This adds, incrementally ontop of moving VEC_PERM_EXPR folding to match.pd (but not incremental patch - sorry...), folding of single-element insert permutations to BIT_INSERT_EXPR.
Things are quite awkward with the new poly-int vec-perm stuff so effectively this doesn't work for SVE and is still very ugly. I wonder how to make it generic enough so SVE would be happy and / or how to make the code prettier. I also can't find a helper to read a specific vector element from a VECTOR_CST/CONSTRUCTOR (can I even do that "generally" with a poly-int index?!), but there surely must be one. Richard - any comments / hints? Look at the match.pd change next to /* See if the permutation is performing a single elem Thanks, Richard. Index: gcc/gimple-match-head.c =================================================================== --- gcc/gimple-match-head.c (revision 271321) +++ gcc/gimple-match-head.c (working copy) @@ -27,6 +27,7 @@ along with GCC; see the file COPYING3. #include "gimple.h" #include "ssa.h" #include "cgraph.h" +#include "vec-perm-indices.h" #include "fold-const.h" #include "fold-const-call.h" #include "stor-layout.h" Index: gcc/generic-match-head.c =================================================================== --- gcc/generic-match-head.c (revision 271321) +++ gcc/generic-match-head.c (working copy) @@ -27,6 +27,7 @@ along with GCC; see the file COPYING3. #include "gimple.h" #include "ssa.h" #include "cgraph.h" +#include "vec-perm-indices.h" #include "fold-const.h" #include "stor-layout.h" #include "tree-dfa.h" Index: gcc/fold-const.c =================================================================== --- gcc/fold-const.c (revision 271321) +++ gcc/fold-const.c (working copy) @@ -9015,7 +9015,7 @@ vec_cst_ctor_to_array (tree arg, unsigne selector. Return the folded VECTOR_CST or CONSTRUCTOR if successful, NULL_TREE otherwise. */ -static tree +tree fold_vec_perm (tree type, tree arg0, tree arg1, const vec_perm_indices &sel) { unsigned int i; @@ -11763,7 +11763,10 @@ fold_ternary_loc (location_t loc, enum t return NULL_TREE; case VEC_PERM_EXPR: - if (TREE_CODE (arg2) == VECTOR_CST) + /* Perform constant folding of BIT_INSERT_EXPR. */ + if (TREE_CODE (arg2) == VECTOR_CST + && TREE_CODE (op0) == VECTOR_CST + && TREE_CODE (op1) == VECTOR_CST) { /* Build a vector of integers from the tree mask. */ vec_perm_builder builder; @@ -11774,61 +11777,7 @@ fold_ternary_loc (location_t loc, enum t poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type); bool single_arg = (op0 == op1); vec_perm_indices sel (builder, single_arg ? 1 : 2, nelts); - - /* Check for cases that fold to OP0 or OP1 in their original - element order. */ - if (sel.series_p (0, 1, 0, 1)) - return op0; - if (sel.series_p (0, 1, nelts, 1)) - return op1; - - if (!single_arg) - { - if (sel.all_from_input_p (0)) - op1 = op0; - else if (sel.all_from_input_p (1)) - { - op0 = op1; - sel.rotate_inputs (1); - } - } - - if ((TREE_CODE (op0) == VECTOR_CST - || TREE_CODE (op0) == CONSTRUCTOR) - && (TREE_CODE (op1) == VECTOR_CST - || TREE_CODE (op1) == CONSTRUCTOR)) - { - tree t = fold_vec_perm (type, op0, op1, sel); - if (t != NULL_TREE) - return t; - } - - bool changed = (op0 == op1 && !single_arg); - - /* Generate a canonical form of the selector. */ - if (arg2 == op2 && sel.encoding () != builder) - { - /* Some targets are deficient and fail to expand a single - argument permutation while still allowing an equivalent - 2-argument version. */ - if (sel.ninputs () == 2 - || can_vec_perm_const_p (TYPE_MODE (type), sel, false)) - op2 = vec_perm_indices_to_tree (TREE_TYPE (arg2), sel); - else - { - vec_perm_indices sel2 (builder, 2, nelts); - if (can_vec_perm_const_p (TYPE_MODE (type), sel2, false)) - op2 = vec_perm_indices_to_tree (TREE_TYPE (arg2), sel2); - else - /* Not directly supported with either encoding, - so use the preferred form. */ - op2 = vec_perm_indices_to_tree (TREE_TYPE (arg2), sel); - } - changed = true; - } - - if (changed) - return build3_loc (loc, VEC_PERM_EXPR, type, op0, op1, op2); + return fold_vec_perm (type, op0, op1, sel); } return NULL_TREE; Index: gcc/fold-const.h =================================================================== --- gcc/fold-const.h (revision 271321) +++ gcc/fold-const.h (working copy) @@ -100,6 +100,9 @@ extern tree fold_bit_and_mask (tree, tre tree, enum tree_code, tree, tree, tree, enum tree_code, tree, tree, tree *); extern tree fold_read_from_constant_string (tree); +#if GCC_VEC_PERN_INDICES_H +extern tree fold_vec_perm (tree, tree, tree, const vec_perm_indices &); +#endif extern bool wide_int_binop (wide_int &res, enum tree_code, const wide_int &arg1, const wide_int &arg2, signop, wi::overflow_type *); Index: gcc/match.pd =================================================================== --- gcc/match.pd (revision 271321) +++ gcc/match.pd (working copy) @@ -5374,3 +5374,156 @@ (define_operator_list COND_TERNARY (bit_and:elt_type (BIT_FIELD_REF:elt_type @0 { size; } { pos; }) { elt; }))))))) + +(simplify + (vec_perm @0 @1 VECTOR_CST@2) + (with + { + tree op0 = @0, op1 = @1, op2 = @2; + + /* Build a vector of integers from the tree mask. */ + vec_perm_builder builder; + if (!tree_to_vec_perm_builder (&builder, op2)) + return NULL_TREE; + + /* Create a vec_perm_indices for the integer vector. */ + poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type); + bool single_arg = (op0 == op1); + vec_perm_indices sel (builder, single_arg ? 1 : 2, nelts); + } + (if (sel.series_p (0, 1, 0, 1)) + { op0; } + (if (sel.series_p (0, 1, nelts, 1)) + { op1; } + (with + { + if (!single_arg) + { + if (sel.all_from_input_p (0)) + op1 = op0; + else if (sel.all_from_input_p (1)) + { + op0 = op1; + sel.rotate_inputs (1); + } + } + gassign *def; + tree cop0 = op0, cop1 = op1; + if (TREE_CODE (op0) == SSA_NAME + && (def = dyn_cast <gassign *> (SSA_NAME_DEF_STMT (op0))) + && gimple_assign_rhs_code (def) == CONSTRUCTOR) + cop0 = gimple_assign_rhs1 (def); + if (TREE_CODE (op1) == SSA_NAME + && (def = dyn_cast <gassign *> (SSA_NAME_DEF_STMT (op1))) + && gimple_assign_rhs_code (def) == CONSTRUCTOR) + cop1 = gimple_assign_rhs1 (def); + + tree t; + } + (if ((TREE_CODE (cop0) == VECTOR_CST + || TREE_CODE (cop0) == CONSTRUCTOR) + && (TREE_CODE (cop1) == VECTOR_CST + || TREE_CODE (cop1) == CONSTRUCTOR) + && (t = fold_vec_perm (type, cop0, cop1, sel))) + { t; } + (with + { + bool changed = (op0 == op1 && !single_arg); + tree ins = NULL_TREE; + unsigned at = 0; + + /* See if the permutation is performing a single element + insert from a CONSTRUCTOR or constant and use a BIT_INSERT_EXPR + in that case. */ + unsigned HOST_WIDE_INT cnelts; + if ((TREE_CODE (cop0) == VECTOR_CST + || TREE_CODE (cop0) == CONSTRUCTOR + || TREE_CODE (cop1) == VECTOR_CST + || TREE_CODE (cop1) == CONSTRUCTOR) + && nelts.is_constant (&cnelts)) + { + unsigned first = 0, first_oo = 0, first_i; + unsigned second = 0, second_oo = 0, second_i; + HOST_WIDE_INT idx; + for (unsigned HOST_WIDE_INT i = 0; i < cnelts; ++i) + if (!sel[i].is_constant (&idx)) + { + first = second = 0; + break; + } + else if ((unsigned HOST_WIDE_INT)idx < cnelts) + { + first_i = i; + first++; + first_oo += (unsigned HOST_WIDE_INT)idx != i; + } + else + { + second_i = i; + second++; + second_oo += (unsigned HOST_WIDE_INT)idx != i + cnelts; + } + if (first_oo == 0 + && second == 1 + && (TREE_CODE (cop1) == VECTOR_CST + || TREE_CODE (cop1) == CONSTRUCTOR)) + { + unsigned idx = sel[second_i].to_constant () - cnelts; + at = second_i; + if (TREE_CODE (cop1) == VECTOR_CST) + ins = VECTOR_CST_ELT (cop1, idx); + else if (TREE_CODE (cop1) == CONSTRUCTOR) + { + if (CONSTRUCTOR_NELTS (cop1) <= idx) + ins = build_zero_cst (TREE_TYPE (type)); + else + ins = CONSTRUCTOR_ELT (cop1, idx)->value; + } + } + else if (second_oo == 0 + && first == 1 + && (TREE_CODE (cop0) == VECTOR_CST + || TREE_CODE (cop0) == CONSTRUCTOR)) + { + unsigned idx = sel[first_i].to_constant (); + at = first_i; + if (TREE_CODE (cop0) == VECTOR_CST) + ins = VECTOR_CST_ELT (cop0, idx); + else if (TREE_CODE (cop0) == CONSTRUCTOR) + { + if (CONSTRUCTOR_NELTS (cop0) <= idx) + ins = build_zero_cst (TREE_TYPE (type)); + else + ins = CONSTRUCTOR_ELT (cop0, idx)->value; + } + op0 = op1; + } + } + + /* Generate a canonical form of the selector. */ + if (!ins && sel.encoding () != builder) + { + /* Some targets are deficient and fail to expand a single + argument permutation while still allowing an equivalent + 2-argument version. */ + if (sel.ninputs () == 2 + || can_vec_perm_const_p (TYPE_MODE (type), sel, false)) + op2 = vec_perm_indices_to_tree (TREE_TYPE (op2), sel); + else + { + vec_perm_indices sel2 (builder, 2, nelts); + if (can_vec_perm_const_p (TYPE_MODE (type), sel2, false)) + op2 = vec_perm_indices_to_tree (TREE_TYPE (op2), sel2); + else + /* Not directly supported with either encoding, + so use the preferred form. */ + op2 = vec_perm_indices_to_tree (TREE_TYPE (op2), sel); + } + changed = true; + } + } + (if (ins) + (bit_insert { op0; } { ins; } + { bitsize_int (at * tree_to_uhwi (TYPE_SIZE (TREE_TYPE (type)))); }) + (if (changed) + (vec_perm { op0; } { op1; } { op2; }))))))))))