Hi, here is an enhancement to gcc, which allows load/store groups with size being non-power-of-2 to be vectorized. Current implementation is using interleaving permutations to transform load/store groups. That is where power-of-2 requirements comes from. For N-element vectors simplest approch would be to use N single element insertions for any required vector permutation. And for 2-element vectors it is a reasonable amount of insertions. Using this approach allows vectorization for cases, which were not supported before.
bootstrapped and tested on x86_64-pc-linux-gnu and aarch64-linux-gnu. Thanks, Dmitrij
>From acf12c34f4bebbb5c6000a87bf9aaa58e48418bb Mon Sep 17 00:00:00 2001 From: Dmitrij Pochepko <dmitrij.poche...@bell-sw.com> Date: Wed, 15 Jul 2020 18:07:26 +0300 Subject: [PATCH] non-power-of-2 group size can be vectorized for 2-element vectors case (PR96208) Support for non-power-of-2 group size in vectorizer for 2-element vectors. gcc/ChangeLog: 2020-07-15 Dmitrij Pochepko <dmitrij.poche...@bell-sw.com> PR gcc/96208 * gcc/tree-vect-data-refs.c: (vect_all_2element_permutations_supported): New function (vect_permute_load_chain): added new branch with new algo (vect_permute_store_chain): Likewise (vect_grouped_load_supported): modified logic for new algo (vect_grouped_store_supported): Likewise (vect_transform_grouped_load): Likewise gcc/testsuite/ChangeLog: 2020-07-15 Dmitrij Pochepko <dmitrij.poche...@bell-sw.com> PR gcc/96208 * gcc.dg/vect/vect-non-pow2-group.c: New test --- gcc/testsuite/gcc.dg/vect/vect-non-pow2-group.c | 25 +++ gcc/tree-vect-data-refs.c | 212 +++++++++++++++++++++--- 2 files changed, 218 insertions(+), 19 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/vect-non-pow2-group.c diff --git a/gcc/testsuite/gcc.dg/vect/vect-non-pow2-group.c b/gcc/testsuite/gcc.dg/vect/vect-non-pow2-group.c new file mode 100644 index 0000000..7a22739 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-non-pow2-group.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_double } */ +/* { dg-require-effective-target vect_perm } */ +/* { dg-additional-options "-fdump-tree-vect-details -fno-vect-cost-model -Ofast" } */ + +typedef struct { + double m1, m2, m3, m4, m5; +} the_struct_t; + +double bar1 (the_struct_t*); + +double foo (double* k, unsigned int n, the_struct_t* the_struct) +{ + unsigned int u; + the_struct_t result; + for (u=0; u < n; u++, k--) { + result.m1 += (*k)*the_struct[u].m1; + result.m2 += (*k)*the_struct[u].m2; + result.m3 += (*k)*the_struct[u].m3; + result.m4 += (*k)*the_struct[u].m4; + } + return bar1 (&result); +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c index e35a215..caf4555 100644 --- a/gcc/tree-vect-data-refs.c +++ b/gcc/tree-vect-data-refs.c @@ -5027,6 +5027,37 @@ vect_create_destination_var (tree scalar_dest, tree vectype) return vec_dest; } +/* Function vect_all_2element_permutations_supported + + Returns TRUE if all possible permutations for 2-element + vectors are supported for requested mode. */ + +bool +vect_all_2element_permutations_supported (machine_mode mode) +{ + // check all possible permutations for 2-element vectors + // for 2 vectors it'll be all low and high combinations: + // ll={0, 2}, lh={0, 3}, hl={1,2}, hh={1,3} + poly_uint64 nelt = GET_MODE_NUNITS (mode); + if (!known_eq (nelt, 2ULL)) + return false; + vec_perm_builder sel (nelt, 2, 2); + sel.quick_grow (2); + sel[0] = 0; + sel[1] = 2; + vec_perm_indices ll (sel, 2, 2); + sel[1] = 3; + vec_perm_indices lh (sel, 2, 2); + sel[0] = 1; + vec_perm_indices hh (sel, 2, 2); + sel[1] = 2; + vec_perm_indices hl (sel, 2, 2); + return can_vec_perm_const_p (mode, ll) + && can_vec_perm_const_p (mode, lh) + && can_vec_perm_const_p (mode, hl) + && can_vec_perm_const_p (mode, hh); +} + /* Function vect_grouped_store_supported. Returns TRUE if interleave high and interleave low permutations @@ -5038,13 +5069,15 @@ vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count) machine_mode mode = TYPE_MODE (vectype); /* vect_permute_store_chain requires the group size to be equal to 3 or - be a power of two. */ - if (count != 3 && exact_log2 (count) == -1) + be a power of two or 2-element vectors to be used. */ + if (count != 3 && exact_log2 (count) == -1 + && !known_eq (GET_MODE_NUNITS (mode), 2ULL)) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, "the size of the group of accesses" - " is not a power of 2 or not eqaul to 3\n"); + " is not a power of 2 or not eqaul to 3" + " and vector element number is not 2\n"); return false; } @@ -5113,9 +5146,14 @@ vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count) } return true; } + else if (known_eq (GET_MODE_NUNITS (mode), 2ULL)) + { + return vect_all_2element_permutations_supported (mode); + } else { - /* If length is not equal to 3 then only power of 2 is supported. */ + /* If length is not equal to 3 and vectors are not 2-elements + then only power of 2 is supported. */ gcc_assert (pow2p_hwi (count)); poly_uint64 nelt = GET_MODE_NUNITS (mode); @@ -5239,6 +5277,8 @@ vect_permute_store_chain (vec_info *vinfo, vec<tree> dr_chain, tree data_ref; tree perm3_mask_low, perm3_mask_high; unsigned int i, j, n, log_length = exact_log2 (length); + poly_uint64 nelt_poly = TYPE_VECTOR_SUBPARTS (vectype); + unsigned int unelt; result_chain->quick_grow (length); memcpy (result_chain->address (), dr_chain.address (), @@ -5247,7 +5287,7 @@ vect_permute_store_chain (vec_info *vinfo, vec<tree> dr_chain, if (length == 3) { /* vect_grouped_store_supported ensures that this is constant. */ - unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); + unsigned int nelt = nelt_poly.to_constant (); unsigned int j0 = 0, j1 = 0, j2 = 0; vec_perm_builder sel (nelt, nelt, 1); @@ -5308,13 +5348,10 @@ vect_permute_store_chain (vec_info *vinfo, vec<tree> dr_chain, (*result_chain)[j] = data_ref; } } - else + else if (pow2p_hwi (length)) { - /* If length is not equal to 3 then only power of 2 is supported. */ - gcc_assert (pow2p_hwi (length)); - /* The encoding has 2 interleaved stepped patterns. */ - poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype); + poly_uint64 nelt = nelt_poly; vec_perm_builder sel (nelt, 2, 3); sel.quick_grow (6); for (i = 0; i < 3; i++) @@ -5360,6 +5397,70 @@ vect_permute_store_chain (vec_info *vinfo, vec<tree> dr_chain, length * sizeof (tree)); } } + else + { + gcc_assert (known_eq (nelt_poly, 2ULL)); + unelt = nelt_poly.to_constant (); + vec_perm_builder sel (unelt, unelt, 1); + sel.quick_grow (unelt); + vec_perm_indices indices; + + // perm_x_y == x-th element of 1st vector and y-th element of 2nd vector + tree perm_0_0, perm_0_1, perm_1_1; + sel[0] = 0; + sel[1] = 1 + unelt; + indices.new_vector (sel, 2, unelt); + perm_0_1 = vect_gen_perm_mask_checked (vectype, indices); + + sel[0] = 0; + sel[1] = 0 + unelt; + indices.new_vector (sel, 2, unelt); + perm_0_0 = vect_gen_perm_mask_checked (vectype, indices); + + sel[0] = 1; + sel[1] = 1 + unelt; + indices.new_vector (sel, 2, unelt); + perm_1_1 = vect_gen_perm_mask_checked (vectype, indices); + + if (length & 1) + { + int idx1, idx2; + tree currentPerm; + for (j = 0; j < length; j++) + { + idx1 = (2 * j) % length; + idx2 = (2 * j + 1) % length; + vect1 = dr_chain[idx1]; + vect2 = dr_chain[idx2]; + data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm"); + if (j < length / 2) currentPerm = perm_0_0; + else if (j == length / 2) currentPerm = perm_0_1; + else currentPerm = perm_1_1; + perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, + vect2, currentPerm); + vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); + (*result_chain)[j] = data_ref; + } + } + else + { + for (i = 0; i < unelt; i++) + { + for (j = 0; j < length / 2; j++) + { + vect1 = dr_chain[2 * j]; + vect2 = dr_chain[2 * j + 1]; + data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm"); + tree selectedPerm = (i == 0) ? perm_0_0 : perm_1_1; + perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, + vect1, vect2, selectedPerm); + vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, + gsi); + (*result_chain)[i * length / 2 + j] = data_ref; + } + } + } + } } /* Function vect_setup_realignment @@ -5664,12 +5765,15 @@ vect_grouped_load_supported (tree vectype, bool single_element_p, /* vect_permute_load_chain requires the group size to be equal to 3 or be a power of two. */ - if (count != 3 && exact_log2 (count) == -1) + poly_uint64 poly_unelt = GET_MODE_NUNITS (mode); + if (count != 3 && exact_log2 (count) == -1 + && !known_eq (poly_unelt, 2ULL)) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, "the size of the group of accesses" - " is not a power of 2 or not equal to 3\n"); + " is not a power of 2 or not equal to 3" + " and vector elements number is not 2\n"); return false; } @@ -5726,9 +5830,14 @@ vect_grouped_load_supported (tree vectype, bool single_element_p, } return true; } + else if (known_eq (poly_unelt, 2ULL)) + { + return vect_all_2element_permutations_supported (mode); + } else { - /* If length is not equal to 3 then only power of 2 is supported. */ + /* If length is not equal to 3 and vector size is not 2 + then only power of 2 is supported. */ gcc_assert (pow2p_hwi (count)); poly_uint64 nelt = GET_MODE_NUNITS (mode); @@ -5862,6 +5971,7 @@ vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain, gimple *perm_stmt; tree vectype = STMT_VINFO_VECTYPE (stmt_info); unsigned int i, j, log_length = exact_log2 (length); + poly_uint64 nelt_poly = TYPE_VECTOR_SUBPARTS (vectype); result_chain->quick_grow (length); memcpy (result_chain->address (), dr_chain.address (), @@ -5870,7 +5980,7 @@ vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain, if (length == 3) { /* vect_grouped_load_supported ensures that this is constant. */ - unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); + unsigned nelt = nelt_poly.to_constant (); unsigned int k; vec_perm_builder sel (nelt, nelt, 1); @@ -5917,13 +6027,10 @@ vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain, (*result_chain)[k] = data_ref; } } - else + else if (pow2p_hwi (length)) { - /* If length is not equal to 3 then only power of 2 is supported. */ - gcc_assert (pow2p_hwi (length)); - /* The encoding has a single stepped pattern. */ - poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype); + poly_uint64 nelt = nelt_poly; vec_perm_builder sel (nelt, 1, 3); sel.quick_grow (3); for (i = 0; i < 3; ++i) @@ -5963,6 +6070,72 @@ vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain, length * sizeof (tree)); } } + else + { + gcc_assert (known_eq (nelt_poly, 2ULL)); + unsigned int unelt = nelt_poly.to_constant (); + vec_perm_builder sel (unelt, unelt, 1); + sel.quick_grow (unelt); + vec_perm_indices indices; + + // for 2-element vectors there are 2 cases to consider: + // 1) result vector Vi is {Vn[j], Vm[k]} and j == k + // which means length is even. + // 2) result vector Vi is {Vn[j], Vm[k]} and j != k + // which means length is odd. + + // perm_x_y == x-th element of 1st vector and y-th element of 2nd vector + tree perm_0_0, perm_0_1, perm_1_0, perm_1_1; + sel[0] = 0; + sel[1] = 1 + unelt; + indices.new_vector (sel, 2, unelt); + perm_0_1 = vect_gen_perm_mask_checked (vectype, indices); + + sel[0] = 1; + sel[1] = 0 + unelt; + indices.new_vector (sel, 2, unelt); + perm_1_0 = vect_gen_perm_mask_checked (vectype, indices); + + sel[0] = 0; + sel[1] = 0 + unelt; + indices.new_vector (sel, 2, unelt); + perm_0_0 = vect_gen_perm_mask_checked (vectype, indices); + + sel[0] = 1; + sel[1] = 1 + unelt; + indices.new_vector (sel, 2, unelt); + perm_1_1 = vect_gen_perm_mask_checked (vectype, indices); + + for (i = 0; i < length; i++) + { + data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm"); + + first_vect = dr_chain[i / unelt]; + second_vect = dr_chain[(i + length) / unelt]; + + tree selectedPerm; + if (length & 1) + { // odd length. different indicies + // odd i, first vector index is 0 and second vector index is 1 + // even i, first vector index is 1 and second vector index is 0 + selectedPerm = (i & 1) ? perm_1_0 : perm_0_1; + perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, + first_vect, second_vect, + selectedPerm); + vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); + (*result_chain)[i] = data_ref; + } + else + { // even length. same indicies (both 0 or both 1) + selectedPerm = (i & 1) ? perm_1_1 : perm_0_0; + perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, + first_vect, second_vect, + selectedPerm); + vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); + (*result_chain)[i] = data_ref; + } + } + } } /* Function vect_shift_permute_load_chain. @@ -6340,6 +6513,7 @@ vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info, mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1 || pow2p_hwi (size) + || known_eq (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), 2ULL) || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info, gsi, &result_chain)) vect_permute_load_chain (vinfo, dr_chain, -- 2.7.4