Re: combine permutations in gimple
On Sat, Aug 18, 2012 at 11:59 AM, Marc Glisse marc.gli...@inria.fr wrote: Hello, here is a new patch (passes bootstrap+regtest), which only combines permutations if the result is the identity. I'll file a PR about more general combinations to link to this conversation and the cost hook proposition. Ok? Ok. Thanks, Richard. 2012-08-18 Marc Glisse marc.gli...@inria.fr gcc/ * fold-const.c (fold_ternary_loc): Detect identity permutations. Canonicalize permutations more. * tree-ssa-forwprop.c (is_combined_permutation_identity): New function. (simplify_permutation): Likewise. (ssa_forward_propagate_and_combine): Call it. gcc/testsuite/ * gcc.dg/tree-ssa/forwprop-19.c: New testcase. * gcc.dg/fold-perm.c: Likewise. -- Marc Glisse Index: gcc/fold-const.c === --- gcc/fold-const.c(revision 190500) +++ gcc/fold-const.c(working copy) @@ -14148,58 +14148,96 @@ fold_ternary_loc (location_t loc, enum t return fold_build2_loc (loc, PLUS_EXPR, type, const_binop (MULT_EXPR, arg0, arg1), arg2); if (integer_zerop (arg2)) return fold_build2_loc (loc, MULT_EXPR, type, arg0, arg1); return fold_fma (loc, type, arg0, arg1, arg2); case VEC_PERM_EXPR: if (TREE_CODE (arg2) == VECTOR_CST) { - unsigned int nelts = TYPE_VECTOR_SUBPARTS (type), i; + unsigned int nelts = TYPE_VECTOR_SUBPARTS (type), i, mask; unsigned char *sel = XALLOCAVEC (unsigned char, nelts); tree t; bool need_mask_canon = false; + bool all_in_vec0 = true; + bool all_in_vec1 = true; + bool maybe_identity = true; + bool single_arg = (op0 == op1); + bool changed = false; + mask = single_arg ? (nelts - 1) : (2 * nelts - 1); gcc_assert (nelts == VECTOR_CST_NELTS (arg2)); for (i = 0; i nelts; i++) { tree val = VECTOR_CST_ELT (arg2, i); if (TREE_CODE (val) != INTEGER_CST) return NULL_TREE; - sel[i] = TREE_INT_CST_LOW (val) (2 * nelts - 1); + sel[i] = TREE_INT_CST_LOW (val) mask; if (TREE_INT_CST_HIGH (val) || ((unsigned HOST_WIDE_INT) TREE_INT_CST_LOW (val) != sel[i])) need_mask_canon = true; + + if (sel[i] nelts) + all_in_vec1 = false; + else + all_in_vec0 = false; + + if ((sel[i] (nelts-1)) != i) + maybe_identity = false; + } + + if (maybe_identity) + { + if (all_in_vec0) + return op0; + if (all_in_vec1) + return op1; } if ((TREE_CODE (arg0) == VECTOR_CST || TREE_CODE (arg0) == CONSTRUCTOR) (TREE_CODE (arg1) == VECTOR_CST || TREE_CODE (arg1) == CONSTRUCTOR)) { t = fold_vec_perm (type, arg0, arg1, sel); if (t != NULL_TREE) return t; } + if (all_in_vec0) + op1 = op0; + else if (all_in_vec1) + { + op0 = op1; + for (i = 0; i nelts; i++) + sel[i] -= nelts; + need_mask_canon = true; + } + + if (op0 == op1 !single_arg) + changed = true; + if (need_mask_canon arg2 == op2) { tree *tsel = XALLOCAVEC (tree, nelts); tree eltype = TREE_TYPE (TREE_TYPE (arg2)); for (i = 0; i nelts; i++) tsel[i] = build_int_cst (eltype, sel[i]); - t = build_vector (TREE_TYPE (arg2), tsel); - return build3_loc (loc, VEC_PERM_EXPR, type, op0, op1, t); + op2 = build_vector (TREE_TYPE (arg2), tsel); + changed = true; } + + if (changed) + return build3_loc (loc, VEC_PERM_EXPR, type, op0, op1, op2); } return NULL_TREE; default: return NULL_TREE; } /* switch (code) */ } /* Perform constant folding and related simplification of EXPR. The related simplifications include x*1 = x, x*0 = 0, etc., Index: gcc/tree-ssa-forwprop.c === --- gcc/tree-ssa-forwprop.c (revision 190500) +++ gcc/tree-ssa-forwprop.c (working copy) @@ -2570,20 +2570,109 @@ combine_conversions (gimple_stmt_iterato gimple_assign_set_rhs_code (stmt, CONVERT_EXPR); update_stmt (stmt); return remove_prop_source_from_use (op0) ? 2 : 1; } } } return 0; } +/* Determine whether applying the 2
Re: combine permutations in gimple
On Wed, 15 Aug 2012, Richard Guenther wrote: On Wed, Aug 15, 2012 at 1:56 PM, Ramana Radhakrishnan ramana.radhakrish...@linaro.org wrote: Of-course, the problem here is this change of semantics with the hook TARGET_VEC_PERM_CONST_OK. Targets were expanding to generic permutes with constants in the *absence* of being able to deal with them with the specialized permutes. fwprop will now leave us at a point where each target has to now grow more knowledge with respect to how best to expand a generic constant permute with a sequence of permutes rather than just using the generic permute. Generating a sequence of permutes from a single constant permute will be a harder problem than (say) dealing with immediate expansions so you are pushing more complexity into the target but in the short term target maintainers should definitely have a heads up that folks using vector permute intrinsics could actually have performance regressions on their targets. It's of course the same with the user input containing such a non-optimal handled constant permute. So I'm less convinced that it's too much burden on the target side. OTOH if there is a generic kind of shuffles that targets do not implement directly but can be simulated by two that are directly implemented pushing the logic to the expander (and adjusting the target hook semantic) would be ok. There's a wonderful knapsack-class problem in there, something for next year's GSoC? (Given a constant permutation, synthesize it with a set of simpler operations with total cost constant_shuffle, where the simpler operation and costs are target-specific (query for presence by TARGET_VEC_PERM_CONST_OK and cost hooks; decompose to simpler operation by generic heuristics). A similar but simpler problem is to synthesize a constant vector instead of loading it from memory (though a load may be cheap enough for most SIMD targets that this may be uninteresting to generalize). brgds, H-P
Re: combine permutations in gimple
On Mon, 13 Aug 2012, Marc Glisse wrote: I'll give it a few more days for the conversation to settle, so I know what I should do between: - the barely modified patch you accepted, - the check asked by Jakub, - the restriction to identity that prevents any regression (well...), - something else? I didn't realize the conversation would go quiet immediatly... Is someone willing to take the responsibility of telling me which approach is right? I can add Jakub's checks (though I am not sure how I would test them), but I would rather not do it if the conclusion is that they are either unnecessary (original patch is ok) or insufficient (don't avoid Ramana's regressions). I can do the very safe option 3 (only combine permutations when the result is the identity permutation), but I first want to make sure the more general thing is bad. (sorry, it's my first patch that gets conflicting answers...) -- Marc Glisse
Re: combine permutations in gimple
On Wed, Aug 15, 2012 at 1:22 PM, Marc Glisse marc.gli...@inria.fr wrote: On Mon, 13 Aug 2012, Marc Glisse wrote: I'll give it a few more days for the conversation to settle, so I know what I should do between: - the barely modified patch you accepted, - the check asked by Jakub, - the restriction to identity that prevents any regression (well...), - something else? I didn't realize the conversation would go quiet immediatly... Is someone willing to take the responsibility of telling me which approach is right? I can add Jakub's checks (though I am not sure how I would test them), but I would rather not do it if the conclusion is that they are either unnecessary (original patch is ok) or insufficient (don't avoid Ramana's regressions). I can do the very safe option 3 (only combine permutations when the result is the identity permutation), but I first want to make sure the more general thing is bad. (sorry, it's my first patch that gets conflicting answers...) Well, we're waiting for someone to break the tie ... I'd go with the original patch, improving the backends where necessary. Richard. -- Marc Glisse
Re: combine permutations in gimple
[It looks like I missed hitting the send button on this response] Seems to be one instruction shorter at least ;-) Yes, there can be much worse regressions than that because of the patch (like 40 instructions instead of 4, in the x86 backend). If this is replacing 4 instructions with 40 in x86 backend maybe someone will notice :) Not a win in this particular testcase because the compiler replaces 2 constant permutes ( that's about 4 cycles) with a load from the constant pool , a generic permute and in addition are polluting the icache with guff in the constant pool . If you go to 3 -4 permutes into a single one then it might be a win but not till that point. with a-b without first asking the backend whether it might be more efficient. One permutation is better than 2. It just happens that the range of possible permutations is too large (and the basic instructions are too strange) for backends to do a good job on them, and thus keeping toplevel input as a hint is important. Of-course, the problem here is this change of semantics with the hook TARGET_VEC_PERM_CONST_OK. Targets were expanding to generic permutes with constants in the *absence* of being able to deal with them with the specialized permutes. fwprop will now leave us at a point where each target has to now grow more knowledge with respect to how best to expand a generic constant permute with a sequence of permutes rather than just using the generic permute. Generating a sequence of permutes from a single constant permute will be a harder problem than (say) dealing with immediate expansions so you are pushing more complexity into the target but in the short term target maintainers should definitely have a heads up that folks using vector permute intrinsics could actually have performance regressions on their targets. Thanks, Ramana
Re: combine permutations in gimple
On Wed, Aug 15, 2012 at 1:56 PM, Ramana Radhakrishnan ramana.radhakrish...@linaro.org wrote: [It looks like I missed hitting the send button on this response] Seems to be one instruction shorter at least ;-) Yes, there can be much worse regressions than that because of the patch (like 40 instructions instead of 4, in the x86 backend). If this is replacing 4 instructions with 40 in x86 backend maybe someone will notice :) Not a win in this particular testcase because the compiler replaces 2 constant permutes ( that's about 4 cycles) with a load from the constant pool , a generic permute and in addition are polluting the icache with guff in the constant pool . If you go to 3 -4 permutes into a single one then it might be a win but not till that point. with a-b without first asking the backend whether it might be more efficient. One permutation is better than 2. It just happens that the range of possible permutations is too large (and the basic instructions are too strange) for backends to do a good job on them, and thus keeping toplevel input as a hint is important. Of-course, the problem here is this change of semantics with the hook TARGET_VEC_PERM_CONST_OK. Targets were expanding to generic permutes with constants in the *absence* of being able to deal with them with the specialized permutes. fwprop will now leave us at a point where each target has to now grow more knowledge with respect to how best to expand a generic constant permute with a sequence of permutes rather than just using the generic permute. Generating a sequence of permutes from a single constant permute will be a harder problem than (say) dealing with immediate expansions so you are pushing more complexity into the target but in the short term target maintainers should definitely have a heads up that folks using vector permute intrinsics could actually have performance regressions on their targets. It's of course the same with the user input containing such a non-optimal handled constant permute. So I'm less convinced that it's too much burden on the target side. OTOH if there is a generic kind of shuffles that targets do not implement directly but can be simulated by two that are directly implemented pushing the logic to the expander (and adjusting the target hook semantic) would be ok. Richard. Thanks, Ramana
Re: combine permutations in gimple
On Wed, Aug 15, 2012 at 01:46:05PM +0200, Richard Guenther wrote: Well, we're waiting for someone to break the tie ... I'd go with the original patch, improving the backends where necessary. E.g. i?86/x86_64 with just plain -msse2 has only very small subset of constant shuffles (and no variable shuffle), so by doing the transformation you could end up with a scalarized shuffle instead of two constant vector shuffles. Expecting the backend to figure it out and doing the two constant vector shuffles in every case is not realistic, i386/x86_64 has way too many different shuffling instructions (and worse different ISA levels have different subsets of them) and while a lot of effort has been spent on it already by Richard, me, Marc and others, we are nowhere close to having optimal sequence in many cases for various modes and ISA levels. Doing a brute force might be too expensive. Jakub
Re: combine permutations in gimple
On Wed, Aug 15, 2012 at 2:07 PM, Jakub Jelinek ja...@redhat.com wrote: On Wed, Aug 15, 2012 at 01:46:05PM +0200, Richard Guenther wrote: Well, we're waiting for someone to break the tie ... I'd go with the original patch, improving the backends where necessary. E.g. i?86/x86_64 with just plain -msse2 has only very small subset of constant shuffles (and no variable shuffle), so by doing the transformation you could end up with a scalarized shuffle instead of two constant vector shuffles. Expecting the backend to figure it out and doing the two constant vector shuffles in every case is not realistic, i386/x86_64 has way too many different shuffling instructions (and worse different ISA levels have different subsets of them) and while a lot of effort has been spent on it already by Richard, me, Marc and others, we are nowhere close to having optimal sequence in many cases for various modes and ISA levels. Doing a brute force might be too expensive. Ok. That would still leave us with the issue Ramana brought up - the target hook returning true unconditionally if a generic permute is implemented. We just avoid generic expansion by tree-vect-generic.c that way. (I wonder if there exists some automated machinery that finds a path using existing instructions to shuffle from {0, 1, ..., n } to the mask and if that would be a viable way to handle shuffle expansions, or pre-compute them all) Richard. Jakub
Re: combine permutations in gimple
On 15 August 2012 13:07, Jakub Jelinek ja...@redhat.com wrote: On Wed, Aug 15, 2012 at 01:46:05PM +0200, Richard Guenther wrote: Well, we're waiting for someone to break the tie ... I'd go with the original patch, improving the backends where necessary. E.g. i?86/x86_64 with just plain -msse2 has only very small subset of constant shuffles (and no variable shuffle), so by doing the transformation you could end up with a scalarized shuffle instead of two constant vector shuffles. Expecting the backend to figure it out and doing the two constant vector shuffles in every case is not realistic, i386/x86_64 has way too many different shuffling instructions (and worse different ISA levels have different subsets of them) and while a lot of effort has been spent on it already by Richard, me, Marc and others, we are nowhere close to having optimal sequence in many cases for various modes and ISA levels. Doing a brute force might be too expensive. Neon has atleast 7-8 such forms. Brute force computing this would be expensive. I don't know what happens on other vector targets - surely other SIMD targets in GCC will also have the same problem. Ramana Jakub
Re: combine permutations in gimple
On Wed, Aug 15, 2012 at 02:15:03PM +0200, Richard Guenther wrote: Ok. That would still leave us with the issue Ramana brought up - the target hook returning true unconditionally if a generic permute is implemented. We just avoid generic expansion by tree-vect-generic.c that way. Yeah, if there is a generic permute, can_vec_perm_p will return true always for the modes for which it is available. Which is why I've been suggesting also the target hook which should return false if only generic permute is going to be used. Jakub
Re: combine permutations in gimple
On Wed, Aug 15, 2012 at 2:29 PM, Jakub Jelinek ja...@redhat.com wrote: On Wed, Aug 15, 2012 at 02:15:03PM +0200, Richard Guenther wrote: Ok. That would still leave us with the issue Ramana brought up - the target hook returning true unconditionally if a generic permute is implemented. We just avoid generic expansion by tree-vect-generic.c that way. Yeah, if there is a generic permute, can_vec_perm_p will return true always for the modes for which it is available. Which is why I've been suggesting also the target hook which should return false if only generic permute is going to be used. Well. What about returning a cost instead? We don't want to transform two insns to four either. Richard. Jakub
Re: combine permutations in gimple
On Wed, Aug 15, 2012 at 2:53 PM, Jakub Jelinek ja...@redhat.com wrote: On Wed, Aug 15, 2012 at 02:36:54PM +0200, Richard Guenther wrote: On Wed, Aug 15, 2012 at 2:29 PM, Jakub Jelinek ja...@redhat.com wrote: On Wed, Aug 15, 2012 at 02:15:03PM +0200, Richard Guenther wrote: Ok. That would still leave us with the issue Ramana brought up - the target hook returning true unconditionally if a generic permute is implemented. We just avoid generic expansion by tree-vect-generic.c that way. Yeah, if there is a generic permute, can_vec_perm_p will return true always for the modes for which it is available. Which is why I've been suggesting also the target hook which should return false if only generic permute is going to be used. Well. What about returning a cost instead? We don't want to transform two insns to four either. That could work, it would just be a lot of work to change it (and more costly). E.g. on i386 for 16-byte vectors with certain ISAs we return true right away, when returning cost we'd need to go the slow path always even when the caller is just interested in a boolean flag whether it is possible or not. CCing rth as the author of the hooks. We'd want more accurate costs for the vectorizer cost model anyways. Richard. Jakub
Re: combine permutations in gimple
On Sat, Aug 11, 2012 at 2:25 PM, Marc Glisse marc.gli...@inria.fr wrote: On Fri, 10 Aug 2012, Marc Glisse wrote: this patch detects permutations of permutations and merges them. It also canonicalizes permutations a bit more. There are several issues with this patch: 1) I am not sure we always want to combine permutations. Indeed, someone (user? vectorizer?) may have written 2 permutations to help the backend generate optimal code, whereas it will do a bad job on the complicated combined permutation. At tree level, I am not sure what can be done except possibly restrict the optimization to the case where the combined permutation is the identity. At the RTL level, we could compare what insns are generated, but that's going to be painful (doing anything with permutations at the RTL level is painful). 2) I don't understand how the cleanup works in tree-ssa-forwprop.c. I copied some fold/update/remove lines from other simplifications, but I don't know if they are right. 3) In a first version of the patch, where I had SSA_NAME_DEF_STMT instead of get_prop_source_stmt, I got an ICE in one of the torture vectorization testcases. It happened in expand_expr_real_2, because it asserts that expand_vec_perm will never fail. However, expand_vec_perm does return NULL_RTX sometimes. Here it was in V16QImode, with the permutation {0,0,2,2,4,...,14,14}. maybe_expand_insn can't handle it directly (that's ok I guess), but then expand_vec_perm sees VOIDmode and exits instead of trying other ways. I don't know if there is a latent bug or if (more likely) my patch may produce trees with wrong modes. 4) Is this the right place? This isn't the transformation I am most interested in, but it is a first step to see if the direction is right. Hello, there was yet another issue with the version I posted: the testcase was trivial so I didn't notice that it didn't perform the optimization at all... Here is a new version. It seems a bit strange to me that there are plenty of functions that check for single-use variables, but there isn't any that checks for variables used in a single statement (but possibly twice). So I may be doing something strange, but it seems to be the natural test here. Happily passed bootstrap+regtest. 2012-08-11 Marc Glisse marc.gli...@inria.fr gcc/ * fold-const.c (fold_ternary_loc): Detect identity permutations. Canonicalize permutations more. * tree-ssa-forwprop.c (is_only_used_in): New function. (simplify_permutation): Likewise. (ssa_forward_propagate_and_combine): Call it. gcc/testsuite/ * gcc.dg/tree-ssa/forwprop-19.c: New testcase. -- Marc Glisse Index: gcc/tree-ssa-forwprop.c === --- gcc/tree-ssa-forwprop.c (revision 190311) +++ gcc/tree-ssa-forwprop.c (working copy) @@ -2569,20 +2569,97 @@ combine_conversions (gimple_stmt_iterato gimple_assign_set_rhs_code (stmt, CONVERT_EXPR); update_stmt (stmt); return remove_prop_source_from_use (op0) ? 2 : 1; } } } return 0; } +/* Return true if VAR has no nondebug use but in stmt. */ +static bool +is_only_used_in (const_tree var, const_gimple stmt) +{ + const ssa_use_operand_t *const ptr0 = (SSA_NAME_IMM_USE_NODE (var)); + const ssa_use_operand_t *ptr = ptr0-next; + + for (; ptr != ptr0; ptr = ptr-next) +{ + const_gimple use_stmt = USE_STMT (ptr); + if (is_gimple_debug (use_stmt)) + continue; + if (use_stmt != stmt) + return false; +} + return true; +} if (!single_imm_use (var, use, use_stmt) || use_stmt != stmt) instead of the above. +/* Combine two shuffles in a row. Returns 1 if there were any changes + made, 2 if cfg-cleanup needs to run. Else it returns 0. */ + +static int +simplify_permutation (gimple_stmt_iterator *gsi) +{ + gimple stmt = gsi_stmt (*gsi); + gimple def_stmt; + tree op0, op1, op2, op3, mask; + enum tree_code code = gimple_assign_rhs_code (stmt); + enum tree_code code2; + location_t loc = gimple_location (stmt); + + gcc_checking_assert (code == VEC_PERM_EXPR); + + op0 = gimple_assign_rhs1 (stmt); + op1 = gimple_assign_rhs2 (stmt); + op2 = gimple_assign_rhs3 (stmt); + + if (TREE_CODE (op0) != SSA_NAME) +return 0; + + if (TREE_CODE (op2) != VECTOR_CST) +return 0; + + def_stmt = SSA_NAME_DEF_STMT (op0); + if (!def_stmt || !is_gimple_assign (def_stmt) + || !can_propagate_from (def_stmt) + || !is_only_used_in (op0, stmt)) Or rather than the above, simply check || !has_single_use (op0) here. +return 0; + + /* Check that it is only used here. We cannot use has_single_use + since the expression is using it twice itself... */ Ah ... so then || num_imm_uses (op0) != 2 + code2 = gimple_assign_rhs_code (def_stmt); + + /*
Re: combine permutations in gimple
On Mon, 13 Aug 2012, Richard Guenther wrote: On Sat, Aug 11, 2012 at 2:25 PM, Marc Glisse marc.gli...@inria.fr wrote: On Fri, 10 Aug 2012, Marc Glisse wrote: 1) I am not sure we always want to combine permutations. Indeed, someone (user? vectorizer?) may have written 2 permutations to help the backend generate optimal code, whereas it will do a bad job on the complicated combined permutation. At tree level, I am not sure what can be done except possibly restrict the optimization to the case where the combined permutation is the identity. At the RTL level, we could compare what insns are generated, but that's going to be painful (doing anything with permutations at the RTL level is painful). I guess people will complain soon enough if this causes horrible performance regressions in vectorized code. +/* Return true if VAR has no nondebug use but in stmt. */ +static bool +is_only_used_in (const_tree var, const_gimple stmt) +{ + const ssa_use_operand_t *const ptr0 = (SSA_NAME_IMM_USE_NODE (var)); + const ssa_use_operand_t *ptr = ptr0-next; + + for (; ptr != ptr0; ptr = ptr-next) +{ + const_gimple use_stmt = USE_STMT (ptr); + if (is_gimple_debug (use_stmt)) + continue; + if (use_stmt != stmt) + return false; +} + return true; +} if (!single_imm_use (var, use, use_stmt) || use_stmt != stmt) instead of the above. I don't think that works with statements that use a variable twice. +/* Combine two shuffles in a row. Returns 1 if there were any changes + made, 2 if cfg-cleanup needs to run. Else it returns 0. */ + +static int +simplify_permutation (gimple_stmt_iterator *gsi) +{ + gimple stmt = gsi_stmt (*gsi); + gimple def_stmt; + tree op0, op1, op2, op3, mask; + enum tree_code code = gimple_assign_rhs_code (stmt); + enum tree_code code2; + location_t loc = gimple_location (stmt); + + gcc_checking_assert (code == VEC_PERM_EXPR); + + op0 = gimple_assign_rhs1 (stmt); + op1 = gimple_assign_rhs2 (stmt); + op2 = gimple_assign_rhs3 (stmt); + + if (TREE_CODE (op0) != SSA_NAME) +return 0; + + if (TREE_CODE (op2) != VECTOR_CST) +return 0; + + def_stmt = SSA_NAME_DEF_STMT (op0); + if (!def_stmt || !is_gimple_assign (def_stmt) + || !can_propagate_from (def_stmt) + || !is_only_used_in (op0, stmt)) Or rather than the above, simply check || !has_single_use (op0) here. Then there was my previous (non-working) patch that used get_prop_source_stmt. +return 0; + + /* Check that it is only used here. We cannot use has_single_use + since the expression is using it twice itself... */ Ah ... so then || num_imm_uses (op0) != 2 Ah, ok, that's simpler indeed, but there were such dire warnings to never use that evil function unless absolutely necessary that I didn't dare use it... Thanks for the permission. + code2 = gimple_assign_rhs_code (def_stmt); + + /* Two consecutive shuffles. */ + if (code2 == VEC_PERM_EXPR) +{ + op3 = gimple_assign_rhs3 (def_stmt); + if (TREE_CODE (op3) != VECTOR_CST) + return 0; + mask = fold_build3_loc (loc, VEC_PERM_EXPR, TREE_TYPE (op3), + op3, op3, op2); + gcc_assert (TREE_CODE (mask) == VECTOR_CST); which means you can use mask = fold_ternary (loc, ...); Ok with that changes. Thanks a lot, I'll do that. Next step should be either BIT_FIELD_REF(VEC_PERM_EXPR) or VEC_PERM_EXPR(CONSTRUCTOR). Is there a good way to determine whether this kind of combination should be done forwards or backwards (i.e. start from VEC_PERM_EXPR and look if its single use is in BIT_FIELD_REF, or start from BIT_FIELD_REF and look if its argument is a VEC_PERM_EXPR only used here? -- Marc Glisse
Re: combine permutations in gimple
On Mon, Aug 13, 2012 at 3:03 PM, Marc Glisse marc.gli...@inria.fr wrote: On Mon, 13 Aug 2012, Richard Guenther wrote: On Sat, Aug 11, 2012 at 2:25 PM, Marc Glisse marc.gli...@inria.fr wrote: On Fri, 10 Aug 2012, Marc Glisse wrote: 1) I am not sure we always want to combine permutations. Indeed, someone (user? vectorizer?) may have written 2 permutations to help the backend generate optimal code, whereas it will do a bad job on the complicated combined permutation. At tree level, I am not sure what can be done except possibly restrict the optimization to the case where the combined permutation is the identity. At the RTL level, we could compare what insns are generated, but that's going to be painful (doing anything with permutations at the RTL level is painful). I guess people will complain soon enough if this causes horrible performance regressions in vectorized code. +/* Return true if VAR has no nondebug use but in stmt. */ +static bool +is_only_used_in (const_tree var, const_gimple stmt) +{ + const ssa_use_operand_t *const ptr0 = (SSA_NAME_IMM_USE_NODE (var)); + const ssa_use_operand_t *ptr = ptr0-next; + + for (; ptr != ptr0; ptr = ptr-next) +{ + const_gimple use_stmt = USE_STMT (ptr); + if (is_gimple_debug (use_stmt)) + continue; + if (use_stmt != stmt) + return false; +} + return true; +} if (!single_imm_use (var, use, use_stmt) || use_stmt != stmt) instead of the above. I don't think that works with statements that use a variable twice. +/* Combine two shuffles in a row. Returns 1 if there were any changes + made, 2 if cfg-cleanup needs to run. Else it returns 0. */ + +static int +simplify_permutation (gimple_stmt_iterator *gsi) +{ + gimple stmt = gsi_stmt (*gsi); + gimple def_stmt; + tree op0, op1, op2, op3, mask; + enum tree_code code = gimple_assign_rhs_code (stmt); + enum tree_code code2; + location_t loc = gimple_location (stmt); + + gcc_checking_assert (code == VEC_PERM_EXPR); + + op0 = gimple_assign_rhs1 (stmt); + op1 = gimple_assign_rhs2 (stmt); + op2 = gimple_assign_rhs3 (stmt); + + if (TREE_CODE (op0) != SSA_NAME) +return 0; + + if (TREE_CODE (op2) != VECTOR_CST) +return 0; + + def_stmt = SSA_NAME_DEF_STMT (op0); + if (!def_stmt || !is_gimple_assign (def_stmt) + || !can_propagate_from (def_stmt) + || !is_only_used_in (op0, stmt)) Or rather than the above, simply check || !has_single_use (op0) here. Then there was my previous (non-working) patch that used get_prop_source_stmt. +return 0; + + /* Check that it is only used here. We cannot use has_single_use + since the expression is using it twice itself... */ Ah ... so then || num_imm_uses (op0) != 2 Ah, ok, that's simpler indeed, but there were such dire warnings to never use that evil function unless absolutely necessary that I didn't dare use it... Thanks for the permission. If your new predicate would match more places (can you do a quick search?) then we want it in tree-flow-inline.h instead of in tree-ssa-forwprop.c. But yes, num_imm_uses can be expensive. For now just stick with the above. + code2 = gimple_assign_rhs_code (def_stmt); + + /* Two consecutive shuffles. */ + if (code2 == VEC_PERM_EXPR) +{ + op3 = gimple_assign_rhs3 (def_stmt); + if (TREE_CODE (op3) != VECTOR_CST) + return 0; + mask = fold_build3_loc (loc, VEC_PERM_EXPR, TREE_TYPE (op3), + op3, op3, op2); + gcc_assert (TREE_CODE (mask) == VECTOR_CST); which means you can use mask = fold_ternary (loc, ...); Ok with that changes. Thanks a lot, I'll do that. Next step should be either BIT_FIELD_REF(VEC_PERM_EXPR) or VEC_PERM_EXPR(CONSTRUCTOR). Is there a good way to determine whether this kind of combination should be done forwards or backwards (i.e. start from VEC_PERM_EXPR and look if its single use is in BIT_FIELD_REF, or start from BIT_FIELD_REF and look if its argument is a VEC_PERM_EXPR only used here? The natural SSA (and forwprop) way is to look for BIT_FIELD_REF and see if the def stmt is a VEC_PERM_EXPR. Richard. -- Marc Glisse
Re: combine permutations in gimple
I guess people will complain soon enough if this causes horrible performance regressions in vectorized code. Not having looked at your patch in great detail,. surely what we don't want is a situation where 2 constant permutations are converted into one generic permute. Based on a quick read of your patch I couldn't work that out. It might be that 2 constant permutes are cheaper than a generic permute. Have you looked at any examples in that space . I surely wouldn't like to see a sequence of interleave / transpose change into a generic permute operation on Neon as that would be far more expensive than this. It surely needs more testting than just this bit before going in. The reason being that this would likely take more registers and indeed produce loads of a constant pool for the new mask. regards, Ramana
Re: combine permutations in gimple
On Mon, Aug 13, 2012 at 3:12 PM, Ramana Radhakrishnan ramana.radhakrish...@linaro.org wrote: I guess people will complain soon enough if this causes horrible performance regressions in vectorized code. Not having looked at your patch in great detail,. surely what we don't want is a situation where 2 constant permutations are converted into one generic permute. Based on a quick read of your patch I couldn't work that out. It might be that 2 constant permutes are cheaper than a generic permute. Have you looked at any examples in that space . I surely wouldn't like to see a sequence of interleave / transpose change into a generic permute operation on Neon as that would be far more expensive than this. It surely needs more testting than just this bit before going in. The reason being that this would likely take more registers and indeed produce loads of a constant pool for the new mask. The patch does not do that. It merely assumes that the target knows how to perform an optimal constant permute and that two constant permutes never generate better code than a single one. Richard. regards, Ramana
Re: combine permutations in gimple
On Mon, 13 Aug 2012, Richard Guenther wrote: On Mon, Aug 13, 2012 at 3:12 PM, Ramana Radhakrishnan ramana.radhakrish...@linaro.org wrote: I guess people will complain soon enough if this causes horrible performance regressions in vectorized code. Not having looked at your patch in great detail,. surely what we don't want is a situation where 2 constant permutations are converted into one generic permute. Based on a quick read of your patch I couldn't work that out. It might be that 2 constant permutes are cheaper than a generic permute. Have you looked at any examples in that space . I surely wouldn't like to see a sequence of interleave / transpose change into a generic permute operation on Neon as that would be far more expensive than this. It surely needs more testting than just this bit before going in. The reason being that this would likely take more registers and indeed produce loads of a constant pool for the new mask. What do you call constant / non-constant? The combined permutation is still constant, although the expansion (in the back-end) might fail to expand it efficiently and fall back to the generic permutation expansion... The patch does not do that. It merely assumes that the target knows how to perform an optimal constant permute and that two constant permutes never generate better code than a single one. Which, to be honest, is false on all platforms I know, although I did contribute some minor enhancements for x86. -- Marc Glisse
Re: combine permutations in gimple
On Mon, Aug 13, 2012 at 03:13:26PM +0200, Richard Guenther wrote: On Mon, Aug 13, 2012 at 3:12 PM, Ramana Radhakrishnan ramana.radhakrish...@linaro.org wrote: I guess people will complain soon enough if this causes horrible performance regressions in vectorized code. Not having looked at your patch in great detail,. surely what we don't want is a situation where 2 constant permutations are converted into one generic permute. Based on a quick read of your patch I couldn't work that out. It might be that 2 constant permutes are cheaper than a generic permute. Have you looked at any examples in that space . I surely wouldn't like to see a sequence of interleave / transpose change into a generic permute operation on Neon as that would be far more expensive than this. It surely needs more testting than just this bit before going in. The reason being that this would likely take more registers and indeed produce loads of a constant pool for the new mask. The patch does not do that. It merely assumes that the target knows how to perform an optimal constant permute and that two constant permutes never generate better code than a single one. Still, the patch should do some tests whether it is beneficial. At least a can_vec_perm_p (mode, false, sel) test of the resulting permutation if both the original permutations pass that test, and perhaps additionally if targetm.vectorize.vec_perm_const_ok is non-NULL and passes for both the original permutations then it should also pass for the resulting permutation. Jakub
Re: combine permutations in gimple
On Mon, 13 Aug 2012, Richard Guenther wrote: + /* Check that it is only used here. We cannot use has_single_use + since the expression is using it twice itself... */ Ah ... so then || num_imm_uses (op0) != 2 Ah, ok, that's simpler indeed, but there were such dire warnings to never use that evil function unless absolutely necessary that I didn't dare use it... Thanks for the permission. If your new predicate would match more places (can you do a quick search?) You mean: if there are more optimizations that either already check for double use in the same statement, or could benefit from doing so? I'll take a look. then we want it in tree-flow-inline.h instead of in tree-ssa-forwprop.c. But yes, num_imm_uses can be expensive. For now just stick with the above. I assume the above means num_imm_uses (op0) != 2, since both versions are above ;-) The natural SSA (and forwprop) way is to look for BIT_FIELD_REF and see if the def stmt is a VEC_PERM_EXPR. Thanks again, -- Marc Glisse
Re: combine permutations in gimple
On Mon, 13 Aug 2012, Jakub Jelinek wrote: On Mon, Aug 13, 2012 at 03:13:26PM +0200, Richard Guenther wrote: The patch does not do that. It merely assumes that the target knows how to perform an optimal constant permute and that two constant permutes never generate better code than a single one. Still, the patch should do some tests whether it is beneficial. At least a can_vec_perm_p (mode, false, sel) test of the resulting permutation if both the original permutations pass that test, Sounds good. The last argument should be the constant permutation vector, presented as an array of indices stored in unsigned char? I hadn't realized we already had access to backend information that early in the compilation. It doesn't give the cost though. and perhaps additionally if targetm.vectorize.vec_perm_const_ok is non-NULL and passes for both the original permutations then it should also pass for the resulting permutation. Isn't that implied by the previous test (I see calls to vec_perm_const_ok in there)? Or maybe not quite? -- Marc Glisse
Re: combine permutations in gimple
On Mon, Aug 13, 2012 at 03:45:00PM +0200, Marc Glisse wrote: On Mon, 13 Aug 2012, Jakub Jelinek wrote: On Mon, Aug 13, 2012 at 03:13:26PM +0200, Richard Guenther wrote: The patch does not do that. It merely assumes that the target knows how to perform an optimal constant permute and that two constant permutes never generate better code than a single one. Still, the patch should do some tests whether it is beneficial. At least a can_vec_perm_p (mode, false, sel) test of the resulting permutation if both the original permutations pass that test, Sounds good. The last argument should be the constant permutation vector, presented as an array of indices stored in unsigned char? I hadn't realized we already had access to backend information that early in the compilation. It doesn't give the cost though. Yeah, it doesn't give the cost, just whether it is supported at all. and perhaps additionally if targetm.vectorize.vec_perm_const_ok is non-NULL and passes for both the original permutations then it should also pass for the resulting permutation. Isn't that implied by the previous test (I see calls to vec_perm_const_ok in there)? Or maybe not quite? can_vec_perm_p can also return true if e.g. the target supports generic variable permutation for the mode in question. So the targetm.vectorize.vec_perm_const_ok check is stricter than that, it tells you whether it can be supported by some constant permutation (or a sequence of them, you know how the i386 code for that is complicated). Jakub
Re: combine permutations in gimple
On 13 August 2012 14:21, Marc Glisse marc.gli...@inria.fr wrote: On Mon, 13 Aug 2012, Richard Guenther wrote: On Mon, Aug 13, 2012 at 3:12 PM, Ramana Radhakrishnan ramana.radhakrish...@linaro.org wrote: I guess people will complain soon enough if this causes horrible performance regressions in vectorized code. Not having looked at your patch in great detail,. surely what we don't want is a situation where 2 constant permutations are converted into one generic permute. Based on a quick read of your patch I couldn't work that out. It might be that 2 constant permutes are cheaper than a generic permute. Have you looked at any examples in that space . I surely wouldn't like to see a sequence of interleave / transpose change into a generic permute operation on Neon as that would be far more expensive than this. It surely needs more testting than just this bit before going in. The reason being that this would likely take more registers and indeed produce loads of a constant pool for the new mask. What do you call constant / non-constant? The combined permutation is still constant, although the expansion (in the back-end) might fail to expand it efficiently and fall back to the generic permutation expansion... That is exactly what I was worried about. By constant I would expect something that is expanded as a constant permute by the backend - an interleave operation or a transpose operation or indeed some of the funky operations as below in ARM / Neon which is the SIMD architecture I'm most familiar with . If you had something that did the following : uint8x8_t tst_vrev642_u8 (uint8x8_t __a) { uint8x8_t __rv; uint8x8_t __mask1 = { 7, 6, 5, 4, 3, 2, 1, 0}; uint8x8_t __mask2 = { 0, 8, 2, 10, 4, 12, 6, 14 }; return __builtin_shuffle ( __builtin_shuffle ( __a, __mask1), __mask2) ; } I would expect these instructions vrev64.8d0, d0 vmovd16, d0 @ v8qi vtrn.8 d0, d16 bx lr With your patch I see tst_vrev642_u8: @ args = 0, pretend = 0, frame = 0 @ frame_needed = 0, uses_anonymous_args = 0 @ link register save eliminated. flddd16, .L2 vtbl.8 d0, {d0}, d16 bx lr .L3: .align 3 .L2: .byte 7 .byte 7 .byte 5 .byte 5 .byte 3 .byte 3 .byte 1 .byte 1 It might be that the backend predicates need tightening in this case but why not try to get cost info about such combinations rather than just doing this gratuitously ? While in this case the ARM port might be wrong , but in general when the vector permute rewrites were done we chose to go ahead and keep the generic constant permutes in the backend as the last resort to fall back to. However if fwprop starts making such transformations really one ought to get relative costs for each of these operations rather than allowing gratuitous replacements. Thanks, Ramana
Re: combine permutations in gimple
On 13 August 2012 14:54, Jakub Jelinek ja...@redhat.com wrote: On Mon, Aug 13, 2012 at 03:45:00PM +0200, Marc Glisse wrote: On Mon, 13 Aug 2012, Jakub Jelinek wrote: On Mon, Aug 13, 2012 at 03:13:26PM +0200, Richard Guenther wrote: The patch does not do that. It merely assumes that the target knows how to perform an optimal constant permute and that two constant permutes never generate better code than a single one. Still, the patch should do some tests whether it is beneficial. At least a can_vec_perm_p (mode, false, sel) test of the resulting permutation if both the original permutations pass that test, Sounds good. The last argument should be the constant permutation vector, presented as an array of indices stored in unsigned char? I hadn't realized we already had access to backend information that early in the compilation. It doesn't give the cost though. Yeah, it doesn't give the cost, just whether it is supported at all. Maybe we need an interface of that form. A generic permute operation used for constant permute operations is going to be more expensive than more specialized constant permute operations. It might be that this cost gets amortized over a large number of operations at which point it makes sense but the compiler should make this transformation based on cost rather than just whether something is supported or not. Ramana
Re: combine permutations in gimple
On Mon, 13 Aug 2012, Ramana Radhakrishnan wrote: On 13 August 2012 14:21, Marc Glisse marc.gli...@inria.fr wrote: On Mon, 13 Aug 2012, Richard Guenther wrote: On Mon, Aug 13, 2012 at 3:12 PM, Ramana Radhakrishnan ramana.radhakrish...@linaro.org wrote: I guess people will complain soon enough if this causes horrible performance regressions in vectorized code. Not having looked at your patch in great detail,. surely what we don't want is a situation where 2 constant permutations are converted into one generic permute. Based on a quick read of your patch I couldn't work that out. It might be that 2 constant permutes are cheaper than a generic permute. Have you looked at any examples in that space . I surely wouldn't like to see a sequence of interleave / transpose change into a generic permute operation on Neon as that would be far more expensive than this. It surely needs more testting than just this bit before going in. The reason being that this would likely take more registers and indeed produce loads of a constant pool for the new mask. What do you call constant / non-constant? The combined permutation is still constant, although the expansion (in the back-end) might fail to expand it efficiently and fall back to the generic permutation expansion... That is exactly what I was worried about. By constant I would expect something that is expanded as a constant permute by the backend - an interleave operation or a transpose operation or indeed some of the funky operations as below in ARM / Neon which is the SIMD architecture I'm most familiar with . If you had something that did the following : uint8x8_t tst_vrev642_u8 (uint8x8_t __a) { uint8x8_t __rv; uint8x8_t __mask1 = { 7, 6, 5, 4, 3, 2, 1, 0}; uint8x8_t __mask2 = { 0, 8, 2, 10, 4, 12, 6, 14 }; return __builtin_shuffle ( __builtin_shuffle ( __a, __mask1), __mask2) ; } I would expect these instructions vrev64.8d0, d0 vmovd16, d0 @ v8qi vtrn.8 d0, d16 bx lr With your patch I see tst_vrev642_u8: @ args = 0, pretend = 0, frame = 0 @ frame_needed = 0, uses_anonymous_args = 0 @ link register save eliminated. flddd16, .L2 vtbl.8 d0, {d0}, d16 bx lr .L3: .align 3 .L2: .byte 7 .byte 7 .byte 5 .byte 5 .byte 3 .byte 3 .byte 1 .byte 1 Seems to be one instruction shorter at least ;-) Yes, there can be much worse regressions than that because of the patch (like 40 instructions instead of 4, in the x86 backend). It might be that the backend predicates need tightening in this case but why not try to get cost info about such combinations rather than just doing this gratuitously ? I don't think the word gratuitous is appropriate. middle-end replaces a+-b with a-b without first asking the backend whether it might be more efficient. One permutation is better than 2. It just happens that the range of possible permutations is too large (and the basic instructions are too strange) for backends to do a good job on them, and thus keeping toplevel input as a hint is important. While in this case the ARM port might be wrong , but in general when the vector permute rewrites were done we chose to go ahead and keep the generic constant permutes in the backend as the last resort to fall back to. However if fwprop starts making such transformations really one ought to get relative costs for each of these operations rather than allowing gratuitous replacements. Indeed, having costs would help, but that's a lot of work. As you can see from the original email, I am willing to limit the optimization to the case where the combined permutation is the identity (yes, that's quite drastic...). I should probably implement that special case anyway, because it doesn't require its argument to have a single use for the optimization to make sense. -- Marc Glisse
Re: combine permutations in gimple
On Mon, 13 Aug 2012, Marc Glisse wrote: On Mon, 13 Aug 2012, Richard Guenther wrote: If your new predicate would match more places (can you do a quick search?) You mean: if there are more optimizations that either already check for double use in the same statement, or could benefit from doing so? I'll take a look. I couldn't find anything obvious (not that I understood a significant percentage of what I saw...). Note that the comments are efficient since the only current use of num_imm_uses is in a debug printf call. I'll give it a few more days for the conversation to settle, so I know what I should do between: - the barely modified patch you accepted, - the check asked by Jakub, - the restriction to identity that prevents any regression (well...), - something else? -- Marc Glisse
Re: combine permutations in gimple
On Fri, 10 Aug 2012, Marc Glisse wrote: this patch detects permutations of permutations and merges them. It also canonicalizes permutations a bit more. There are several issues with this patch: 1) I am not sure we always want to combine permutations. Indeed, someone (user? vectorizer?) may have written 2 permutations to help the backend generate optimal code, whereas it will do a bad job on the complicated combined permutation. At tree level, I am not sure what can be done except possibly restrict the optimization to the case where the combined permutation is the identity. At the RTL level, we could compare what insns are generated, but that's going to be painful (doing anything with permutations at the RTL level is painful). 2) I don't understand how the cleanup works in tree-ssa-forwprop.c. I copied some fold/update/remove lines from other simplifications, but I don't know if they are right. 3) In a first version of the patch, where I had SSA_NAME_DEF_STMT instead of get_prop_source_stmt, I got an ICE in one of the torture vectorization testcases. It happened in expand_expr_real_2, because it asserts that expand_vec_perm will never fail. However, expand_vec_perm does return NULL_RTX sometimes. Here it was in V16QImode, with the permutation {0,0,2,2,4,...,14,14}. maybe_expand_insn can't handle it directly (that's ok I guess), but then expand_vec_perm sees VOIDmode and exits instead of trying other ways. I don't know if there is a latent bug or if (more likely) my patch may produce trees with wrong modes. 4) Is this the right place? This isn't the transformation I am most interested in, but it is a first step to see if the direction is right. Hello, there was yet another issue with the version I posted: the testcase was trivial so I didn't notice that it didn't perform the optimization at all... Here is a new version. It seems a bit strange to me that there are plenty of functions that check for single-use variables, but there isn't any that checks for variables used in a single statement (but possibly twice). So I may be doing something strange, but it seems to be the natural test here. Happily passed bootstrap+regtest. 2012-08-11 Marc Glisse marc.gli...@inria.fr gcc/ * fold-const.c (fold_ternary_loc): Detect identity permutations. Canonicalize permutations more. * tree-ssa-forwprop.c (is_only_used_in): New function. (simplify_permutation): Likewise. (ssa_forward_propagate_and_combine): Call it. gcc/testsuite/ * gcc.dg/tree-ssa/forwprop-19.c: New testcase. -- Marc GlisseIndex: gcc/tree-ssa-forwprop.c === --- gcc/tree-ssa-forwprop.c (revision 190311) +++ gcc/tree-ssa-forwprop.c (working copy) @@ -2569,20 +2569,97 @@ combine_conversions (gimple_stmt_iterato gimple_assign_set_rhs_code (stmt, CONVERT_EXPR); update_stmt (stmt); return remove_prop_source_from_use (op0) ? 2 : 1; } } } return 0; } +/* Return true if VAR has no nondebug use but in stmt. */ +static bool +is_only_used_in (const_tree var, const_gimple stmt) +{ + const ssa_use_operand_t *const ptr0 = (SSA_NAME_IMM_USE_NODE (var)); + const ssa_use_operand_t *ptr = ptr0-next; + + for (; ptr != ptr0; ptr = ptr-next) +{ + const_gimple use_stmt = USE_STMT (ptr); + if (is_gimple_debug (use_stmt)) + continue; + if (use_stmt != stmt) + return false; +} + return true; +} + +/* Combine two shuffles in a row. Returns 1 if there were any changes + made, 2 if cfg-cleanup needs to run. Else it returns 0. */ + +static int +simplify_permutation (gimple_stmt_iterator *gsi) +{ + gimple stmt = gsi_stmt (*gsi); + gimple def_stmt; + tree op0, op1, op2, op3, mask; + enum tree_code code = gimple_assign_rhs_code (stmt); + enum tree_code code2; + location_t loc = gimple_location (stmt); + + gcc_checking_assert (code == VEC_PERM_EXPR); + + op0 = gimple_assign_rhs1 (stmt); + op1 = gimple_assign_rhs2 (stmt); + op2 = gimple_assign_rhs3 (stmt); + + if (TREE_CODE (op0) != SSA_NAME) +return 0; + + if (TREE_CODE (op2) != VECTOR_CST) +return 0; + + if (op0 != op1) +return 0; + + def_stmt = SSA_NAME_DEF_STMT (op0); + if (!def_stmt || !is_gimple_assign (def_stmt) + || !can_propagate_from (def_stmt) + || !is_only_used_in (op0, stmt)) +return 0; + + /* Check that it is only used here. We cannot use has_single_use + since the expression is using it twice itself... */ + + code2 = gimple_assign_rhs_code (def_stmt); + + /* Two consecutive shuffles. */ + if (code2 == VEC_PERM_EXPR) +{ + op3 = gimple_assign_rhs3 (def_stmt); + if (TREE_CODE (op3) != VECTOR_CST) + return 0; + mask = fold_build3_loc (loc, VEC_PERM_EXPR, TREE_TYPE (op3), + op3, op3, op2); + gcc_assert (TREE_CODE (mask) == VECTOR_CST); +
Re: combine permutations in gimple
On Fri, 10 Aug 2012, Marc Glisse wrote: There are several issues with this patch: Oups: 5) and the testcase needs fixing, just like Jakub just fixed Richard's testcase, to avoid ABI warnings. But that's easy. -- Marc Glisse