Re: combine permutations in gimple

2012-08-20 Thread Richard Guenther
On Sat, Aug 18, 2012 at 11:59 AM, Marc Glisse marc.gli...@inria.fr wrote:
 Hello,

 here is a new patch (passes bootstrap+regtest), which only combines
 permutations if the result is the identity. I'll file a PR about more
 general
 combinations to link to this conversation and the cost hook proposition.

 Ok?

Ok.

Thanks,
Richard.


 2012-08-18  Marc Glisse  marc.gli...@inria.fr


 gcc/
 * fold-const.c (fold_ternary_loc): Detect identity permutations.
 Canonicalize permutations more.
 * tree-ssa-forwprop.c (is_combined_permutation_identity): New
 function.

 (simplify_permutation): Likewise.
 (ssa_forward_propagate_and_combine): Call it.

 gcc/testsuite/
 * gcc.dg/tree-ssa/forwprop-19.c: New testcase.
 * gcc.dg/fold-perm.c: Likewise.

 --
 Marc Glisse
 Index: gcc/fold-const.c
 ===
 --- gcc/fold-const.c(revision 190500)
 +++ gcc/fold-const.c(working copy)
 @@ -14148,58 +14148,96 @@ fold_ternary_loc (location_t loc, enum t
 return fold_build2_loc (loc, PLUS_EXPR, type,
 const_binop (MULT_EXPR, arg0, arg1), arg2);
if (integer_zerop (arg2))
 return fold_build2_loc (loc, MULT_EXPR, type, arg0, arg1);

return fold_fma (loc, type, arg0, arg1, arg2);

  case VEC_PERM_EXPR:
if (TREE_CODE (arg2) == VECTOR_CST)
 {
 - unsigned int nelts = TYPE_VECTOR_SUBPARTS (type), i;
 + unsigned int nelts = TYPE_VECTOR_SUBPARTS (type), i, mask;
   unsigned char *sel = XALLOCAVEC (unsigned char, nelts);
   tree t;
   bool need_mask_canon = false;
 + bool all_in_vec0 = true;
 + bool all_in_vec1 = true;
 + bool maybe_identity = true;
 + bool single_arg = (op0 == op1);
 + bool changed = false;

 + mask = single_arg ? (nelts - 1) : (2 * nelts - 1);
   gcc_assert (nelts == VECTOR_CST_NELTS (arg2));
   for (i = 0; i  nelts; i++)
 {
   tree val = VECTOR_CST_ELT (arg2, i);
   if (TREE_CODE (val) != INTEGER_CST)
 return NULL_TREE;

 - sel[i] = TREE_INT_CST_LOW (val)  (2 * nelts - 1);
 + sel[i] = TREE_INT_CST_LOW (val)  mask;
   if (TREE_INT_CST_HIGH (val)
   || ((unsigned HOST_WIDE_INT)
   TREE_INT_CST_LOW (val) != sel[i]))
 need_mask_canon = true;
 +
 + if (sel[i]  nelts)
 +   all_in_vec1 = false;
 + else
 +   all_in_vec0 = false;
 +
 + if ((sel[i]  (nelts-1)) != i)
 +   maybe_identity = false;
 +   }
 +
 + if (maybe_identity)
 +   {
 + if (all_in_vec0)
 +   return op0;
 + if (all_in_vec1)
 +   return op1;
 }

   if ((TREE_CODE (arg0) == VECTOR_CST
|| TREE_CODE (arg0) == CONSTRUCTOR)
(TREE_CODE (arg1) == VECTOR_CST
   || TREE_CODE (arg1) == CONSTRUCTOR))
 {
   t = fold_vec_perm (type, arg0, arg1, sel);
   if (t != NULL_TREE)
 return t;
 }

 + if (all_in_vec0)
 +   op1 = op0;
 + else if (all_in_vec1)
 +   {
 + op0 = op1;
 + for (i = 0; i  nelts; i++)
 +   sel[i] -= nelts;
 + need_mask_canon = true;
 +   }
 +
 + if (op0 == op1  !single_arg)
 +   changed = true;
 +
   if (need_mask_canon  arg2 == op2)
 {
   tree *tsel = XALLOCAVEC (tree, nelts);
   tree eltype = TREE_TYPE (TREE_TYPE (arg2));
   for (i = 0; i  nelts; i++)
 tsel[i] = build_int_cst (eltype, sel[i]);
 - t = build_vector (TREE_TYPE (arg2), tsel);
 - return build3_loc (loc, VEC_PERM_EXPR, type, op0, op1, t);
 + op2 = build_vector (TREE_TYPE (arg2), tsel);
 + changed = true;
 }
 +
 + if (changed)
 +   return build3_loc (loc, VEC_PERM_EXPR, type, op0, op1, op2);
 }
return NULL_TREE;

  default:
return NULL_TREE;
  } /* switch (code) */
  }

  /* Perform constant folding and related simplification of EXPR.
 The related simplifications include x*1 = x, x*0 = 0, etc.,
 Index: gcc/tree-ssa-forwprop.c
 ===
 --- gcc/tree-ssa-forwprop.c (revision 190500)
 +++ gcc/tree-ssa-forwprop.c (working copy)
 @@ -2570,20 +2570,109 @@ combine_conversions (gimple_stmt_iterato
   gimple_assign_set_rhs_code (stmt, CONVERT_EXPR);
   update_stmt (stmt);
   return remove_prop_source_from_use (op0) ? 2 : 1;
 }
 }
  }

return 0;
  }

 +/* Determine whether applying the 2 

Re: combine permutations in gimple

2012-08-16 Thread Hans-Peter Nilsson
On Wed, 15 Aug 2012, Richard Guenther wrote:
 On Wed, Aug 15, 2012 at 1:56 PM, Ramana Radhakrishnan
 ramana.radhakrish...@linaro.org wrote:
  Of-course, the problem here is this change of semantics with the hook
  TARGET_VEC_PERM_CONST_OK. Targets were expanding to generic permutes
  with constants in the *absence* of being able to deal with them with
  the specialized permutes.  fwprop will now leave us at a point where
  each target has to now grow more knowledge with respect to how best to
  expand a generic constant permute with a sequence of permutes rather
  than just using the generic permute.
 
  Generating a sequence of permutes from a single constant permute will
  be a  harder problem than (say) dealing with immediate expansions so
  you are pushing more complexity into the target but in the short term
  target maintainers should definitely have a heads up that folks using
  vector permute intrinsics could actually have performance regressions
  on their targets.

 It's of course the same with the user input containing such a non-optimal
 handled constant permute.  So I'm less convinced that it's too much burden
 on the target side.  OTOH if there is a generic kind of shuffles that targets
 do not implement directly but can be simulated by two that are directly
 implemented pushing the logic to the expander (and adjusting the target
 hook semantic) would be ok.

There's a wonderful knapsack-class problem in there, something
for next year's GSoC?

(Given a constant permutation, synthesize it with a set of
simpler operations with total cost  constant_shuffle, where the
simpler operation and costs are target-specific (query for
presence by TARGET_VEC_PERM_CONST_OK and cost hooks; decompose
to simpler operation by generic heuristics).  A similar but
simpler problem is to synthesize a constant vector instead of
loading it from memory (though a load may be cheap enough for
most SIMD targets that this may be uninteresting to generalize).

brgds, H-P


Re: combine permutations in gimple

2012-08-15 Thread Marc Glisse

On Mon, 13 Aug 2012, Marc Glisse wrote:

I'll give it a few more days for the conversation to settle, so I know what I 
should do between:

- the barely modified patch you accepted,
- the check asked by Jakub,
- the restriction to identity that prevents any regression (well...),
- something else?


I didn't realize the conversation would go quiet immediatly...

Is someone willing to take the responsibility of telling me which approach 
is right? I can add Jakub's checks (though I am not sure how I would test 
them), but I would rather not do it if the conclusion is that they are 
either unnecessary (original patch is ok) or insufficient (don't avoid 
Ramana's regressions). I can do the very safe option 3 (only combine 
permutations when the result is the identity permutation), but I first 
want to make sure the more general thing is bad.


(sorry, it's my first patch that gets conflicting answers...)

--
Marc Glisse


Re: combine permutations in gimple

2012-08-15 Thread Richard Guenther
On Wed, Aug 15, 2012 at 1:22 PM, Marc Glisse marc.gli...@inria.fr wrote:
 On Mon, 13 Aug 2012, Marc Glisse wrote:

 I'll give it a few more days for the conversation to settle, so I know
 what I should do between:
 - the barely modified patch you accepted,
 - the check asked by Jakub,
 - the restriction to identity that prevents any regression (well...),
 - something else?


 I didn't realize the conversation would go quiet immediatly...

 Is someone willing to take the responsibility of telling me which approach
 is right? I can add Jakub's checks (though I am not sure how I would test
 them), but I would rather not do it if the conclusion is that they are
 either unnecessary (original patch is ok) or insufficient (don't avoid
 Ramana's regressions). I can do the very safe option 3 (only combine
 permutations when the result is the identity permutation), but I first want
 to make sure the more general thing is bad.

 (sorry, it's my first patch that gets conflicting answers...)

Well, we're waiting for someone to break the tie ... I'd go with the original
patch, improving the backends where necessary.

Richard.

 --
 Marc Glisse


Re: combine permutations in gimple

2012-08-15 Thread Ramana Radhakrishnan
[It looks like I missed hitting the send button on this response]


 Seems to be one instruction shorter at least ;-) Yes, there can be much
 worse regressions than that because of the patch (like 40 instructions
 instead of 4, in the x86 backend).

If this is replacing 4 instructions with 40 in x86 backend maybe
someone will notice :)

Not a win in this particular testcase  because the compiler replaces 2
constant permutes ( that's about 4 cycles) with a load from the
constant pool , a generic permute and in addition are polluting the
icache with guff in the constant pool .  If you go to 3 -4 permutes
into a single one then it might be a win but not till that point.


 with a-b without first asking the backend whether it might be more
 efficient. One permutation is better than 2.
  It just happens that the range
 of possible permutations is too large (and the basic instructions are too
 strange) for backends to do a good job on them, and thus keeping toplevel
 input as a hint is important.

Of-course, the problem here is this change of semantics with the hook
TARGET_VEC_PERM_CONST_OK. Targets were expanding to generic permutes
with constants in the *absence* of being able to deal with them with
the specialized permutes.  fwprop will now leave us at a point where
each target has to now grow more knowledge with respect to how best to
expand a generic constant permute with a sequence of permutes rather
than just using the generic permute.

Generating a sequence of permutes from a single constant permute will
be a  harder problem than (say) dealing with immediate expansions so
you are pushing more complexity into the target but in the short term
target maintainers should definitely have a heads up that folks using
vector permute intrinsics could actually have performance regressions
on their targets.

Thanks,
Ramana


Re: combine permutations in gimple

2012-08-15 Thread Richard Guenther
On Wed, Aug 15, 2012 at 1:56 PM, Ramana Radhakrishnan
ramana.radhakrish...@linaro.org wrote:
 [It looks like I missed hitting the send button on this response]


 Seems to be one instruction shorter at least ;-) Yes, there can be much
 worse regressions than that because of the patch (like 40 instructions
 instead of 4, in the x86 backend).

 If this is replacing 4 instructions with 40 in x86 backend maybe
 someone will notice :)

 Not a win in this particular testcase  because the compiler replaces 2
 constant permutes ( that's about 4 cycles) with a load from the
 constant pool , a generic permute and in addition are polluting the
 icache with guff in the constant pool .  If you go to 3 -4 permutes
 into a single one then it might be a win but not till that point.


 with a-b without first asking the backend whether it might be more
 efficient. One permutation is better than 2.
  It just happens that the range
 of possible permutations is too large (and the basic instructions are too
 strange) for backends to do a good job on them, and thus keeping toplevel
 input as a hint is important.

 Of-course, the problem here is this change of semantics with the hook
 TARGET_VEC_PERM_CONST_OK. Targets were expanding to generic permutes
 with constants in the *absence* of being able to deal with them with
 the specialized permutes.  fwprop will now leave us at a point where
 each target has to now grow more knowledge with respect to how best to
 expand a generic constant permute with a sequence of permutes rather
 than just using the generic permute.

 Generating a sequence of permutes from a single constant permute will
 be a  harder problem than (say) dealing with immediate expansions so
 you are pushing more complexity into the target but in the short term
 target maintainers should definitely have a heads up that folks using
 vector permute intrinsics could actually have performance regressions
 on their targets.

It's of course the same with the user input containing such a non-optimal
handled constant permute.  So I'm less convinced that it's too much burden
on the target side.  OTOH if there is a generic kind of shuffles that targets
do not implement directly but can be simulated by two that are directly
implemented pushing the logic to the expander (and adjusting the target
hook semantic) would be ok.

Richard.

 Thanks,
 Ramana


Re: combine permutations in gimple

2012-08-15 Thread Jakub Jelinek
On Wed, Aug 15, 2012 at 01:46:05PM +0200, Richard Guenther wrote:
 Well, we're waiting for someone to break the tie ... I'd go with the original
 patch, improving the backends where necessary.

E.g. i?86/x86_64 with just plain -msse2 has only very small subset of
constant shuffles (and no variable shuffle), so by doing the transformation
you could end up with a scalarized shuffle instead of two constant vector
shuffles.  Expecting the backend to figure it out and doing the two constant
vector shuffles in every case is not realistic, i386/x86_64 has way too many
different shuffling instructions (and worse different ISA levels have
different subsets of them) and while a lot of effort has been spent on
it already by Richard, me, Marc and others, we are nowhere close to having
optimal sequence in many cases for various modes and ISA levels.  Doing a
brute force might be too expensive.

Jakub


Re: combine permutations in gimple

2012-08-15 Thread Richard Guenther
On Wed, Aug 15, 2012 at 2:07 PM, Jakub Jelinek ja...@redhat.com wrote:
 On Wed, Aug 15, 2012 at 01:46:05PM +0200, Richard Guenther wrote:
 Well, we're waiting for someone to break the tie ... I'd go with the original
 patch, improving the backends where necessary.

 E.g. i?86/x86_64 with just plain -msse2 has only very small subset of
 constant shuffles (and no variable shuffle), so by doing the transformation
 you could end up with a scalarized shuffle instead of two constant vector
 shuffles.  Expecting the backend to figure it out and doing the two constant
 vector shuffles in every case is not realistic, i386/x86_64 has way too many
 different shuffling instructions (and worse different ISA levels have
 different subsets of them) and while a lot of effort has been spent on
 it already by Richard, me, Marc and others, we are nowhere close to having
 optimal sequence in many cases for various modes and ISA levels.  Doing a
 brute force might be too expensive.

Ok.  That would still leave us with the issue Ramana brought up - the
target hook returning true unconditionally if a generic permute is implemented.
We just avoid generic expansion by tree-vect-generic.c that way.

(I wonder if there exists some automated machinery that finds a path using
existing instructions to shuffle from {0, 1, ..., n } to the mask and if that
would be a viable way to handle shuffle expansions, or pre-compute them all)

Richard.

 Jakub


Re: combine permutations in gimple

2012-08-15 Thread Ramana Radhakrishnan
On 15 August 2012 13:07, Jakub Jelinek ja...@redhat.com wrote:
 On Wed, Aug 15, 2012 at 01:46:05PM +0200, Richard Guenther wrote:
 Well, we're waiting for someone to break the tie ... I'd go with the original
 patch, improving the backends where necessary.

 E.g. i?86/x86_64 with just plain -msse2 has only very small subset of
 constant shuffles (and no variable shuffle), so by doing the transformation
 you could end up with a scalarized shuffle instead of two constant vector
 shuffles.  Expecting the backend to figure it out and doing the two constant
 vector shuffles in every case is not realistic, i386/x86_64 has way too many
 different shuffling instructions (and worse different ISA levels have
 different subsets of them) and while a lot of effort has been spent on
 it already by Richard, me, Marc and others, we are nowhere close to having
 optimal sequence in many cases for various modes and ISA levels.  Doing a
 brute force might be too expensive.


Neon has atleast 7-8 such forms. Brute force computing this would be expensive.

I don't know what happens on other vector targets - surely other SIMD
targets in GCC will also have the same problem.

Ramana

 Jakub


Re: combine permutations in gimple

2012-08-15 Thread Jakub Jelinek
On Wed, Aug 15, 2012 at 02:15:03PM +0200, Richard Guenther wrote:
 Ok.  That would still leave us with the issue Ramana brought up - the
 target hook returning true unconditionally if a generic permute is 
 implemented.
 We just avoid generic expansion by tree-vect-generic.c that way.

Yeah, if there is a generic permute, can_vec_perm_p will return true always
for the modes for which it is available.  Which is why I've been suggesting
also the target hook which should return false if only generic permute is
going to be used.

Jakub


Re: combine permutations in gimple

2012-08-15 Thread Richard Guenther
On Wed, Aug 15, 2012 at 2:29 PM, Jakub Jelinek ja...@redhat.com wrote:
 On Wed, Aug 15, 2012 at 02:15:03PM +0200, Richard Guenther wrote:
 Ok.  That would still leave us with the issue Ramana brought up - the
 target hook returning true unconditionally if a generic permute is 
 implemented.
 We just avoid generic expansion by tree-vect-generic.c that way.

 Yeah, if there is a generic permute, can_vec_perm_p will return true always
 for the modes for which it is available.  Which is why I've been suggesting
 also the target hook which should return false if only generic permute is
 going to be used.

Well.  What about returning a cost instead?  We don't want to transform
two insns to four either.

Richard.

 Jakub


Re: combine permutations in gimple

2012-08-15 Thread Richard Guenther
On Wed, Aug 15, 2012 at 2:53 PM, Jakub Jelinek ja...@redhat.com wrote:
 On Wed, Aug 15, 2012 at 02:36:54PM +0200, Richard Guenther wrote:
 On Wed, Aug 15, 2012 at 2:29 PM, Jakub Jelinek ja...@redhat.com wrote:
  On Wed, Aug 15, 2012 at 02:15:03PM +0200, Richard Guenther wrote:
  Ok.  That would still leave us with the issue Ramana brought up - the
  target hook returning true unconditionally if a generic permute is 
  implemented.
  We just avoid generic expansion by tree-vect-generic.c that way.
 
  Yeah, if there is a generic permute, can_vec_perm_p will return true always
  for the modes for which it is available.  Which is why I've been suggesting
  also the target hook which should return false if only generic permute is
  going to be used.

 Well.  What about returning a cost instead?  We don't want to transform
 two insns to four either.

 That could work, it would just be a lot of work to change it (and more
 costly).  E.g. on i386 for 16-byte vectors with certain ISAs we return true
 right away, when returning cost we'd need to go the slow path always
 even when the caller is just interested in a boolean flag whether it is
 possible or not.

 CCing rth as the author of the hooks.

We'd want more accurate costs for the vectorizer cost model anyways.

Richard.

 Jakub


Re: combine permutations in gimple

2012-08-13 Thread Richard Guenther
On Sat, Aug 11, 2012 at 2:25 PM, Marc Glisse marc.gli...@inria.fr wrote:
 On Fri, 10 Aug 2012, Marc Glisse wrote:

 this patch detects permutations of permutations and merges them. It also
 canonicalizes permutations a bit more.

 There are several issues with this patch:

 1) I am not sure we always want to combine permutations. Indeed, someone
 (user? vectorizer?) may have written 2 permutations to help the backend
 generate optimal code, whereas it will do a bad job on the complicated
 combined permutation. At tree level, I am not sure what can be done except
 possibly restrict the optimization to the case where the combined
 permutation is the identity. At the RTL level, we could compare what insns
 are generated, but that's going to be painful (doing anything with
 permutations at the RTL level is painful).

 2) I don't understand how the cleanup works in tree-ssa-forwprop.c. I
 copied some fold/update/remove lines from other simplifications, but I don't
 know if they are right.

 3) In a first version of the patch, where I had SSA_NAME_DEF_STMT instead
 of get_prop_source_stmt, I got an ICE in one of the torture vectorization
 testcases. It happened in expand_expr_real_2, because it asserts that
 expand_vec_perm will never fail. However, expand_vec_perm does return
 NULL_RTX sometimes. Here it was in V16QImode, with the permutation
 {0,0,2,2,4,...,14,14}. maybe_expand_insn can't handle it directly (that's ok
 I guess), but then expand_vec_perm sees VOIDmode and exits instead of trying
 other ways. I don't know if there is a latent bug or if (more likely) my
 patch may produce trees with wrong modes.

 4) Is this the right place?

 This isn't the transformation I am most interested in, but it is a first
 step to see if the direction is right.


 Hello,

 there was yet another issue with the version I posted: the testcase was
 trivial so I didn't notice that it didn't perform the optimization at all...

 Here is a new version. It seems a bit strange to me that there are plenty of
 functions that check for single-use variables, but there isn't any that
 checks for variables used in a single statement (but possibly twice). So I
 may be doing something strange, but it seems to be the natural test here.

 Happily passed bootstrap+regtest.

 2012-08-11  Marc Glisse  marc.gli...@inria.fr


 gcc/
 * fold-const.c (fold_ternary_loc): Detect identity permutations.
 Canonicalize permutations more.
 * tree-ssa-forwprop.c (is_only_used_in): New function.
 (simplify_permutation): Likewise.

 (ssa_forward_propagate_and_combine): Call it.

 gcc/testsuite/
 * gcc.dg/tree-ssa/forwprop-19.c: New testcase.

 --
 Marc Glisse

 Index: gcc/tree-ssa-forwprop.c
 ===
 --- gcc/tree-ssa-forwprop.c (revision 190311)
 +++ gcc/tree-ssa-forwprop.c (working copy)
 @@ -2569,20 +2569,97 @@ combine_conversions (gimple_stmt_iterato
   gimple_assign_set_rhs_code (stmt, CONVERT_EXPR);
   update_stmt (stmt);
   return remove_prop_source_from_use (op0) ? 2 : 1;
 }
 }
  }

return 0;
  }

 +/* Return true if VAR has no nondebug use but in stmt.  */
 +static bool
 +is_only_used_in (const_tree var, const_gimple stmt)
 +{
 +  const ssa_use_operand_t *const ptr0 = (SSA_NAME_IMM_USE_NODE (var));
 +  const ssa_use_operand_t *ptr = ptr0-next;
 +
 +  for (; ptr != ptr0; ptr = ptr-next)
 +{
 +  const_gimple use_stmt = USE_STMT (ptr);
 +  if (is_gimple_debug (use_stmt))
 +   continue;
 +  if (use_stmt != stmt)
 +   return false;
 +}
 +  return true;
 +}

if (!single_imm_use (var, use, use_stmt) || use_stmt != stmt)

instead of the above.

 +/* Combine two shuffles in a row.  Returns 1 if there were any changes
 +   made, 2 if cfg-cleanup needs to run.  Else it returns 0.  */
 +
 +static int
 +simplify_permutation (gimple_stmt_iterator *gsi)
 +{
 +  gimple stmt = gsi_stmt (*gsi);
 +  gimple def_stmt;
 +  tree op0, op1, op2, op3, mask;
 +  enum tree_code code = gimple_assign_rhs_code (stmt);
 +  enum tree_code code2;
 +  location_t loc = gimple_location (stmt);
 +
 +  gcc_checking_assert (code == VEC_PERM_EXPR);
 +
 +  op0 = gimple_assign_rhs1 (stmt);
 +  op1 = gimple_assign_rhs2 (stmt);
 +  op2 = gimple_assign_rhs3 (stmt);
 +
 +  if (TREE_CODE (op0) != SSA_NAME)
 +return 0;
 +
 +  if (TREE_CODE (op2) != VECTOR_CST)
 +return 0;
 +
 +  def_stmt = SSA_NAME_DEF_STMT (op0);
 +  if (!def_stmt || !is_gimple_assign (def_stmt)
 +  || !can_propagate_from (def_stmt)
 +  || !is_only_used_in (op0, stmt))

Or rather than the above, simply check

   || !has_single_use (op0)

here.

 +return 0;
 +
 +  /* Check that it is only used here. We cannot use has_single_use
 + since the expression is using it twice itself...  */

Ah ... so then

   || num_imm_uses (op0) != 2

 +  code2 = gimple_assign_rhs_code (def_stmt);
 +
 +  /* 

Re: combine permutations in gimple

2012-08-13 Thread Marc Glisse

On Mon, 13 Aug 2012, Richard Guenther wrote:


On Sat, Aug 11, 2012 at 2:25 PM, Marc Glisse marc.gli...@inria.fr wrote:

On Fri, 10 Aug 2012, Marc Glisse wrote:


1) I am not sure we always want to combine permutations. Indeed, someone
(user? vectorizer?) may have written 2 permutations to help the backend
generate optimal code, whereas it will do a bad job on the complicated
combined permutation. At tree level, I am not sure what can be done except
possibly restrict the optimization to the case where the combined
permutation is the identity. At the RTL level, we could compare what insns
are generated, but that's going to be painful (doing anything with
permutations at the RTL level is painful).


I guess people will complain soon enough if this causes horrible 
performance regressions in vectorized code.



+/* Return true if VAR has no nondebug use but in stmt.  */
+static bool
+is_only_used_in (const_tree var, const_gimple stmt)
+{
+  const ssa_use_operand_t *const ptr0 = (SSA_NAME_IMM_USE_NODE (var));
+  const ssa_use_operand_t *ptr = ptr0-next;
+
+  for (; ptr != ptr0; ptr = ptr-next)
+{
+  const_gimple use_stmt = USE_STMT (ptr);
+  if (is_gimple_debug (use_stmt))
+   continue;
+  if (use_stmt != stmt)
+   return false;
+}
+  return true;
+}


if (!single_imm_use (var, use, use_stmt) || use_stmt != stmt)

instead of the above.


I don't think that works with statements that use a variable twice.


+/* Combine two shuffles in a row.  Returns 1 if there were any changes
+   made, 2 if cfg-cleanup needs to run.  Else it returns 0.  */
+
+static int
+simplify_permutation (gimple_stmt_iterator *gsi)
+{
+  gimple stmt = gsi_stmt (*gsi);
+  gimple def_stmt;
+  tree op0, op1, op2, op3, mask;
+  enum tree_code code = gimple_assign_rhs_code (stmt);
+  enum tree_code code2;
+  location_t loc = gimple_location (stmt);
+
+  gcc_checking_assert (code == VEC_PERM_EXPR);
+
+  op0 = gimple_assign_rhs1 (stmt);
+  op1 = gimple_assign_rhs2 (stmt);
+  op2 = gimple_assign_rhs3 (stmt);
+
+  if (TREE_CODE (op0) != SSA_NAME)
+return 0;
+
+  if (TREE_CODE (op2) != VECTOR_CST)
+return 0;
+
+  def_stmt = SSA_NAME_DEF_STMT (op0);
+  if (!def_stmt || !is_gimple_assign (def_stmt)
+  || !can_propagate_from (def_stmt)
+  || !is_only_used_in (op0, stmt))


Or rather than the above, simply check

  || !has_single_use (op0)

here.


Then there was my previous (non-working) patch that used 
get_prop_source_stmt.



+return 0;
+
+  /* Check that it is only used here. We cannot use has_single_use
+ since the expression is using it twice itself...  */


Ah ... so then

  || num_imm_uses (op0) != 2


Ah, ok, that's simpler indeed, but there were such dire warnings to never 
use that evil function unless absolutely necessary that I didn't dare use 
it... Thanks for the permission.



+  code2 = gimple_assign_rhs_code (def_stmt);
+
+  /* Two consecutive shuffles.  */
+  if (code2 == VEC_PERM_EXPR)
+{
+  op3 = gimple_assign_rhs3 (def_stmt);
+  if (TREE_CODE (op3) != VECTOR_CST)
+   return 0;
+  mask = fold_build3_loc (loc, VEC_PERM_EXPR, TREE_TYPE (op3),
+ op3, op3, op2);
+  gcc_assert (TREE_CODE (mask) == VECTOR_CST);


which means you can use

 mask = fold_ternary (loc, ...);

Ok with that changes.


Thanks a lot, I'll do that.

Next step should be either BIT_FIELD_REF(VEC_PERM_EXPR) or 
VEC_PERM_EXPR(CONSTRUCTOR). Is there a good way to determine whether this 
kind of combination should be done forwards or backwards (i.e. start from 
VEC_PERM_EXPR and look if its single use is in BIT_FIELD_REF, or start 
from BIT_FIELD_REF and look if its argument is a VEC_PERM_EXPR only used 
here?


--
Marc Glisse


Re: combine permutations in gimple

2012-08-13 Thread Richard Guenther
On Mon, Aug 13, 2012 at 3:03 PM, Marc Glisse marc.gli...@inria.fr wrote:
 On Mon, 13 Aug 2012, Richard Guenther wrote:

 On Sat, Aug 11, 2012 at 2:25 PM, Marc Glisse marc.gli...@inria.fr wrote:

 On Fri, 10 Aug 2012, Marc Glisse wrote:

 1) I am not sure we always want to combine permutations. Indeed, someone
 (user? vectorizer?) may have written 2 permutations to help the backend
 generate optimal code, whereas it will do a bad job on the complicated
 combined permutation. At tree level, I am not sure what can be done
 except
 possibly restrict the optimization to the case where the combined
 permutation is the identity. At the RTL level, we could compare what
 insns
 are generated, but that's going to be painful (doing anything with
 permutations at the RTL level is painful).


 I guess people will complain soon enough if this causes horrible performance
 regressions in vectorized code.


 +/* Return true if VAR has no nondebug use but in stmt.  */
 +static bool
 +is_only_used_in (const_tree var, const_gimple stmt)
 +{
 +  const ssa_use_operand_t *const ptr0 = (SSA_NAME_IMM_USE_NODE (var));
 +  const ssa_use_operand_t *ptr = ptr0-next;
 +
 +  for (; ptr != ptr0; ptr = ptr-next)
 +{
 +  const_gimple use_stmt = USE_STMT (ptr);
 +  if (is_gimple_debug (use_stmt))
 +   continue;
 +  if (use_stmt != stmt)
 +   return false;
 +}
 +  return true;
 +}


 if (!single_imm_use (var, use, use_stmt) || use_stmt != stmt)

 instead of the above.


 I don't think that works with statements that use a variable twice.


 +/* Combine two shuffles in a row.  Returns 1 if there were any changes
 +   made, 2 if cfg-cleanup needs to run.  Else it returns 0.  */
 +
 +static int
 +simplify_permutation (gimple_stmt_iterator *gsi)
 +{
 +  gimple stmt = gsi_stmt (*gsi);
 +  gimple def_stmt;
 +  tree op0, op1, op2, op3, mask;
 +  enum tree_code code = gimple_assign_rhs_code (stmt);
 +  enum tree_code code2;
 +  location_t loc = gimple_location (stmt);
 +
 +  gcc_checking_assert (code == VEC_PERM_EXPR);
 +
 +  op0 = gimple_assign_rhs1 (stmt);
 +  op1 = gimple_assign_rhs2 (stmt);
 +  op2 = gimple_assign_rhs3 (stmt);
 +
 +  if (TREE_CODE (op0) != SSA_NAME)
 +return 0;
 +
 +  if (TREE_CODE (op2) != VECTOR_CST)
 +return 0;
 +
 +  def_stmt = SSA_NAME_DEF_STMT (op0);
 +  if (!def_stmt || !is_gimple_assign (def_stmt)
 +  || !can_propagate_from (def_stmt)
 +  || !is_only_used_in (op0, stmt))


 Or rather than the above, simply check

   || !has_single_use (op0)

 here.


 Then there was my previous (non-working) patch that used
 get_prop_source_stmt.


 +return 0;
 +
 +  /* Check that it is only used here. We cannot use has_single_use
 + since the expression is using it twice itself...  */


 Ah ... so then

   || num_imm_uses (op0) != 2


 Ah, ok, that's simpler indeed, but there were such dire warnings to never
 use that evil function unless absolutely necessary that I didn't dare use
 it... Thanks for the permission.

If your new predicate would match more places (can you do a quick search?)
then we want it in tree-flow-inline.h instead of in
tree-ssa-forwprop.c.  But yes,
num_imm_uses can be expensive.   For now just stick with the above.

 +  code2 = gimple_assign_rhs_code (def_stmt);
 +
 +  /* Two consecutive shuffles.  */
 +  if (code2 == VEC_PERM_EXPR)
 +{
 +  op3 = gimple_assign_rhs3 (def_stmt);
 +  if (TREE_CODE (op3) != VECTOR_CST)
 +   return 0;
 +  mask = fold_build3_loc (loc, VEC_PERM_EXPR, TREE_TYPE (op3),
 + op3, op3, op2);
 +  gcc_assert (TREE_CODE (mask) == VECTOR_CST);


 which means you can use

  mask = fold_ternary (loc, ...);

 Ok with that changes.


 Thanks a lot, I'll do that.

 Next step should be either BIT_FIELD_REF(VEC_PERM_EXPR) or
 VEC_PERM_EXPR(CONSTRUCTOR). Is there a good way to determine whether this
 kind of combination should be done forwards or backwards (i.e. start from
 VEC_PERM_EXPR and look if its single use is in BIT_FIELD_REF, or start from
 BIT_FIELD_REF and look if its argument is a VEC_PERM_EXPR only used here?

The natural SSA (and forwprop) way is to look for BIT_FIELD_REF and see
if the def stmt is a VEC_PERM_EXPR.

Richard.

 --
 Marc Glisse


Re: combine permutations in gimple

2012-08-13 Thread Ramana Radhakrishnan

 I guess people will complain soon enough if this causes horrible performance
 regressions in vectorized code.

Not having looked at your patch in great detail,. surely what we don't
want is a situation where 2 constant permutations are converted into
one generic permute. Based on a quick read of your patch I couldn't
work that out.  It might be that 2 constant  permutes are cheaper than
a generic permute. Have you looked at any examples in that space . I
surely wouldn't like to see a sequence of interleave / transpose
change into a generic permute operation on Neon as that would be far
more expensive than this.  It surely needs more testting than just
this bit before going in. The reason being that this would likely take
more registers and indeed produce loads of a constant pool for the new
mask.

regards,
Ramana


Re: combine permutations in gimple

2012-08-13 Thread Richard Guenther
On Mon, Aug 13, 2012 at 3:12 PM, Ramana Radhakrishnan
ramana.radhakrish...@linaro.org wrote:

 I guess people will complain soon enough if this causes horrible performance
 regressions in vectorized code.

 Not having looked at your patch in great detail,. surely what we don't
 want is a situation where 2 constant permutations are converted into
 one generic permute. Based on a quick read of your patch I couldn't
 work that out.  It might be that 2 constant  permutes are cheaper than
 a generic permute. Have you looked at any examples in that space . I
 surely wouldn't like to see a sequence of interleave / transpose
 change into a generic permute operation on Neon as that would be far
 more expensive than this.  It surely needs more testting than just
 this bit before going in. The reason being that this would likely take
 more registers and indeed produce loads of a constant pool for the new
 mask.

The patch does not do that.  It merely assumes that the target knows
how to perform an optimal constant permute and that two constant
permutes never generate better code than a single one.

Richard.

 regards,
 Ramana


Re: combine permutations in gimple

2012-08-13 Thread Marc Glisse

On Mon, 13 Aug 2012, Richard Guenther wrote:


On Mon, Aug 13, 2012 at 3:12 PM, Ramana Radhakrishnan
ramana.radhakrish...@linaro.org wrote:


I guess people will complain soon enough if this causes horrible performance
regressions in vectorized code.


Not having looked at your patch in great detail,. surely what we don't
want is a situation where 2 constant permutations are converted into
one generic permute. Based on a quick read of your patch I couldn't
work that out.  It might be that 2 constant  permutes are cheaper than
a generic permute. Have you looked at any examples in that space . I
surely wouldn't like to see a sequence of interleave / transpose
change into a generic permute operation on Neon as that would be far
more expensive than this.  It surely needs more testting than just
this bit before going in. The reason being that this would likely take
more registers and indeed produce loads of a constant pool for the new
mask.


What do you call constant / non-constant? The combined permutation is 
still constant, although the expansion (in the back-end) might fail to 
expand it efficiently and fall back to the generic permutation 
expansion...



The patch does not do that.  It merely assumes that the target knows
how to perform an optimal constant permute and that two constant
permutes never generate better code than a single one.


Which, to be honest, is false on all platforms I know, although I did 
contribute some minor enhancements for x86.


--
Marc Glisse


Re: combine permutations in gimple

2012-08-13 Thread Jakub Jelinek
On Mon, Aug 13, 2012 at 03:13:26PM +0200, Richard Guenther wrote:
 On Mon, Aug 13, 2012 at 3:12 PM, Ramana Radhakrishnan
 ramana.radhakrish...@linaro.org wrote:
 
  I guess people will complain soon enough if this causes horrible 
  performance
  regressions in vectorized code.
 
  Not having looked at your patch in great detail,. surely what we don't
  want is a situation where 2 constant permutations are converted into
  one generic permute. Based on a quick read of your patch I couldn't
  work that out.  It might be that 2 constant  permutes are cheaper than
  a generic permute. Have you looked at any examples in that space . I
  surely wouldn't like to see a sequence of interleave / transpose
  change into a generic permute operation on Neon as that would be far
  more expensive than this.  It surely needs more testting than just
  this bit before going in. The reason being that this would likely take
  more registers and indeed produce loads of a constant pool for the new
  mask.
 
 The patch does not do that.  It merely assumes that the target knows
 how to perform an optimal constant permute and that two constant
 permutes never generate better code than a single one.

Still, the patch should do some tests whether it is beneficial.
At least a can_vec_perm_p (mode, false, sel) test of the resulting
permutation if both the original permutations pass that test, and perhaps
additionally if targetm.vectorize.vec_perm_const_ok is non-NULL and
passes for both the original permutations then it should also pass
for the resulting permutation.

Jakub


Re: combine permutations in gimple

2012-08-13 Thread Marc Glisse

On Mon, 13 Aug 2012, Richard Guenther wrote:


+  /* Check that it is only used here. We cannot use has_single_use
+ since the expression is using it twice itself...  */


Ah ... so then

  || num_imm_uses (op0) != 2


Ah, ok, that's simpler indeed, but there were such dire warnings to never
use that evil function unless absolutely necessary that I didn't dare use
it... Thanks for the permission.


If your new predicate would match more places (can you do a quick search?)


You mean: if there are more optimizations that either already check for 
double use in the same statement, or could benefit from doing so? I'll 
take a look.



then we want it in tree-flow-inline.h instead of in
tree-ssa-forwprop.c.  But yes,
num_imm_uses can be expensive.   For now just stick with the above.


I assume the above means num_imm_uses (op0) != 2, since both versions 
are above ;-)



The natural SSA (and forwprop) way is to look for BIT_FIELD_REF and see
if the def stmt is a VEC_PERM_EXPR.


Thanks again,

--
Marc Glisse


Re: combine permutations in gimple

2012-08-13 Thread Marc Glisse

On Mon, 13 Aug 2012, Jakub Jelinek wrote:


On Mon, Aug 13, 2012 at 03:13:26PM +0200, Richard Guenther wrote:

The patch does not do that.  It merely assumes that the target knows
how to perform an optimal constant permute and that two constant
permutes never generate better code than a single one.


Still, the patch should do some tests whether it is beneficial.
At least a can_vec_perm_p (mode, false, sel) test of the resulting
permutation if both the original permutations pass that test,


Sounds good. The last argument should be the constant permutation vector, 
presented as an array of indices stored in unsigned char? I hadn't 
realized we already had access to backend information that early in the 
compilation. It doesn't give the cost though.



and perhaps
additionally if targetm.vectorize.vec_perm_const_ok is non-NULL and
passes for both the original permutations then it should also pass
for the resulting permutation.


Isn't that implied by the previous test (I see calls to vec_perm_const_ok 
in there)? Or maybe not quite?


--
Marc Glisse


Re: combine permutations in gimple

2012-08-13 Thread Jakub Jelinek
On Mon, Aug 13, 2012 at 03:45:00PM +0200, Marc Glisse wrote:
 On Mon, 13 Aug 2012, Jakub Jelinek wrote:
 
 On Mon, Aug 13, 2012 at 03:13:26PM +0200, Richard Guenther wrote:
 The patch does not do that.  It merely assumes that the target knows
 how to perform an optimal constant permute and that two constant
 permutes never generate better code than a single one.
 
 Still, the patch should do some tests whether it is beneficial.
 At least a can_vec_perm_p (mode, false, sel) test of the resulting
 permutation if both the original permutations pass that test,
 
 Sounds good. The last argument should be the constant permutation
 vector, presented as an array of indices stored in unsigned char? I
 hadn't realized we already had access to backend information that
 early in the compilation. It doesn't give the cost though.

Yeah, it doesn't give the cost, just whether it is supported at all.

 and perhaps
 additionally if targetm.vectorize.vec_perm_const_ok is non-NULL and
 passes for both the original permutations then it should also pass
 for the resulting permutation.
 
 Isn't that implied by the previous test (I see calls to
 vec_perm_const_ok in there)? Or maybe not quite?

can_vec_perm_p can also return true if e.g. the target supports generic
variable permutation for the mode in question.
So the targetm.vectorize.vec_perm_const_ok check is stricter than that,
it tells you whether it can be supported by some constant permutation (or a
sequence of them, you know how the i386 code for that is complicated).

Jakub


Re: combine permutations in gimple

2012-08-13 Thread Ramana Radhakrishnan
On 13 August 2012 14:21, Marc Glisse marc.gli...@inria.fr wrote:
 On Mon, 13 Aug 2012, Richard Guenther wrote:

 On Mon, Aug 13, 2012 at 3:12 PM, Ramana Radhakrishnan
 ramana.radhakrish...@linaro.org wrote:


 I guess people will complain soon enough if this causes horrible
 performance
 regressions in vectorized code.


 Not having looked at your patch in great detail,. surely what we don't
 want is a situation where 2 constant permutations are converted into
 one generic permute. Based on a quick read of your patch I couldn't
 work that out.  It might be that 2 constant  permutes are cheaper than
 a generic permute. Have you looked at any examples in that space . I
 surely wouldn't like to see a sequence of interleave / transpose
 change into a generic permute operation on Neon as that would be far
 more expensive than this.  It surely needs more testting than just
 this bit before going in. The reason being that this would likely take
 more registers and indeed produce loads of a constant pool for the new
 mask.


 What do you call constant / non-constant? The combined permutation is still
 constant, although the expansion (in the back-end) might fail to expand it
 efficiently and fall back to the generic permutation expansion...


That is exactly what I was worried about. By constant I would expect
something that is expanded as  a constant permute by the backend - an
interleave operation or a transpose operation or indeed some of the
funky operations as below in ARM / Neon which is the SIMD architecture
I'm most familiar with .


If you had something that did the following :


uint8x8_t
tst_vrev642_u8 (uint8x8_t __a)
{
  uint8x8_t __rv;
  uint8x8_t __mask1 = { 7, 6, 5, 4, 3, 2, 1, 0};
  uint8x8_t __mask2 =  { 0, 8, 2, 10, 4, 12, 6, 14 };
  return __builtin_shuffle ( __builtin_shuffle ( __a, __mask1), __mask2) ;
}



I would expect these instructions

vrev64.8d0, d0
vmovd16, d0  @ v8qi
vtrn.8  d0, d16
bx  lr

With your patch I see

tst_vrev642_u8:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
flddd16, .L2
vtbl.8  d0, {d0}, d16
bx  lr
.L3:
.align  3
.L2:
.byte   7
.byte   7
.byte   5
.byte   5
.byte   3
.byte   3
.byte   1
.byte   1

It might be that the backend predicates need tightening in this case
but  why not try to get cost info about such combinations rather than
just doing this gratuitously ?  While in this case the ARM port might
be wrong , but in general when the vector permute rewrites were done
we chose to go ahead and keep the generic constant permutes in the
backend as the last resort to fall back to. However if fwprop starts
making such transformations really one ought to get relative costs for
each of these operations rather than allowing gratuitous replacements.



Thanks,
Ramana


Re: combine permutations in gimple

2012-08-13 Thread Ramana Radhakrishnan
On 13 August 2012 14:54, Jakub Jelinek ja...@redhat.com wrote:
 On Mon, Aug 13, 2012 at 03:45:00PM +0200, Marc Glisse wrote:
 On Mon, 13 Aug 2012, Jakub Jelinek wrote:

 On Mon, Aug 13, 2012 at 03:13:26PM +0200, Richard Guenther wrote:
 The patch does not do that.  It merely assumes that the target knows
 how to perform an optimal constant permute and that two constant
 permutes never generate better code than a single one.
 
 Still, the patch should do some tests whether it is beneficial.
 At least a can_vec_perm_p (mode, false, sel) test of the resulting
 permutation if both the original permutations pass that test,

 Sounds good. The last argument should be the constant permutation
 vector, presented as an array of indices stored in unsigned char? I
 hadn't realized we already had access to backend information that
 early in the compilation. It doesn't give the cost though.

 Yeah, it doesn't give the cost, just whether it is supported at all.


Maybe we need an interface of that form. A generic permute operation
used for constant permute operations is going to be more expensive
than more specialized constant permute operations. It might be that
this cost gets amortized over a large number of operations at which
point it makes sense but the compiler should make this transformation
based on cost rather than just whether something is supported or not.

Ramana


Re: combine permutations in gimple

2012-08-13 Thread Marc Glisse

On Mon, 13 Aug 2012, Ramana Radhakrishnan wrote:


On 13 August 2012 14:21, Marc Glisse marc.gli...@inria.fr wrote:

On Mon, 13 Aug 2012, Richard Guenther wrote:


On Mon, Aug 13, 2012 at 3:12 PM, Ramana Radhakrishnan
ramana.radhakrish...@linaro.org wrote:



I guess people will complain soon enough if this causes horrible
performance
regressions in vectorized code.



Not having looked at your patch in great detail,. surely what we don't
want is a situation where 2 constant permutations are converted into
one generic permute. Based on a quick read of your patch I couldn't
work that out.  It might be that 2 constant  permutes are cheaper than
a generic permute. Have you looked at any examples in that space . I
surely wouldn't like to see a sequence of interleave / transpose
change into a generic permute operation on Neon as that would be far
more expensive than this.  It surely needs more testting than just
this bit before going in. The reason being that this would likely take
more registers and indeed produce loads of a constant pool for the new
mask.



What do you call constant / non-constant? The combined permutation is still
constant, although the expansion (in the back-end) might fail to expand it
efficiently and fall back to the generic permutation expansion...



That is exactly what I was worried about. By constant I would expect
something that is expanded as  a constant permute by the backend - an
interleave operation or a transpose operation or indeed some of the
funky operations as below in ARM / Neon which is the SIMD architecture
I'm most familiar with .


If you had something that did the following :


uint8x8_t
tst_vrev642_u8 (uint8x8_t __a)
{
 uint8x8_t __rv;
 uint8x8_t __mask1 = { 7, 6, 5, 4, 3, 2, 1, 0};
 uint8x8_t __mask2 =  { 0, 8, 2, 10, 4, 12, 6, 14 };
 return __builtin_shuffle ( __builtin_shuffle ( __a, __mask1), __mask2) ;
}



I would expect these instructions

vrev64.8d0, d0
vmovd16, d0  @ v8qi
vtrn.8  d0, d16
bx  lr

With your patch I see

tst_vrev642_u8:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
flddd16, .L2
vtbl.8  d0, {d0}, d16
bx  lr
.L3:
.align  3
.L2:
.byte   7
.byte   7
.byte   5
.byte   5
.byte   3
.byte   3
.byte   1
.byte   1


Seems to be one instruction shorter at least ;-) Yes, there can be much 
worse regressions than that because of the patch (like 40 instructions 
instead of 4, in the x86 backend).



It might be that the backend predicates need tightening in this case
but  why not try to get cost info about such combinations rather than
just doing this gratuitously ?


I don't think the word gratuitous is appropriate. middle-end replaces a+-b 
with a-b without first asking the backend whether it might be more 
efficient. One permutation is better than 2. It just happens that the 
range of possible permutations is too large (and the basic instructions 
are too strange) for backends to do a good job on them, and thus keeping 
toplevel input as a hint is important.



While in this case the ARM port might
be wrong , but in general when the vector permute rewrites were done
we chose to go ahead and keep the generic constant permutes in the
backend as the last resort to fall back to. However if fwprop starts
making such transformations really one ought to get relative costs for
each of these operations rather than allowing gratuitous replacements.


Indeed, having costs would help, but that's a lot of work.

As you can see from the original email, I am willing to limit the 
optimization to the case where the combined permutation is the identity 
(yes, that's quite drastic...). I should probably implement that special 
case anyway, because it doesn't require its argument to have a single use 
for the optimization to make sense.


--
Marc Glisse


Re: combine permutations in gimple

2012-08-13 Thread Marc Glisse

On Mon, 13 Aug 2012, Marc Glisse wrote:


On Mon, 13 Aug 2012, Richard Guenther wrote:


If your new predicate would match more places (can you do a quick search?)


You mean: if there are more optimizations that either already check for 
double use in the same statement, or could benefit from doing so? I'll take a 
look.


I couldn't find anything obvious (not that I understood a significant 
percentage of what I saw...).


Note that the comments are efficient since the only current use of 
num_imm_uses is in a debug printf call.



I'll give it a few more days for the conversation to settle, so I know 
what I should do between:

- the barely modified patch you accepted,
- the check asked by Jakub,
- the restriction to identity that prevents any regression (well...),
- something else?

--
Marc Glisse


Re: combine permutations in gimple

2012-08-11 Thread Marc Glisse

On Fri, 10 Aug 2012, Marc Glisse wrote:

this patch detects permutations of permutations and merges them. It also 
canonicalizes permutations a bit more.


There are several issues with this patch:

1) I am not sure we always want to combine permutations. Indeed, someone 
(user? vectorizer?) may have written 2 permutations to help the backend 
generate optimal code, whereas it will do a bad job on the complicated 
combined permutation. At tree level, I am not sure what can be done except 
possibly restrict the optimization to the case where the combined permutation 
is the identity. At the RTL level, we could compare what insns are generated, 
but that's going to be painful (doing anything with permutations at the RTL 
level is painful).


2) I don't understand how the cleanup works in tree-ssa-forwprop.c. I copied 
some fold/update/remove lines from other simplifications, but I don't know if 
they are right.


3) In a first version of the patch, where I had SSA_NAME_DEF_STMT instead of 
get_prop_source_stmt, I got an ICE in one of the torture vectorization 
testcases. It happened in expand_expr_real_2, because it asserts that 
expand_vec_perm will never fail. However, expand_vec_perm does return 
NULL_RTX sometimes. Here it was in V16QImode, with the permutation 
{0,0,2,2,4,...,14,14}. maybe_expand_insn can't handle it directly (that's ok 
I guess), but then expand_vec_perm sees VOIDmode and exits instead of trying 
other ways. I don't know if there is a latent bug or if (more likely) my 
patch may produce trees with wrong modes.


4) Is this the right place?

This isn't the transformation I am most interested in, but it is a first step 
to see if the direction is right.


Hello,

there was yet another issue with the version I posted: the testcase was 
trivial so I didn't notice that it didn't perform the optimization at 
all...


Here is a new version. It seems a bit strange to me that there are plenty 
of functions that check for single-use variables, but there isn't any that 
checks for variables used in a single statement (but possibly twice). So I 
may be doing something strange, but it seems to be the natural test here.


Happily passed bootstrap+regtest.

2012-08-11  Marc Glisse  marc.gli...@inria.fr

gcc/
* fold-const.c (fold_ternary_loc): Detect identity permutations.
Canonicalize permutations more.
* tree-ssa-forwprop.c (is_only_used_in): New function.
(simplify_permutation): Likewise.
(ssa_forward_propagate_and_combine): Call it.

gcc/testsuite/
* gcc.dg/tree-ssa/forwprop-19.c: New testcase.

--
Marc GlisseIndex: gcc/tree-ssa-forwprop.c
===
--- gcc/tree-ssa-forwprop.c (revision 190311)
+++ gcc/tree-ssa-forwprop.c (working copy)
@@ -2569,20 +2569,97 @@ combine_conversions (gimple_stmt_iterato
  gimple_assign_set_rhs_code (stmt, CONVERT_EXPR);
  update_stmt (stmt);
  return remove_prop_source_from_use (op0) ? 2 : 1;
}
}
 }
 
   return 0;
 }
 
+/* Return true if VAR has no nondebug use but in stmt.  */
+static bool
+is_only_used_in (const_tree var, const_gimple stmt)
+{
+  const ssa_use_operand_t *const ptr0 = (SSA_NAME_IMM_USE_NODE (var));
+  const ssa_use_operand_t *ptr = ptr0-next;
+
+  for (; ptr != ptr0; ptr = ptr-next)
+{
+  const_gimple use_stmt = USE_STMT (ptr);
+  if (is_gimple_debug (use_stmt))
+   continue;
+  if (use_stmt != stmt)
+   return false;
+}
+  return true;
+}
+
+/* Combine two shuffles in a row.  Returns 1 if there were any changes
+   made, 2 if cfg-cleanup needs to run.  Else it returns 0.  */
+ 
+static int
+simplify_permutation (gimple_stmt_iterator *gsi)
+{
+  gimple stmt = gsi_stmt (*gsi);
+  gimple def_stmt;
+  tree op0, op1, op2, op3, mask;
+  enum tree_code code = gimple_assign_rhs_code (stmt);
+  enum tree_code code2;
+  location_t loc = gimple_location (stmt);
+
+  gcc_checking_assert (code == VEC_PERM_EXPR);
+
+  op0 = gimple_assign_rhs1 (stmt);
+  op1 = gimple_assign_rhs2 (stmt);
+  op2 = gimple_assign_rhs3 (stmt);
+
+  if (TREE_CODE (op0) != SSA_NAME)
+return 0;
+
+  if (TREE_CODE (op2) != VECTOR_CST)
+return 0;
+
+  if (op0 != op1)
+return 0;
+
+  def_stmt = SSA_NAME_DEF_STMT (op0);
+  if (!def_stmt || !is_gimple_assign (def_stmt)
+  || !can_propagate_from (def_stmt)
+  || !is_only_used_in (op0, stmt))
+return 0;
+
+  /* Check that it is only used here. We cannot use has_single_use
+ since the expression is using it twice itself...  */
+
+  code2 = gimple_assign_rhs_code (def_stmt);
+
+  /* Two consecutive shuffles.  */
+  if (code2 == VEC_PERM_EXPR)
+{
+  op3 = gimple_assign_rhs3 (def_stmt);
+  if (TREE_CODE (op3) != VECTOR_CST)
+   return 0;
+  mask = fold_build3_loc (loc, VEC_PERM_EXPR, TREE_TYPE (op3),
+ op3, op3, op2);
+  gcc_assert (TREE_CODE (mask) == VECTOR_CST);
+

Re: combine permutations in gimple

2012-08-10 Thread Marc Glisse

On Fri, 10 Aug 2012, Marc Glisse wrote:


There are several issues with this patch:


Oups:

5) and the testcase needs fixing, just like Jakub just fixed Richard's 
testcase, to avoid ABI warnings. But that's easy.


--
Marc Glisse