Re: [PATCH] Fix PR50031, take 2

2012-02-10 Thread Richard Guenther
On Thu, Feb 9, 2012 at 5:56 PM, William J. Schmidt
wschm...@linux.vnet.ibm.com wrote:
 Following is a revision of yesterday's PR50031 patch submission,
 modified per Richard's comments.  Bootstrapped and tested with no
 regressions on powerpc64-linux.  I've confirmed the same performance
 improvements in SPEC.  OK for trunk?

Some more questions - maybe Jakub can clarify as well given he worked
on patterns recently ...

 Thanks,
 Bill


 2012-02-09  Bill Schmidt  wschm...@linux.vnet.ibm.com
            Ira Rosen  i...@il.ibm.com

        PR tree-optimization/50031
        * targhooks.c (default_builtin_vectorization_cost): Handle
        vec_promote_demote.
        * target.h (enum vect_cost_for_stmt): Add vec_promote_demote.
        * tree-vect-loop.c (vect_get_single_scalar_iteraion_cost): Handle
        all types of reduction and pattern statements.
        (vect_estimate_min_profitable_iters): Likewise.
        * tree-vect-stmts.c (vect_model_promotion_demotion_cost): New function.
        (vect_get_load_cost): Use vec_perm for permutations; add dump logic
        for explicit realigns.
        (vectorizable_conversion): Call vect_model_promotion_demotion_cost.
        * config/spu/spu.c (spu_builtin_vectorization_cost): Handle
        vec_promote_demote.
        * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
        * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost): Update
        vec_perm for VSX and handle vec_promote_demote.


 Index: gcc/targhooks.c
 ===
 --- gcc/targhooks.c     (revision 183944)
 +++ gcc/targhooks.c     (working copy)
 @@ -514,6 +514,7 @@ default_builtin_vectorization_cost (enum vect_cost
       case scalar_to_vec:
       case cond_branch_not_taken:
       case vec_perm:
 +      case vec_promote_demote:
         return 1;

       case unaligned_load:
 Index: gcc/target.h
 ===
 --- gcc/target.h        (revision 183944)
 +++ gcc/target.h        (working copy)
 @@ -145,7 +145,8 @@ enum vect_cost_for_stmt
   scalar_to_vec,
   cond_branch_not_taken,
   cond_branch_taken,
 -  vec_perm
 +  vec_perm,
 +  vec_promote_demote
  };

  /* The target structure.  This holds all the backend hooks.  */
 Index: gcc/tree-vect-loop.c
 ===
 --- gcc/tree-vect-loop.c        (revision 183944)
 +++ gcc/tree-vect-loop.c        (working copy)
 @@ -2417,7 +2417,8 @@ vect_get_single_scalar_iteraion_cost (loop_vec_inf
           if (stmt_info
                !STMT_VINFO_RELEVANT_P (stmt_info)
                (!STMT_VINFO_LIVE_P (stmt_info)
 -                  || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
 +                  || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE 
 (stmt_info)))
 +              !STMT_VINFO_IN_PATTERN_P (stmt_info))
             continue;

Why would we exclude !relevant stmts when they are part of a pattern?
We are looking at a scalar iteration cost, so all stmts that are not dead
count, no?


           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
 @@ -2564,15 +2565,48 @@ vect_estimate_min_profitable_iters (loop_vec_info
        {
          gimple stmt = gsi_stmt (si);
          stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 +
 +         /* Translate the last statement in a pattern into the
 +            related replacement statement.  */
 +         if (STMT_VINFO_IN_PATTERN_P (stmt_info))
 +           {
 +             stmt = STMT_VINFO_RELATED_STMT (stmt_info);
 +             stmt_info = vinfo_for_stmt (stmt);
 +           }

So here we are tanslating stmt to the main scalar pattern stmt - and thus
count it as many times as we have stmts in that pattern?  That looks wrong.
More like

  if (STMT_VINFO_IN_PATTERN_P (stmt_info)
   STMT_VINFO_RELATED_STMT (stmt_info) != stmt)
continue;

?  Does the main stmt has the flag set and points to itself?

          /* Skip stmts that are not vectorized inside the loop.  */
          if (!STMT_VINFO_RELEVANT_P (stmt_info)
               (!STMT_VINFO_LIVE_P (stmt_info)
 -                 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
 +                 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE 
 (stmt_info
            continue;
 +

... and then, what does STMT_VINFO_INSIDE_OF_LOOP_COST of
that main pattern stmt represent?  Shouldn't it represent the cost
of the whole sequence, and thus ...

          vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * 
 factor;
          /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
             some of the outside costs are generated inside the outer-loop.  
 */
          vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
 +          if (is_pattern_stmt_p (stmt_info)
 +              STMT_VINFO_PATTERN_DEF_SEQ (stmt_info))
 +            {
 +             

Re: [PATCH] Fix PR50031, take 2

2012-02-10 Thread William J. Schmidt
Richard, thanks.  I can answer most of your questions, but for the last
one I will have to ask Ira to weigh in.

On Fri, 2012-02-10 at 13:06 +0100, Richard Guenther wrote:
 On Thu, Feb 9, 2012 at 5:56 PM, William J. Schmidt
 wschm...@linux.vnet.ibm.com wrote:
  Following is a revision of yesterday's PR50031 patch submission,
  modified per Richard's comments.  Bootstrapped and tested with no
  regressions on powerpc64-linux.  I've confirmed the same performance
  improvements in SPEC.  OK for trunk?
 
 Some more questions - maybe Jakub can clarify as well given he worked
 on patterns recently ...
 
  Thanks,
  Bill
 
 
  2012-02-09  Bill Schmidt  wschm...@linux.vnet.ibm.com
 Ira Rosen  i...@il.ibm.com
 
 PR tree-optimization/50031
 * targhooks.c (default_builtin_vectorization_cost): Handle
 vec_promote_demote.
 * target.h (enum vect_cost_for_stmt): Add vec_promote_demote.
 * tree-vect-loop.c (vect_get_single_scalar_iteraion_cost): Handle
 all types of reduction and pattern statements.
 (vect_estimate_min_profitable_iters): Likewise.
 * tree-vect-stmts.c (vect_model_promotion_demotion_cost): New 
  function.
 (vect_get_load_cost): Use vec_perm for permutations; add dump logic
 for explicit realigns.
 (vectorizable_conversion): Call vect_model_promotion_demotion_cost.
 * config/spu/spu.c (spu_builtin_vectorization_cost): Handle
 vec_promote_demote.
 * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
 * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost): Update
 vec_perm for VSX and handle vec_promote_demote.
 
 
  Index: gcc/targhooks.c
  ===
  --- gcc/targhooks.c (revision 183944)
  +++ gcc/targhooks.c (working copy)
  @@ -514,6 +514,7 @@ default_builtin_vectorization_cost (enum vect_cost
case scalar_to_vec:
case cond_branch_not_taken:
case vec_perm:
  +  case vec_promote_demote:
  return 1;
 
case unaligned_load:
  Index: gcc/target.h
  ===
  --- gcc/target.h(revision 183944)
  +++ gcc/target.h(working copy)
  @@ -145,7 +145,8 @@ enum vect_cost_for_stmt
scalar_to_vec,
cond_branch_not_taken,
cond_branch_taken,
  -  vec_perm
  +  vec_perm,
  +  vec_promote_demote
   };
 
   /* The target structure.  This holds all the backend hooks.  */
  Index: gcc/tree-vect-loop.c
  ===
  --- gcc/tree-vect-loop.c(revision 183944)
  +++ gcc/tree-vect-loop.c(working copy)
  @@ -2417,7 +2417,8 @@ vect_get_single_scalar_iteraion_cost (loop_vec_inf
if (stmt_info
 !STMT_VINFO_RELEVANT_P (stmt_info)
 (!STMT_VINFO_LIVE_P (stmt_info)
  -  || STMT_VINFO_DEF_TYPE (stmt_info) != 
  vect_reduction_def))
  +  || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE 
  (stmt_info)))
  +  !STMT_VINFO_IN_PATTERN_P (stmt_info))
  continue;
 
 Why would we exclude !relevant stmts when they are part of a pattern?
 We are looking at a scalar iteration cost, so all stmts that are not dead
 count, no?

As I understand it, we're at a point where a statement replacing the
pattern exists but has not yet been inserted in the code stream.  All of
the statements in the pattern are marked irrelevant, but the related
statement of the main (last) statement of the pattern is relevant.  Thus
we need to allow the main statement through this check so the
replacement statement can be found and counted.

 
 
if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
  @@ -2564,15 +2565,48 @@ vect_estimate_min_profitable_iters (loop_vec_info
 {
   gimple stmt = gsi_stmt (si);
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
  +
  + /* Translate the last statement in a pattern into the
  +related replacement statement.  */
  + if (STMT_VINFO_IN_PATTERN_P (stmt_info))
  +   {
  + stmt = STMT_VINFO_RELATED_STMT (stmt_info);
  + stmt_info = vinfo_for_stmt (stmt);
  +   }
 
 So here we are tanslating stmt to the main scalar pattern stmt - and thus
 count it as many times as we have stmts in that pattern?  That looks wrong.
 More like
 
   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
STMT_VINFO_RELATED_STMT (stmt_info) != stmt)
 continue;
 
 ?  Does the main stmt has the flag set and points to itself?

From the commentary at the end of tree-vect-patterns.c, only the main
statement in the pattern (the last one) is flagged as
STMT_VINFO_IN_PATTERN_P.  So this is finding the new replacement
statement which has been created but not yet inserted in the code.  It
only gets counted once.

 
   /* 

Re: [PATCH] Fix PR50031, take 2

2012-02-10 Thread William J. Schmidt


On Fri, 2012-02-10 at 07:31 -0600, William J. Schmidt wrote:
 Richard, thanks.  I can answer most of your questions, but for the last
 one I will have to ask Ira to weigh in.
 
 On Fri, 2012-02-10 at 13:06 +0100, Richard Guenther wrote:
  On Thu, Feb 9, 2012 at 5:56 PM, William J. Schmidt
  wschm...@linux.vnet.ibm.com wrote:
   Following is a revision of yesterday's PR50031 patch submission,
   modified per Richard's comments.  Bootstrapped and tested with no
   regressions on powerpc64-linux.  I've confirmed the same performance
   improvements in SPEC.  OK for trunk?
  
  Some more questions - maybe Jakub can clarify as well given he worked
  on patterns recently ...
  
   Thanks,
   Bill
  
  
   2012-02-09  Bill Schmidt  wschm...@linux.vnet.ibm.com
  Ira Rosen  i...@il.ibm.com
  
  PR tree-optimization/50031
  * targhooks.c (default_builtin_vectorization_cost): Handle
  vec_promote_demote.
  * target.h (enum vect_cost_for_stmt): Add vec_promote_demote.
  * tree-vect-loop.c (vect_get_single_scalar_iteraion_cost): Handle
  all types of reduction and pattern statements.
  (vect_estimate_min_profitable_iters): Likewise.
  * tree-vect-stmts.c (vect_model_promotion_demotion_cost): New 
   function.
  (vect_get_load_cost): Use vec_perm for permutations; add dump logic
  for explicit realigns.
  (vectorizable_conversion): Call vect_model_promotion_demotion_cost.
  * config/spu/spu.c (spu_builtin_vectorization_cost): Handle
  vec_promote_demote.
  * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
  * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost): 
   Update
  vec_perm for VSX and handle vec_promote_demote.
  
  
   Index: gcc/targhooks.c
   ===
   --- gcc/targhooks.c (revision 183944)
   +++ gcc/targhooks.c (working copy)
   @@ -514,6 +514,7 @@ default_builtin_vectorization_cost (enum vect_cost
 case scalar_to_vec:
 case cond_branch_not_taken:
 case vec_perm:
   +  case vec_promote_demote:
   return 1;
  
 case unaligned_load:
   Index: gcc/target.h
   ===
   --- gcc/target.h(revision 183944)
   +++ gcc/target.h(working copy)
   @@ -145,7 +145,8 @@ enum vect_cost_for_stmt
 scalar_to_vec,
 cond_branch_not_taken,
 cond_branch_taken,
   -  vec_perm
   +  vec_perm,
   +  vec_promote_demote
};
  
/* The target structure.  This holds all the backend hooks.  */
   Index: gcc/tree-vect-loop.c
   ===
   --- gcc/tree-vect-loop.c(revision 183944)
   +++ gcc/tree-vect-loop.c(working copy)
   @@ -2417,7 +2417,8 @@ vect_get_single_scalar_iteraion_cost (loop_vec_inf
 if (stmt_info
  !STMT_VINFO_RELEVANT_P (stmt_info)
  (!STMT_VINFO_LIVE_P (stmt_info)
   -  || STMT_VINFO_DEF_TYPE (stmt_info) != 
   vect_reduction_def))
   +  || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE 
   (stmt_info)))
   +  !STMT_VINFO_IN_PATTERN_P (stmt_info))
   continue;
  
  Why would we exclude !relevant stmts when they are part of a pattern?
  We are looking at a scalar iteration cost, so all stmts that are not dead
  count, no?
 
 As I understand it, we're at a point where a statement replacing the
 pattern exists but has not yet been inserted in the code stream.  All of
 the statements in the pattern are marked irrelevant, but the related
 statement of the main (last) statement of the pattern is relevant.  Thus
 we need to allow the main statement through this check so the
 replacement statement can be found and counted.
 
  
  
 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
   @@ -2564,15 +2565,48 @@ vect_estimate_min_profitable_iters (loop_vec_info
  {
gimple stmt = gsi_stmt (si);
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   +
   + /* Translate the last statement in a pattern into the
   +related replacement statement.  */
   + if (STMT_VINFO_IN_PATTERN_P (stmt_info))
   +   {
   + stmt = STMT_VINFO_RELATED_STMT (stmt_info);
   + stmt_info = vinfo_for_stmt (stmt);
   +   }
  
  So here we are tanslating stmt to the main scalar pattern stmt - and thus
  count it as many times as we have stmts in that pattern?  That looks wrong.
  More like
  
if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 STMT_VINFO_RELATED_STMT (stmt_info) != stmt)
  continue;
  
  ?  Does the main stmt has the flag set and points to itself?
 
 From the commentary at the end of tree-vect-patterns.c, only the main
 statement in the pattern (the last one) is 

Re: [PATCH] Fix PR50031, take 2

2012-02-10 Thread Jakub Jelinek
On Fri, Feb 10, 2012 at 07:31:01AM -0600, William J. Schmidt wrote:
 From the commentary at the end of tree-vect-patterns.c, only the main
 statement in the pattern (the last one) is flagged as
 STMT_VINFO_IN_PATTERN_P.  So this is finding the new replacement
 statement which has been created but not yet inserted in the code.  It
 only gets counted once.

STMT_VINFO_IN_PATTERN_P is set on the vinfo of the original stmt (and not
just the last, but all original stmts that are being replaced by the
pattern).  Each of these has STMT_VINFO_RELATED_STMT set to a pattern stmt
(last one corresponding to that original stmt).  If needed, further
pattern stmts for that original stmts are put into
STMT_VINFO_PATTERN_DEF_SEQ.  The pattern matcher could e.g. match
3 original stmts, and stick a single pattern stmt into STMT_VINFO_RELATED_STMT
on the first original stmt, then say 2 pattern stmts into
STMT_VINFO_PATTERN_DEF_SEQ of the second original stmt plus one
STMT_VINFO_RELATED_STMT and finally one pattern stmt through
STMT_VINFO_RELATED_STMT on the third original stmt.

 Note that STMT_VINFO_PATTERN_DEF_SEQ doesn't exist in the 4.6 branch, so
 this section has to be omitted if we backport it (which is desirable
 since the degradation was introduced in 4.6).  Removing it apparently
 does not affect the sphinx3 degradation.

Yeah, when backporting you can basically assume that
STMT_VINFO_PATTERN_DEF_SEQ is always NULL in 4.6 and just write NULL instead
of itt (and simplify), because no pattern recognizers needed more than one
pattern stmt for each original stmt back then.

Jakub


Re: [PATCH] Fix PR50031, take 2

2012-02-10 Thread William J. Schmidt
Jakub, thanks!  Based on this, I believe the patch is correct in its
handling of the STMT_VINFO_PATTERN_DEF_SEQ logic, without any double
counting.

I misinterpreted what the commentary for vect_pattern_recog was saying:
I thought that all replacements were hung off of the last pattern
statement, but this was just true for the particular example, where only
one replacement statement was created.  Sorry for any confusion!

Based on the discussion, is the patch OK for trunk?

Thanks,
Bill

On Fri, 2012-02-10 at 14:44 +0100, Jakub Jelinek wrote:
 On Fri, Feb 10, 2012 at 07:31:01AM -0600, William J. Schmidt wrote:
  From the commentary at the end of tree-vect-patterns.c, only the main
  statement in the pattern (the last one) is flagged as
  STMT_VINFO_IN_PATTERN_P.  So this is finding the new replacement
  statement which has been created but not yet inserted in the code.  It
  only gets counted once.
 
 STMT_VINFO_IN_PATTERN_P is set on the vinfo of the original stmt (and not
 just the last, but all original stmts that are being replaced by the
 pattern).  Each of these has STMT_VINFO_RELATED_STMT set to a pattern stmt
 (last one corresponding to that original stmt).  If needed, further
 pattern stmts for that original stmts are put into
 STMT_VINFO_PATTERN_DEF_SEQ.  The pattern matcher could e.g. match
 3 original stmts, and stick a single pattern stmt into STMT_VINFO_RELATED_STMT
 on the first original stmt, then say 2 pattern stmts into
 STMT_VINFO_PATTERN_DEF_SEQ of the second original stmt plus one
 STMT_VINFO_RELATED_STMT and finally one pattern stmt through
 STMT_VINFO_RELATED_STMT on the third original stmt.
 
  Note that STMT_VINFO_PATTERN_DEF_SEQ doesn't exist in the 4.6 branch, so
  this section has to be omitted if we backport it (which is desirable
  since the degradation was introduced in 4.6).  Removing it apparently
  does not affect the sphinx3 degradation.
 
 Yeah, when backporting you can basically assume that
 STMT_VINFO_PATTERN_DEF_SEQ is always NULL in 4.6 and just write NULL instead
 of itt (and simplify), because no pattern recognizers needed more than one
 pattern stmt for each original stmt back then.
 
   Jakub
 



Re: [PATCH] Fix PR50031, take 2

2012-02-10 Thread Richard Guenther
On Fri, Feb 10, 2012 at 3:39 PM, William J. Schmidt
wschm...@linux.vnet.ibm.com wrote:
 Jakub, thanks!  Based on this, I believe the patch is correct in its
 handling of the STMT_VINFO_PATTERN_DEF_SEQ logic, without any double
 counting.

 I misinterpreted what the commentary for vect_pattern_recog was saying:
 I thought that all replacements were hung off of the last pattern
 statement, but this was just true for the particular example, where only
 one replacement statement was created.  Sorry for any confusion!

 Based on the discussion, is the patch OK for trunk?

But you still count for each of the matched scalar stmts the cost of the
whole pattern sequence.  No?

Richard.

 Thanks,
 Bill

 On Fri, 2012-02-10 at 14:44 +0100, Jakub Jelinek wrote:
 On Fri, Feb 10, 2012 at 07:31:01AM -0600, William J. Schmidt wrote:
  From the commentary at the end of tree-vect-patterns.c, only the main
  statement in the pattern (the last one) is flagged as
  STMT_VINFO_IN_PATTERN_P.  So this is finding the new replacement
  statement which has been created but not yet inserted in the code.  It
  only gets counted once.

 STMT_VINFO_IN_PATTERN_P is set on the vinfo of the original stmt (and not
 just the last, but all original stmts that are being replaced by the
 pattern).  Each of these has STMT_VINFO_RELATED_STMT set to a pattern stmt
 (last one corresponding to that original stmt).  If needed, further
 pattern stmts for that original stmts are put into
 STMT_VINFO_PATTERN_DEF_SEQ.  The pattern matcher could e.g. match
 3 original stmts, and stick a single pattern stmt into 
 STMT_VINFO_RELATED_STMT
 on the first original stmt, then say 2 pattern stmts into
 STMT_VINFO_PATTERN_DEF_SEQ of the second original stmt plus one
 STMT_VINFO_RELATED_STMT and finally one pattern stmt through
 STMT_VINFO_RELATED_STMT on the third original stmt.

  Note that STMT_VINFO_PATTERN_DEF_SEQ doesn't exist in the 4.6 branch, so
  this section has to be omitted if we backport it (which is desirable
  since the degradation was introduced in 4.6).  Removing it apparently
  does not affect the sphinx3 degradation.

 Yeah, when backporting you can basically assume that
 STMT_VINFO_PATTERN_DEF_SEQ is always NULL in 4.6 and just write NULL instead
 of itt (and simplify), because no pattern recognizers needed more than one
 pattern stmt for each original stmt back then.

       Jakub




Re: [PATCH] Fix PR50031, take 2

2012-02-10 Thread William J. Schmidt


On Fri, 2012-02-10 at 16:22 +0100, Richard Guenther wrote:
 On Fri, Feb 10, 2012 at 3:39 PM, William J. Schmidt
 wschm...@linux.vnet.ibm.com wrote:
  Jakub, thanks!  Based on this, I believe the patch is correct in its
  handling of the STMT_VINFO_PATTERN_DEF_SEQ logic, without any double
  counting.
 
  I misinterpreted what the commentary for vect_pattern_recog was saying:
  I thought that all replacements were hung off of the last pattern
  statement, but this was just true for the particular example, where only
  one replacement statement was created.  Sorry for any confusion!
 
  Based on the discussion, is the patch OK for trunk?
 
 But you still count for each of the matched scalar stmts the cost of the
 whole pattern sequence.  No?

I need to change the commentary to make this more clear now.  My
comment:  Translate the last statement... is incorrect; this is done
for each statement in a pattern.

Per Jakub's explanation, the replacement statements are distributed over
the original pattern statements.  Visiting STMT_VINFO_RELATED_STMT for a
statement marked STMT_VINFO_IN_PATTERN_P will find zero or one
replacement statements that should be examined.  Visiting
STMT_VINFO_PATTERN_DEF_SEQ may pick up some leftover replacement
statements that didn't fit in the 1-1 mapping.  The point is that all of
these related statements and pattern-def-seq statements are disjoint,
and Ira's logic is ensuring that they all get examined once.

It's not a very clean way to represent the replacement of a pattern --
not how you'd do it if designing from scratch -- but I guess I can see
how it got this way when the STMT_VINFO_PATTERN_DEF_SEQ was glued onto
the existing 1-1 mapping.

Bill

 
 Richard.
 
  Thanks,
  Bill
 
  On Fri, 2012-02-10 at 14:44 +0100, Jakub Jelinek wrote:
  On Fri, Feb 10, 2012 at 07:31:01AM -0600, William J. Schmidt wrote:
   From the commentary at the end of tree-vect-patterns.c, only the main
   statement in the pattern (the last one) is flagged as
   STMT_VINFO_IN_PATTERN_P.  So this is finding the new replacement
   statement which has been created but not yet inserted in the code.  It
   only gets counted once.
 
  STMT_VINFO_IN_PATTERN_P is set on the vinfo of the original stmt (and not
  just the last, but all original stmts that are being replaced by the
  pattern).  Each of these has STMT_VINFO_RELATED_STMT set to a pattern stmt
  (last one corresponding to that original stmt).  If needed, further
  pattern stmts for that original stmts are put into
  STMT_VINFO_PATTERN_DEF_SEQ.  The pattern matcher could e.g. match
  3 original stmts, and stick a single pattern stmt into 
  STMT_VINFO_RELATED_STMT
  on the first original stmt, then say 2 pattern stmts into
  STMT_VINFO_PATTERN_DEF_SEQ of the second original stmt plus one
  STMT_VINFO_RELATED_STMT and finally one pattern stmt through
  STMT_VINFO_RELATED_STMT on the third original stmt.
 
   Note that STMT_VINFO_PATTERN_DEF_SEQ doesn't exist in the 4.6 branch, so
   this section has to be omitted if we backport it (which is desirable
   since the degradation was introduced in 4.6).  Removing it apparently
   does not affect the sphinx3 degradation.
 
  Yeah, when backporting you can basically assume that
  STMT_VINFO_PATTERN_DEF_SEQ is always NULL in 4.6 and just write NULL 
  instead
  of itt (and simplify), because no pattern recognizers needed more than one
  pattern stmt for each original stmt back then.
 
Jakub
 
 
 



Re: [PATCH] Fix PR50031, take 2

2012-02-10 Thread Jakub Jelinek
On Fri, Feb 10, 2012 at 04:22:32PM +0100, Richard Guenther wrote:
 On Fri, Feb 10, 2012 at 3:39 PM, William J. Schmidt
 wschm...@linux.vnet.ibm.com wrote:
  Jakub, thanks!  Based on this, I believe the patch is correct in its
  handling of the STMT_VINFO_PATTERN_DEF_SEQ logic, without any double
  counting.
 
  I misinterpreted what the commentary for vect_pattern_recog was saying:
  I thought that all replacements were hung off of the last pattern
  statement, but this was just true for the particular example, where only
  one replacement statement was created.  Sorry for any confusion!
 
  Based on the discussion, is the patch OK for trunk?
 
 But you still count for each of the matched scalar stmts the cost of the
 whole pattern sequence.  No?

For each stmt you should count the original stmt if it is relevant
(I think there are cases where both the original stmt and pattern stmt for
it are both relevant and emitted, didn't understand it exactly though,
but usually only the orig or only the pattern stmts are relevant),
and if STMT_VINFO_IN_PATTERN_P, count the STMT_VINFO_RELATED_STMT (if any)
and all STMT_VINFO_PATTERN_DEF_SEQ sequence stmts (if any), again, not sure
if STMT_VINFO_RELEVANT needs to be tested for each or not.
I guess look at what we actually emit if the cost model says it should be
emitted.
Neither the STMT_VINFO_PATTERN_DEF_SEQ nor STMT_VINFO_RELATED_STMT
are shared in between any orig stmts, each has its own set of stmts.
I believe if some original stmt should map to no pattern stmts, it isn't
marked as STMT_VINFO_IN_PATTERN_P at all, but as nothing uses its lhs
with the pattern stmts in place, it won't be relevant and will be ignored.

Jakub


Re: [PATCH] Fix PR50031, take 2

2012-02-10 Thread Jakub Jelinek
On Fri, Feb 10, 2012 at 09:36:01AM -0600, William J. Schmidt wrote:
 Per Jakub's explanation, the replacement statements are distributed over
 the original pattern statements.  Visiting STMT_VINFO_RELATED_STMT for a
 statement marked STMT_VINFO_IN_PATTERN_P will find zero or one
 replacement statements that should be examined.  Visiting
 STMT_VINFO_PATTERN_DEF_SEQ may pick up some leftover replacement
 statements that didn't fit in the 1-1 mapping.  The point is that all of
 these related statements and pattern-def-seq statements are disjoint,
 and Ira's logic is ensuring that they all get examined once.
 
 It's not a very clean way to represent the replacement of a pattern --
 not how you'd do it if designing from scratch -- but I guess I can see
 how it got this way when the STMT_VINFO_PATTERN_DEF_SEQ was glued onto
 the existing 1-1 mapping.

Still in 4.6 we have just a 1-1 or 1-0 mapping (as I wrote in the previous
mail, I think those orig stmts that don't need any replacements, e.g.
if you have 2 orig stmts mapping to just one pattern stmt, are just marked
as not relevant).  Then 4.7 first added (Ira's changes)
STMT_VINFO_PATTERN_DEF_STMT, i.e. in addition to 1-1 and 1-0 there was
a possibility for 1-2 mapping.  And then I needed more than 2, but it was
too late in the game to do large changes, so we end up with 1-1+N scheme
by changing the DEF_STMT into DEF_SEQ. For 4.8 we definitely at least should
remove STMT_VINFO_RELATED_STMT as a way to represent pattern stmts,
even for 1-1 mapping the pattern stmt should go into a sequence.

Jakub


Re: [PATCH] Fix PR50031, take 2

2012-02-10 Thread Richard Guenther
On Fri, Feb 10, 2012 at 4:36 PM, William J. Schmidt
wschm...@linux.vnet.ibm.com wrote:


 On Fri, 2012-02-10 at 16:22 +0100, Richard Guenther wrote:
 On Fri, Feb 10, 2012 at 3:39 PM, William J. Schmidt
 wschm...@linux.vnet.ibm.com wrote:
  Jakub, thanks!  Based on this, I believe the patch is correct in its
  handling of the STMT_VINFO_PATTERN_DEF_SEQ logic, without any double
  counting.
 
  I misinterpreted what the commentary for vect_pattern_recog was saying:
  I thought that all replacements were hung off of the last pattern
  statement, but this was just true for the particular example, where only
  one replacement statement was created.  Sorry for any confusion!
 
  Based on the discussion, is the patch OK for trunk?

 But you still count for each of the matched scalar stmts the cost of the
 whole pattern sequence.  No?

 I need to change the commentary to make this more clear now.  My
 comment:  Translate the last statement... is incorrect; this is done
 for each statement in a pattern.

 Per Jakub's explanation, the replacement statements are distributed over
 the original pattern statements.

Ok, looking at the code in tree-vect-patterns.c it seems that
STMT_VINFO_IN_PATTERN_P is only set for the stmt that contains
the first replacement stmt in STMT_VINFO_RELATED_STMT and further
stmts in that stmts STMT_VINFO_PATTERN_DEF_SEQ.  Other
stmts that were matched do not have STMT_VINFO_IN_PATTERN_P set
(but their STMT_VINFO_RELATED_STMT points to the stmt that has
STMT_VINFO_IN_PATTERN_P set).  Possibly the other scalar stmts
participating in the pattern are marked as not relevant (couldn't spot
this quickly).

So it seems that your patch should indeed work as-is.

Thus, ok, if Jakub doesn't spot another error.

Thanks,
Richard.

  Visiting STMT_VINFO_RELATED_STMT for a
 statement marked STMT_VINFO_IN_PATTERN_P will find zero or one
 replacement statements that should be examined.  Visiting
 STMT_VINFO_PATTERN_DEF_SEQ may pick up some leftover replacement
 statements that didn't fit in the 1-1 mapping.  The point is that all of
 these related statements and pattern-def-seq statements are disjoint,
 and Ira's logic is ensuring that they all get examined once.

 It's not a very clean way to represent the replacement of a pattern --
 not how you'd do it if designing from scratch -- but I guess I can see
 how it got this way when the STMT_VINFO_PATTERN_DEF_SEQ was glued onto
 the existing 1-1 mapping.

 Bill


 Richard.

  Thanks,
  Bill
 
  On Fri, 2012-02-10 at 14:44 +0100, Jakub Jelinek wrote:
  On Fri, Feb 10, 2012 at 07:31:01AM -0600, William J. Schmidt wrote:
   From the commentary at the end of tree-vect-patterns.c, only the main
   statement in the pattern (the last one) is flagged as
   STMT_VINFO_IN_PATTERN_P.  So this is finding the new replacement
   statement which has been created but not yet inserted in the code.  It
   only gets counted once.
 
  STMT_VINFO_IN_PATTERN_P is set on the vinfo of the original stmt (and not
  just the last, but all original stmts that are being replaced by the
  pattern).  Each of these has STMT_VINFO_RELATED_STMT set to a pattern stmt
  (last one corresponding to that original stmt).  If needed, further
  pattern stmts for that original stmts are put into
  STMT_VINFO_PATTERN_DEF_SEQ.  The pattern matcher could e.g. match
  3 original stmts, and stick a single pattern stmt into 
  STMT_VINFO_RELATED_STMT
  on the first original stmt, then say 2 pattern stmts into
  STMT_VINFO_PATTERN_DEF_SEQ of the second original stmt plus one
  STMT_VINFO_RELATED_STMT and finally one pattern stmt through
  STMT_VINFO_RELATED_STMT on the third original stmt.
 
   Note that STMT_VINFO_PATTERN_DEF_SEQ doesn't exist in the 4.6 branch, so
   this section has to be omitted if we backport it (which is desirable
   since the degradation was introduced in 4.6).  Removing it apparently
   does not affect the sphinx3 degradation.
 
  Yeah, when backporting you can basically assume that
  STMT_VINFO_PATTERN_DEF_SEQ is always NULL in 4.6 and just write NULL 
  instead
  of itt (and simplify), because no pattern recognizers needed more than one
  pattern stmt for each original stmt back then.
 
        Jakub
 
 




[PATCH] Fix PR50031, take 2

2012-02-09 Thread William J. Schmidt
Following is a revision of yesterday's PR50031 patch submission,
modified per Richard's comments.  Bootstrapped and tested with no
regressions on powerpc64-linux.  I've confirmed the same performance
improvements in SPEC.  OK for trunk?

Thanks,
Bill


2012-02-09  Bill Schmidt  wschm...@linux.vnet.ibm.com
Ira Rosen  i...@il.ibm.com

PR tree-optimization/50031
* targhooks.c (default_builtin_vectorization_cost): Handle
vec_promote_demote.
* target.h (enum vect_cost_for_stmt): Add vec_promote_demote.
* tree-vect-loop.c (vect_get_single_scalar_iteraion_cost): Handle
all types of reduction and pattern statements.
(vect_estimate_min_profitable_iters): Likewise.
* tree-vect-stmts.c (vect_model_promotion_demotion_cost): New function.
(vect_get_load_cost): Use vec_perm for permutations; add dump logic
for explicit realigns.
(vectorizable_conversion): Call vect_model_promotion_demotion_cost.
* config/spu/spu.c (spu_builtin_vectorization_cost): Handle
vec_promote_demote.
* config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
* config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost): Update
vec_perm for VSX and handle vec_promote_demote.


Index: gcc/targhooks.c
===
--- gcc/targhooks.c (revision 183944)
+++ gcc/targhooks.c (working copy)
@@ -514,6 +514,7 @@ default_builtin_vectorization_cost (enum vect_cost
   case scalar_to_vec:
   case cond_branch_not_taken:
   case vec_perm:
+  case vec_promote_demote:
 return 1;
 
   case unaligned_load:
Index: gcc/target.h
===
--- gcc/target.h(revision 183944)
+++ gcc/target.h(working copy)
@@ -145,7 +145,8 @@ enum vect_cost_for_stmt
   scalar_to_vec,
   cond_branch_not_taken,
   cond_branch_taken,
-  vec_perm
+  vec_perm,
+  vec_promote_demote
 };
 
 /* The target structure.  This holds all the backend hooks.  */
Index: gcc/tree-vect-loop.c
===
--- gcc/tree-vect-loop.c(revision 183944)
+++ gcc/tree-vect-loop.c(working copy)
@@ -2417,7 +2417,8 @@ vect_get_single_scalar_iteraion_cost (loop_vec_inf
   if (stmt_info
!STMT_VINFO_RELEVANT_P (stmt_info)
(!STMT_VINFO_LIVE_P (stmt_info)
-  || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
+  || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
+  !STMT_VINFO_IN_PATTERN_P (stmt_info))
 continue;
 
   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
@@ -2564,15 +2565,48 @@ vect_estimate_min_profitable_iters (loop_vec_info
{
  gimple stmt = gsi_stmt (si);
  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+
+ /* Translate the last statement in a pattern into the
+related replacement statement.  */
+ if (STMT_VINFO_IN_PATTERN_P (stmt_info))
+   {
+ stmt = STMT_VINFO_RELATED_STMT (stmt_info);
+ stmt_info = vinfo_for_stmt (stmt);
+   }
+
  /* Skip stmts that are not vectorized inside the loop.  */
  if (!STMT_VINFO_RELEVANT_P (stmt_info)
   (!STMT_VINFO_LIVE_P (stmt_info)
- || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
+ || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
continue;
+
  vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * 
factor;
  /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
 some of the outside costs are generated inside the outer-loop.  
*/
  vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
+  if (is_pattern_stmt_p (stmt_info)
+  STMT_VINFO_PATTERN_DEF_SEQ (stmt_info))
+{
+ gimple_stmt_iterator gsi;
+ 
+ for (gsi = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
+  !gsi_end_p (gsi); gsi_next (gsi))
+{
+  gimple pattern_def_stmt = gsi_stmt (gsi);
+  stmt_vec_info pattern_def_stmt_info
+   = vinfo_for_stmt (pattern_def_stmt);
+  if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
+  || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
+   {
+  vec_inside_cost
+   += STMT_VINFO_INSIDE_OF_LOOP_COST
+  (pattern_def_stmt_info) * factor;
+  vec_outside_cost
+   += STMT_VINFO_OUTSIDE_OF_LOOP_COST
+  (pattern_def_stmt_info);
+}
+   }
+   }
}
 }
 
Index: