Re: [RFC][PR82479] missing popcount builtin detection

2018-06-01 Thread Richard Biener
On Fri, Jun 1, 2018 at 12:06 PM Bin.Cheng  wrote:
>
> On Fri, Jun 1, 2018 at 9:56 AM, Kugan Vivekanandarajah
>  wrote:
> > Hi Bin,
> >
> > Thanks a lo for the review.
> >
> > On 1 June 2018 at 03:45, Bin.Cheng  wrote:
> >> On Thu, May 31, 2018 at 3:51 AM, Kugan Vivekanandarajah
> >>  wrote:
> >>> Hi Bin,
> >>>
> >>> Thanks for the review. Please find the revised patch based on the
> >>> review comments.
> >>>
> >>> Thanks,
> >>> Kugan
> >>>
> >>> On 17 May 2018 at 19:56, Bin.Cheng  wrote:
>  On Thu, May 17, 2018 at 2:39 AM, Kugan Vivekanandarajah
>   wrote:
> > Hi Richard,
> >
> > On 6 March 2018 at 02:24, Richard Biener  
> > wrote:
> >> On Thu, Feb 8, 2018 at 1:41 AM, Kugan Vivekanandarajah
> >>  wrote:
> >>> Hi Richard,
> >>>
> >>
> >> Hi,
> >> Thanks very much for working.
> >>
> >>> +/* Utility function to check if OP is defined by a stmt
> >>> +   that is a val - 1.  If that is the case, set it to STMT.  */
> >>> +
> >>> +static bool
> >>> +ssa_defined_by_and_minus_one_stmt_p (tree op, tree val, gimple **stmt)
> >> This is checking if op is defined as val - 1, so name it as
> >> ssa_defined_by_minus_one_stmt_p?
> >>
> >>> +{
> >>> +  if (TREE_CODE (op) == SSA_NAME
> >>> +  && (*stmt = SSA_NAME_DEF_STMT (op))
> >>> +  && is_gimple_assign (*stmt)
> >>> +  && (gimple_assign_rhs_code (*stmt) == PLUS_EXPR)
> >>> +  && val == gimple_assign_rhs1 (*stmt)
> >>> +  && integer_minus_onep (gimple_assign_rhs2 (*stmt)))
> >>> +return true;
> >>> +  else
> >>> +return false;
> >> You can simply return the boolean condition.
> > Done.
> >
> >>
> >>> +}
> >>> +
> >>> +/* See if LOOP is a popcout implementation of the form
> >> ...
> >>> +  rhs1 = gimple_assign_rhs1 (and_stmt);
> >>> +  rhs2 = gimple_assign_rhs2 (and_stmt);
> >>> +
> >>> +  if (ssa_defined_by_and_minus_one_stmt_p (rhs1, rhs2, &and_minus_one))
> >>> +rhs1 = rhs2;
> >>> +  else if (ssa_defined_by_and_minus_one_stmt_p (rhs2, rhs1, 
> >>> &and_minus_one))
> >>> +;
> >>> +  else
> >>> +return false;
> >>> +
> >>> +  /* Check the recurrence.  */
> >>> +  phi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (and_minus_one));
> >> So gimple_assign_rhs1 (and_minus_one) == rhs1 is always true?  Please
> >> use rhs1 directly.
> >
> > Done.
> >>> +  gimple *src_phi = SSA_NAME_DEF_STMT (rhs2);
> >> I think this is checking wrong thing and is redundant.  Either rhs2
> >> equals to rhs1 or is defined as (rhs1 - 1).  For (rhs2 == rhs1) case,
> >> the check duplicates checking on phi; for the latter, it's never a PHI
> >> stmt and shouldn't be checked.
> >>
> >>> +  if (gimple_code (phi) != GIMPLE_PHI
> >>> +  || gimple_code (src_phi) != GIMPLE_PHI)
> >>> +return false;
> >>> +
> >>> +  dest = gimple_assign_lhs (count_stmt);
> >>> +  tree fn = builtin_decl_implicit (BUILT_IN_POPCOUNT);
> >>> +  tree src = gimple_phi_arg_def (src_phi, loop_preheader_edge 
> >>> (loop)->dest_idx);
> >>> +  if (adjust)
> >>> +iter = fold_build2 (MINUS_EXPR, TREE_TYPE (dest),
> >>> +build_call_expr (fn, 1, src),
> >>> +build_int_cst (TREE_TYPE (dest), 1));
> >>> +  else
> >>> +iter = build_call_expr (fn, 1, src);
> >> Note tree-ssa-loop-niters.c always use unsigned_type_for (IV-type) as
> >> niters type.  Though unsigned type is unnecessary in this case, but
> >> better to follow existing behavior?
> >
> > Done.
> >>
> >>> +  max = int_cst_value (TYPE_MAX_VALUE (TREE_TYPE (dest)));
> >> As richi suggested, max should be the number of bits in type of IV.
> >>
> >>> +
> >>> +  niter->assumptions = boolean_false_node;
> >> Redundant.
> >
> > Not sure I understand. If I remove this,  I am getting ICE
> > (segmentation fault). What is the expectation here?
> Is it a simple typo?  Because assumptions is set to boolean_true_node
> just 5 lines below?
> The niters part looks good for me with this change.  You may need
> richi's approval for other parts?

diff --git a/gcc/ipa-fnsummary.c b/gcc/ipa-fnsummary.c
index bdf9ba1..4dc156a 100644
--- a/gcc/ipa-fnsummary.c
+++ b/gcc/ipa-fnsummary.c
@@ -1485,6 +1485,10 @@ will_be_nonconstant_expr_predicate (struct
ipa_node_params *info,
   nonconstant_names);
   return p2.or_with (summary->conds, p1);
 }
+  else if (TREE_CODE (expr) == CALL_EXPR)
+{
+  return false;
+}
   else
 {
   debug_tree (expr);

I'd return true here instead - we don't want to track popcount() in
predicates.  Also unnecessary braces.

@@ -3492,6 +3494,11 @@ expression_expensive_p (tree expr)
   if (!integer_pow2p (TREE_OPERAND (expr, 1)))
return true;
 }
+  if (code == CALL_EXPR
+  && (fndecl = get_callee_fndecl (expr))
+  && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
+  && (BUILT_IN_POPCOUNT == DECL_FUNCTION_CODE (fndecl)))
+return false;

please use

if (code == CALL_EXPR)
 {
 if (!is_inexpensive_builtin (get_callee_fndecl (expr)))
   return t

Re: [RFC][PR82479] missing popcount builtin detection

2018-06-01 Thread Bin.Cheng
On Fri, Jun 1, 2018 at 9:56 AM, Kugan Vivekanandarajah
 wrote:
> Hi Bin,
>
> Thanks a lo for the review.
>
> On 1 June 2018 at 03:45, Bin.Cheng  wrote:
>> On Thu, May 31, 2018 at 3:51 AM, Kugan Vivekanandarajah
>>  wrote:
>>> Hi Bin,
>>>
>>> Thanks for the review. Please find the revised patch based on the
>>> review comments.
>>>
>>> Thanks,
>>> Kugan
>>>
>>> On 17 May 2018 at 19:56, Bin.Cheng  wrote:
 On Thu, May 17, 2018 at 2:39 AM, Kugan Vivekanandarajah
  wrote:
> Hi Richard,
>
> On 6 March 2018 at 02:24, Richard Biener  
> wrote:
>> On Thu, Feb 8, 2018 at 1:41 AM, Kugan Vivekanandarajah
>>  wrote:
>>> Hi Richard,
>>>
>>
>> Hi,
>> Thanks very much for working.
>>
>>> +/* Utility function to check if OP is defined by a stmt
>>> +   that is a val - 1.  If that is the case, set it to STMT.  */
>>> +
>>> +static bool
>>> +ssa_defined_by_and_minus_one_stmt_p (tree op, tree val, gimple **stmt)
>> This is checking if op is defined as val - 1, so name it as
>> ssa_defined_by_minus_one_stmt_p?
>>
>>> +{
>>> +  if (TREE_CODE (op) == SSA_NAME
>>> +  && (*stmt = SSA_NAME_DEF_STMT (op))
>>> +  && is_gimple_assign (*stmt)
>>> +  && (gimple_assign_rhs_code (*stmt) == PLUS_EXPR)
>>> +  && val == gimple_assign_rhs1 (*stmt)
>>> +  && integer_minus_onep (gimple_assign_rhs2 (*stmt)))
>>> +return true;
>>> +  else
>>> +return false;
>> You can simply return the boolean condition.
> Done.
>
>>
>>> +}
>>> +
>>> +/* See if LOOP is a popcout implementation of the form
>> ...
>>> +  rhs1 = gimple_assign_rhs1 (and_stmt);
>>> +  rhs2 = gimple_assign_rhs2 (and_stmt);
>>> +
>>> +  if (ssa_defined_by_and_minus_one_stmt_p (rhs1, rhs2, &and_minus_one))
>>> +rhs1 = rhs2;
>>> +  else if (ssa_defined_by_and_minus_one_stmt_p (rhs2, rhs1, 
>>> &and_minus_one))
>>> +;
>>> +  else
>>> +return false;
>>> +
>>> +  /* Check the recurrence.  */
>>> +  phi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (and_minus_one));
>> So gimple_assign_rhs1 (and_minus_one) == rhs1 is always true?  Please
>> use rhs1 directly.
>
> Done.
>>> +  gimple *src_phi = SSA_NAME_DEF_STMT (rhs2);
>> I think this is checking wrong thing and is redundant.  Either rhs2
>> equals to rhs1 or is defined as (rhs1 - 1).  For (rhs2 == rhs1) case,
>> the check duplicates checking on phi; for the latter, it's never a PHI
>> stmt and shouldn't be checked.
>>
>>> +  if (gimple_code (phi) != GIMPLE_PHI
>>> +  || gimple_code (src_phi) != GIMPLE_PHI)
>>> +return false;
>>> +
>>> +  dest = gimple_assign_lhs (count_stmt);
>>> +  tree fn = builtin_decl_implicit (BUILT_IN_POPCOUNT);
>>> +  tree src = gimple_phi_arg_def (src_phi, loop_preheader_edge 
>>> (loop)->dest_idx);
>>> +  if (adjust)
>>> +iter = fold_build2 (MINUS_EXPR, TREE_TYPE (dest),
>>> +build_call_expr (fn, 1, src),
>>> +build_int_cst (TREE_TYPE (dest), 1));
>>> +  else
>>> +iter = build_call_expr (fn, 1, src);
>> Note tree-ssa-loop-niters.c always use unsigned_type_for (IV-type) as
>> niters type.  Though unsigned type is unnecessary in this case, but
>> better to follow existing behavior?
>
> Done.
>>
>>> +  max = int_cst_value (TYPE_MAX_VALUE (TREE_TYPE (dest)));
>> As richi suggested, max should be the number of bits in type of IV.
>>
>>> +
>>> +  niter->assumptions = boolean_false_node;
>> Redundant.
>
> Not sure I understand. If I remove this,  I am getting ICE
> (segmentation fault). What is the expectation here?
Is it a simple typo?  Because assumptions is set to boolean_true_node
just 5 lines below?
The niters part looks good for me with this change.  You may need
richi's approval for other parts?

Thanks,
bin
>
>>> +  niter->control.base = NULL_TREE;
>>> +  niter->control.step = NULL_TREE;
>>> +  niter->control.no_overflow = false;
>>> +  niter->niter = iter;
>>> +  niter->assumptions = boolean_true_node;
>>> +  niter->may_be_zero = boolean_false_node;
>>> +  niter->max = max;
>>> +  niter->bound = NULL_TREE;
>>> +  niter->cmp = ERROR_MARK;
>>> +  return true;
>>> +}
>>> +
>>> +
>> Appology if these are nitpickings.
> Thanks for the review. I am happy to make the changes needed to get it
> to how it should be :)
>
> Thanks,
> Kugan
>
>>
>> Thanks,
>> bin


Re: [RFC][PR82479] missing popcount builtin detection

2018-06-01 Thread Kugan Vivekanandarajah
Hi Bin,

Thanks a lo for the review.

On 1 June 2018 at 03:45, Bin.Cheng  wrote:
> On Thu, May 31, 2018 at 3:51 AM, Kugan Vivekanandarajah
>  wrote:
>> Hi Bin,
>>
>> Thanks for the review. Please find the revised patch based on the
>> review comments.
>>
>> Thanks,
>> Kugan
>>
>> On 17 May 2018 at 19:56, Bin.Cheng  wrote:
>>> On Thu, May 17, 2018 at 2:39 AM, Kugan Vivekanandarajah
>>>  wrote:
 Hi Richard,

 On 6 March 2018 at 02:24, Richard Biener  
 wrote:
> On Thu, Feb 8, 2018 at 1:41 AM, Kugan Vivekanandarajah
>  wrote:
>> Hi Richard,
>>
>
> Hi,
> Thanks very much for working.
>
>> +/* Utility function to check if OP is defined by a stmt
>> +   that is a val - 1.  If that is the case, set it to STMT.  */
>> +
>> +static bool
>> +ssa_defined_by_and_minus_one_stmt_p (tree op, tree val, gimple **stmt)
> This is checking if op is defined as val - 1, so name it as
> ssa_defined_by_minus_one_stmt_p?
>
>> +{
>> +  if (TREE_CODE (op) == SSA_NAME
>> +  && (*stmt = SSA_NAME_DEF_STMT (op))
>> +  && is_gimple_assign (*stmt)
>> +  && (gimple_assign_rhs_code (*stmt) == PLUS_EXPR)
>> +  && val == gimple_assign_rhs1 (*stmt)
>> +  && integer_minus_onep (gimple_assign_rhs2 (*stmt)))
>> +return true;
>> +  else
>> +return false;
> You can simply return the boolean condition.
Done.

>
>> +}
>> +
>> +/* See if LOOP is a popcout implementation of the form
> ...
>> +  rhs1 = gimple_assign_rhs1 (and_stmt);
>> +  rhs2 = gimple_assign_rhs2 (and_stmt);
>> +
>> +  if (ssa_defined_by_and_minus_one_stmt_p (rhs1, rhs2, &and_minus_one))
>> +rhs1 = rhs2;
>> +  else if (ssa_defined_by_and_minus_one_stmt_p (rhs2, rhs1, &and_minus_one))
>> +;
>> +  else
>> +return false;
>> +
>> +  /* Check the recurrence.  */
>> +  phi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (and_minus_one));
> So gimple_assign_rhs1 (and_minus_one) == rhs1 is always true?  Please
> use rhs1 directly.

Done.
>> +  gimple *src_phi = SSA_NAME_DEF_STMT (rhs2);
> I think this is checking wrong thing and is redundant.  Either rhs2
> equals to rhs1 or is defined as (rhs1 - 1).  For (rhs2 == rhs1) case,
> the check duplicates checking on phi; for the latter, it's never a PHI
> stmt and shouldn't be checked.
>
>> +  if (gimple_code (phi) != GIMPLE_PHI
>> +  || gimple_code (src_phi) != GIMPLE_PHI)
>> +return false;
>> +
>> +  dest = gimple_assign_lhs (count_stmt);
>> +  tree fn = builtin_decl_implicit (BUILT_IN_POPCOUNT);
>> +  tree src = gimple_phi_arg_def (src_phi, loop_preheader_edge 
>> (loop)->dest_idx);
>> +  if (adjust)
>> +iter = fold_build2 (MINUS_EXPR, TREE_TYPE (dest),
>> +build_call_expr (fn, 1, src),
>> +build_int_cst (TREE_TYPE (dest), 1));
>> +  else
>> +iter = build_call_expr (fn, 1, src);
> Note tree-ssa-loop-niters.c always use unsigned_type_for (IV-type) as
> niters type.  Though unsigned type is unnecessary in this case, but
> better to follow existing behavior?

Done.
>
>> +  max = int_cst_value (TYPE_MAX_VALUE (TREE_TYPE (dest)));
> As richi suggested, max should be the number of bits in type of IV.
>
>> +
>> +  niter->assumptions = boolean_false_node;
> Redundant.

Not sure I understand. If I remove this,  I am getting ICE
(segmentation fault). What is the expectation here?

>> +  niter->control.base = NULL_TREE;
>> +  niter->control.step = NULL_TREE;
>> +  niter->control.no_overflow = false;
>> +  niter->niter = iter;
>> +  niter->assumptions = boolean_true_node;
>> +  niter->may_be_zero = boolean_false_node;
>> +  niter->max = max;
>> +  niter->bound = NULL_TREE;
>> +  niter->cmp = ERROR_MARK;
>> +  return true;
>> +}
>> +
>> +
> Appology if these are nitpickings.
Thanks for the review. I am happy to make the changes needed to get it
to how it should be :)

Thanks,
Kugan

>
> Thanks,
> bin
From f45179c777d846731d2d899a142c45a36ab35fd1 Mon Sep 17 00:00:00 2001
From: Kugan Vivekanandarajah 
Date: Thu, 10 May 2018 21:41:53 +1000
Subject: [PATCH] popcount

Change-Id: I10c1f990e5b9c9900cf7c93678df924f0463b72e
---
 gcc/ipa-fnsummary.c   |   4 +
 gcc/testsuite/gcc.dg/tree-ssa/popcount.c  |  41 
 gcc/testsuite/gcc.dg/tree-ssa/popcount2.c |  29 ++
 gcc/testsuite/gcc.dg/tree-ssa/popcount3.c |  28 ++
 gcc/tree-scalar-evolution.c   |   7 ++
 gcc/tree-ssa-loop-ivopts.c|  10 ++
 gcc/tree-ssa-loop-niter.c | 152 +-
 7 files changed, 270 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/popcount.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/popcount2.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/popcount3.c

diff --git a/gcc/ipa-fnsummary.c b/gcc/ipa-fnsummary.c
index bdf9ba1..4dc156a 100644
--- a/gcc/ipa-fnsummary.c
+++ b/gcc/ipa-fnsummary.c
@@ -1485,6 +1485,10 @@ will_be_nonconstant_expr_predicate (struct ipa_node_params *info,
 	   nonconstant_names);
   return p2.or_with (summary->conds, p1);
 }
+

Re: [RFC][PR82479] missing popcount builtin detection

2018-05-31 Thread Bin.Cheng
On Thu, May 31, 2018 at 3:51 AM, Kugan Vivekanandarajah
 wrote:
> Hi Bin,
>
> Thanks for the review. Please find the revised patch based on the
> review comments.
>
> Thanks,
> Kugan
>
> On 17 May 2018 at 19:56, Bin.Cheng  wrote:
>> On Thu, May 17, 2018 at 2:39 AM, Kugan Vivekanandarajah
>>  wrote:
>>> Hi Richard,
>>>
>>> On 6 March 2018 at 02:24, Richard Biener  wrote:
 On Thu, Feb 8, 2018 at 1:41 AM, Kugan Vivekanandarajah
  wrote:
> Hi Richard,
>

Hi,
Thanks very much for working.

> +/* Utility function to check if OP is defined by a stmt
> +   that is a val - 1.  If that is the case, set it to STMT.  */
> +
> +static bool
> +ssa_defined_by_and_minus_one_stmt_p (tree op, tree val, gimple **stmt)
This is checking if op is defined as val - 1, so name it as
ssa_defined_by_minus_one_stmt_p?

> +{
> +  if (TREE_CODE (op) == SSA_NAME
> +  && (*stmt = SSA_NAME_DEF_STMT (op))
> +  && is_gimple_assign (*stmt)
> +  && (gimple_assign_rhs_code (*stmt) == PLUS_EXPR)
> +  && val == gimple_assign_rhs1 (*stmt)
> +  && integer_minus_onep (gimple_assign_rhs2 (*stmt)))
> +return true;
> +  else
> +return false;
You can simply return the boolean condition.

> +}
> +
> +/* See if LOOP is a popcout implementation of the form
...
> +  rhs1 = gimple_assign_rhs1 (and_stmt);
> +  rhs2 = gimple_assign_rhs2 (and_stmt);
> +
> +  if (ssa_defined_by_and_minus_one_stmt_p (rhs1, rhs2, &and_minus_one))
> +rhs1 = rhs2;
> +  else if (ssa_defined_by_and_minus_one_stmt_p (rhs2, rhs1, &and_minus_one))
> +;
> +  else
> +return false;
> +
> +  /* Check the recurrence.  */
> +  phi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (and_minus_one));
So gimple_assign_rhs1 (and_minus_one) == rhs1 is always true?  Please
use rhs1 directly.

> +  gimple *src_phi = SSA_NAME_DEF_STMT (rhs2);
I think this is checking wrong thing and is redundant.  Either rhs2
equals to rhs1 or is defined as (rhs1 - 1).  For (rhs2 == rhs1) case,
the check duplicates checking on phi; for the latter, it's never a PHI
stmt and shouldn't be checked.

> +  if (gimple_code (phi) != GIMPLE_PHI
> +  || gimple_code (src_phi) != GIMPLE_PHI)
> +return false;
> +
> +  dest = gimple_assign_lhs (count_stmt);
> +  tree fn = builtin_decl_implicit (BUILT_IN_POPCOUNT);
> +  tree src = gimple_phi_arg_def (src_phi, loop_preheader_edge 
> (loop)->dest_idx);
> +  if (adjust)
> +iter = fold_build2 (MINUS_EXPR, TREE_TYPE (dest),
> +build_call_expr (fn, 1, src),
> +build_int_cst (TREE_TYPE (dest), 1));
> +  else
> +iter = build_call_expr (fn, 1, src);
Note tree-ssa-loop-niters.c always use unsigned_type_for (IV-type) as
niters type.  Though unsigned type is unnecessary in this case, but
better to follow existing behavior?

> +  max = int_cst_value (TYPE_MAX_VALUE (TREE_TYPE (dest)));
As richi suggested, max should be the number of bits in type of IV.

> +
> +  niter->assumptions = boolean_false_node;
Redundant.

> +  niter->control.base = NULL_TREE;
> +  niter->control.step = NULL_TREE;
> +  niter->control.no_overflow = false;
> +  niter->niter = iter;
> +  niter->assumptions = boolean_true_node;
> +  niter->may_be_zero = boolean_false_node;
> +  niter->max = max;
> +  niter->bound = NULL_TREE;
> +  niter->cmp = ERROR_MARK;
> +  return true;
> +}
> +
> +
Appology if these are nitpickings.

Thanks,
bin


Re: [RFC][PR82479] missing popcount builtin detection

2018-05-30 Thread Kugan Vivekanandarajah
Hi Bin,

Thanks for the review. Please find the revised patch based on the
review comments.

Thanks,
Kugan

On 17 May 2018 at 19:56, Bin.Cheng  wrote:
> On Thu, May 17, 2018 at 2:39 AM, Kugan Vivekanandarajah
>  wrote:
>> Hi Richard,
>>
>> On 6 March 2018 at 02:24, Richard Biener  wrote:
>>> On Thu, Feb 8, 2018 at 1:41 AM, Kugan Vivekanandarajah
>>>  wrote:
 Hi Richard,

 On 1 February 2018 at 23:21, Richard Biener  
 wrote:
> On Thu, Feb 1, 2018 at 5:07 AM, Kugan Vivekanandarajah
>  wrote:
>> Hi Richard,
>>
>> On 31 January 2018 at 21:39, Richard Biener  
>> wrote:
>>> On Wed, Jan 31, 2018 at 11:28 AM, Kugan Vivekanandarajah
>>>  wrote:
 Hi Richard,

 Thanks for the review.
 On 25 January 2018 at 20:04, Richard Biener 
  wrote:
> On Wed, Jan 24, 2018 at 10:56 PM, Kugan Vivekanandarajah
>  wrote:
>> Hi All,
>>
>> Here is a patch for popcount builtin detection similar to LLVM. I
>> would like to queue this for review for next stage 1.
>>
>> 1. This is done part of loop-distribution and effective for -O3 and 
>> above.
>> 2. This does not distribute loop to detect popcount (like
>> memcpy/memmove). I dont think that happens in practice. Please 
>> correct
>> me if I am wrong.
>
> But then it has no business inside loop distribution but instead is
> doing final value
> replacement, right?  You are pattern-matching the whole loop after 
> all.  I think
> final value replacement would already do the correct thing if you
> teached number of
> iteration analysis that niter for
>
>[local count: 955630224]:
>   # b_11 = PHI 
>   _1 = b_11 + -1;
>   b_8 = _1 & b_11;
>   if (b_8 != 0)
> goto ; [89.00%]
>   else
> goto ; [11.00%]
>
>[local count: 850510900]:
>   goto ; [100.00%]

 I am looking into this approach. What should be the scalar evolution
 for b_8 (i.e. b & (b -1) in a loop) should be? This is not clear to me
 and can this be represented with the scev?
>>>
>>> No, it's not affine and thus cannot be represented.  You only need the
>>> scalar evolution of the counting IV which is already handled and
>>> the number of iteration analysis needs to handle the above IV - this
>>> is the missing part.
>> Thanks for the clarification. I am now matching this loop pattern in
>> number_of_iterations_exit when number_of_iterations_exit_assumptions
>> fails. If the pattern matches, I am inserting the _builtin_popcount in
>> the loop preheater and setting the loop niter with this. This will be
>> used by the final value replacement. Is this what you wanted?
>
> No, you shouldn't insert a popcount stmt but instead the niter
> GENERIC tree should be a CALL_EXPR to popcount with the
> appropriate argument.

 Thats what I tried earlier but ran into some ICEs. I wasn't sure if
 niter in tree_niter_desc can take such.

 Attached patch now does this. Also had to add support for CALL_EXPR in
 few places to handle niter with CALL_EXPR. Does this look OK?
>>>
>>> Overall this looks ok - the patch includes changes in places that I don't 
>>> think
>>> need changes such as chrec_convert_1 or extract_ops_from_tree.
>>> The expression_expensive_p change should be more specific than making
>>> all calls inexpensive as well.
>>
>> Changed it.
>>
>>>
>>> The verify_ssa change looks bogus, you do
>>>
>>> +  dest = gimple_phi_result (count_phi);
>>> +  tree var = make_ssa_name (TREE_TYPE (dest), NULL);
>>> +  tree fn = builtin_decl_implicit (BUILT_IN_POPCOUNT);
>>> +
>>> +  var = build_call_expr (fn, 1, src);
>>> +  *niter = fold_build2 (MINUS_EXPR, TREE_TYPE (dest), var,
>>> +   build_int_cst (TREE_TYPE (dest), 1));
>>>
>>> why do you allocate a new SSA name here?  It seems unused
>>> as you overwrive 'var' with the CALL_EXPR immediately.
>> Changed now.
>>
>>>
>>> I didn't review the pattern matching thoroughly nor the exact place you
>>> call it.  But
>>>
>>> +  if (check_popcount_pattern (loop, &count))
>>> +   {
>>> + niter->assumptions = boolean_false_node;
>>> + niter->control.base = NULL_TREE;
>>> + niter->control.step = NULL_TREE;
>>> + niter->control.no_overflow = false;
>>> + niter->niter = count;
>>> + niter->assumptions = boolean_true_node;
>>> + niter->may_be_zero = boolean_false_node;
>>> + niter->max = -1;
>>> + niter->bound = NULL_TREE;
>>> + niter->cmp = ERROR_MARK;
>>> + return true;
>>> +   }
>>>
>>> simply setting may_be_zero to false looks fishy.
>> Should I set this to (argument to popcount == zero)?
> No, I think that's unnecessa

Re: [RFC][PR82479] missing popcount builtin detection

2018-05-17 Thread Bin.Cheng
On Thu, May 17, 2018 at 2:39 AM, Kugan Vivekanandarajah
 wrote:
> Hi Richard,
>
> On 6 March 2018 at 02:24, Richard Biener  wrote:
>> On Thu, Feb 8, 2018 at 1:41 AM, Kugan Vivekanandarajah
>>  wrote:
>>> Hi Richard,
>>>
>>> On 1 February 2018 at 23:21, Richard Biener  
>>> wrote:
 On Thu, Feb 1, 2018 at 5:07 AM, Kugan Vivekanandarajah
  wrote:
> Hi Richard,
>
> On 31 January 2018 at 21:39, Richard Biener  
> wrote:
>> On Wed, Jan 31, 2018 at 11:28 AM, Kugan Vivekanandarajah
>>  wrote:
>>> Hi Richard,
>>>
>>> Thanks for the review.
>>> On 25 January 2018 at 20:04, Richard Biener 
>>>  wrote:
 On Wed, Jan 24, 2018 at 10:56 PM, Kugan Vivekanandarajah
  wrote:
> Hi All,
>
> Here is a patch for popcount builtin detection similar to LLVM. I
> would like to queue this for review for next stage 1.
>
> 1. This is done part of loop-distribution and effective for -O3 and 
> above.
> 2. This does not distribute loop to detect popcount (like
> memcpy/memmove). I dont think that happens in practice. Please correct
> me if I am wrong.

 But then it has no business inside loop distribution but instead is
 doing final value
 replacement, right?  You are pattern-matching the whole loop after 
 all.  I think
 final value replacement would already do the correct thing if you
 teached number of
 iteration analysis that niter for

[local count: 955630224]:
   # b_11 = PHI 
   _1 = b_11 + -1;
   b_8 = _1 & b_11;
   if (b_8 != 0)
 goto ; [89.00%]
   else
 goto ; [11.00%]

[local count: 850510900]:
   goto ; [100.00%]
>>>
>>> I am looking into this approach. What should be the scalar evolution
>>> for b_8 (i.e. b & (b -1) in a loop) should be? This is not clear to me
>>> and can this be represented with the scev?
>>
>> No, it's not affine and thus cannot be represented.  You only need the
>> scalar evolution of the counting IV which is already handled and
>> the number of iteration analysis needs to handle the above IV - this
>> is the missing part.
> Thanks for the clarification. I am now matching this loop pattern in
> number_of_iterations_exit when number_of_iterations_exit_assumptions
> fails. If the pattern matches, I am inserting the _builtin_popcount in
> the loop preheater and setting the loop niter with this. This will be
> used by the final value replacement. Is this what you wanted?

 No, you shouldn't insert a popcount stmt but instead the niter
 GENERIC tree should be a CALL_EXPR to popcount with the
 appropriate argument.
>>>
>>> Thats what I tried earlier but ran into some ICEs. I wasn't sure if
>>> niter in tree_niter_desc can take such.
>>>
>>> Attached patch now does this. Also had to add support for CALL_EXPR in
>>> few places to handle niter with CALL_EXPR. Does this look OK?
>>
>> Overall this looks ok - the patch includes changes in places that I don't 
>> think
>> need changes such as chrec_convert_1 or extract_ops_from_tree.
>> The expression_expensive_p change should be more specific than making
>> all calls inexpensive as well.
>
> Changed it.
>
>>
>> The verify_ssa change looks bogus, you do
>>
>> +  dest = gimple_phi_result (count_phi);
>> +  tree var = make_ssa_name (TREE_TYPE (dest), NULL);
>> +  tree fn = builtin_decl_implicit (BUILT_IN_POPCOUNT);
>> +
>> +  var = build_call_expr (fn, 1, src);
>> +  *niter = fold_build2 (MINUS_EXPR, TREE_TYPE (dest), var,
>> +   build_int_cst (TREE_TYPE (dest), 1));
>>
>> why do you allocate a new SSA name here?  It seems unused
>> as you overwrive 'var' with the CALL_EXPR immediately.
> Changed now.
>
>>
>> I didn't review the pattern matching thoroughly nor the exact place you
>> call it.  But
>>
>> +  if (check_popcount_pattern (loop, &count))
>> +   {
>> + niter->assumptions = boolean_false_node;
>> + niter->control.base = NULL_TREE;
>> + niter->control.step = NULL_TREE;
>> + niter->control.no_overflow = false;
>> + niter->niter = count;
>> + niter->assumptions = boolean_true_node;
>> + niter->may_be_zero = boolean_false_node;
>> + niter->max = -1;
>> + niter->bound = NULL_TREE;
>> + niter->cmp = ERROR_MARK;
>> + return true;
>> +   }
>>
>> simply setting may_be_zero to false looks fishy.
> Should I set this to (argument to popcount == zero)?
No, I think that's unnecessary.  The number of iterations is computed
as: may_be_zero ? 0 : niter;
Here niter is ZERO even when may_be_zero is set to false, and niters
is computed correctly.

I think the point is that may_be_zero is false doesn't imply that
niters is non-zero.

>
>> Try with -fno-tree-loop-ch.
> I chan

Re: [RFC][PR82479] missing popcount builtin detection

2018-05-16 Thread Kugan Vivekanandarajah
Hi Richard,

On 6 March 2018 at 02:24, Richard Biener  wrote:
> On Thu, Feb 8, 2018 at 1:41 AM, Kugan Vivekanandarajah
>  wrote:
>> Hi Richard,
>>
>> On 1 February 2018 at 23:21, Richard Biener  
>> wrote:
>>> On Thu, Feb 1, 2018 at 5:07 AM, Kugan Vivekanandarajah
>>>  wrote:
 Hi Richard,

 On 31 January 2018 at 21:39, Richard Biener  
 wrote:
> On Wed, Jan 31, 2018 at 11:28 AM, Kugan Vivekanandarajah
>  wrote:
>> Hi Richard,
>>
>> Thanks for the review.
>> On 25 January 2018 at 20:04, Richard Biener  
>> wrote:
>>> On Wed, Jan 24, 2018 at 10:56 PM, Kugan Vivekanandarajah
>>>  wrote:
 Hi All,

 Here is a patch for popcount builtin detection similar to LLVM. I
 would like to queue this for review for next stage 1.

 1. This is done part of loop-distribution and effective for -O3 and 
 above.
 2. This does not distribute loop to detect popcount (like
 memcpy/memmove). I dont think that happens in practice. Please correct
 me if I am wrong.
>>>
>>> But then it has no business inside loop distribution but instead is
>>> doing final value
>>> replacement, right?  You are pattern-matching the whole loop after all. 
>>>  I think
>>> final value replacement would already do the correct thing if you
>>> teached number of
>>> iteration analysis that niter for
>>>
>>>[local count: 955630224]:
>>>   # b_11 = PHI 
>>>   _1 = b_11 + -1;
>>>   b_8 = _1 & b_11;
>>>   if (b_8 != 0)
>>> goto ; [89.00%]
>>>   else
>>> goto ; [11.00%]
>>>
>>>[local count: 850510900]:
>>>   goto ; [100.00%]
>>
>> I am looking into this approach. What should be the scalar evolution
>> for b_8 (i.e. b & (b -1) in a loop) should be? This is not clear to me
>> and can this be represented with the scev?
>
> No, it's not affine and thus cannot be represented.  You only need the
> scalar evolution of the counting IV which is already handled and
> the number of iteration analysis needs to handle the above IV - this
> is the missing part.
 Thanks for the clarification. I am now matching this loop pattern in
 number_of_iterations_exit when number_of_iterations_exit_assumptions
 fails. If the pattern matches, I am inserting the _builtin_popcount in
 the loop preheater and setting the loop niter with this. This will be
 used by the final value replacement. Is this what you wanted?
>>>
>>> No, you shouldn't insert a popcount stmt but instead the niter
>>> GENERIC tree should be a CALL_EXPR to popcount with the
>>> appropriate argument.
>>
>> Thats what I tried earlier but ran into some ICEs. I wasn't sure if
>> niter in tree_niter_desc can take such.
>>
>> Attached patch now does this. Also had to add support for CALL_EXPR in
>> few places to handle niter with CALL_EXPR. Does this look OK?
>
> Overall this looks ok - the patch includes changes in places that I don't 
> think
> need changes such as chrec_convert_1 or extract_ops_from_tree.
> The expression_expensive_p change should be more specific than making
> all calls inexpensive as well.

Changed it.

>
> The verify_ssa change looks bogus, you do
>
> +  dest = gimple_phi_result (count_phi);
> +  tree var = make_ssa_name (TREE_TYPE (dest), NULL);
> +  tree fn = builtin_decl_implicit (BUILT_IN_POPCOUNT);
> +
> +  var = build_call_expr (fn, 1, src);
> +  *niter = fold_build2 (MINUS_EXPR, TREE_TYPE (dest), var,
> +   build_int_cst (TREE_TYPE (dest), 1));
>
> why do you allocate a new SSA name here?  It seems unused
> as you overwrive 'var' with the CALL_EXPR immediately.
Changed now.

>
> I didn't review the pattern matching thoroughly nor the exact place you
> call it.  But
>
> +  if (check_popcount_pattern (loop, &count))
> +   {
> + niter->assumptions = boolean_false_node;
> + niter->control.base = NULL_TREE;
> + niter->control.step = NULL_TREE;
> + niter->control.no_overflow = false;
> + niter->niter = count;
> + niter->assumptions = boolean_true_node;
> + niter->may_be_zero = boolean_false_node;
> + niter->max = -1;
> + niter->bound = NULL_TREE;
> + niter->cmp = ERROR_MARK;
> + return true;
> +   }
>
> simply setting may_be_zero to false looks fishy.
Should I set this to (argument to popcount == zero)?

> Try with -fno-tree-loop-ch.
I changed the pattern matching to handle loop without header copying
too. Looks a bit complicated checking all the conditions. Wondering if
this can be done in a simpler and easier to read way.

>  Also max should not be negative,
> it should be the number of bits in the IV type?

Changed this too.
>
> A related testcase could be that we can completely peel
> a loop like the following which iterates at most 8 times:
>
> int a[8];
> void foo (unsigned char ctrl)
> {
>   int c =

Re: [RFC][PR82479] missing popcount builtin detection

2018-03-08 Thread Kugan Vivekanandarajah
Hi Richard and Bin,

Thanks for your comments. I will revise the patch and post it as soon
as stage-1 opens.

Kugan

On 7 March 2018 at 22:25, Bin.Cheng  wrote:
> On Wed, Mar 7, 2018 at 8:26 AM, Richard Biener
>  wrote:
>> On Tue, Mar 6, 2018 at 5:20 PM, Bin.Cheng  wrote:
>>> On Mon, Mar 5, 2018 at 3:24 PM, Richard Biener
>>>  wrote:
 On Thu, Feb 8, 2018 at 1:41 AM, Kugan Vivekanandarajah
  wrote:
> Hi Richard,
>
> On 1 February 2018 at 23:21, Richard Biener  
> wrote:
>> On Thu, Feb 1, 2018 at 5:07 AM, Kugan Vivekanandarajah
>>  wrote:
>>> Hi Richard,
>>>
>>> On 31 January 2018 at 21:39, Richard Biener 
>>>  wrote:
 On Wed, Jan 31, 2018 at 11:28 AM, Kugan Vivekanandarajah
  wrote:
> Hi Richard,
>
> Thanks for the review.
> On 25 January 2018 at 20:04, Richard Biener 
>  wrote:
>> On Wed, Jan 24, 2018 at 10:56 PM, Kugan Vivekanandarajah
>>  wrote:
>>> Hi All,
>>>
>>> Here is a patch for popcount builtin detection similar to LLVM. I
>>> would like to queue this for review for next stage 1.
>>>
>>> 1. This is done part of loop-distribution and effective for -O3 and 
>>> above.
>>> 2. This does not distribute loop to detect popcount (like
>>> memcpy/memmove). I dont think that happens in practice. Please 
>>> correct
>>> me if I am wrong.
>>
>> But then it has no business inside loop distribution but instead is
>> doing final value
>> replacement, right?  You are pattern-matching the whole loop after 
>> all.  I think
>> final value replacement would already do the correct thing if you
>> teached number of
>> iteration analysis that niter for
>>
>>[local count: 955630224]:
>>   # b_11 = PHI 
>>   _1 = b_11 + -1;
>>   b_8 = _1 & b_11;
>>   if (b_8 != 0)
>> goto ; [89.00%]
>>   else
>> goto ; [11.00%]
>>
>>[local count: 850510900]:
>>   goto ; [100.00%]
>
> I am looking into this approach. What should be the scalar evolution
> for b_8 (i.e. b & (b -1) in a loop) should be? This is not clear to me
> and can this be represented with the scev?

 No, it's not affine and thus cannot be represented.  You only need the
 scalar evolution of the counting IV which is already handled and
 the number of iteration analysis needs to handle the above IV - this
 is the missing part.
>>> Thanks for the clarification. I am now matching this loop pattern in
>>> number_of_iterations_exit when number_of_iterations_exit_assumptions
>>> fails. If the pattern matches, I am inserting the _builtin_popcount in
>>> the loop preheater and setting the loop niter with this. This will be
>>> used by the final value replacement. Is this what you wanted?
>>
>> No, you shouldn't insert a popcount stmt but instead the niter
>> GENERIC tree should be a CALL_EXPR to popcount with the
>> appropriate argument.
>
> Thats what I tried earlier but ran into some ICEs. I wasn't sure if
> niter in tree_niter_desc can take such.
>
> Attached patch now does this. Also had to add support for CALL_EXPR in
> few places to handle niter with CALL_EXPR. Does this look OK?

 Overall this looks ok - the patch includes changes in places that I don't 
 think
 need changes such as chrec_convert_1 or extract_ops_from_tree.
 The expression_expensive_p change should be more specific than making
 all calls inexpensive as well.

 The verify_ssa change looks bogus, you do

 +  dest = gimple_phi_result (count_phi);
 +  tree var = make_ssa_name (TREE_TYPE (dest), NULL);
 +  tree fn = builtin_decl_implicit (BUILT_IN_POPCOUNT);
 +
 +  var = build_call_expr (fn, 1, src);
 +  *niter = fold_build2 (MINUS_EXPR, TREE_TYPE (dest), var,
 +   build_int_cst (TREE_TYPE (dest), 1));

 why do you allocate a new SSA name here?  It seems unused
 as you overwrive 'var' with the CALL_EXPR immediately.

 I didn't review the pattern matching thoroughly nor the exact place you
 call it.  But

 +  if (check_popcount_pattern (loop, &count))
 +   {
 + niter->assumptions = boolean_false_node;
 + niter->control.base = NULL_TREE;
 + niter->control.step = NULL_TREE;
 + niter->control.no_overflow = false;
 + niter->niter = count;
 + niter->assumptions = boolean_true_node;
 + niter->may_be_zero = boolean_false_node;
 + niter->max = -1;
 + niter->bound = NULL_TREE;
 + niter->cmp = ERROR_MARK;
 + return true;
 +   }

 simply settin

Re: [RFC][PR82479] missing popcount builtin detection

2018-03-07 Thread Bin.Cheng
On Wed, Mar 7, 2018 at 8:26 AM, Richard Biener
 wrote:
> On Tue, Mar 6, 2018 at 5:20 PM, Bin.Cheng  wrote:
>> On Mon, Mar 5, 2018 at 3:24 PM, Richard Biener
>>  wrote:
>>> On Thu, Feb 8, 2018 at 1:41 AM, Kugan Vivekanandarajah
>>>  wrote:
 Hi Richard,

 On 1 February 2018 at 23:21, Richard Biener  
 wrote:
> On Thu, Feb 1, 2018 at 5:07 AM, Kugan Vivekanandarajah
>  wrote:
>> Hi Richard,
>>
>> On 31 January 2018 at 21:39, Richard Biener  
>> wrote:
>>> On Wed, Jan 31, 2018 at 11:28 AM, Kugan Vivekanandarajah
>>>  wrote:
 Hi Richard,

 Thanks for the review.
 On 25 January 2018 at 20:04, Richard Biener 
  wrote:
> On Wed, Jan 24, 2018 at 10:56 PM, Kugan Vivekanandarajah
>  wrote:
>> Hi All,
>>
>> Here is a patch for popcount builtin detection similar to LLVM. I
>> would like to queue this for review for next stage 1.
>>
>> 1. This is done part of loop-distribution and effective for -O3 and 
>> above.
>> 2. This does not distribute loop to detect popcount (like
>> memcpy/memmove). I dont think that happens in practice. Please 
>> correct
>> me if I am wrong.
>
> But then it has no business inside loop distribution but instead is
> doing final value
> replacement, right?  You are pattern-matching the whole loop after 
> all.  I think
> final value replacement would already do the correct thing if you
> teached number of
> iteration analysis that niter for
>
>[local count: 955630224]:
>   # b_11 = PHI 
>   _1 = b_11 + -1;
>   b_8 = _1 & b_11;
>   if (b_8 != 0)
> goto ; [89.00%]
>   else
> goto ; [11.00%]
>
>[local count: 850510900]:
>   goto ; [100.00%]

 I am looking into this approach. What should be the scalar evolution
 for b_8 (i.e. b & (b -1) in a loop) should be? This is not clear to me
 and can this be represented with the scev?
>>>
>>> No, it's not affine and thus cannot be represented.  You only need the
>>> scalar evolution of the counting IV which is already handled and
>>> the number of iteration analysis needs to handle the above IV - this
>>> is the missing part.
>> Thanks for the clarification. I am now matching this loop pattern in
>> number_of_iterations_exit when number_of_iterations_exit_assumptions
>> fails. If the pattern matches, I am inserting the _builtin_popcount in
>> the loop preheater and setting the loop niter with this. This will be
>> used by the final value replacement. Is this what you wanted?
>
> No, you shouldn't insert a popcount stmt but instead the niter
> GENERIC tree should be a CALL_EXPR to popcount with the
> appropriate argument.

 Thats what I tried earlier but ran into some ICEs. I wasn't sure if
 niter in tree_niter_desc can take such.

 Attached patch now does this. Also had to add support for CALL_EXPR in
 few places to handle niter with CALL_EXPR. Does this look OK?
>>>
>>> Overall this looks ok - the patch includes changes in places that I don't 
>>> think
>>> need changes such as chrec_convert_1 or extract_ops_from_tree.
>>> The expression_expensive_p change should be more specific than making
>>> all calls inexpensive as well.
>>>
>>> The verify_ssa change looks bogus, you do
>>>
>>> +  dest = gimple_phi_result (count_phi);
>>> +  tree var = make_ssa_name (TREE_TYPE (dest), NULL);
>>> +  tree fn = builtin_decl_implicit (BUILT_IN_POPCOUNT);
>>> +
>>> +  var = build_call_expr (fn, 1, src);
>>> +  *niter = fold_build2 (MINUS_EXPR, TREE_TYPE (dest), var,
>>> +   build_int_cst (TREE_TYPE (dest), 1));
>>>
>>> why do you allocate a new SSA name here?  It seems unused
>>> as you overwrive 'var' with the CALL_EXPR immediately.
>>>
>>> I didn't review the pattern matching thoroughly nor the exact place you
>>> call it.  But
>>>
>>> +  if (check_popcount_pattern (loop, &count))
>>> +   {
>>> + niter->assumptions = boolean_false_node;
>>> + niter->control.base = NULL_TREE;
>>> + niter->control.step = NULL_TREE;
>>> + niter->control.no_overflow = false;
>>> + niter->niter = count;
>>> + niter->assumptions = boolean_true_node;
>>> + niter->may_be_zero = boolean_false_node;
>>> + niter->max = -1;
>>> + niter->bound = NULL_TREE;
>>> + niter->cmp = ERROR_MARK;
>>> + return true;
>>> +   }
>>>
>>> simply setting may_be_zero to false looks fishy.  Try
>>> with -fno-tree-loop-ch.  Also max should not be negative,
>>> it should be the number of bits in the IV type?
>>>
>>> A related testcase could be that we can completely peel
>>> a loop like the following which iterates at most 8 times:
>>

Re: [RFC][PR82479] missing popcount builtin detection

2018-03-07 Thread Richard Biener
On Tue, Mar 6, 2018 at 5:20 PM, Bin.Cheng  wrote:
> On Mon, Mar 5, 2018 at 3:24 PM, Richard Biener
>  wrote:
>> On Thu, Feb 8, 2018 at 1:41 AM, Kugan Vivekanandarajah
>>  wrote:
>>> Hi Richard,
>>>
>>> On 1 February 2018 at 23:21, Richard Biener  
>>> wrote:
 On Thu, Feb 1, 2018 at 5:07 AM, Kugan Vivekanandarajah
  wrote:
> Hi Richard,
>
> On 31 January 2018 at 21:39, Richard Biener  
> wrote:
>> On Wed, Jan 31, 2018 at 11:28 AM, Kugan Vivekanandarajah
>>  wrote:
>>> Hi Richard,
>>>
>>> Thanks for the review.
>>> On 25 January 2018 at 20:04, Richard Biener 
>>>  wrote:
 On Wed, Jan 24, 2018 at 10:56 PM, Kugan Vivekanandarajah
  wrote:
> Hi All,
>
> Here is a patch for popcount builtin detection similar to LLVM. I
> would like to queue this for review for next stage 1.
>
> 1. This is done part of loop-distribution and effective for -O3 and 
> above.
> 2. This does not distribute loop to detect popcount (like
> memcpy/memmove). I dont think that happens in practice. Please correct
> me if I am wrong.

 But then it has no business inside loop distribution but instead is
 doing final value
 replacement, right?  You are pattern-matching the whole loop after 
 all.  I think
 final value replacement would already do the correct thing if you
 teached number of
 iteration analysis that niter for

[local count: 955630224]:
   # b_11 = PHI 
   _1 = b_11 + -1;
   b_8 = _1 & b_11;
   if (b_8 != 0)
 goto ; [89.00%]
   else
 goto ; [11.00%]

[local count: 850510900]:
   goto ; [100.00%]
>>>
>>> I am looking into this approach. What should be the scalar evolution
>>> for b_8 (i.e. b & (b -1) in a loop) should be? This is not clear to me
>>> and can this be represented with the scev?
>>
>> No, it's not affine and thus cannot be represented.  You only need the
>> scalar evolution of the counting IV which is already handled and
>> the number of iteration analysis needs to handle the above IV - this
>> is the missing part.
> Thanks for the clarification. I am now matching this loop pattern in
> number_of_iterations_exit when number_of_iterations_exit_assumptions
> fails. If the pattern matches, I am inserting the _builtin_popcount in
> the loop preheater and setting the loop niter with this. This will be
> used by the final value replacement. Is this what you wanted?

 No, you shouldn't insert a popcount stmt but instead the niter
 GENERIC tree should be a CALL_EXPR to popcount with the
 appropriate argument.
>>>
>>> Thats what I tried earlier but ran into some ICEs. I wasn't sure if
>>> niter in tree_niter_desc can take such.
>>>
>>> Attached patch now does this. Also had to add support for CALL_EXPR in
>>> few places to handle niter with CALL_EXPR. Does this look OK?
>>
>> Overall this looks ok - the patch includes changes in places that I don't 
>> think
>> need changes such as chrec_convert_1 or extract_ops_from_tree.
>> The expression_expensive_p change should be more specific than making
>> all calls inexpensive as well.
>>
>> The verify_ssa change looks bogus, you do
>>
>> +  dest = gimple_phi_result (count_phi);
>> +  tree var = make_ssa_name (TREE_TYPE (dest), NULL);
>> +  tree fn = builtin_decl_implicit (BUILT_IN_POPCOUNT);
>> +
>> +  var = build_call_expr (fn, 1, src);
>> +  *niter = fold_build2 (MINUS_EXPR, TREE_TYPE (dest), var,
>> +   build_int_cst (TREE_TYPE (dest), 1));
>>
>> why do you allocate a new SSA name here?  It seems unused
>> as you overwrive 'var' with the CALL_EXPR immediately.
>>
>> I didn't review the pattern matching thoroughly nor the exact place you
>> call it.  But
>>
>> +  if (check_popcount_pattern (loop, &count))
>> +   {
>> + niter->assumptions = boolean_false_node;
>> + niter->control.base = NULL_TREE;
>> + niter->control.step = NULL_TREE;
>> + niter->control.no_overflow = false;
>> + niter->niter = count;
>> + niter->assumptions = boolean_true_node;
>> + niter->may_be_zero = boolean_false_node;
>> + niter->max = -1;
>> + niter->bound = NULL_TREE;
>> + niter->cmp = ERROR_MARK;
>> + return true;
>> +   }
>>
>> simply setting may_be_zero to false looks fishy.  Try
>> with -fno-tree-loop-ch.  Also max should not be negative,
>> it should be the number of bits in the IV type?
>>
>> A related testcase could be that we can completely peel
>> a loop like the following which iterates at most 8 times:
>>
>> int a[8];
>> void foo (unsigned char ctrl)
>> {
>>   int c = 0;
>>   while (ctrl)
>> {
>>ctrl = ctrl & (ctrl - 1);
>>a[c++] = ctrl;
>> }
>> }
>>
>> This is now st

Re: [RFC][PR82479] missing popcount builtin detection

2018-03-06 Thread Bin.Cheng
On Mon, Mar 5, 2018 at 3:24 PM, Richard Biener
 wrote:
> On Thu, Feb 8, 2018 at 1:41 AM, Kugan Vivekanandarajah
>  wrote:
>> Hi Richard,
>>
>> On 1 February 2018 at 23:21, Richard Biener  
>> wrote:
>>> On Thu, Feb 1, 2018 at 5:07 AM, Kugan Vivekanandarajah
>>>  wrote:
 Hi Richard,

 On 31 January 2018 at 21:39, Richard Biener  
 wrote:
> On Wed, Jan 31, 2018 at 11:28 AM, Kugan Vivekanandarajah
>  wrote:
>> Hi Richard,
>>
>> Thanks for the review.
>> On 25 January 2018 at 20:04, Richard Biener  
>> wrote:
>>> On Wed, Jan 24, 2018 at 10:56 PM, Kugan Vivekanandarajah
>>>  wrote:
 Hi All,

 Here is a patch for popcount builtin detection similar to LLVM. I
 would like to queue this for review for next stage 1.

 1. This is done part of loop-distribution and effective for -O3 and 
 above.
 2. This does not distribute loop to detect popcount (like
 memcpy/memmove). I dont think that happens in practice. Please correct
 me if I am wrong.
>>>
>>> But then it has no business inside loop distribution but instead is
>>> doing final value
>>> replacement, right?  You are pattern-matching the whole loop after all. 
>>>  I think
>>> final value replacement would already do the correct thing if you
>>> teached number of
>>> iteration analysis that niter for
>>>
>>>[local count: 955630224]:
>>>   # b_11 = PHI 
>>>   _1 = b_11 + -1;
>>>   b_8 = _1 & b_11;
>>>   if (b_8 != 0)
>>> goto ; [89.00%]
>>>   else
>>> goto ; [11.00%]
>>>
>>>[local count: 850510900]:
>>>   goto ; [100.00%]
>>
>> I am looking into this approach. What should be the scalar evolution
>> for b_8 (i.e. b & (b -1) in a loop) should be? This is not clear to me
>> and can this be represented with the scev?
>
> No, it's not affine and thus cannot be represented.  You only need the
> scalar evolution of the counting IV which is already handled and
> the number of iteration analysis needs to handle the above IV - this
> is the missing part.
 Thanks for the clarification. I am now matching this loop pattern in
 number_of_iterations_exit when number_of_iterations_exit_assumptions
 fails. If the pattern matches, I am inserting the _builtin_popcount in
 the loop preheater and setting the loop niter with this. This will be
 used by the final value replacement. Is this what you wanted?
>>>
>>> No, you shouldn't insert a popcount stmt but instead the niter
>>> GENERIC tree should be a CALL_EXPR to popcount with the
>>> appropriate argument.
>>
>> Thats what I tried earlier but ran into some ICEs. I wasn't sure if
>> niter in tree_niter_desc can take such.
>>
>> Attached patch now does this. Also had to add support for CALL_EXPR in
>> few places to handle niter with CALL_EXPR. Does this look OK?
>
> Overall this looks ok - the patch includes changes in places that I don't 
> think
> need changes such as chrec_convert_1 or extract_ops_from_tree.
> The expression_expensive_p change should be more specific than making
> all calls inexpensive as well.
>
> The verify_ssa change looks bogus, you do
>
> +  dest = gimple_phi_result (count_phi);
> +  tree var = make_ssa_name (TREE_TYPE (dest), NULL);
> +  tree fn = builtin_decl_implicit (BUILT_IN_POPCOUNT);
> +
> +  var = build_call_expr (fn, 1, src);
> +  *niter = fold_build2 (MINUS_EXPR, TREE_TYPE (dest), var,
> +   build_int_cst (TREE_TYPE (dest), 1));
>
> why do you allocate a new SSA name here?  It seems unused
> as you overwrive 'var' with the CALL_EXPR immediately.
>
> I didn't review the pattern matching thoroughly nor the exact place you
> call it.  But
>
> +  if (check_popcount_pattern (loop, &count))
> +   {
> + niter->assumptions = boolean_false_node;
> + niter->control.base = NULL_TREE;
> + niter->control.step = NULL_TREE;
> + niter->control.no_overflow = false;
> + niter->niter = count;
> + niter->assumptions = boolean_true_node;
> + niter->may_be_zero = boolean_false_node;
> + niter->max = -1;
> + niter->bound = NULL_TREE;
> + niter->cmp = ERROR_MARK;
> + return true;
> +   }
>
> simply setting may_be_zero to false looks fishy.  Try
> with -fno-tree-loop-ch.  Also max should not be negative,
> it should be the number of bits in the IV type?
>
> A related testcase could be that we can completely peel
> a loop like the following which iterates at most 8 times:
>
> int a[8];
> void foo (unsigned char ctrl)
> {
>   int c = 0;
>   while (ctrl)
> {
>ctrl = ctrl & (ctrl - 1);
>a[c++] = ctrl;
> }
> }
>
> This is now stage1 material so please update and re-post.  Maybe Bin has
> further suggestions as well.
Sorry for being late on this.  Actually I thought about popcount in
distribution before.  More l

Re: [RFC][PR82479] missing popcount builtin detection

2018-03-05 Thread Richard Biener
On Thu, Feb 8, 2018 at 1:41 AM, Kugan Vivekanandarajah
 wrote:
> Hi Richard,
>
> On 1 February 2018 at 23:21, Richard Biener  
> wrote:
>> On Thu, Feb 1, 2018 at 5:07 AM, Kugan Vivekanandarajah
>>  wrote:
>>> Hi Richard,
>>>
>>> On 31 January 2018 at 21:39, Richard Biener  
>>> wrote:
 On Wed, Jan 31, 2018 at 11:28 AM, Kugan Vivekanandarajah
  wrote:
> Hi Richard,
>
> Thanks for the review.
> On 25 January 2018 at 20:04, Richard Biener  
> wrote:
>> On Wed, Jan 24, 2018 at 10:56 PM, Kugan Vivekanandarajah
>>  wrote:
>>> Hi All,
>>>
>>> Here is a patch for popcount builtin detection similar to LLVM. I
>>> would like to queue this for review for next stage 1.
>>>
>>> 1. This is done part of loop-distribution and effective for -O3 and 
>>> above.
>>> 2. This does not distribute loop to detect popcount (like
>>> memcpy/memmove). I dont think that happens in practice. Please correct
>>> me if I am wrong.
>>
>> But then it has no business inside loop distribution but instead is
>> doing final value
>> replacement, right?  You are pattern-matching the whole loop after all.  
>> I think
>> final value replacement would already do the correct thing if you
>> teached number of
>> iteration analysis that niter for
>>
>>[local count: 955630224]:
>>   # b_11 = PHI 
>>   _1 = b_11 + -1;
>>   b_8 = _1 & b_11;
>>   if (b_8 != 0)
>> goto ; [89.00%]
>>   else
>> goto ; [11.00%]
>>
>>[local count: 850510900]:
>>   goto ; [100.00%]
>
> I am looking into this approach. What should be the scalar evolution
> for b_8 (i.e. b & (b -1) in a loop) should be? This is not clear to me
> and can this be represented with the scev?

 No, it's not affine and thus cannot be represented.  You only need the
 scalar evolution of the counting IV which is already handled and
 the number of iteration analysis needs to handle the above IV - this
 is the missing part.
>>> Thanks for the clarification. I am now matching this loop pattern in
>>> number_of_iterations_exit when number_of_iterations_exit_assumptions
>>> fails. If the pattern matches, I am inserting the _builtin_popcount in
>>> the loop preheater and setting the loop niter with this. This will be
>>> used by the final value replacement. Is this what you wanted?
>>
>> No, you shouldn't insert a popcount stmt but instead the niter
>> GENERIC tree should be a CALL_EXPR to popcount with the
>> appropriate argument.
>
> Thats what I tried earlier but ran into some ICEs. I wasn't sure if
> niter in tree_niter_desc can take such.
>
> Attached patch now does this. Also had to add support for CALL_EXPR in
> few places to handle niter with CALL_EXPR. Does this look OK?

Overall this looks ok - the patch includes changes in places that I don't think
need changes such as chrec_convert_1 or extract_ops_from_tree.
The expression_expensive_p change should be more specific than making
all calls inexpensive as well.

The verify_ssa change looks bogus, you do

+  dest = gimple_phi_result (count_phi);
+  tree var = make_ssa_name (TREE_TYPE (dest), NULL);
+  tree fn = builtin_decl_implicit (BUILT_IN_POPCOUNT);
+
+  var = build_call_expr (fn, 1, src);
+  *niter = fold_build2 (MINUS_EXPR, TREE_TYPE (dest), var,
+   build_int_cst (TREE_TYPE (dest), 1));

why do you allocate a new SSA name here?  It seems unused
as you overwrive 'var' with the CALL_EXPR immediately.

I didn't review the pattern matching thoroughly nor the exact place you
call it.  But

+  if (check_popcount_pattern (loop, &count))
+   {
+ niter->assumptions = boolean_false_node;
+ niter->control.base = NULL_TREE;
+ niter->control.step = NULL_TREE;
+ niter->control.no_overflow = false;
+ niter->niter = count;
+ niter->assumptions = boolean_true_node;
+ niter->may_be_zero = boolean_false_node;
+ niter->max = -1;
+ niter->bound = NULL_TREE;
+ niter->cmp = ERROR_MARK;
+ return true;
+   }

simply setting may_be_zero to false looks fishy.  Try
with -fno-tree-loop-ch.  Also max should not be negative,
it should be the number of bits in the IV type?

A related testcase could be that we can completely peel
a loop like the following which iterates at most 8 times:

int a[8];
void foo (unsigned char ctrl)
{
  int c = 0;
  while (ctrl)
{
   ctrl = ctrl & (ctrl - 1);
   a[c++] = ctrl;
}
}

This is now stage1 material so please update and re-post.  Maybe Bin has
further suggestions as well.

Thanks,
Richard.

> Thanks,
> Kugan
>
>
> gcc/ChangeLog:
>
> 2018-02-08  Kugan Vivekanandarajah  
>
> * gimple-expr.c (extract_ops_from_tree): Handle CALL_EXPR.
> * tree-chrec.c (chrec_convert_1): Likewise.
> * tree-scalar-evolution.c (expression_expensive_p): Likewise.
> * tree-ssa-loop-ivopts.c (contains_abnormal_s

Re: [RFC][PR82479] missing popcount builtin detection

2018-02-07 Thread Kugan Vivekanandarajah
Hi Richard,

On 1 February 2018 at 23:21, Richard Biener  wrote:
> On Thu, Feb 1, 2018 at 5:07 AM, Kugan Vivekanandarajah
>  wrote:
>> Hi Richard,
>>
>> On 31 January 2018 at 21:39, Richard Biener  
>> wrote:
>>> On Wed, Jan 31, 2018 at 11:28 AM, Kugan Vivekanandarajah
>>>  wrote:
 Hi Richard,

 Thanks for the review.
 On 25 January 2018 at 20:04, Richard Biener  
 wrote:
> On Wed, Jan 24, 2018 at 10:56 PM, Kugan Vivekanandarajah
>  wrote:
>> Hi All,
>>
>> Here is a patch for popcount builtin detection similar to LLVM. I
>> would like to queue this for review for next stage 1.
>>
>> 1. This is done part of loop-distribution and effective for -O3 and 
>> above.
>> 2. This does not distribute loop to detect popcount (like
>> memcpy/memmove). I dont think that happens in practice. Please correct
>> me if I am wrong.
>
> But then it has no business inside loop distribution but instead is
> doing final value
> replacement, right?  You are pattern-matching the whole loop after all.  
> I think
> final value replacement would already do the correct thing if you
> teached number of
> iteration analysis that niter for
>
>[local count: 955630224]:
>   # b_11 = PHI 
>   _1 = b_11 + -1;
>   b_8 = _1 & b_11;
>   if (b_8 != 0)
> goto ; [89.00%]
>   else
> goto ; [11.00%]
>
>[local count: 850510900]:
>   goto ; [100.00%]

 I am looking into this approach. What should be the scalar evolution
 for b_8 (i.e. b & (b -1) in a loop) should be? This is not clear to me
 and can this be represented with the scev?
>>>
>>> No, it's not affine and thus cannot be represented.  You only need the
>>> scalar evolution of the counting IV which is already handled and
>>> the number of iteration analysis needs to handle the above IV - this
>>> is the missing part.
>> Thanks for the clarification. I am now matching this loop pattern in
>> number_of_iterations_exit when number_of_iterations_exit_assumptions
>> fails. If the pattern matches, I am inserting the _builtin_popcount in
>> the loop preheater and setting the loop niter with this. This will be
>> used by the final value replacement. Is this what you wanted?
>
> No, you shouldn't insert a popcount stmt but instead the niter
> GENERIC tree should be a CALL_EXPR to popcount with the
> appropriate argument.

Thats what I tried earlier but ran into some ICEs. I wasn't sure if
niter in tree_niter_desc can take such.

Attached patch now does this. Also had to add support for CALL_EXPR in
few places to handle niter with CALL_EXPR. Does this look OK?

Thanks,
Kugan


gcc/ChangeLog:

2018-02-08  Kugan Vivekanandarajah  

* gimple-expr.c (extract_ops_from_tree): Handle CALL_EXPR.
* tree-chrec.c (chrec_convert_1): Likewise.
* tree-scalar-evolution.c (expression_expensive_p): Likewise.
* tree-ssa-loop-ivopts.c (contains_abnormal_ssa_name_p): Likewise.
* tree-ssa-loop-niter.c (check_popcount_pattern): New.
(number_of_iterations_exit): Record niter for popcount patern.
* tree-ssa.c (verify_ssa): Check stmt to be non NULL.

gcc/testsuite/ChangeLog:

2018-02-08  Kugan Vivekanandarajah  

* gcc.dg/tree-ssa/popcount.c: New test.


>
>> In general, there is also a condition gating the loop like
>>
>> if (b_4 != 0)
>>   goto loop;
>> else
>>   end:
>>
>> This of course will not be removed by final value replacement. Since
>> popcount (0) is defined, this is redundant and could be removed but
>> not removed.
>
> Yeah, that's probably sth for another pass though.  I suppose the
> end: case just uses zero in which case you'll have a PHI and you
> can optimize
>
>   if (b != 0)
> return popcount (b);
>   return 0;
>
> in phiopt.
>
> Richard.
>
>> Thanks,
>> Kugan
>>
>>>
>>> Richard.
>>>
 Thanks,
 Kugan
>
> is related to popcount (b_5).
>
> Richard.
>
>> Bootstrapped and regression tested on aarch64-linux-gnu with no new 
>> regressions.
>>
>> Thanks,
>> Kugan
>>
>> gcc/ChangeLog:
>>
>> 2018-01-25  Kugan Vivekanandarajah  
>>
>> PR middle-end/82479
>> * tree-loop-distribution.c (handle_popcount): New.
>> (pass_loop_distribution::execute): Use handle_popcount.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 2018-01-25  Kugan Vivekanandarajah  
>>
>> PR middle-end/82479
>> * gcc.dg/tree-ssa/popcount.c: New test.
From 296efa2c09cdd797e3295f0c29a5a943dc87e9f2 Mon Sep 17 00:00:00 2001
From: Kugan Vivekanandarajah 
Date: Thu, 8 Feb 2018 09:28:57 +1100
Subject: [PATCH] popcount v2

---
 gcc/gimple-expr.c|   3 +-
 gcc/testsuite/gcc.dg/tree-ssa/popcount.c |  41 ++
 gcc/tree-chrec.c |   2 +
 gcc/tree-scalar-evolution.c  |   2 +
 gcc/tree-ssa-loop-ivopts.c   |  10 +++
 gcc/tree-ssa-loop-niter.c| 124

Re: [RFC][PR82479] missing popcount builtin detection

2018-02-01 Thread Richard Biener
On Thu, Feb 1, 2018 at 5:07 AM, Kugan Vivekanandarajah
 wrote:
> Hi Richard,
>
> On 31 January 2018 at 21:39, Richard Biener  
> wrote:
>> On Wed, Jan 31, 2018 at 11:28 AM, Kugan Vivekanandarajah
>>  wrote:
>>> Hi Richard,
>>>
>>> Thanks for the review.
>>> On 25 January 2018 at 20:04, Richard Biener  
>>> wrote:
 On Wed, Jan 24, 2018 at 10:56 PM, Kugan Vivekanandarajah
  wrote:
> Hi All,
>
> Here is a patch for popcount builtin detection similar to LLVM. I
> would like to queue this for review for next stage 1.
>
> 1. This is done part of loop-distribution and effective for -O3 and above.
> 2. This does not distribute loop to detect popcount (like
> memcpy/memmove). I dont think that happens in practice. Please correct
> me if I am wrong.

 But then it has no business inside loop distribution but instead is
 doing final value
 replacement, right?  You are pattern-matching the whole loop after all.  I 
 think
 final value replacement would already do the correct thing if you
 teached number of
 iteration analysis that niter for

[local count: 955630224]:
   # b_11 = PHI 
   _1 = b_11 + -1;
   b_8 = _1 & b_11;
   if (b_8 != 0)
 goto ; [89.00%]
   else
 goto ; [11.00%]

[local count: 850510900]:
   goto ; [100.00%]
>>>
>>> I am looking into this approach. What should be the scalar evolution
>>> for b_8 (i.e. b & (b -1) in a loop) should be? This is not clear to me
>>> and can this be represented with the scev?
>>
>> No, it's not affine and thus cannot be represented.  You only need the
>> scalar evolution of the counting IV which is already handled and
>> the number of iteration analysis needs to handle the above IV - this
>> is the missing part.
> Thanks for the clarification. I am now matching this loop pattern in
> number_of_iterations_exit when number_of_iterations_exit_assumptions
> fails. If the pattern matches, I am inserting the _builtin_popcount in
> the loop preheater and setting the loop niter with this. This will be
> used by the final value replacement. Is this what you wanted?

No, you shouldn't insert a popcount stmt but instead the niter
GENERIC tree should be a CALL_EXPR to popcount with the
appropriate argument.

> In general, there is also a condition gating the loop like
>
> if (b_4 != 0)
>   goto loop;
> else
>   end:
>
> This of course will not be removed by final value replacement. Since
> popcount (0) is defined, this is redundant and could be removed but
> not removed.

Yeah, that's probably sth for another pass though.  I suppose the
end: case just uses zero in which case you'll have a PHI and you
can optimize

  if (b != 0)
return popcount (b);
  return 0;

in phiopt.

Richard.

> Thanks,
> Kugan
>
>>
>> Richard.
>>
>>> Thanks,
>>> Kugan

 is related to popcount (b_5).

 Richard.

> Bootstrapped and regression tested on aarch64-linux-gnu with no new 
> regressions.
>
> Thanks,
> Kugan
>
> gcc/ChangeLog:
>
> 2018-01-25  Kugan Vivekanandarajah  
>
> PR middle-end/82479
> * tree-loop-distribution.c (handle_popcount): New.
> (pass_loop_distribution::execute): Use handle_popcount.
>
> gcc/testsuite/ChangeLog:
>
> 2018-01-25  Kugan Vivekanandarajah  
>
> PR middle-end/82479
> * gcc.dg/tree-ssa/popcount.c: New test.


Re: [RFC][PR82479] missing popcount builtin detection

2018-01-31 Thread Kugan Vivekanandarajah
Hi Richard,

On 31 January 2018 at 21:39, Richard Biener  wrote:
> On Wed, Jan 31, 2018 at 11:28 AM, Kugan Vivekanandarajah
>  wrote:
>> Hi Richard,
>>
>> Thanks for the review.
>> On 25 January 2018 at 20:04, Richard Biener  
>> wrote:
>>> On Wed, Jan 24, 2018 at 10:56 PM, Kugan Vivekanandarajah
>>>  wrote:
 Hi All,

 Here is a patch for popcount builtin detection similar to LLVM. I
 would like to queue this for review for next stage 1.

 1. This is done part of loop-distribution and effective for -O3 and above.
 2. This does not distribute loop to detect popcount (like
 memcpy/memmove). I dont think that happens in practice. Please correct
 me if I am wrong.
>>>
>>> But then it has no business inside loop distribution but instead is
>>> doing final value
>>> replacement, right?  You are pattern-matching the whole loop after all.  I 
>>> think
>>> final value replacement would already do the correct thing if you
>>> teached number of
>>> iteration analysis that niter for
>>>
>>>[local count: 955630224]:
>>>   # b_11 = PHI 
>>>   _1 = b_11 + -1;
>>>   b_8 = _1 & b_11;
>>>   if (b_8 != 0)
>>> goto ; [89.00%]
>>>   else
>>> goto ; [11.00%]
>>>
>>>[local count: 850510900]:
>>>   goto ; [100.00%]
>>
>> I am looking into this approach. What should be the scalar evolution
>> for b_8 (i.e. b & (b -1) in a loop) should be? This is not clear to me
>> and can this be represented with the scev?
>
> No, it's not affine and thus cannot be represented.  You only need the
> scalar evolution of the counting IV which is already handled and
> the number of iteration analysis needs to handle the above IV - this
> is the missing part.
Thanks for the clarification. I am now matching this loop pattern in
number_of_iterations_exit when number_of_iterations_exit_assumptions
fails. If the pattern matches, I am inserting the _builtin_popcount in
the loop preheater and setting the loop niter with this. This will be
used by the final value replacement. Is this what you wanted?

In general, there is also a condition gating the loop like

if (b_4 != 0)
  goto loop;
else
  end:

This of course will not be removed by final value replacement. Since
popcount (0) is defined, this is redundant and could be removed but
not removed.

Thanks,
Kugan

>
> Richard.
>
>> Thanks,
>> Kugan
>>>
>>> is related to popcount (b_5).
>>>
>>> Richard.
>>>
 Bootstrapped and regression tested on aarch64-linux-gnu with no new 
 regressions.

 Thanks,
 Kugan

 gcc/ChangeLog:

 2018-01-25  Kugan Vivekanandarajah  

 PR middle-end/82479
 * tree-loop-distribution.c (handle_popcount): New.
 (pass_loop_distribution::execute): Use handle_popcount.

 gcc/testsuite/ChangeLog:

 2018-01-25  Kugan Vivekanandarajah  

 PR middle-end/82479
 * gcc.dg/tree-ssa/popcount.c: New test.


Re: [RFC][PR82479] missing popcount builtin detection

2018-01-31 Thread Richard Biener
On Wed, Jan 31, 2018 at 11:28 AM, Kugan Vivekanandarajah
 wrote:
> Hi Richard,
>
> Thanks for the review.
> On 25 January 2018 at 20:04, Richard Biener  
> wrote:
>> On Wed, Jan 24, 2018 at 10:56 PM, Kugan Vivekanandarajah
>>  wrote:
>>> Hi All,
>>>
>>> Here is a patch for popcount builtin detection similar to LLVM. I
>>> would like to queue this for review for next stage 1.
>>>
>>> 1. This is done part of loop-distribution and effective for -O3 and above.
>>> 2. This does not distribute loop to detect popcount (like
>>> memcpy/memmove). I dont think that happens in practice. Please correct
>>> me if I am wrong.
>>
>> But then it has no business inside loop distribution but instead is
>> doing final value
>> replacement, right?  You are pattern-matching the whole loop after all.  I 
>> think
>> final value replacement would already do the correct thing if you
>> teached number of
>> iteration analysis that niter for
>>
>>[local count: 955630224]:
>>   # b_11 = PHI 
>>   _1 = b_11 + -1;
>>   b_8 = _1 & b_11;
>>   if (b_8 != 0)
>> goto ; [89.00%]
>>   else
>> goto ; [11.00%]
>>
>>[local count: 850510900]:
>>   goto ; [100.00%]
>
> I am looking into this approach. What should be the scalar evolution
> for b_8 (i.e. b & (b -1) in a loop) should be? This is not clear to me
> and can this be represented with the scev?

No, it's not affine and thus cannot be represented.  You only need the
scalar evolution of the counting IV which is already handled and
the number of iteration analysis needs to handle the above IV - this
is the missing part.

Richard.

> Thanks,
> Kugan
>>
>> is related to popcount (b_5).
>>
>> Richard.
>>
>>> Bootstrapped and regression tested on aarch64-linux-gnu with no new 
>>> regressions.
>>>
>>> Thanks,
>>> Kugan
>>>
>>> gcc/ChangeLog:
>>>
>>> 2018-01-25  Kugan Vivekanandarajah  
>>>
>>> PR middle-end/82479
>>> * tree-loop-distribution.c (handle_popcount): New.
>>> (pass_loop_distribution::execute): Use handle_popcount.
>>>
>>> gcc/testsuite/ChangeLog:
>>>
>>> 2018-01-25  Kugan Vivekanandarajah  
>>>
>>> PR middle-end/82479
>>> * gcc.dg/tree-ssa/popcount.c: New test.


Re: [RFC][PR82479] missing popcount builtin detection

2018-01-31 Thread Kugan Vivekanandarajah
Hi Richard,

Thanks for the review.
On 25 January 2018 at 20:04, Richard Biener  wrote:
> On Wed, Jan 24, 2018 at 10:56 PM, Kugan Vivekanandarajah
>  wrote:
>> Hi All,
>>
>> Here is a patch for popcount builtin detection similar to LLVM. I
>> would like to queue this for review for next stage 1.
>>
>> 1. This is done part of loop-distribution and effective for -O3 and above.
>> 2. This does not distribute loop to detect popcount (like
>> memcpy/memmove). I dont think that happens in practice. Please correct
>> me if I am wrong.
>
> But then it has no business inside loop distribution but instead is
> doing final value
> replacement, right?  You are pattern-matching the whole loop after all.  I 
> think
> final value replacement would already do the correct thing if you
> teached number of
> iteration analysis that niter for
>
>[local count: 955630224]:
>   # b_11 = PHI 
>   _1 = b_11 + -1;
>   b_8 = _1 & b_11;
>   if (b_8 != 0)
> goto ; [89.00%]
>   else
> goto ; [11.00%]
>
>[local count: 850510900]:
>   goto ; [100.00%]

I am looking into this approach. What should be the scalar evolution
for b_8 (i.e. b & (b -1) in a loop) should be? This is not clear to me
and can this be represented with the scev?

Thanks,
Kugan
>
> is related to popcount (b_5).
>
> Richard.
>
>> Bootstrapped and regression tested on aarch64-linux-gnu with no new 
>> regressions.
>>
>> Thanks,
>> Kugan
>>
>> gcc/ChangeLog:
>>
>> 2018-01-25  Kugan Vivekanandarajah  
>>
>> PR middle-end/82479
>> * tree-loop-distribution.c (handle_popcount): New.
>> (pass_loop_distribution::execute): Use handle_popcount.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 2018-01-25  Kugan Vivekanandarajah  
>>
>> PR middle-end/82479
>> * gcc.dg/tree-ssa/popcount.c: New test.


Re: [RFC][PR82479] missing popcount builtin detection

2018-01-25 Thread Richard Biener
On Wed, Jan 24, 2018 at 10:56 PM, Kugan Vivekanandarajah
 wrote:
> Hi All,
>
> Here is a patch for popcount builtin detection similar to LLVM. I
> would like to queue this for review for next stage 1.
>
> 1. This is done part of loop-distribution and effective for -O3 and above.
> 2. This does not distribute loop to detect popcount (like
> memcpy/memmove). I dont think that happens in practice. Please correct
> me if I am wrong.

But then it has no business inside loop distribution but instead is
doing final value
replacement, right?  You are pattern-matching the whole loop after all.  I think
final value replacement would already do the correct thing if you
teached number of
iteration analysis that niter for

   [local count: 955630224]:
  # b_11 = PHI 
  _1 = b_11 + -1;
  b_8 = _1 & b_11;
  if (b_8 != 0)
goto ; [89.00%]
  else
goto ; [11.00%]

   [local count: 850510900]:
  goto ; [100.00%]

is related to popcount (b_5).

Richard.

> Bootstrapped and regression tested on aarch64-linux-gnu with no new 
> regressions.
>
> Thanks,
> Kugan
>
> gcc/ChangeLog:
>
> 2018-01-25  Kugan Vivekanandarajah  
>
> PR middle-end/82479
> * tree-loop-distribution.c (handle_popcount): New.
> (pass_loop_distribution::execute): Use handle_popcount.
>
> gcc/testsuite/ChangeLog:
>
> 2018-01-25  Kugan Vivekanandarajah  
>
> PR middle-end/82479
> * gcc.dg/tree-ssa/popcount.c: New test.


[RFC][PR82479] missing popcount builtin detection

2018-01-24 Thread Kugan Vivekanandarajah
Hi All,

Here is a patch for popcount builtin detection similar to LLVM. I
would like to queue this for review for next stage 1.

1. This is done part of loop-distribution and effective for -O3 and above.
2. This does not distribute loop to detect popcount (like
memcpy/memmove). I dont think that happens in practice. Please correct
me if I am wrong.

Bootstrapped and regression tested on aarch64-linux-gnu with no new regressions.

Thanks,
Kugan

gcc/ChangeLog:

2018-01-25  Kugan Vivekanandarajah  

PR middle-end/82479
* tree-loop-distribution.c (handle_popcount): New.
(pass_loop_distribution::execute): Use handle_popcount.

gcc/testsuite/ChangeLog:

2018-01-25  Kugan Vivekanandarajah  

PR middle-end/82479
* gcc.dg/tree-ssa/popcount.c: New test.
From 9fa09af4b7013c6207e59a4920c82f089bfe45c2 Mon Sep 17 00:00:00 2001
From: Kugan Vivekanandarajah 
Date: Wed, 24 Jan 2018 08:50:08 +1100
Subject: [PATCH] pocount builtin detection

Change-Id: Ic6e175f9cc9a69bd417936a4845c2c046fd446b4

Change-Id: I680eb107445660c60a5d38f5d7300ab1a3243bf5

Change-Id: Ia9f0df89e05520091dc7797195098118768c7ac2
---
 gcc/testsuite/gcc.dg/tree-ssa/popcount.c |  41 +
 gcc/tree-loop-distribution.c | 145 +++
 2 files changed, 186 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/popcount.c

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount.c
new file mode 100644
index 000..86a66cb
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount.c
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-tree-optimized" } */
+
+extern int foo (int);
+
+int PopCount (long b) {
+int c = 0;
+b++;
+
+while (b) {
+	b &= b - 1;
+	c++;
+}
+return c;
+}
+int PopCount2 (long b) {
+int c = 0;
+
+while (b) {
+	b &= b - 1;
+	c++;
+}
+foo (c);
+return foo (c);
+}
+
+void PopCount3 (long b1) {
+
+for (long i = 0; i < b1; ++i)
+  {
+	long b = i;
+	int c = 0;
+	while (b) {
+	b &= b - 1;
+	c++;
+	}
+	foo (c);
+  }
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_popcount" 3 "optimized" } } */
diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c
index a3d76e4..1060700 100644
--- a/gcc/tree-loop-distribution.c
+++ b/gcc/tree-loop-distribution.c
@@ -1585,6 +1585,148 @@ classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition,
   return;
 }
 
+/* See if loop is a popcout implementation of the form
+
+int c = 0;
+while (b) {
+	b = b & (b - 1);
+	c++;
+}
+
+If so, convert this into c = __builtin_popcount (b)
+return true if we did, false otherwise.  */
+
+
+static bool
+handle_popcount (loop_p loop)
+{
+  tree lhs, rhs;
+  tree dest, src;
+  gimple *and_minus_one;
+  int count = 0;
+  gphi *count_phi;
+  gimple *fn_call;
+  gimple *use_stmt;
+  use_operand_p use_p;
+  imm_use_iterator iter;
+  gimple_stmt_iterator gsi;
+
+  /* Check loop terminating branch is like
+ if (b != 0).  */
+  gimple *stmt = last_stmt (loop->header);
+  if (!stmt
+  || gimple_code (stmt) != GIMPLE_COND
+  || !zerop (gimple_cond_rhs (stmt)))
+return false;
+
+  /* Cheeck "b = b & (b - 1)" is calculated.  */
+  lhs = gimple_cond_lhs (stmt);
+  gimple *and_stmt = SSA_NAME_DEF_STMT (lhs);
+  if (gimple_assign_rhs_code (and_stmt) != BIT_AND_EXPR)
+return false;
+  lhs = gimple_assign_rhs1 (and_stmt);
+  rhs = gimple_assign_rhs2 (and_stmt);
+  if (TREE_CODE (lhs) == SSA_NAME
+  && (and_minus_one = SSA_NAME_DEF_STMT (lhs))
+  && is_gimple_assign (and_minus_one)
+  && (gimple_assign_rhs_code (and_minus_one) == PLUS_EXPR)
+  && integer_minus_onep (gimple_assign_rhs2 (and_minus_one)))
+  lhs = rhs;
+  else if (TREE_CODE (rhs) == SSA_NAME
+  && (and_minus_one = SSA_NAME_DEF_STMT (rhs))
+  && is_gimple_assign (and_minus_one)
+  && (gimple_assign_rhs_code (and_minus_one) == PLUS_EXPR)
+  && integer_minus_onep (gimple_assign_rhs2 (and_minus_one)))
+  ;
+  else
+return false;
+  if ((gimple_assign_rhs1 (and_stmt) != gimple_assign_rhs1 (and_minus_one))
+  && (gimple_assign_rhs2 (and_stmt) != gimple_assign_rhs1 (and_minus_one)))
+return false;
+
+  /* Check the recurrence.  */
+  gimple *phi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (and_minus_one));
+  gimple *src_phi = SSA_NAME_DEF_STMT (lhs);
+  if (gimple_code (phi) != GIMPLE_PHI
+  || gimple_code (src_phi) != GIMPLE_PHI)
+return false;
+
+  /* Check the loop closed SSA definition for just the variable c defined in
+ loop.  */
+  src = gimple_phi_arg_def (src_phi, loop_preheader_edge (loop)->dest_idx);
+  basic_block bb = single_exit (loop)->dest;
+  for (gphi_iterator gpi = gsi_start_phis (bb);
+   !gsi_end_p (gpi); gsi_next (&gpi))
+{
+  count_phi = gpi.phi ();
+  count++;
+}
+  if (count != 1)
+return false;
+
+  /* Make sure we have a count by one and it starts from zero.  */
+  rhs =