Re: Use nonzero bits to refine range in split_constant_offset (PR 81635)

2018-02-08 Thread Richard Biener
On Thu, Feb 8, 2018 at 1:09 PM, Richard Sandiford
 wrote:
> Richard Biener  writes:
>> On Fri, Feb 2, 2018 at 3:12 PM, Richard Sandiford
>>  wrote:
>>> Index: gcc/tree-data-ref.c
>>> ===
>>> --- gcc/tree-data-ref.c 2018-02-02 14:03:53.964530009 +
>>> +++ gcc/tree-data-ref.c 2018-02-02 14:03:54.184521826 +
>>> @@ -721,7 +721,13 @@ split_constant_offset_1 (tree type, tree
>>> if (TREE_CODE (tmp_var) != SSA_NAME)
>>>   return false;
>>> wide_int var_min, var_max;
>>> -   if (get_range_info (tmp_var, _min, _max) != 
>>> VR_RANGE)
>>> +   value_range_type vr_type = get_range_info (tmp_var, 
>>> _min,
>>> +  _max);
>>> +   wide_int var_nonzero = get_nonzero_bits (tmp_var);
>>> +   signop sgn = TYPE_SIGN (itype);
>>> +   if (intersect_range_with_nonzero_bits (vr_type, _min,
>>> +  _max, 
>>> var_nonzero,
>>> +  sgn) != VR_RANGE)
>>
>> Above it looks like we could go from VR_RANGE to VR_UNDEFINED.
>> I'm not sure if the original range-info might be useful in this case -
>> if it may be
>> can we simply use only the range info if it was VR_RANGE?
>
> I think we only drop to VR_UNDEFINED if we have contradictory
> information: nonzero bits says some bits must be clear, but the range
> only contains values for which the bits are set.  In that case I think
> we should either be conservative and not use the information, or be
> aggressive and say that we have undefined behaviour, so overflow is OK.
>
> It seems a bit of a fudge to go back to the old range when we know it's
> false, and use it to allow the split some times and not others.

Fine.

> Thanks,
> Richard
>
>>
>> Ok otherwise.
>> Thanks,
>> Richard.
>>
>>>   return false;
>>>
>>> /* See whether the range of OP0 (i.e. TMP_VAR + TMP_OFF)
>>> @@ -729,7 +735,6 @@ split_constant_offset_1 (tree type, tree
>>>operations done in ITYPE.  The addition must overflow
>>>at both ends of the range or at neither.  */
>>> bool overflow[2];
>>> -   signop sgn = TYPE_SIGN (itype);
>>> unsigned int prec = TYPE_PRECISION (itype);
>>> wide_int woff = wi::to_wide (tmp_off, prec);
>>> wide_int op0_min = wi::add (var_min, woff, sgn, 
>>> [0]);
>>> Index: gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-3.c
>>> ===
>>> --- /dev/null   2018-02-02 09:03:36.168354735 +
>>> +++ gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-3.c2018-02-02 
>>> 14:03:54.183521863 +
>>> @@ -0,0 +1,62 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-additional-options "-fno-tree-loop-vectorize" } */
>>> +/* { dg-require-effective-target vect_double } */
>>> +/* { dg-require-effective-target lp64 } */
>>> +
>>> +void
>>> +f1 (double *p, double *q, unsigned int n)
>>> +{
>>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
>>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
>>> +  for (unsigned int i = 0; i < n; i += 4)
>>> +{
>>> +  double a = q[i] + p[i];
>>> +  double b = q[i + 1] + p[i + 1];
>>> +  q[i] = a;
>>> +  q[i + 1] = b;
>>> +}
>>> +}
>>> +
>>> +void
>>> +f2 (double *p, double *q, unsigned int n)
>>> +{
>>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
>>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
>>> +  for (unsigned int i = 0; i < n; i += 2)
>>> +{
>>> +  double a = q[i] + p[i];
>>> +  double b = q[i + 1] + p[i + 1];
>>> +  q[i] = a;
>>> +  q[i + 1] = b;
>>> +}
>>> +}
>>> +
>>> +void
>>> +f3 (double *p, double *q, unsigned int n)
>>> +{
>>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
>>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
>>> +  for (unsigned int i = 0; i < n; i += 6)
>>> +{
>>> +  double a = q[i] + p[i];
>>> +  double b = q[i + 1] + p[i + 1];
>>> +  q[i] = a;
>>> +  q[i + 1] = b;
>>> +}
>>> +}
>>> +
>>> +void
>>> +f4 (double *p, double *q, unsigned int start, unsigned int n)
>>> +{
>>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
>>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
>>> +  for (unsigned int i = start & -2; i < n; i += 2)
>>> +{
>>> +  double a = q[i] + p[i];
>>> +  double b = q[i + 1] + p[i + 1];
>>> +  q[i] = a;
>>> +  q[i + 1] = b;
>>> +}
>>> +}
>>> +
>>> +/* { dg-final { scan-tree-dump-times "basic block vectorized" 4 "slp1" } } 
>>> */
>>> Index: 

Re: Use nonzero bits to refine range in split_constant_offset (PR 81635)

2018-02-08 Thread Richard Sandiford
Richard Biener  writes:
> On Fri, Feb 2, 2018 at 3:12 PM, Richard Sandiford
>  wrote:
>> Index: gcc/tree-data-ref.c
>> ===
>> --- gcc/tree-data-ref.c 2018-02-02 14:03:53.964530009 +
>> +++ gcc/tree-data-ref.c 2018-02-02 14:03:54.184521826 +
>> @@ -721,7 +721,13 @@ split_constant_offset_1 (tree type, tree
>> if (TREE_CODE (tmp_var) != SSA_NAME)
>>   return false;
>> wide_int var_min, var_max;
>> -   if (get_range_info (tmp_var, _min, _max) != VR_RANGE)
>> +   value_range_type vr_type = get_range_info (tmp_var, _min,
>> +  _max);
>> +   wide_int var_nonzero = get_nonzero_bits (tmp_var);
>> +   signop sgn = TYPE_SIGN (itype);
>> +   if (intersect_range_with_nonzero_bits (vr_type, _min,
>> +  _max, var_nonzero,
>> +  sgn) != VR_RANGE)
>
> Above it looks like we could go from VR_RANGE to VR_UNDEFINED.
> I'm not sure if the original range-info might be useful in this case -
> if it may be
> can we simply use only the range info if it was VR_RANGE?

I think we only drop to VR_UNDEFINED if we have contradictory
information: nonzero bits says some bits must be clear, but the range
only contains values for which the bits are set.  In that case I think
we should either be conservative and not use the information, or be
aggressive and say that we have undefined behaviour, so overflow is OK.

It seems a bit of a fudge to go back to the old range when we know it's
false, and use it to allow the split some times and not others.

Thanks,
Richard

>
> Ok otherwise.
> Thanks,
> Richard.
>
>>   return false;
>>
>> /* See whether the range of OP0 (i.e. TMP_VAR + TMP_OFF)
>> @@ -729,7 +735,6 @@ split_constant_offset_1 (tree type, tree
>>operations done in ITYPE.  The addition must overflow
>>at both ends of the range or at neither.  */
>> bool overflow[2];
>> -   signop sgn = TYPE_SIGN (itype);
>> unsigned int prec = TYPE_PRECISION (itype);
>> wide_int woff = wi::to_wide (tmp_off, prec);
>> wide_int op0_min = wi::add (var_min, woff, sgn, 
>> [0]);
>> Index: gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-3.c
>> ===
>> --- /dev/null   2018-02-02 09:03:36.168354735 +
>> +++ gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-3.c2018-02-02 
>> 14:03:54.183521863 +
>> @@ -0,0 +1,62 @@
>> +/* { dg-do compile } */
>> +/* { dg-additional-options "-fno-tree-loop-vectorize" } */
>> +/* { dg-require-effective-target vect_double } */
>> +/* { dg-require-effective-target lp64 } */
>> +
>> +void
>> +f1 (double *p, double *q, unsigned int n)
>> +{
>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
>> +  for (unsigned int i = 0; i < n; i += 4)
>> +{
>> +  double a = q[i] + p[i];
>> +  double b = q[i + 1] + p[i + 1];
>> +  q[i] = a;
>> +  q[i + 1] = b;
>> +}
>> +}
>> +
>> +void
>> +f2 (double *p, double *q, unsigned int n)
>> +{
>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
>> +  for (unsigned int i = 0; i < n; i += 2)
>> +{
>> +  double a = q[i] + p[i];
>> +  double b = q[i + 1] + p[i + 1];
>> +  q[i] = a;
>> +  q[i + 1] = b;
>> +}
>> +}
>> +
>> +void
>> +f3 (double *p, double *q, unsigned int n)
>> +{
>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
>> +  for (unsigned int i = 0; i < n; i += 6)
>> +{
>> +  double a = q[i] + p[i];
>> +  double b = q[i + 1] + p[i + 1];
>> +  q[i] = a;
>> +  q[i + 1] = b;
>> +}
>> +}
>> +
>> +void
>> +f4 (double *p, double *q, unsigned int start, unsigned int n)
>> +{
>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
>> +  for (unsigned int i = start & -2; i < n; i += 2)
>> +{
>> +  double a = q[i] + p[i];
>> +  double b = q[i + 1] + p[i + 1];
>> +  q[i] = a;
>> +  q[i + 1] = b;
>> +}
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times "basic block vectorized" 4 "slp1" } } 
>> */
>> Index: gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-4.c
>> ===
>> --- /dev/null   2018-02-02 09:03:36.168354735 +
>> +++ gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-4.c2018-02-02 
>> 

Re: Use nonzero bits to refine range in split_constant_offset (PR 81635)

2018-02-08 Thread Richard Biener
On Fri, Feb 2, 2018 at 3:12 PM, Richard Sandiford
 wrote:
> This patch is part 2 of the fix for PR 81635.  It means that
> split_constant_offset can handle loops like:
>
>   for (unsigned int i = 0; i < n; i += 4)
> {
>   a[i] = ...;
>   a[i + 1] = ...;
> }
>
> CCP records that "i" must have its low 2 bits clear, but we don't
> include this information in the range of "i", which remains [0, +INF].
> I tried making set_nonzero_bits update the range info in the same
> way that set_range_info updates the nonzero bits, but it regressed
> cases like vrp117.c and made some other tests worse.
>
> vrp117.c has a multiplication by 10, so CCP can infer that the low bit
> of the result is clear.  If we included that in the range, the range
> would go from [-INF, +INF] to [-INF, not-quite-+INF].  However,
> the multiplication is also known to overflow in all cases, so VRP
> saturates the result to [INT_MAX, INT_MAX].  This obviously creates a
> contradiction with the nonzero bits, and intersecting the new saturated
> range with an existing not-quite-+INF range would make us drop to
> VR_UNDEFINED.  We're prepared to fold a comparison with an [INT_MAX,
> INT_MAX] value but not with a VR_UNDEFINED value.
>
> The other problems were created when intersecting [-INF, not-quite-+INF]
> with a useful VR_ANTI_RANGE like ~[-1, 1].  The intersection would
> keep the former range rather than the latter.
>
> The patch therefore keeps the adjustment local to split_constant_offset
> for now, but adds a helper routine so that it's easy to move this later.
>
> Tested on aarch64-linux-gnu, x86_64-linux-gnu and powerpc64le-linux-gnu.
> OK to install?
>
> Richard
>
>
> 2018-02-02  Richard Sandiford  
>
> gcc/
> PR tree-optimization/81635
> * wide-int.h (wi::round_down_for_mask, wi::round_up_for_mask): 
> Declare.
> * wide-int.cc (wi::round_down_for_mask, wi::round_up_for_mask)
> (test_round_for_mask): New functions.
> (wide_int_cc_tests): Call test_round_for_mask.
> * tree-vrp.h (intersect_range_with_nonzero_bits): Declare.
> * tree-vrp.c (intersect_range_with_nonzero_bits): New function.
> * tree-data-ref.c (split_constant_offset_1): Use it to refine the
> range returned by get_range_info.
>
> gcc/testsuite/
> PR tree-optimization/81635
> * gcc.dg/vect/bb-slp-pr81635-3.c: New test.
> * gcc.dg/vect/bb-slp-pr81635-4.c: Likewise.
>
> Index: gcc/wide-int.h
> ===
> --- gcc/wide-int.h  2018-02-02 14:03:53.964530009 +
> +++ gcc/wide-int.h  2018-02-02 14:03:54.185521788 +
> @@ -3308,6 +3308,8 @@ gt_pch_nx (trailing_wide_ints  *, voi
>wide_int set_bit_in_zero (unsigned int, unsigned int);
>wide_int insert (const wide_int , const wide_int , unsigned int,
>unsigned int);
> +  wide_int round_down_for_mask (const wide_int &, const wide_int &);
> +  wide_int round_up_for_mask (const wide_int &, const wide_int &);
>
>template 
>T mask (unsigned int, bool);
> Index: gcc/wide-int.cc
> ===
> --- gcc/wide-int.cc 2018-02-02 14:03:53.964530009 +
> +++ gcc/wide-int.cc 2018-02-02 14:03:54.185521788 +
> @@ -2132,6 +2132,70 @@ wi::only_sign_bit_p (const wide_int_ref
>return only_sign_bit_p (x, x.precision);
>  }
>
> +/* Return VAL if VAL has no bits set outside MASK.  Otherwise round VAL
> +   down to the previous value that has no bits set outside MASK.
> +   This rounding wraps for signed values if VAL is negative and
> +   the top bit of MASK is clear.
> +
> +   For example, round_down_for_mask (6, 0xf1) would give 1 and
> +   round_down_for_mask (24, 0xf1) would give 17.  */
> +
> +wide_int
> +wi::round_down_for_mask (const wide_int , const wide_int )
> +{
> +  /* Get the bits in VAL that are outside the mask.  */
> +  wide_int extra_bits = wi::bit_and_not (val, mask);
> +  if (extra_bits == 0)
> +return val;
> +
> +  /* Get a mask that includes the top bit in EXTRA_BITS and is all 1s
> + below that bit.  */
> +  unsigned int precision = val.get_precision ();
> +  wide_int lower_mask = wi::mask (precision - wi::clz (extra_bits),
> + false, precision);
> +
> +  /* Clear the bits that aren't in MASK, but ensure that all bits
> + in MASK below the top cleared bit are set.  */
> +  return (val & mask) | (mask & lower_mask);
> +}
> +
> +/* Return VAL if VAL has no bits set outside MASK.  Otherwise round VAL
> +   up to the next value that has no bits set outside MASK.  The rounding
> +   wraps if there are no suitable values greater than VAL.
> +
> +   For example, round_up_for_mask (6, 0xf1) would give 16 and
> +   round_up_for_mask (24, 0xf1) would give 32.  */
> +
> +wide_int
> +wi::round_up_for_mask (const wide_int , const wide_int )
> +{
> +  /* 

Use nonzero bits to refine range in split_constant_offset (PR 81635)

2018-02-02 Thread Richard Sandiford
This patch is part 2 of the fix for PR 81635.  It means that
split_constant_offset can handle loops like:

  for (unsigned int i = 0; i < n; i += 4)
{
  a[i] = ...;
  a[i + 1] = ...;
}

CCP records that "i" must have its low 2 bits clear, but we don't
include this information in the range of "i", which remains [0, +INF].
I tried making set_nonzero_bits update the range info in the same
way that set_range_info updates the nonzero bits, but it regressed
cases like vrp117.c and made some other tests worse.

vrp117.c has a multiplication by 10, so CCP can infer that the low bit
of the result is clear.  If we included that in the range, the range
would go from [-INF, +INF] to [-INF, not-quite-+INF].  However,
the multiplication is also known to overflow in all cases, so VRP
saturates the result to [INT_MAX, INT_MAX].  This obviously creates a
contradiction with the nonzero bits, and intersecting the new saturated
range with an existing not-quite-+INF range would make us drop to
VR_UNDEFINED.  We're prepared to fold a comparison with an [INT_MAX,
INT_MAX] value but not with a VR_UNDEFINED value.

The other problems were created when intersecting [-INF, not-quite-+INF]
with a useful VR_ANTI_RANGE like ~[-1, 1].  The intersection would
keep the former range rather than the latter.

The patch therefore keeps the adjustment local to split_constant_offset
for now, but adds a helper routine so that it's easy to move this later.

Tested on aarch64-linux-gnu, x86_64-linux-gnu and powerpc64le-linux-gnu.
OK to install?

Richard


2018-02-02  Richard Sandiford  

gcc/
PR tree-optimization/81635
* wide-int.h (wi::round_down_for_mask, wi::round_up_for_mask): Declare.
* wide-int.cc (wi::round_down_for_mask, wi::round_up_for_mask)
(test_round_for_mask): New functions.
(wide_int_cc_tests): Call test_round_for_mask.
* tree-vrp.h (intersect_range_with_nonzero_bits): Declare.
* tree-vrp.c (intersect_range_with_nonzero_bits): New function.
* tree-data-ref.c (split_constant_offset_1): Use it to refine the
range returned by get_range_info.

gcc/testsuite/
PR tree-optimization/81635
* gcc.dg/vect/bb-slp-pr81635-3.c: New test.
* gcc.dg/vect/bb-slp-pr81635-4.c: Likewise.

Index: gcc/wide-int.h
===
--- gcc/wide-int.h  2018-02-02 14:03:53.964530009 +
+++ gcc/wide-int.h  2018-02-02 14:03:54.185521788 +
@@ -3308,6 +3308,8 @@ gt_pch_nx (trailing_wide_ints  *, voi
   wide_int set_bit_in_zero (unsigned int, unsigned int);
   wide_int insert (const wide_int , const wide_int , unsigned int,
   unsigned int);
+  wide_int round_down_for_mask (const wide_int &, const wide_int &);
+  wide_int round_up_for_mask (const wide_int &, const wide_int &);
 
   template 
   T mask (unsigned int, bool);
Index: gcc/wide-int.cc
===
--- gcc/wide-int.cc 2018-02-02 14:03:53.964530009 +
+++ gcc/wide-int.cc 2018-02-02 14:03:54.185521788 +
@@ -2132,6 +2132,70 @@ wi::only_sign_bit_p (const wide_int_ref
   return only_sign_bit_p (x, x.precision);
 }
 
+/* Return VAL if VAL has no bits set outside MASK.  Otherwise round VAL
+   down to the previous value that has no bits set outside MASK.
+   This rounding wraps for signed values if VAL is negative and
+   the top bit of MASK is clear.
+
+   For example, round_down_for_mask (6, 0xf1) would give 1 and
+   round_down_for_mask (24, 0xf1) would give 17.  */
+
+wide_int
+wi::round_down_for_mask (const wide_int , const wide_int )
+{
+  /* Get the bits in VAL that are outside the mask.  */
+  wide_int extra_bits = wi::bit_and_not (val, mask);
+  if (extra_bits == 0)
+return val;
+
+  /* Get a mask that includes the top bit in EXTRA_BITS and is all 1s
+ below that bit.  */
+  unsigned int precision = val.get_precision ();
+  wide_int lower_mask = wi::mask (precision - wi::clz (extra_bits),
+ false, precision);
+
+  /* Clear the bits that aren't in MASK, but ensure that all bits
+ in MASK below the top cleared bit are set.  */
+  return (val & mask) | (mask & lower_mask);
+}
+
+/* Return VAL if VAL has no bits set outside MASK.  Otherwise round VAL
+   up to the next value that has no bits set outside MASK.  The rounding
+   wraps if there are no suitable values greater than VAL.
+
+   For example, round_up_for_mask (6, 0xf1) would give 16 and
+   round_up_for_mask (24, 0xf1) would give 32.  */
+
+wide_int
+wi::round_up_for_mask (const wide_int , const wide_int )
+{
+  /* Get the bits in VAL that are outside the mask.  */
+  wide_int extra_bits = wi::bit_and_not (val, mask);
+  if (extra_bits == 0)
+return val;
+
+  /* Get a mask that is all 1s above the top bit in EXTRA_BITS.  */
+  unsigned int precision = val.get_precision ();
+  wide_int upper_mask = wi::mask