On Fri, Jan 26, 2018 at 3:57 PM, Christophe Lyon
<christophe.l...@linaro.org> wrote:
> On 26 January 2018 at 11:25, Richard Biener <rguent...@suse.de> wrote:
>> On Thu, 25 Jan 2018, Richard Biener wrote:
>>
>>> On Thu, 25 Jan 2018, Marc Glisse wrote:
>>>
>>> > On Thu, 25 Jan 2018, Richard Biener wrote:
>>> >
>>> > > --- gcc/match.pd  (revision 257047)
>>> > > +++ gcc/match.pd  (working copy)
>>> > > @@ -1939,6 +1939,37 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>>> > >      (minus (convert (view_convert:stype @1))
>>> > >       (convert (view_convert:stype @2)))))))
>>> > >
>>> > > +/* (A * C) +- (B * C) -> (A+-B) * C and (A * C) +- A -> A * (C+-1).
>>> > > +    Modeled after fold_plusminus_mult_expr.  */
>>> > > +(if (!TYPE_SATURATING (type)
>>> > > +     && (!FLOAT_TYPE_P (type) || flag_associative_math))
>>> > > + (for plusminus (plus minus)
>>> > > +  (simplify
>>> > > +   (plusminus (mult:s @0 @1) (mult:cs @0 @2))
>>> >
>>> > No :c on the first mult, so we don't actually handle A*C+B*C?
>>>
>>> Hmm, I somehow convinced myself that it's only necessary on one
>>> of the mults...  but you are of course right.  Will re-test with
>>> that fixed.
>>
>> This is what I have applied.  Note I had to XFAIL a minor part of
>> gcc.dg/tree-ssa/loop-15.c, namely simplifying the final value
>> replacement down to n * n.  While the patch should enable this
>> the place where the transform could trigger with enough information
>> about n is VRP but that doesn't end up folding all stmts - and
>> that's something I'm not willing to change right now.
>>
>> Bootstrapped and tested on x86_64-unknown-linux-gnu, applied on trunk.
>>
>
> Hi,
>
>
> I think this patch caused regressions on aarch64:
> FAIL: gcc.dg/wmul-1.c scan-tree-dump-not widening_mul "WIDEN_MULT_PLUS_EXPR"

Late forwprop does

@@ -75,7 +21,7 @@
   _1 = (long unsigned int) Idx_6(D);
   _2 = _1 * 40;
   _12 = _1 * 4;
-  _17 = _2 + _12;
+  _17 = _1 * 44;
   _13 = Arr_7(D) + _17;
   MEM[(int[10] *)_13] = 1;
   _4 = _2 + 400;
  _18 = _4 + _12;
  _16 = Arr_7(D) + _18;
  MEM[(int[10] *)_16] = 2;


which I'm not sure ends up profitable at the end.  It makes _12
dead so the total number of multiplications stays the same.
For this we then apply the WIDEN_MULT_PLUS_EXPR two
times given _2 is no longer used multiple times.

The reason the above transform happens is the match.pd :s
behavior which "ignores" :s in case the resulting expression
is "simple".

Can you open a bugreport?

Richard.

> Thanks,
>
> Christophe
>
>> Richard.
>>
>> 2018-01-26  Richard Biener  <rguent...@suse.de>
>>
>>         PR tree-optimization/81082
>>         * fold-const.c (fold_plusminus_mult_expr): Do not perform the
>>         association if it requires casting to unsigned.
>>         * match.pd ((A * C) +- (B * C) -> (A+-B)): New patterns derived
>>         from fold_plusminus_mult_expr to catch important cases late when
>>         range info is available.
>>
>>         * gcc.dg/vect/pr81082.c: New testcase.
>>         * gcc.dg/tree-ssa/loop-15.c: XFAIL the (int)((unsigned)n + -1U) * n 
>> + n
>>         simplification to n * n.
>>
>> Index: gcc/fold-const.c
>> ===================================================================
>> --- gcc/fold-const.c    (revision 257047)
>> +++ gcc/fold-const.c    (working copy)
>> @@ -7097,7 +7097,7 @@ fold_plusminus_mult_expr (location_t loc
>>
>>    /* Same may be zero and thus the operation 'code' may overflow.  Likewise
>>       same may be minus one and thus the multiplication may overflow.  
>> Perform
>> -     the operations in an unsigned type.  */
>> +     the sum operation in an unsigned type.  */
>>    tree utype = unsigned_type_for (type);
>>    tree tem = fold_build2_loc (loc, code, utype,
>>                               fold_convert_loc (loc, utype, alt0),
>> @@ -7110,9 +7110,9 @@ fold_plusminus_mult_expr (location_t loc
>>      return fold_build2_loc (loc, MULT_EXPR, type,
>>                             fold_convert (type, tem), same);
>>
>> -  return fold_convert_loc (loc, type,
>> -                          fold_build2_loc (loc, MULT_EXPR, utype, tem,
>> -                                           fold_convert_loc (loc, utype, 
>> same)));
>> +  /* Do not resort to unsigned multiplication because
>> +     we lose the no-overflow property of the expression.  */
>> +  return NULL_TREE;
>>  }
>>
>>  /* Subroutine of native_encode_expr.  Encode the INTEGER_CST
>> Index: gcc/match.pd
>> ===================================================================
>> --- gcc/match.pd        (revision 257047)
>> +++ gcc/match.pd        (working copy)
>> @@ -1939,6 +1939,37 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>>       (minus (convert (view_convert:stype @1))
>>             (convert (view_convert:stype @2)))))))
>>
>> +/* (A * C) +- (B * C) -> (A+-B) * C and (A * C) +- A -> A * (C+-1).
>> +    Modeled after fold_plusminus_mult_expr.  */
>> +(if (!TYPE_SATURATING (type)
>> +     && (!FLOAT_TYPE_P (type) || flag_associative_math))
>> + (for plusminus (plus minus)
>> +  (simplify
>> +   (plusminus (mult:cs @0 @1) (mult:cs @0 @2))
>> +   (if (!ANY_INTEGRAL_TYPE_P (type)
>> +        || TYPE_OVERFLOW_WRAPS (type)
>> +        || (INTEGRAL_TYPE_P (type)
>> +           && tree_expr_nonzero_p (@0)
>> +           && expr_not_equal_to (@0, wi::minus_one (TYPE_PRECISION 
>> (type)))))
>> +    (mult (plusminus @1 @2) @0)))
>> +  /* We cannot generate constant 1 for fract.  */
>> +  (if (!ALL_FRACT_MODE_P (TYPE_MODE (type)))
>> +   (simplify
>> +    (plusminus @0 (mult:cs @0 @2))
>> +    (if (!ANY_INTEGRAL_TYPE_P (type)
>> +        || TYPE_OVERFLOW_WRAPS (type)
>> +        || (INTEGRAL_TYPE_P (type)
>> +            && tree_expr_nonzero_p (@0)
>> +            && expr_not_equal_to (@0, wi::minus_one (TYPE_PRECISION 
>> (type)))))
>> +     (mult (plusminus { build_one_cst (type); } @2) @0)))
>> +   (simplify
>> +    (plusminus (mult:cs @0 @2) @0)
>> +    (if (!ANY_INTEGRAL_TYPE_P (type)
>> +        || TYPE_OVERFLOW_WRAPS (type)
>> +        || (INTEGRAL_TYPE_P (type)
>> +            && tree_expr_nonzero_p (@0)
>> +            && expr_not_equal_to (@0, wi::minus_one (TYPE_PRECISION 
>> (type)))))
>> +     (mult (plusminus @2 { build_one_cst (type); }) @0))))))
>>
>>  /* Simplifications of MIN_EXPR, MAX_EXPR, fmin() and fmax().  */
>>
>> Index: gcc/testsuite/gcc.dg/vect/pr81082.c
>> ===================================================================
>> --- gcc/testsuite/gcc.dg/vect/pr81082.c (revision 0)
>> +++ gcc/testsuite/gcc.dg/vect/pr81082.c (working copy)
>> @@ -0,0 +1,15 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target vect_int } */
>> +
>> +int
>> +f (int *x, int b1, int b2, int b3)
>> +{
>> +  int foo = 0;
>> +  for (int i1 = 0; i1 < b1; ++i1)
>> +    for (int i2 = 0; i2 < b2; ++i2)
>> +      for (int i3 = 0; i3 < b3; ++i3)
>> +       foo += x[i1 * b2 * b3 + i2 * b3 + (i3 - 1)];
>> +  return foo;
>> +}
>> +
>> +/* { dg-final { scan-tree-dump "vectorized 1 loops in function" "vect" } } 
>> */
>> Index: gcc/testsuite/gcc.dg/tree-ssa/loop-15.c
>> ===================================================================
>> --- gcc/testsuite/gcc.dg/tree-ssa/loop-15.c     (revision 257048)
>> +++ gcc/testsuite/gcc.dg/tree-ssa/loop-15.c     (working copy)
>> @@ -19,7 +19,7 @@ int bla(void)
>>  }
>>
>>  /* Since the loop is removed, there should be no addition.  */
>> -/* { dg-final { scan-tree-dump-times " \\+ " 0 "optimized" } } */
>> +/* { dg-final { scan-tree-dump-times " \\+ " 0 "optimized" { xfail *-*-* } 
>> } } */
>>  /* { dg-final { scan-tree-dump-times " \\* " 1 "optimized" } } */
>>
>>  /* The if from the loop header copying remains in the code.  */

Reply via email to