Re: [PATCH v2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-16 Thread luoxhu via Gcc-patches



On 2020/9/15 14:51, Richard Biener wrote:


>> I only see VAR_DECL and PARM_DECL, is there any function to check the tree
>> variable is global? I added DECL_REGISTER, but the RTL still expands to 
>> stack:
> 
> is_global_var () or alternatively !auto_var_in_fn_p (), I think doing
> IFN_SET only
> makes sense if there's the chance we can promote the variable to a
> register.  But it
> would be an incorrect transform (it stores the whole vector) if the
> vector storage
> could "escape" to another thread - which means you probably have to check
> !TREE_ADDRESSABLE as well.
> 

The tree of param "u" will be marked ADDRESSABLE when generating 
"VIEW_CONVERT_EXPR(D.3190)[_1] = i;", if check !TREE_ADDRESSABLE, no 
IFN_SET
will be produced in gimple-isel.


#1  0x1066c700 in convert_vector_to_array_for_subscript (loc=5307072, 
vecp=0x7fffc5d0,
index=) at ../../gcc/gcc/c-family/c-common.c:8169
#2  0x10553b54 in build_array_ref (loc=5307072, array=, index=<
trunc_mod_expr 0x759c73a0>) at ../../gcc/gcc/c/c-typeck.c:2668
#3  0x105c8824 in c_parser_postfix_expression_after_primary 
(parser=0x77f703f0, expr_lo
c=5307040, expr=...) at ../../gcc/gcc/c/c-parser.c:10494
#4  0x105c7570 in c_parser_postfix_expression (parser=0x77f703f0) 
at ../../gcc/gcc/c/c-
parser.c:10216

>>
>> My current implementation does:
>>
>> 1)  v = vec_insert (i, u, n);
>>
>> =>gimple:
>> {
>>register __vector signed int D.3190;
>>D.3190 = u;// *new decl and copy u first.*
>>_1 = n & 3;
>>VIEW_CONVERT_EXPR(D.3190)[_1] = i;   // *update op0 of 
>> VIEW_CONVERT_EXPR*
>>_2 = D.3190;
>>...
>> }
>>
>> =>isel:
>> {
>>register __vector signed int D.3190;
>>D.3190 = u_4(D);
>>_1 = n_6(D) & 3;
>>.VEC_SET (, i_7(D), _1);
> 
> why are you passing the address of D.3190 to .VEC_SET?  That will not
> make D.3190
> be expanded to a pseudo.   You really need to have GIMPLE registers
> here (SSA name)
> and thus a return value, leaving the argument unmodified
> 
>D.3190_3 = .VEC_SET (D.3190_4, i_7(D), _1);
> 
> note this is why I asked about the actual CPU instruction - as I read
> Seghers mail
> the instruction modifies a vector register, not memory.
> 

Updated the code and got expected gimple-isel output and ASM for both 2 cases:

pr79251.c.236t.isel:

__attribute__((noinline))
test (__vector signed int u, int i, size_t n)
{
  long unsigned int _1;
  __vector signed int _6;
  vector(4) int _7;
  vector(4) int vec_set_dst_8;

   [local count: 1073741824]:
  _1 = n_2(D) & 3;
  _7 = u;
  vec_set_dst_8 = .VEC_SET (_7, i_4(D), _1);
  u = vec_set_dst_8;
  _6 = u;
  return _6;

}

But tree variable "u" need to be set to "TREE_ADDRESSABLE (view_op0) = 0;"
(Maybe check IFN VEC_SET and set to 0 in discover_nonconstant_array_refs
is better later.) after generating the IFN VEC_SET, otherwise, "u" will still
be expanded to stack since expander:
 "Replacing Expressions:  _7 replace with --> _7 = u;".

Setting "u" to non-addressable also seems really unreasonable as for below
case, as u[n+1] need be ADDRESSABLE:

__attribute__ ((noinline)) vector TYPE
test (vector TYPE u, TYPE i, size_t n)
{
 u[n % 4] = i;
 u[n+1] = i+1;
 return u;
}

=> gimple-isel with both VEC_SET and VIEW_CONVERT_EXPR:

test (__vector signed int u, int i, size_t n)
{
  long unsigned int _1;
  long unsigned int _2;
  int _3;
  __vector signed int _9;
  vector(4) int _10;
  vector(4) int vec_set_dst_11;

   [local count: 1073741824]:
  _1 = n_4(D) & 3;
  _10 = u;
  vec_set_dst_11 = .VEC_SET (_10, i_6(D), _1);
  u = vec_set_dst_11;
  _2 = n_4(D) + 1;
  _3 = i_6(D) + 1;
  VIEW_CONVERT_EXPR(u)[_2] = _3;
  _9 = u;
  return _9;

}


Below code are generating the IFN call, create_tmp_reg or 
create_tmp_reg_ssa_name
seems not create a variable that will be allocated on register?


diff --git a/gcc/gimple-isel.cc b/gcc/gimple-isel.cc
index b330cf4c20e..a699022cd09 100644
--- a/gcc/gimple-isel.cc
+++ b/gcc/gimple-isel.cc
...
+  if (!is_global_var (view_op0)
+ && TREE_CODE (TREE_TYPE (view_op0)) == VECTOR_TYPE
+ && tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (view_op0)))
+ && tree_to_uhwi (TYPE_SIZE (TREE_TYPE (view_op0))) == 128
+ && determine_value_range (pos, , ) == VR_RANGE
+ && wi::geu_p (minv, 0)
+ && wi::leu_p (maxv, (128 / GET_MODE_BITSIZE (innermode
+   {
+ location_t loc = gimple_location (stmt);
+ tree var_src = create_tmp_reg (TREE_TYPE (view_op0));
+ tree var_dst
+   = make_temp_ssa_name (TREE_TYPE (view_op0), NULL, "vec_set_dst");
+ TREE_ADDRESSABLE (view_op0) = 0;
+
+ ass_stmt = gimple_build_assign (var_src, view_op0);
+ gimple_set_location (ass_stmt, loc);
+ gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT);
+
+ new_stmt
+   = gimple_build_call_internal (IFN_VEC_SET, 3, var_src, val, pos);
+
+ gimple_call_set_lhs (new_stmt, var_dst);
+
+ 

Re: [PATCH v2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-14 Thread luoxhu via Gcc-patches



On 2020/9/14 17:47, Richard Biener wrote:

On Mon, Sep 14, 2020 at 10:05 AM luoxhu  wrote:



Not sure whether this reflects the issues you discussed above.
I constructed below test cases and tested with and without this patch,
only if "a+c"(which means store only), the performance is getting bad with
this patch due to extra load/store(about 50% slower).
For "v = vec_insert (i, v, n);" usage, v is always loaded after store, so this
patch will always get big benefit.
But for "v[n % 4] = i;" usage, it depends on whether "v" is used immediately
inside the function or out of the function soon.  Does this mean unify the two
usage to same gimple code not a good idea sometimes?  Or is it possible to
detect the generated IFN ".VEC_INSERT (, i_4(D), _1);" destination "v" is
used not far away inside or out side of the function?


#define TYPE int

vector TYPE v = {1, 2, 3, 4};   // a. global vector.
vector TYPE s = {4, 3, 2, 1};

__attribute__ ((noinline))
vector TYPE
test (vector TYPE u, TYPE i, size_t n)
{
   v[n % 4] = i;


^^^

this should be

u[n % 4] = i;

I guess.  Is the % 4 mandated by the vec_insert semantics btw?


Yes.  Segher pasted the builtin description in his reply.  "v = vec_insert (i, u, 
n);"
is a bit different with "u[n % 4] = i;" since it returns a copy of u instead of 
modify
u. The adjust in lower __builtin_vec_insert_xxx gimple will make a copy of u 
first to
meet the requirements.



If you tested with the above error you probably need to re-measure.


No I did test for u as local instead of global before.  If u is not used very
soon, the performance is almost the same for generating single store or IFN_SET
of inserting with variable index.

source:
__attribute__ ((noinline)) vector TYPE
test (vector TYPE u, TYPE i, size_t n)
{
 u[n % 4] = i;
 vector TYPE r = {0};
 for (long k = 0; k < 100; k++)
   {
 r += v;
   }
 return u+r;
}

=> store hit load is relieved due to long distance.

ASM:
0:  addis 2,12,.TOC.-.LCF0@ha
   addi 2,2,.TOC.-.LCF0@l
   .localentry test,.-test
   addis 10,2,.LANCHOR0@toc@ha
   li 8,50
   xxspltib 32,0
   addi 9,1,-16
   rldic 6,6,2,60
   stxv 34,-16(1)
   addi 10,10,.LANCHOR0@toc@l
   mtctr 8
   xxlor 33,32,32
   stwx 5,9,6  // short store
   lxv 45,0(10)
   .p2align 4,,15
.L2:
   vadduwm 0,0,13
   vadduwm 1,1,13
   bdnz .L2
   vadduwm 0,0,1
   lxv 33,-16(1)   // wide load
   vadduwm 2,0,1
   blr


Then I intend to use "v" there for global memory insert test to separate
the store and load into different function ("v" is stored in function,
but loaded out of function will get different performance for single store
or lxv+xxperm+xxsel+stxv, the inner function doesn't know the distance
between load and store across functions).

Do you mean that we don't need generate IFN_SET if "v" is global memory?
I only see VAR_DECL and PARM_DECL, is there any function to check the tree
variable is global? I added DECL_REGISTER, but the RTL still expands to stack:

gcc/internal-fn.c: rtx to_rtx = expand_expr (view_op0, NULL_RTX, VOIDmode, 
EXPAND_WRITE);

(gdb) p view_op0
$584 = 
(gdb) p DECL_REGISTER(view_op0)
$585 = 1
(gdb) p to_rtx
$586 = (rtx_def *) (mem/c:V4SI (reg/f:DI 112 virtual-stack-vars) [1 D.3190+0 
S16 A128])




On gimple the above function (after fixing it) looks like

   VIEW_CONVERT_EXPR(u)[_1] = i_4(D);

and the IFN idea I had would - for non-global memory 'u' only - transform
this to

   vector_register_2 = u;
   vector_register_3 = .IFN_VEC_SET (vector_register_2, _1, i_4(D));
   u = vector_register_3;

if vec_set can handle variable indexes.  This then becomes a
vec_set on a register and if that was the only variable indexing of 'u'
will also cause 'u' to be expanded as register rather than stack memory.

Note we can't use the direct-optab method here since the vec_set optab
modifies operand 0 which isn't possible in SSA form.  That might hint
at that we eventually want to extend BIT_INSERT_EXPR to handle
a non-constant bit position but for experiments using an alternate
internal function is certainly easier.



My current implementation does:

1)  v = vec_insert (i, u, n);

=>gimple:
{
 register __vector signed int D.3190;
 D.3190 = u;// *new decl and copy u first.*
 _1 = n & 3;
 VIEW_CONVERT_EXPR(D.3190)[_1] = i;   // *update op0 of 
VIEW_CONVERT_EXPR*
 _2 = D.3190;
 ...
}

=>isel:
{
 register __vector signed int D.3190;
 D.3190 = u_4(D);
 _1 = n_6(D) & 3;
 .VEC_SET (, i_7(D), _1);
 _2 = D.3190;
 ...
}


2) u[n % 4] = i;

=>gimple:
{
 __vector signed int D.3191;
 _1 = n & 3;
 VIEW_CONVERT_EXPR(u)[_1] = i;   // *update op0 of VIEW_CONVERT_EXPR*
 D.3191 = u;   
...

}

=>isel:
{
 D.3190 = u_4(D);
 _1 = n_6(D) & 3;
 .VEC_SET (, i_7(D), _1);
 _2 = D.3190;
 v = _2;
}

The IFN ".VEC_SET" behavio

Re: [PATCH v2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-14 Thread luoxhu via Gcc-patches



On 2020/9/10 18:08, Richard Biener wrote:
> On Wed, Sep 9, 2020 at 6:03 PM Segher Boessenkool
>  wrote:
>>
>> On Wed, Sep 09, 2020 at 04:28:19PM +0200, Richard Biener wrote:
>>> On Wed, Sep 9, 2020 at 3:49 PM Segher Boessenkool
>>>  wrote:

 Hi!

 On Tue, Sep 08, 2020 at 10:26:51AM +0200, Richard Biener wrote:
> Hmm, yeah - I guess that's what should be addressed first then.
> I'm quite sure that in case 'v' is not on the stack but in memory like
> in my case a SImode store is better than what we get from
> vec_insert - in fact vec_insert will likely introduce a RMW cycle
> which is prone to inserting store-data-races?

 The other way around -- if it is in memory, and was stored as vector
 recently, then reading back something shorter from it is prone to
 SHL/LHS problems.  There is nothing special about the stack here, except
 of course it is more likely to have been stored recently if on the
 stack.  So it depends how often it has been stored recently which option
 is best.  On newer CPUs, although they can avoid SHL/LHS flushes more
 often, the penalty is relatively bigger, so memory does not often win.

 I.e.: it needs to be measured.  Intuition is often wrong here.
>>>
>>> But the current method would simply do a direct store to memory
>>> without a preceeding read of the whole vector.
>>
>> The problem is even worse the other way: you do a short store here, but
>> so a full vector read later.  If the store and read are far apart, that
>> is fine, but if they are close (that is on the order of fifty or more
>> insns), there can be problems.
> 
> Sure, but you can't simply load/store a whole vector when the code didn't
> unless you know it will not introduce data races and it will not trap (thus
> the whole vector needs to be at least naturally aligned).
> 
> Also if there's a preceeding short store you will now load the whole vector
> to avoid the short store ... catch-22
> 
>> There often are problems over function calls (where the compiler cannot
>> usually *see* how something is used).
> 
> Yep.  The best way would be to use small loads and larger stores
> which is what CPUs usually tend to handle fine (with alignment
> constraints, etc.).  Of course that's not what either of the "solutions"
> can do.
> 
> That said, since you seem to be "first" in having an instruction
> to insert into a vector at a variable position the idea that we'd
> have to spill anyway for this to be expanded and thus we expand
> the vector to a stack location in the first place falls down.  And
> that's where I'd first try to improve things.
> 
> So what can the CPU actually do?
> 

Not sure whether this reflects the issues you discussed above.
I constructed below test cases and tested with and without this patch,
only if "a+c"(which means store only), the performance is getting bad with
this patch due to extra load/store(about 50% slower).
For "v = vec_insert (i, v, n);" usage, v is always loaded after store, so this
patch will always get big benefit.
But for "v[n % 4] = i;" usage, it depends on whether "v" is used immediately
inside the function or out of the function soon.  Does this mean unify the two
usage to same gimple code not a good idea sometimes?  Or is it possible to
detect the generated IFN ".VEC_INSERT (, i_4(D), _1);" destination "v" is
used not far away inside or out side of the function?


#define TYPE int

vector TYPE v = {1, 2, 3, 4};   // a. global vector.
vector TYPE s = {4, 3, 2, 1};

__attribute__ ((noinline))
vector TYPE
test (vector TYPE u, TYPE i, size_t n)
{
  v[n % 4] = i;
  return u;
}

int main ()
{
  //vector TYPE v = {1, 2, 3, 4};// b. local vector.
  //vector TYPE s = {4, 3, 2, 1};
  vector TYPE r = {0};
  for (long k = 0; k < 3989478945; k++)
{
r += test (s, 254.0f, k);  // c. access different vector. store only.
   //r += test (v, 254.0f, k);  // d. access same vector. load after store 
in callee function.
  //test (s, 254.0f, k); r += v;   //e. load after store out of function.
}
  return r[0];
}


Thanks,
Xionghu



Re: [PATCH v2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-08 Thread luoxhu via Gcc-patches



On 2020/9/8 16:26, Richard Biener wrote:
>> Seems not only pseudo, for example "v = vec_insert (i, v, n);"
>> the vector variable will be store to stack first, then [r112:DI] is a
>> memory here to be processed.  So the patch loads it from stack(insn #10) to
>> temp vector register first, and store to stack again(insn #24) after
>> rs6000_vector_set_var.
> Hmm, yeah - I guess that's what should be addressed first then.
> I'm quite sure that in case 'v' is not on the stack but in memory like
> in my case a SImode store is better than what we get from
> vec_insert - in fact vec_insert will likely introduce a RMW cycle
> which is prone to inserting store-data-races?

Yes, for your case, there is no stack operation and to_rtx is expanded
with BLKmode instead of V4SImode.  Add the to_rtx mode check could workaround
it.  ASM doesn't show store hit load issue.  

optimized:

_1 = i_2(D) % 4;
VIEW_CONVERT_EXPR(x.u)[_1] = a_4(D);

expand:
2: r118:DI=%3:DI
3: r119:DI=%4:DI
4: NOTE_INSN_FUNCTION_BEG
7: r120:DI=unspec[`*.LANCHOR0',%2:DI] 47
  REG_EQUAL `*.LANCHOR0'
8: r122:SI=r118:DI#0
9: {r124:SI=r122:SI/0x4;clobber ca:SI;}
   10: r125:SI=r124:SI<<0x2
   11: r123:SI=r122:SI-r125:SI
  REG_EQUAL r122:SI%0x4
   12: r126:DI=sign_extend(r123:SI)
   13: r127:DI=r126:DI+0x4
   14: r128:DI=r127:DI<<0x2
   15: r129:DI=r120:DI+r128:DI
   16: [r129:DI]=r119:DI#0

 p to_rtx
$319 = (rtx_def *) (mem/c:BLK (reg/f:DI 120) [2 x+0 S32 A128])

asm:
addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
.localentry test,.-test
srawi 9,3,2
addze 9,9
addis 10,2,.LANCHOR0@toc@ha
addi 10,10,.LANCHOR0@toc@l
slwi 9,9,2
subf 9,9,3
extsw 9,9
addi 9,9,4
sldi 9,9,2
stwx 4,10,9
blr


> 
> So - what we need to "fix" is cfgexpand.c marking variably-indexed
> decls as not to be expanded as registers (see
> discover_nonconstant_array_refs).
> 
> I guess one way forward would be to perform instruction
> selection on GIMPLE here and transform
> 
> VIEW_CONVERT_EXPR(D.3185)[_1] = i_6(D)
> 
> to a (direct) internal function based on the vec_set optab.  

I don't quite understand what you mean here.  Do you mean:
ALTIVEC_BUILTIN_VEC_INSERT -> VIEW_CONVERT_EXPR -> internal function -> vec_set
or ALTIVEC_BUILTIN_VEC_INSERT -> internal function -> vec_set?
And which pass to put the selection and transform is acceptable?
Why call it *based on* vec_set optab?  The VIEW_CONVERT_EXPR or internal 
function
is expanded to vec_set optab.

I guess you suggest adding internal function for VIEW_CONVERT_EXPR in gimple,
and do the transform from internal function to vec_set optab in expander?
I doubt my understanding as this looks really over complicated since we
transform from VIEW_CONVERT_EXPR to vec_set optab directly so far...
IIUC, Internal function seems doesn't help much here as Segher said before.


> But then in GIMPLE D.3185 is also still memory (we don't have a variable
> index partial register set operation - BIT_INSERT_EXPR is
> currently specified to receive a constant bit position only).
> 
> At which point after your patch is the stack storage elided?
> 

Stack storage is elided by register reload pass in RTL.


Thanks,
Xionghu


Re: [PATCH v2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-08 Thread luoxhu via Gcc-patches
Hi Richi,

On 2020/9/7 19:57, Richard Biener wrote:
> +  if (TREE_CODE (to) == ARRAY_REF)
> +   {
> + tree op0 = TREE_OPERAND (to, 0);
> + if (TREE_CODE (op0) == VIEW_CONVERT_EXPR
> + && expand_view_convert_to_vec_set (to, from, to_rtx))
> +   {
> + pop_temp_slots ();
> + return;
> +   }
> +   }
> 
> you're placing this at an awkward spot IMHO, after to_rtx expansion
> but disregading parts of  it and compensating just with 'to' matching.
> Is the pieces (offset, bitpos) really too awkward to work with for
> matching?
> 
> Because as written you'll miscompile
> 
> struct X { _vector signed int v; _vector singed int u; } x;
> 
> test(int i, int a)
> {
>x.u[i] = a;
> }
> 
> as I think you'll end up assigning to x.v.

Thanks for pointing out, this case will be a problem for the patch.
I checked with optimize_bitfield_assignment_op, it will return very early
as the mode1 is not VOIDmode, and this is actually not "FIELD op= VAL"
operation?

To be honest, I am not quite familiar with this part of code, I put the new
function expand_view_convert_to_vec_set just after to_rtx expansion because
adjust_address will change the V4SImode memory to SImode memory, but I need
keep target to_rtx V4SImode to save the vector after calling
rs6000_vector_set_var, so seems paradoxical here?

 p to_rtx
$264 = (rtx_def *) (mem/c:V4SI (reg/f:DI 112 virtual-stack-vars) [1 D.3186+0 
S16 A128])

=> to_rtx = adjust_address (to_rtx, mode1, 0);

p to_rtx
$265 = (rtx_def *) (mem/c:SI (reg/f:DI 112 virtual-stack-vars) [1 D.3186+0 S4 
A128])


> 
> Are we just interested in the case were we store to a
> pseudo or also when the destination is memory?  I guess
> only when it's a pseudo - correct?  In that case
> handling this all in optimize_bitfield_assignment_op
> is probably the best thing to try.
> 
> Note we possibly refrain from assigning a pseudo to
> such vector because we see a variable array-ref to it.

Seems not only pseudo, for example "v = vec_insert (i, v, n);"
the vector variable will be store to stack first, then [r112:DI] is a
memory here to be processed.  So the patch loads it from stack(insn #10) to
temp vector register first, and store to stack again(insn #24) after
rs6000_vector_set_var.

optimized:

D.3185 = v_3(D);
_1 = n_5(D) & 3;
VIEW_CONVERT_EXPR(D.3185)[_1] = i_6(D);
v_8 = D.3185;
return v_8;

=> expand without the patch:

2: r119:V4SI=%2:V4SI
3: r120:DI=%5:DI
4: r121:DI=%6:DI
5: NOTE_INSN_FUNCTION_BEG
8: [r112:DI]=r119:V4SI

9: r122:DI=r121:DI&0x3
   10: r123:DI=r122:DI<<0x2
   11: r124:DI=r112:DI+r123:DI
   12: [r124:DI]=r120:DI#0

   13: r126:V4SI=[r112:DI]
   14: r118:V4SI=r126:V4SI
   18: %2:V4SI=r118:V4SI
   19: use %2:V4SI

=> expand with the patch (replace #9~#12 to #10~#24):

2: r119:V4SI=%2:V4SI
3: r120:DI=%5:DI
4: r121:DI=%6:DI
5: NOTE_INSN_FUNCTION_BEG
8: [r112:DI]=r119:V4SI
9: r122:DI=r121:DI&0x3

   10: r123:V4SI=[r112:DI] // load from stack
   11: {r125:SI=0x3-r122:DI#0;clobber ca:SI;}
   12: r125:SI=r125:SI<<0x2
   13: {r125:SI=0x14-r125:SI;clobber ca:SI;}
   14: r128:DI=unspec[`*.LC0',%2:DI] 47
  REG_EQUAL `*.LC0'
   15: r127:V2DI=[r128:DI]
  REG_EQUAL const_vector
   16: r126:V16QI=r127:V2DI#0
   17: r129:V16QI=unspec[r120:DI#0] 61
   18: r130:V16QI=unspec[r125:SI] 151
   19: r131:V16QI=unspec[r129:V16QI,r129:V16QI,r130:V16QI] 232
   20: r132:V16QI=unspec[r126:V16QI,r126:V16QI,r130:V16QI] 232
   21: r124:V16QI=r123:V4SI#0
   22: r124:V16QI={(r132:V16QI!=const_vector)?r131:V16QI:r124:V16QI}
   23: r123:V4SI=r124:V16QI#0
   24: [r112:DI]=r123:V4SI   // store to stack.

   25: r134:V4SI=[r112:DI]
   26: r118:V4SI=r134:V4SI
   30: %2:V4SI=r118:V4SI
   31: use %2:V4SI


Thanks,
Xionghu


[PATCH v2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-06 Thread luoxhu via Gcc-patches

Hi,

On 2020/9/4 18:23, Segher Boessenkool wrote:

diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
index 03b00738a5e..00c65311f76 100644
--- a/gcc/config/rs6000/rs6000-c.c
+++ b/gcc/config/rs6000/rs6000-c.c
/* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0. */
@@ -1654,15 +1656,8 @@ altivec_resolve_overloaded_builtin (location_t loc, tree 
fndecl,
   SET_EXPR_LOCATION (stmt, loc);
   stmt = build1 (COMPOUND_LITERAL_EXPR, arg1_type, stmt);
 }
-
-  innerptrtype = build_pointer_type (arg1_inner_type);
-
-  stmt = build_unary_op (loc, ADDR_EXPR, stmt, 0);
-  stmt = convert (innerptrtype, stmt);
-  stmt = build_binary_op (loc, PLUS_EXPR, stmt, arg2, 1);
-  stmt = build_indirect_ref (loc, stmt, RO_NULL);
-  stmt = build2 (MODIFY_EXPR, TREE_TYPE (stmt), stmt,
-convert (TREE_TYPE (stmt), arg0));
+  stmt = build_array_ref (loc, stmt, arg2);
+  stmt = fold_build2 (MODIFY_EXPR, TREE_TYPE (arg0), stmt, arg0);
stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
return stmt;
  }

You should make a copy of the vector, not modify the original one in
place?  (If I read that correctly, heh.)  Looks good otherwise.



Segher, there  is already existed code to make a copy of vector as we
discussed offline.  Thanks for the reminder.

cat pr79251.c.006t.gimple
__attribute__((noinline))
test (__vector signed int v, int i, size_t n)
{
 __vector signed int D.3192;
 __vector signed int D.3194;
 __vector signed int v1;
 v1 = v;
 D.3192 = v1;
 _1 = n & 3;
 VIEW_CONVERT_EXPR(D.3192)[_1] = i;
 v1 = D.3192;
 D.3194 = v1;
 return D.3194;
}

Attached the v2 patch which does:
1) Build VIEW_CONVERT_EXPR for vec_insert (i, v, n) like v[n%4] = i to
unify the gimple code, then expander could use vec_set_optab to expand.
2) Recognize the pattern in expander and use optabs to expand
VIEW_CONVERT_EXPR to vec_insert with variable index to fast instructions:
lvsl+xxperm+xxsel. 


Thanks,
Xionghu
From 9a7a47b086579e26ae13f378732cea067b64a4e6 Mon Sep 17 00:00:00 2001
From: Xiong Hu Luo 
Date: Wed, 19 Aug 2020 03:54:17 -0500
Subject: [PATCH v2] rs6000: Expand vec_insert in expander instead of gimple
 [PR79251]

vec_insert accepts 3 arguments, arg0 is input vector, arg1 is the value
to be insert, arg2 is the place to insert arg1 to arg0.  Current expander
generates stxv+stwx+lxv if arg2 is variable instead of constant, which
causes serious store hit load performance issue on Power.  This patch tries
 1) Build VIEW_CONVERT_EXPR for vec_insert (i, v, n) like v[n%4] = i to
unify the gimple code, then expander could use vec_set_optab to expand.
 2) Recognize the pattern in expander and use optabs to expand
VIEW_CONVERT_EXPR to vec_insert with variable index to fast instructions:
lvsl+xxperm+xxsel.
In this way, "vec_insert (i, v, n)" and "v[n%4] = i" won't be expanded too
early in gimple stage if arg2 is variable, avoid generating store hit load
instructions.

For Power9 V4SI:
addi 9,1,-16
rldic 6,6,2,60
stxv 34,-16(1)
stwx 5,9,6
lxv 34,-16(1)
=>
addis 9,2,.LC0@toc@ha
addi 9,9,.LC0@toc@l
mtvsrwz 33,5
lxv 32,0(9)
sradi 9,6,2
addze 9,9
sldi 9,9,2
subf 9,9,6
subfic 9,9,3
sldi 9,9,2
subfic 9,9,20
lvsl 13,0,9
xxperm 33,33,45
xxperm 32,32,45
xxsel 34,34,33,32

Though instructions increase from 5 to 15, the performance is improved
60% in typical cases.
Tested with V2DI, V2DF V4SI, V4SF, V8HI, V16QI on Power9-LE and
Power8-BE, bootstrap tested pass.

gcc/ChangeLog:

2020-09-07  Xionghu Luo  

* config/rs6000/altivec.md (altivec_lvsl_reg_2): Rename to
 (altivec_lvsl_reg_2) and extend to SDI mode.
* config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin):
Ajdust variable index vec_insert to VIEW_CONVERT_EXPR.
* config/rs6000/rs6000-protos.h (rs6000_expand_vector_set_var):
New declare.
* config/rs6000/rs6000.c (rs6000_expand_vector_set_var):
New function.
* config/rs6000/rs6000.md (FQHS): New mode iterator.
(FD): New mode iterator.
p8_mtvsrwz_v16qi2: New define_insn.
p8_mtvsrd_v16qi2: New define_insn.
* config/rs6000/vector.md: Add register operand2 match for
vec_set index.
* config/rs6000/vsx.md: Call gen_altivec_lvsl_reg_di2.
* expr.c (expand_view_convert_to_vec_set): New function.
(expand_assignment): Call expand_view_convert_to_vec_set.

gcc/testsuite/ChangeLog:

2020-09-07  Xionghu Luo  

* gcc.target/powerpc/pr79251.c: New test.
* gcc.target/powerpc/pr79251-run.c: New test.
* gcc.target/powerpc/pr79251.h: New header.
---
 gcc/config/rs6000/altivec.md  |   4 +-
 gcc/config/rs6000/rs6000-c.c  |  22 ++-
 gcc/config/rs6000/rs6000-protos.h |   1 +
 

Re: [PATCH] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-04 Thread luoxhu via Gcc-patches



On 2020/9/4 15:23, Richard Biener wrote:
> On Fri, Sep 4, 2020 at 9:19 AM Richard Biener
>  wrote:
>>
>> On Fri, Sep 4, 2020 at 8:38 AM luoxhu  wrote:
>>>
>>>
>>>
>>> On 2020/9/4 14:16, luoxhu via Gcc-patches wrote:
>>>> Hi,
>>>>
>>>>
>>>> Yes, I checked and found that both vec_set and vec_extract doesn't support
>>>> variable index for most targets, store_bit_field_1 and extract_bit_field_1
>>>> would only consider use optabs when index is integer value.  Anyway, it
>>>> shouldn't be hard to extend depending on target requirements.
>>>>
>>>> Another problem is v[n&3]=i and vec_insert(v, i, n) are generating with
>>>> different gimple code:
>>>>
>>>> {
>>>> _1 = n & 3;
>>>> VIEW_CONVERT_EXPR(v1)[_1] = i;
>>>> }
>>>>
>>>> vs:
>>>>
>>>> {
>>>> __vector signed int v1;
>>>> __vector signed int D.3192;
>>>> long unsigned int _1;
>>>> long unsigned int _2;
>>>> int * _3;
>>>>
>>>>  [local count: 1073741824]:
>>>> D.3192 = v_4(D);
>>>> _1 = n_7(D) & 3;
>>>> _2 = _1 * 4;
>>>> _3 =  + _2;
>>>> *_3 = i_8(D);
>>>> v1_10 = D.3192;
>>>> return v1_10;
>>>> }
>>>
>>> Just realized use convert_vector_to_array_for_subscript would generate
>>> "VIEW_CONVERT_EXPR(v1)[_1] = i;" before produce those instructions,
>>>your confirmation and comments will be highly appreciated...  Thanks in
>>> advance. :)
>>
>> I think what the GCC vector extensions produce is generally better
>> so wherever "code generation" for vec_insert resides it should be
>> adjusted to produce the same code.  Same for vec_extract.
> 
> Guess altivec.h, dispatching to __builtin_vec_insert.  Wonder why it wasn't
> 
> #define vec_insert(a,b,c) (a)[c]=(b)
> 
> anyway, you obviously have some lowering of the builtin somewhere in rs6000.c
> and thus can adjust that.
> 

Yes, altivec.h use that style for all vector functions, not sure why.
But this could be adjusted by below patch during front end parsing,
which could also generate  "VIEW_CONVERT_EXPR(D.3192)[_1] = i;"
in gimple, then both v[n&3]=i and vec_insert(v, i, n) could use optabs
in expander:


diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
index 03b00738a5e..00c65311f76 100644
--- a/gcc/config/rs6000/rs6000-c.c
+++ b/gcc/config/rs6000/rs6000-c.c
   /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0. */
@@ -1654,15 +1656,8 @@ altivec_resolve_overloaded_builtin (location_t loc, tree 
fndecl,
  SET_EXPR_LOCATION (stmt, loc);
  stmt = build1 (COMPOUND_LITERAL_EXPR, arg1_type, stmt);
}
-
-  innerptrtype = build_pointer_type (arg1_inner_type);
-
-  stmt = build_unary_op (loc, ADDR_EXPR, stmt, 0);
-  stmt = convert (innerptrtype, stmt);
-  stmt = build_binary_op (loc, PLUS_EXPR, stmt, arg2, 1);
-  stmt = build_indirect_ref (loc, stmt, RO_NULL);
-  stmt = build2 (MODIFY_EXPR, TREE_TYPE (stmt), stmt,
-convert (TREE_TYPE (stmt), arg0));
+  stmt = build_array_ref (loc, stmt, arg2);
+  stmt = fold_build2 (MODIFY_EXPR, TREE_TYPE (arg0), stmt, arg0);
   stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
   return stmt;
 }


Re: [PATCH] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-04 Thread luoxhu via Gcc-patches




On 2020/9/4 14:16, luoxhu via Gcc-patches wrote:

Hi,


Yes, I checked and found that both vec_set and vec_extract doesn't support
variable index for most targets, store_bit_field_1 and extract_bit_field_1
would only consider use optabs when index is integer value.  Anyway, it
shouldn't be hard to extend depending on target requirements.

Another problem is v[n&3]=i and vec_insert(v, i, n) are generating with
different gimple code:

{
_1 = n & 3;
VIEW_CONVERT_EXPR(v1)[_1] = i;
}

vs:

{
   __vector signed int v1;
   __vector signed int D.3192;
   long unsigned int _1;
   long unsigned int _2;
   int * _3;

[local count: 1073741824]:
   D.3192 = v_4(D);
   _1 = n_7(D) & 3;
   _2 = _1 * 4;
   _3 =  + _2;
   *_3 = i_8(D);
   v1_10 = D.3192;
   return v1_10;
}


Just realized use convert_vector_to_array_for_subscript would generate
"VIEW_CONVERT_EXPR(v1)[_1] = i;" before produce those instructions, 
 your confirmation and comments will be highly appreciated...  Thanks in 
advance. :)


Xionghu



If not use builtin for "vec_insert(v, i, n)", the pointer is "int*" instead
of vector type, will this be difficult for expander to capture so many
statements then call the optabs?  So shall we still keep the builtin style
for "vec_insert(v, i, n)" and expand "v[n&3]=i" with optabs or expand both
with optabs???

Drafted a fast patch to expand "v[n&3]=i" with optabs as below, sorry that not
using existed vec_set yet as not quite sure, together with the first patch, both
cases could be handled as expected:


[PATCH] Expander: expand VIEW_CONVERT_EXPR to vec_insert with variable index

v[n%4] = i has same semantic with vec_insert (i, v, n), but it will be
optimized to "VIEW_CONVERT_EXPR(v1)[_1] = i;" in gimple, this
patch tries to recognize the pattern in expander and use optabs to
expand it to fast instructions like vec_insert: lvsl+xxperm+xxsel.

gcc/ChangeLog:

* config/rs6000/vector.md:
* expr.c (expand_assignment):
* optabs.def (OPTAB_CD):
---
  gcc/config/rs6000/vector.md | 13 +++
  gcc/expr.c  | 46 +
  gcc/optabs.def  |  1 +
  3 files changed, 60 insertions(+)

diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md
index 796345c80d3..46d21271e17 100644
--- a/gcc/config/rs6000/vector.md
+++ b/gcc/config/rs6000/vector.md
@@ -1244,6 +1244,19 @@ (define_expand "vec_extract"
DONE;
  })

+(define_expand "vec_insert"
+  [(match_operand:VEC_E 0 "vlogical_operand")
+   (match_operand: 1 "register_operand")
+   (match_operand 2 "register_operand")]
+  "VECTOR_MEM_ALTIVEC_OR_VSX_P (mode)"
+{
+  rtx target = gen_reg_rtx (V16QImode);
+  rs6000_expand_vector_insert (target, operands[0], operands[1], operands[2]);
+  rtx sub_target = simplify_gen_subreg (GET_MODE(operands[0]), target, 
V16QImode, 0);
+  emit_insn (gen_rtx_SET (operands[0], sub_target));
+  DONE;
+})
+
  ;; Convert double word types to single word types
  (define_expand "vec_pack_trunc_v2df"
[(match_operand:V4SF 0 "vfloat_operand")
diff --git a/gcc/expr.c b/gcc/expr.c
index dd2200ddea8..ce2890c1a2d 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -5237,6 +5237,52 @@ expand_assignment (tree to, tree from, bool nontemporal)

to_rtx = expand_expr (tem, NULL_RTX, VOIDmode, EXPAND_WRITE);

+  tree type = TREE_TYPE (to);
+  if (TREE_CODE (to) == ARRAY_REF && tree_fits_uhwi_p (TYPE_SIZE (type))
+ && tree_fits_uhwi_p (TYPE_SIZE_UNIT (type))
+ && tree_to_uhwi (TYPE_SIZE (type))
+* tree_to_uhwi (TYPE_SIZE_UNIT (type))
+  == 128)
+   {
+ tree op0 = TREE_OPERAND (to, 0);
+ tree op1 = TREE_OPERAND (to, 1);
+ if (TREE_CODE (op0) == VIEW_CONVERT_EXPR)
+   {
+ tree view_op0 = TREE_OPERAND (op0, 0);
+ mode = TYPE_MODE (TREE_TYPE (view_op0));
+ if (TREE_CODE (TREE_TYPE (view_op0)) == VECTOR_TYPE)
+   {
+ rtx value
+   = expand_expr (from, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+ rtx pos
+   = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+ rtx temp_target = gen_reg_rtx (mode);
+ emit_move_insn (temp_target, to_rtx);
+
+ machine_mode outermode = mode;
+ scalar_mode innermode = GET_MODE_INNER (outermode);
+ class expand_operand ops[3];
+ enum insn_code icode
+   = convert_optab_handler (vec_insert_optab, innermode,
+outermode);
+
+ if (icode != CODE_FOR_nothing)
+   {
+ pos = convert_to_mode (E_SImode, pos, 0);
+
+   

Re: [PATCH] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-04 Thread luoxhu via Gcc-patches
Hi,

On 2020/9/3 18:29, Richard Biener wrote:
> On Thu, Sep 3, 2020 at 11:20 AM luoxhu  wrote:
>>
>>
>>
>> On 2020/9/2 17:30, Richard Biener wrote:
>>>> so maybe bypass convert_vector_to_array_for_subscript for special 
>>>> circumstance
>>>> like "i = v[n%4]" or "v[n&3]=i" to generate vec_extract or vec_insert 
>>>> builtin
>>>> call a relative simpler method?
>>> I think you have it backward.  You need to work with what
>>> convert_vector_to_array_for_subscript
>>> gives and deal with it during RTL expansion / optimization to generate
>>> more optimal
>>> code for power.  The goal is to have as little target specific
>>> builtins during the GIMPLE
>>> optimization phase (because we cannot work out its semantics in optimizers).
>>
>> OK, got it, will add optabs vec_insert and expand 
>> "VIEW_CONVERT_EXPR(v1)[_1] = i_6(D);"
>> expressions to rs6000_expand_vector_insert instead of builtin call.
>> vec_extract already has optabs and "i = v[n%4]" should be in another patch
>> after this.
> 
> There is already vec_set and vec_extract - the question is whether the 
> expander
> tries those for variable index.
> 

Yes, I checked and found that both vec_set and vec_extract doesn't support
variable index for most targets, store_bit_field_1 and extract_bit_field_1
would only consider use optabs when index is integer value.  Anyway, it
shouldn't be hard to extend depending on target requirements. 

Another problem is v[n&3]=i and vec_insert(v, i, n) are generating with
different gimple code:

{
_1 = n & 3;
VIEW_CONVERT_EXPR(v1)[_1] = i;
}

vs:

{
  __vector signed int v1;
  __vector signed int D.3192;
  long unsigned int _1;
  long unsigned int _2;
  int * _3;

   [local count: 1073741824]:
  D.3192 = v_4(D);
  _1 = n_7(D) & 3;
  _2 = _1 * 4;
  _3 =  + _2; 
  *_3 = i_8(D);
  v1_10 = D.3192;
  return v1_10;
}

If not use builtin for "vec_insert(v, i, n)", the pointer is "int*" instead
of vector type, will this be difficult for expander to capture so many
statements then call the optabs?  So shall we still keep the builtin style
for "vec_insert(v, i, n)" and expand "v[n&3]=i" with optabs or expand both 
with optabs???

Drafted a fast patch to expand "v[n&3]=i" with optabs as below, sorry that not
using existed vec_set yet as not quite sure, together with the first patch, both
cases could be handled as expected:


[PATCH] Expander: expand VIEW_CONVERT_EXPR to vec_insert with variable index

v[n%4] = i has same semantic with vec_insert (i, v, n), but it will be
optimized to "VIEW_CONVERT_EXPR(v1)[_1] = i;" in gimple, this
patch tries to recognize the pattern in expander and use optabs to
expand it to fast instructions like vec_insert: lvsl+xxperm+xxsel.

gcc/ChangeLog:

* config/rs6000/vector.md:
* expr.c (expand_assignment):
* optabs.def (OPTAB_CD):
---
 gcc/config/rs6000/vector.md | 13 +++
 gcc/expr.c  | 46 +
 gcc/optabs.def  |  1 +
 3 files changed, 60 insertions(+)

diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md
index 796345c80d3..46d21271e17 100644
--- a/gcc/config/rs6000/vector.md
+++ b/gcc/config/rs6000/vector.md
@@ -1244,6 +1244,19 @@ (define_expand "vec_extract"
   DONE;
 })
 
+(define_expand "vec_insert"
+  [(match_operand:VEC_E 0 "vlogical_operand")
+   (match_operand: 1 "register_operand")
+   (match_operand 2 "register_operand")]
+  "VECTOR_MEM_ALTIVEC_OR_VSX_P (mode)"
+{
+  rtx target = gen_reg_rtx (V16QImode);
+  rs6000_expand_vector_insert (target, operands[0], operands[1], operands[2]);
+  rtx sub_target = simplify_gen_subreg (GET_MODE(operands[0]), target, 
V16QImode, 0);
+  emit_insn (gen_rtx_SET (operands[0], sub_target));
+  DONE;
+})
+
 ;; Convert double word types to single word types
 (define_expand "vec_pack_trunc_v2df"
   [(match_operand:V4SF 0 "vfloat_operand")
diff --git a/gcc/expr.c b/gcc/expr.c
index dd2200ddea8..ce2890c1a2d 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -5237,6 +5237,52 @@ expand_assignment (tree to, tree from, bool nontemporal)
 
   to_rtx = expand_expr (tem, NULL_RTX, VOIDmode, EXPAND_WRITE);
 
+  tree type = TREE_TYPE (to);
+  if (TREE_CODE (to) == ARRAY_REF && tree_fits_uhwi_p (TYPE_SIZE (type))
+ && tree_fits_uhwi_p (TYPE_SIZE_UNIT (type))
+ && tree_to_uhwi (TYPE_SIZE (type))
+* tree_to_uhwi (TYPE_SIZE_UNIT (type))
+  == 128)
+   {
+ tree op0 = TREE_OPERAND (to, 0);
+ tree op1 = TREE_OPERAND (to, 1);
+ if (TREE_CODE (op0) == V

Re: [PATCH] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-03 Thread luoxhu via Gcc-patches



On 2020/9/2 17:30, Richard Biener wrote:
>> so maybe bypass convert_vector_to_array_for_subscript for special 
>> circumstance
>> like "i = v[n%4]" or "v[n&3]=i" to generate vec_extract or vec_insert builtin
>> call a relative simpler method?
> I think you have it backward.  You need to work with what
> convert_vector_to_array_for_subscript
> gives and deal with it during RTL expansion / optimization to generate
> more optimal
> code for power.  The goal is to have as little target specific
> builtins during the GIMPLE
> optimization phase (because we cannot work out its semantics in optimizers).

OK, got it, will add optabs vec_insert and expand 
"VIEW_CONVERT_EXPR(v1)[_1] = i_6(D);"
expressions to rs6000_expand_vector_insert instead of builtin call.  
vec_extract already has optabs and "i = v[n%4]" should be in another patch
after this.


Thanks,
Xionghu



Re: [PATCH] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-02 Thread luoxhu via Gcc-patches
Hi,

On 2020/9/1 21:07, Richard Biener wrote:
> On Tue, Sep 1, 2020 at 10:11 AM luoxhu via Gcc-patches
>  wrote:
>>
>> Hi,
>>
>> On 2020/9/1 01:04, Segher Boessenkool wrote:
>>> Hi!
>>>
>>> On Mon, Aug 31, 2020 at 04:06:47AM -0500, Xiong Hu Luo wrote:
>>>> vec_insert accepts 3 arguments, arg0 is input vector, arg1 is the value
>>>> to be insert, arg2 is the place to insert arg1 to arg0.  This patch adds
>>>> __builtin_vec_insert_v4si[v4sf,v2di,v2df,v8hi,v16qi] for vec_insert to
>>>> not expand too early in gimple stage if arg2 is variable, to avoid generate
>>>> store hit load instructions.
>>>>
>>>> For Power9 V4SI:
>>>>   addi 9,1,-16
>>>>   rldic 6,6,2,60
>>>>   stxv 34,-16(1)
>>>>   stwx 5,9,6
>>>>   lxv 34,-16(1)
>>>> =>
>>>>   addis 9,2,.LC0@toc@ha
>>>>   addi 9,9,.LC0@toc@l
>>>>   mtvsrwz 33,5
>>>>   lxv 32,0(9)
>>>>   sradi 9,6,2
>>>>   addze 9,9
>>>>   sldi 9,9,2
>>>>   subf 9,9,6
>>>>   subfic 9,9,3
>>>>   sldi 9,9,2
>>>>   subfic 9,9,20
>>>>   lvsl 13,0,9
>>>>   xxperm 33,33,45
>>>>   xxperm 32,32,45
>>>>   xxsel 34,34,33,32
>>>
>>> For v a V4SI, x a SI, j some int,  what do we generate for
>>> v[j&3] = x;
>>> ?
>>> This should be exactly the same as we generate for
>>> vec_insert(x, v, j);
>>> (the builtin does a modulo 4 automatically).
>>
>> No, even with my patch "stxv 34,-16(1);stwx 5,9,6;lxv 34,-16(1)" generated 
>> currently.
>> Is it feasible and acceptable to expand some kind of pattern in expander 
>> directly without
>> builtin transition?
>>
>> I borrowed some of implementation from vec_extract.  For vec_extract, the 
>> issue also exists:
>>
>> source: gimple:  
>>expand:asm:
>> 1) i = vec_extract (v, n);  =>  i = __builtin_vec_ext_v4si (v, n);   => 
>> {r120:SI=unspec[r118:V4SI,r119:DI] 134;...} => slwi 9,6,2   vextuwrx 
>> 3,9,2
>> 2) i = vec_extract (v, 3);  =>  i = __builtin_vec_ext_v4si (v, 3);   => 
>> {r120:SI=vec_select(r118:V4SI,parallel)...} =>  li 9,12  vextuwrx 
>> 3,9,2
>> 3) i = v[n%4];   =>  _1 = n & 3;  i = VIEW_CONVERT_EXPR(v)[_1];  =>  
>>   ...=> stxv 34,-16(1);addi 9,1,-16; rldic 5,5,2,60; lwax 
>> 3,9,5
>> 4) i = v[3]; =>  i = BIT_FIELD_REF ;  =>  
>> {r120:SI=vec_select(r118:V4SI,parallel)...} => li 9,12;   vextuwrx 3,9,2
> 
> Why are 1) and 2) handled differently than 3)/4)?

1) and 2) are calling builtin function vec_extract, it is defined to
 __builtin_vec_extract and will be resolved to ALTIVEC_BUILTIN_VEC_EXTRACT
 by resolve_overloaded_builtin, to generate a call __builtin_vec_ext_v4si
 to be expanded only in RTL. 
3) is access variable v as array type with opcode VIEW_CONVERT_EXPR, I
 guess we should also generate builtin call instead of calling
 convert_vector_to_array_for_subscript to generate VIEW_CONVERT_EXPR
 expression for such kind of usage.
4) is translated to BIT_FIELD_REF with constant bitstart and bitsize,
variable v could also be accessed by register instead of stack, so optabs
could match the rs6000_expand_vector_insert to generate expected instruction
through extract_bit_field.

> 
>> Case 3) also couldn't handle the similar usage, and case 4) doesn't generate 
>> builtin as expected,
>> it just expand to vec_select by coincidence.  So does this mean both 
>> vec_insert and vec_extract
>> and all other similar vector builtins should use IFN as suggested by Richard 
>> Biener, to match the
>> pattern in gimple and expand both constant and variable index in expander?  
>> Will this also be
>> beneficial for other targets except power?  Or we should do that gradually 
>> after this patch
>> approved as it seems another independent issue?  Thanks:)
> 
> If the code generated for 3)/4) isn't optimal you have to figure why
> by tracing the RTL
> expansion code and looking for missing optabs.
> 
> Consider the amount of backend code you need to write if ever using
> those in constexpr
> context ...

It seems too complicated to expand the "i = VIEW_CONVERT_EXPR(v)[_1];"
or "VIEW_CONVERT_EXPR(v1)[_1] = i_6(D);" to
rs6000_expand_vector_inser

Re: [PATCH] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-01 Thread luoxhu via Gcc-patches
Hi,

On 2020/9/1 00:47, will schmidt wrote:
>> +  tmode = TYPE_MODE (TREE_TYPE (arg0));
>> +  mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
>> +  mode2 = TYPE_MODE ((TREE_TYPE (arg2)));
>> +  gcc_assert (VECTOR_MODE_P (tmode));
>> +
>> +  op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
>> +  op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
>> +  op2 = expand_expr (arg2, NULL_RTX, mode2, EXPAND_NORMAL);
>> +
>> +  if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
>> +op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
>> +
>> +  op0 = force_reg (tmode, op0);
>> +  op1 = force_reg (mode1, op1);
>> +  op2 = force_reg (mode2, op2);
>> +
>> +  target = gen_reg_rtx (V16QImode);
> Should that be tmode, or is V16QImode always correct here?


Thanks for the review.
Yes, the target should be TImode here, but the followed call
rs6000_expand_vector_insert needs a lot of emit_insns in it, 
using V16QI could reuse most of patterns in existed md files,
after returning from this function, there will be a convert from
V16QImode to TImode to make the type same:

expr.c: convert_move (target, temp, TYPE_UNSIGNED (TREE_TYPE (exp))); 

and I've tested this with V2DI, V2DF V4SI, V4SF, V8HI, V16QI on
Power9-LE and Power8-BE, the result correctness is ensured.
Other comments are modified.  Will update it later if no disagreements
about the implementation.


Thanks,
Xionghu

> 
>> +  rs6000_expand_vector_insert (target, op0, op1, op2);
>> +
>> +  return target;
>> +}


Re: [PATCH] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-01 Thread luoxhu via Gcc-patches
Hi,

On 2020/9/1 01:04, Segher Boessenkool wrote:
> Hi!
> 
> On Mon, Aug 31, 2020 at 04:06:47AM -0500, Xiong Hu Luo wrote:
>> vec_insert accepts 3 arguments, arg0 is input vector, arg1 is the value
>> to be insert, arg2 is the place to insert arg1 to arg0.  This patch adds
>> __builtin_vec_insert_v4si[v4sf,v2di,v2df,v8hi,v16qi] for vec_insert to
>> not expand too early in gimple stage if arg2 is variable, to avoid generate
>> store hit load instructions.
>>
>> For Power9 V4SI:
>>  addi 9,1,-16
>>  rldic 6,6,2,60
>>  stxv 34,-16(1)
>>  stwx 5,9,6
>>  lxv 34,-16(1)
>> =>
>>  addis 9,2,.LC0@toc@ha
>>  addi 9,9,.LC0@toc@l
>>  mtvsrwz 33,5
>>  lxv 32,0(9)
>>  sradi 9,6,2
>>  addze 9,9
>>  sldi 9,9,2
>>  subf 9,9,6
>>  subfic 9,9,3
>>  sldi 9,9,2
>>  subfic 9,9,20
>>  lvsl 13,0,9
>>  xxperm 33,33,45
>>  xxperm 32,32,45
>>  xxsel 34,34,33,32
> 
> For v a V4SI, x a SI, j some int,  what do we generate for
>v[j&3] = x;
> ?
> This should be exactly the same as we generate for
>vec_insert(x, v, j);
> (the builtin does a modulo 4 automatically).

No, even with my patch "stxv 34,-16(1);stwx 5,9,6;lxv 34,-16(1)" generated 
currently.
Is it feasible and acceptable to expand some kind of pattern in expander 
directly without
builtin transition?

I borrowed some of implementation from vec_extract.  For vec_extract, the issue 
also exists:

   source: gimple:  
   expand:asm:
1) i = vec_extract (v, n);  =>  i = __builtin_vec_ext_v4si (v, n);   => 
{r120:SI=unspec[r118:V4SI,r119:DI] 134;...} => slwi 9,6,2   vextuwrx 3,9,2  
 
2) i = vec_extract (v, 3);  =>  i = __builtin_vec_ext_v4si (v, 3);   => 
{r120:SI=vec_select(r118:V4SI,parallel)...} =>  li 9,12  vextuwrx 3,9,2
3) i = v[n%4];   =>  _1 = n & 3;  i = VIEW_CONVERT_EXPR(v)[_1];  => 
   ...=> stxv 34,-16(1);addi 9,1,-16; rldic 5,5,2,60; lwax 3,9,5
4) i = v[3]; =>  i = BIT_FIELD_REF ;  =>  
{r120:SI=vec_select(r118:V4SI,parallel)...} => li 9,12;   vextuwrx 3,9,2 

Case 3) also couldn't handle the similar usage, and case 4) doesn't generate 
builtin as expected,
it just expand to vec_select by coincidence.  So does this mean both vec_insert 
and vec_extract
and all other similar vector builtins should use IFN as suggested by Richard 
Biener, to match the
pattern in gimple and expand both constant and variable index in expander?  
Will this also be
beneficial for other targets except power?  Or we should do that gradually 
after this patch
approved as it seems another independent issue?  Thanks:)


Xionghu


Re: [PATCH] ipa-inline: Improve growth accumulation for recursive calls

2020-08-14 Thread luoxhu via Gcc-patches
Hi,

On 2020/8/13 20:52, Jan Hubicka wrote:
>> Since there are no other callers outside of these specialized nodes, the
>> guessed profile count should be same equal?  Perf tool shows that even
>> each specialized node is called only once, none of them take same time for
>> each call:
>>
>>40.65%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
>> __brute_force_MOD_digits_2.constprop.4
>>16.31%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
>> __brute_force_MOD_digits_2.constprop.3
>>10.91%  exchange2_gcc.o  libgfortran.so.5.0.0 [.] 
>> _gfortran_mminloc0_4_i4
>> 5.41%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
>> __brute_force_MOD_digits_2.constprop.6
>> 4.68%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
>> __logic_MOD_new_solver
>> 3.76%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
>> __brute_force_MOD_digits_2.constprop.5
>> 1.07%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
>> __brute_force_MOD_digits_2.constprop.7
>> 0.84%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
>> __brute_force_MOD_brute.constprop.0
>> 0.47%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
>> __brute_force_MOD_digits_2.constprop.2
>> 0.24%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
>> __brute_force_MOD_digits_2.constprop.1
>> 0.24%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
>> __brute_force_MOD_covered.constprop.0
>> 0.11%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
>> __brute_force_MOD_reflected.constprop.0
>> 0.00%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
>> __brute_force_MOD_brute.constprop.1
>>
>>
>> digits_2.constprop.4 & digits_2.constprop.3 takes most of the execution time,
>> So profile count and frequency seem not very helpful for this case?
> Yep, you can not really determine the time spent on each of recursion
> levels from the recursive edge probability since you can not assume it
> to be the same on each level of recursion (here we know it is 0 at level
> 10 and it is slowly dropping as the recursion increases because there
> are fewer posiblities to fill up the partly filled sudoku:).
> Especially you can not assume it after the ipa-cp did the work to figure
> out that there is recursion depth counter that affect the function.
> 
> Thinking of the ipa-cp profile updating changes, I did not consider safe
> the approach of adding extra weight to the recursive call. The problem
> is that the recursive call frequency estimate, when comming from static
> profile stimation, is likely completely off, just like static profile
> estimator can not be used to predict realistically the number of
> iterations of loops.  However even with profile feedback this is hard to
> interpret.
> 
> I was wondering if we can not simply detect this scenario and avoid
> using the recursive edge probability in the profile updating.
> We could simply scale by the number of copies.
> This means that if we produce 10 clones, we could just set each clone to
> 1/10th of the frequency.  This implies that the hottest spot will not be
> underestimated expontentially much as can happen now, but just 10 times
> at worst, wich is probably still OK. We play similar games in loop
> optimizers: static profile estimators expect every loop to trip around 3
> times so unroller/vectorizer/etc would make no sense on them.
> 
> By scaling down according to number of copies we will keep the
> frequencies of calls to function called from clones "sane".  We will
> still run into problems when we inline one clone to another (sine its
> proflie will get scaled by the recursive edge frequency), but perhaps
> that can be special cases in profiler or make ipa-cp to adjust the
> recursive edges to get frequency close to 1 as a hack.
> 
> This is not pretty, but at least it looks safer to me than the original
> profile updating patch adding extra weight to the frequency.


Really appreciate for your detailed explanation.  BTW, My previous patch
for PGO build on exchange2 takes this similar method by setting each cloned
node to 1/10th of the frequency several month agao :)

https://gcc.gnu.org/pipermail/gcc-patches/2020-June/546926.html

Xionghu


Re: [PATCH] ipa-inline: Improve growth accumulation for recursive calls

2020-08-13 Thread luoxhu via Gcc-patches
Hi,

On 2020/8/13 01:53, Jan Hubicka wrote:
> Hello,
> with Martin we spent some time looking into exchange2 and my
> understanding of the problem is the following:
> 
> There is the self recursive function digits_2 with the property that it
> has 10 nested loops and calls itself from the innermost.
> Now we do not do amazing job on guessing the profile since it is quite
> atypical. First observation is that the callback frequencly needs to be
> less than 1 otherwise the program never terminates, however with 10
> nested loops one needs to predict every loop to iterate just few times
> and conditionals guarding them as not very likely. For that we added
> PRED_LOOP_GUARD_WITH_RECURSION some time ago and I fixed it yesterday
> (causing regression in exhange since the bad profile turned out to
> disable some harmful vectorization) and I also now added a cap to the
> self recursive frequency so things to not get mispropagated by ipa-cp.

Thanks for the information :)  Tamar replied that there is another
regression *on exchange2 is 11%.*, I've also rebased my code and confirmed
it really getting even slower than before (revert the patch could pull the
performance back)...

> 
> Now if ipa-cp decides to duplicate digits few times we have a new
> problem.  The tree of recursion is orgnaized in a way that the depth is
> bounded by 10 (which GCC does not know) and moreover most time is not
> spent on very deep levels of recursion.
> 
> For that you have the patch which increases frequencies of recursively
> cloned nodes, however it still seems to me as very specific hack for
> exchange: I do not see how to guess where most of time is spent.
> Even for very regular trees, by master theorem, it depends on very
> little differences in the estimates of recursion frequency whether most
> of time is spent on the top of tree, bottom or things are balanced.

The build is not PGO, so I am not clear how profile count will affect the 
ipa-cp and ipa-inline decision. 
Since there are no other callers outside of these specialized nodes, the
guessed profile count should be same equal?  Perf tool shows that even
each specialized node is called only once, none of them take same time for
each call:

  40.65%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
__brute_force_MOD_digits_2.constprop.4
  16.31%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
__brute_force_MOD_digits_2.constprop.3
  10.91%  exchange2_gcc.o  libgfortran.so.5.0.0 [.] _gfortran_mminloc0_4_i4
   5.41%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
__brute_force_MOD_digits_2.constprop.6
   4.68%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] __logic_MOD_new_solver
   3.76%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
__brute_force_MOD_digits_2.constprop.5
   1.07%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
__brute_force_MOD_digits_2.constprop.7
   0.84%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
__brute_force_MOD_brute.constprop.0
   0.47%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
__brute_force_MOD_digits_2.constprop.2
   0.24%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
__brute_force_MOD_digits_2.constprop.1
   0.24%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
__brute_force_MOD_covered.constprop.0
   0.11%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
__brute_force_MOD_reflected.constprop.0
   0.00%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
__brute_force_MOD_brute.constprop.1


digits_2.constprop.4 & digits_2.constprop.3 takes most of the execution time,
So profile count and frequency seem not very helpful for this case? 

> 
> With algorithms doing backtracing, like exhchange, the likelyness of
> recusion reduces with deeper recursion level, but we do not know how
> quickly and what the level is.


> 
>> From: Xiong Hu Luo 
>>
>> For SPEC2017 exchange2, there is a large recursive functiondigits_2(function
>> size 1300) generates specialized node from digits_2.1 to digits_2.8 with 
>> added
>> build option:
>>
>> --param ipa-cp-eval-threshold=1 --param ipa-cp-unit-growth=80
>>
>> ipa-inline pass will consider inline these nodes called only once, but these
>> large functions inlined too deeply will cause serious register spill and
>> performance down as followed.
>>
>> inlineA: brute (inline digits_2.1, 2.2, 2.3, 2.4) -> digits_2.5 (inline 2.6, 
>> 2.7, 2.8)
>> inlineB: digits_2.1 (inline digits_2.2, 2.3) -> call digits_2.4 (inline 
>> digits_2.5, 2.6) -> call digits_2.7 (inline 2.8)
>> inlineC: brute (inline digits_2) -> call 2.1 -> 2.2 (inline 2.3) -> 2.4 -> 
>> 2.5 -> 2.6 (inline 2.7 ) -> 2.8
>> inlineD: brute -> call digits_2 -> call 2.1 -> call 2.2 -> 2.3 -> 2.4 -> 2.5 
>> -> 2.6 -> 2.7 -> 2.8
>>
>> Performance diff:
>> inlineB is ~25% faster than inlineA;
>> inlineC is ~20% faster than inlineB;
>> inlineD is ~30% faster than inlineC.
>>
>> The master GCC code now generates inline sequence like inlineB, this patch
>> makes the ipa-inline pass behavior like inlineD by:
>>   1) The growth acumulation for recursive 

Re: [PATCH v5] dse: Remove partial load after full store for high part access[PR71309]

2020-08-05 Thread luoxhu via Gcc-patches

Hi Richard,

On 2020/8/3 22:01, Richard Sandiford wrote:

/* Try a wider mode if truncating the store mode to NEW_MODE
 requires a real instruction.  */
if (maybe_lt (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode))
@@ -1779,6 +1780,25 @@ find_shift_sequence (poly_int64 access_size,
  && !targetm.modes_tieable_p (new_mode, store_mode))
continue;
  
+  if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)


Like I mentioned before, this should be the other way around:

 multiple_p (shift, GET_MODE_BITSIZE (new_mode))

i.e. for the subreg to be valid, the shift amount must be a multiple
of the outer mode size in bits.

OK with that change, thanks, and sorry for the multiple review cycles.


Just had another question is do we want to backport this patch to gcc-10
and gcc-9 branches? :)


Xionghu




Richard



Re: [PATCH v5] dse: Remove partial load after full store for high part access[PR71309]

2020-08-03 Thread luoxhu via Gcc-patches



On 2020/8/3 22:01, Richard Sandiford wrote:


/* Try a wider mode if truncating the store mode to NEW_MODE
 requires a real instruction.  */
if (maybe_lt (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode))
@@ -1779,6 +1780,25 @@ find_shift_sequence (poly_int64 access_size,
  && !targetm.modes_tieable_p (new_mode, store_mode))
continue;
  
+  if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)


Like I mentioned before, this should be the other way around:

 multiple_p (shift, GET_MODE_BITSIZE (new_mode))

i.e. for the subreg to be valid, the shift amount must be a multiple
of the outer mode size in bits.

OK with that change, thanks, and sorry for the multiple review cycles.


Appreciate your time and patience to make it better :).
Updated the change and lp64 requirements from Segher and committed by
r11-2526-g265d817b1eb4644c7a9613ad6920315d98e2e0a4, thank you all.

Xionghu



Richard



[PATCH v5] dse: Remove partial load after full store for high part access[PR71309]

2020-08-03 Thread luoxhu via Gcc-patches
Thanks, the v5 update as comments:
 1. Move const_rhs shift out of loop;
 2. Iterate from int size for read_mode.


This patch could optimize(works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
16: r122:DI=r119:TI#8

Final ASM will be as below without partial load after full store(stxv+ld):
  ld 10,16(3)
  mr 9,3
  ld 3,24(3)
  std 10,0(9)
  std 3,8(9)
  blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression tested on Power9-LE.

For AArch64, one ldr is replaced by mov with this patch:

ldp x2, x3, [x0, 16]
stp x2, x3, [x0]
ldr x0, [x0, 8]

=>

mov x1, x0
ldp x2, x0, [x0, 16]
stp x2, x0, [x1]

gcc/ChangeLog:

2020-08-03  Xionghu Luo  

PR rtl-optimization/71309
* dse.c (find_shift_sequence): Use subreg of shifted from high part
register to avoid loading from address.

gcc/testsuite/ChangeLog:

2020-08-03  Xionghu Luo  

PR rtl-optimization/71309
* gcc.target/powerpc/pr71309.c: New test.
---
 gcc/dse.c  | 78 ++
 gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 +
 2 files changed, 82 insertions(+), 29 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c

diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..b67b5c2ba35 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1720,6 +1720,35 @@ find_shift_sequence (poly_int64 access_size,
   scalar_int_mode new_mode;
   rtx read_reg = NULL;
 
+  /* If a constant was stored into memory, try to simplify it here,
+ otherwise the cost of the shift might preclude this optimization
+ e.g. at -Os, even when no actual shift will be needed.  */
+  if (store_info->const_rhs)
+{
+  auto new_mode = smallest_int_mode_for_size (access_size * BITS_PER_UNIT);
+  auto byte = subreg_lowpart_offset (new_mode, store_mode);
+  rtx ret
+   = simplify_subreg (new_mode, store_info->const_rhs, store_mode, byte);
+  if (ret && CONSTANT_P (ret))
+   {
+ rtx shift_rtx = gen_int_shift_amount (new_mode, shift);
+ ret = simplify_const_binary_operation (LSHIFTRT, new_mode, ret,
+shift_rtx);
+ if (ret && CONSTANT_P (ret))
+   {
+ byte = subreg_lowpart_offset (read_mode, new_mode);
+ ret = simplify_subreg (read_mode, ret, new_mode, byte);
+ if (ret && CONSTANT_P (ret)
+ && (set_src_cost (ret, read_mode, speed)
+ <= COSTS_N_INSNS (1)))
+   return ret;
+   }
+   }
+}
+
+  if (require_cst)
+return NULL_RTX;
+
   /* Some machines like the x86 have shift insns for each size of
  operand.  Other machines like the ppc or the ia-64 may only have
  shift insns that shift values within 32 or 64 bit registers.
@@ -1729,7 +1758,7 @@ find_shift_sequence (poly_int64 access_size,
 
   opt_scalar_int_mode new_mode_iter;
   FOR_EACH_MODE_FROM (new_mode_iter,
- smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
+ smallest_int_mode_for_size (GET_MODE_BITSIZE (read_mode)))
 {
   rtx target, new_reg, new_lhs;
   rtx_insn *shift_seq, *insn;
@@ -1739,34 +1768,6 @@ find_shift_sequence (poly_int64 access_size,
   if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
break;
 
-  /* If a constant was stored into memory, try to simplify it here,
-otherwise the cost of the shift might preclude this optimization
-e.g. at -Os, even when no actual shift will be needed.  */
-  if (store_info->const_rhs)
-   {
- poly_uint64 byte = subreg_lowpart_offset (new_mode, store_mode);
- rtx ret = simplify_subreg (new_mode, store_info->const_rhs,
-store_mode, byte);
- if (ret && CONSTANT_P (ret))
-   {
- rtx shift_rtx = gen_int_shift_amount (new_mode, shift);
- ret = simplify_const_binary_operation (LSHIFTRT, new_mode,
-ret, shift_rtx);
- if (ret && CONSTANT_P (ret))
-   {
- byte = subreg_lowpart_offset (read_mode, new_mode);
- ret = simplify_subreg (read_mode, ret, new_mode, byte);
- if (ret && CONSTANT_P (ret)
- && (set_src_cost (ret, read_mode, speed)
- <= COSTS_N_INSNS (1)))
-   return ret;
-   }
-   }
-   }
-
-  if (require_cst)
-   return NULL_RTX;
-
   /* Try a wider mode if truncating the store mode to NEW_MODE
 requires a real instruction.  */
   if (maybe_lt (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode))
@@ -1779,6 +1780,25 @@ find_shift_sequence (poly_int64 access_size,
  && !targetm.modes_tieable_p (new_mode, 

Re: [PATCH v4] dse: Remove partial load after full store for high part access[PR71309]

2020-07-28 Thread luoxhu via Gcc-patches

Gentle ping in case this mail is missed, Thanks :)

https://gcc.gnu.org/pipermail/gcc-patches/2020-July/550602.html

Xionghu


On 2020/7/24 18:47, luoxhu via Gcc-patches wrote:

Hi Richard,

This is the updated version that could pass all regression test on
Power9-LE.
Just need another  "maybe_lt (GET_MODE_SIZE (new_mode), access_size)"
before generating shift for store_info->const_rhs to ensure correct
constant is generated, take testsuite/gfortran1/equiv_2.x for example:

equiv_2.x-equiv_2.f90.264r.cse2:
5: r119:DI=0x6867666564636261
6: [sfp:DI+0x20]=r119:DI
18: r123:SI=0x65646362
19: r124:SI=[sfp:DI+0x21]
20: r125:CC=cmp(r124:SI,r123:SI)

Bad code due to get SImode subreg0 and shift right 8 bits got 0x646362 in
insn #32, equiv_2.x-equiv_2.f90.265r.dse1:
5:  r119:DI=0x6867666564636261
32: r126:SI=0x646362
6:  [sfp:DI+0x20]=r119:DI
18: r123:SI=0x65646362
19: r124:SI=r126:SI
20: r125:CC=cmp(r124:SI,r123:SI)

Good code, get 0x65646362 in insn #32, equiv_2.x-equiv_2.f90.265r.dse1:
5:  r119:DI=0x6867666564636261
32: r126:SI=0x65646362
6:  [sfp:DI+0x20]=r119:DI
18: r123:SI=0x65646362
19: r124:SI=r126:SI
20: r125:CC=cmp(r124:SI,r123:SI)

So the new_mode size should also be GE than access_size for const_rhs shift.
It seems a bit more complicated now...



[PATCH v4] dse: Remove partial load after full store for high part 
access[PR71309]


This patch could optimize(works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
16: r122:DI=r119:TI#8

Final ASM will be as below without partial load after full store(stxv+ld):
   ld 10,16(3)
   mr 9,3
   ld 3,24(3)
   std 10,0(9)
   std 3,8(9)
   blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression tested on Power9-LE.

For AArch64, one ldr is replaced by mov with this patch:

ldp x2, x3, [x0, 16]
stp x2, x3, [x0]
ldr x0, [x0, 8]

=>

mov x1, x0
ldp x2, x0, [x0, 16]
stp x2, x0, [x1]

gcc/ChangeLog:

2020-07-24  Xionghu Luo  

PR rtl-optimization/71309
* dse.c (find_shift_sequence): Use subreg of shifted from high part
register to avoid loading from address.

gcc/testsuite/ChangeLog:

2020-07-24  Xionghu Luo  

PR rtl-optimization/71309
* gcc.target/powerpc/pr71309.c: New test.
---
  gcc/dse.c  | 25 ++--
  gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++
  2 files changed, 56 insertions(+), 2 deletions(-)
  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c

diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..a7cdc43a182 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1728,8 +1728,7 @@ find_shift_sequence (poly_int64 access_size,
   the machine.  */

opt_scalar_int_mode new_mode_iter;
-  FOR_EACH_MODE_FROM (new_mode_iter,
- smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
+  FOR_EACH_MODE_IN_CLASS (new_mode_iter, MODE_INT)
  {
rtx target, new_reg, new_lhs;
rtx_insn *shift_seq, *insn;
@@ -1744,6 +1743,9 @@ find_shift_sequence (poly_int64 access_size,
 e.g. at -Os, even when no actual shift will be needed.  */
if (store_info->const_rhs)
{
+ if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
+   continue;
+
  poly_uint64 byte = subreg_lowpart_offset (new_mode, store_mode);
  rtx ret = simplify_subreg (new_mode, store_info->const_rhs,
 store_mode, byte);
@@ -1779,6 +1781,25 @@ find_shift_sequence (poly_int64 access_size,
  && !targetm.modes_tieable_p (new_mode, store_mode))
continue;

+  if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)
+ && known_le (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode)))
+   {
+ /* Try to implement the shift using a subreg.  */
+ poly_int64 offset = subreg_offset_from_lsb (new_mode,
+ store_mode, shift);
+ rtx rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs,
+   store_mode, offset);
+ if (rhs_subreg)
+   {
+ read_reg
+   = extract_low_bits (read_mode, new_mode, copy_rtx (rhs_subreg));
+ break;
+   }
+   }
+
+  if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
+   continue;
+
new_reg = gen_reg_rtx (new_mode);

start_sequence ();
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c 
b/gcc/testsuite/gcc.target/powerpc/pr71309.c
new file mode 100644
index 000..94d727a8ed9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+#define TYPE void*
+#define 

[PATCH v4] dse: Remove partial load after full store for high part access[PR71309]

2020-07-24 Thread luoxhu via Gcc-patches
Hi Richard,

This is the updated version that could pass all regression test on
Power9-LE. 
Just need another  "maybe_lt (GET_MODE_SIZE (new_mode), access_size)"
before generating shift for store_info->const_rhs to ensure correct
constant is generated, take testsuite/gfortran1/equiv_2.x for example:

equiv_2.x-equiv_2.f90.264r.cse2:
5: r119:DI=0x6867666564636261
6: [sfp:DI+0x20]=r119:DI
18: r123:SI=0x65646362
19: r124:SI=[sfp:DI+0x21]
20: r125:CC=cmp(r124:SI,r123:SI)

Bad code due to get SImode subreg0 and shift right 8 bits got 0x646362 in
insn #32, equiv_2.x-equiv_2.f90.265r.dse1:
5:  r119:DI=0x6867666564636261
32: r126:SI=0x646362
6:  [sfp:DI+0x20]=r119:DI
18: r123:SI=0x65646362
19: r124:SI=r126:SI
20: r125:CC=cmp(r124:SI,r123:SI)

Good code, get 0x65646362 in insn #32, equiv_2.x-equiv_2.f90.265r.dse1:
5:  r119:DI=0x6867666564636261
32: r126:SI=0x65646362
6:  [sfp:DI+0x20]=r119:DI
18: r123:SI=0x65646362
19: r124:SI=r126:SI
20: r125:CC=cmp(r124:SI,r123:SI)

So the new_mode size should also be GE than access_size for const_rhs shift.
It seems a bit more complicated now...



[PATCH v4] dse: Remove partial load after full store for high part 
access[PR71309]


This patch could optimize(works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
16: r122:DI=r119:TI#8

Final ASM will be as below without partial load after full store(stxv+ld):
  ld 10,16(3)
  mr 9,3
  ld 3,24(3)
  std 10,0(9)
  std 3,8(9)
  blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression tested on Power9-LE.

For AArch64, one ldr is replaced by mov with this patch:

ldp x2, x3, [x0, 16]
stp x2, x3, [x0]
ldr x0, [x0, 8]

=>

mov x1, x0
ldp x2, x0, [x0, 16]
stp x2, x0, [x1]

gcc/ChangeLog:

2020-07-24  Xionghu Luo  

PR rtl-optimization/71309
* dse.c (find_shift_sequence): Use subreg of shifted from high part
register to avoid loading from address.

gcc/testsuite/ChangeLog:

2020-07-24  Xionghu Luo  

PR rtl-optimization/71309
* gcc.target/powerpc/pr71309.c: New test.
---
 gcc/dse.c  | 25 ++--
 gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++
 2 files changed, 56 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c

diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..a7cdc43a182 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1728,8 +1728,7 @@ find_shift_sequence (poly_int64 access_size,
  the machine.  */
 
   opt_scalar_int_mode new_mode_iter;
-  FOR_EACH_MODE_FROM (new_mode_iter,
- smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
+  FOR_EACH_MODE_IN_CLASS (new_mode_iter, MODE_INT)
 {
   rtx target, new_reg, new_lhs;
   rtx_insn *shift_seq, *insn;
@@ -1744,6 +1743,9 @@ find_shift_sequence (poly_int64 access_size,
 e.g. at -Os, even when no actual shift will be needed.  */
   if (store_info->const_rhs)
{
+ if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
+   continue;
+
  poly_uint64 byte = subreg_lowpart_offset (new_mode, store_mode);
  rtx ret = simplify_subreg (new_mode, store_info->const_rhs,
 store_mode, byte);
@@ -1779,6 +1781,25 @@ find_shift_sequence (poly_int64 access_size,
  && !targetm.modes_tieable_p (new_mode, store_mode))
continue;
 
+  if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)
+ && known_le (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode)))
+   {
+ /* Try to implement the shift using a subreg.  */
+ poly_int64 offset = subreg_offset_from_lsb (new_mode,
+ store_mode, shift);
+ rtx rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs,
+   store_mode, offset);
+ if (rhs_subreg)
+   {
+ read_reg
+   = extract_low_bits (read_mode, new_mode, copy_rtx (rhs_subreg));
+ break;
+   }
+   }
+
+  if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
+   continue;
+
   new_reg = gen_reg_rtx (new_mode);
 
   start_sequence ();
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c 
b/gcc/testsuite/gcc.target/powerpc/pr71309.c
new file mode 100644
index 000..94d727a8ed9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+#define TYPE void*
+#define TYPE2 void*
+
+struct path {
+TYPE2 mnt;
+TYPE dentry;
+};
+
+struct nameidata {
+struct path path;
+struct path root;
+};
+
+__attribute__ ((noinline))
+TYPE foo(struct nameidata *nd)
+{
+  TYPE d;
+  TYPE2 d2;
+
+  nd->path = 

Re: [PATCH v3] dse: Remove partial load after full store for high part access[PR71309]

2020-07-23 Thread luoxhu via Gcc-patches



On 2020/7/23 04:30, Richard Sandiford wrote:
> 
> I now realise the reason is that the starting mode is too wide.
> I think we should fix that by doing:
> 
>FOR_EACH_MODE_IN_CLASS (new_mode_iter, MODE_INT)
>  {
>…
> 
> and then add:
> 
>if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
>  continue;
> 
> after your optimisation, so that the shift code proper still only
> considers modes that are wide enough to hold the unshifted value.
> 
> I don't think there are any efficiency concerns with that, since
> smallest_int_mode_for_size does its own similar iteration internally.
> 
> Sorry for not picking up on that first time.
> 

Thanks:), I didn't make it clear that it starts from TImode first...

The updated patch use "FOR_EACH_MODE_IN_CLASS (new_mode_iter, MODE_INT)"
to iterate from QImode now, but still need size "new_mode <= store_mode"
to get legal subreg, otherwise "gfortran.dg/shift-kind.f90" will fail.
Even so, there are still below execute fail when running regression tests:

gfortran.fortran-torture/execute/equiv_2.f90 execution,  -O2
gfortran.fortran-torture/execute/equiv_2.f90 execution,  -O2 -fbounds-check
gfortran.fortran-torture/execute/equiv_2.f90 execution,  -O2 
-fomit-frame-pointer -finline-functions
gfortran.fortran-torture/execute/equiv_2.f90 execution,  -O2 
-fomit-frame-pointer -finline-functions -funroll-loops
gfortran.fortran-torture/execute/equiv_2.f90 execution,  -O3 -g
gfortran.fortran-torture/execute/equiv_2.f90 execution, -O2 -ftree-vectorize 
-maltivec 

Any clue about the execution outputs "STOP 2", please? Still investigating.

PS: if switch the sequence of GET_MODE_BITSIZE (new_mode) and shift in 
multiple_p,
it will generates incorrect ASM for PR71309.


This patch could optimize (works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
16: r122:DI=r119:TI#8

Final ASM will be as below without partial load after full store(stxv+ld):
  ld 10,16(3)
  mr 9,3
  ld 3,24(3)
  std 10,0(9)
  std 3,8(9)
  blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression tested on Power9-LE.

For AArch64, one ldr is replaced by mov with this patch:

ldp x2, x3, [x0, 16]
stp x2, x3, [x0]
ldr x0, [x0, 8]

=>

mov x1, x0
ldp x2, x0, [x0, 16]
stp x2, x0, [x1]

gcc/ChangeLog:

2020-07-23  Xionghu Luo  

PR rtl-optimization/71309
* dse.c (find_shift_sequence): Use subreg of shifted from high part
register to avoid loading from address.

gcc/testsuite/ChangeLog:

2020-07-23  Xionghu Luo  

PR rtl-optimization/71309
* gcc.target/powerpc/pr71309.c: New test.
---
 gcc/dse.c  | 22 +--
 gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++
 2 files changed, 53 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c

diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..afc6ad30623 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1728,8 +1728,7 @@ find_shift_sequence (poly_int64 access_size,
  the machine.  */
 
   opt_scalar_int_mode new_mode_iter;
-  FOR_EACH_MODE_FROM (new_mode_iter,
- smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
+  FOR_EACH_MODE_IN_CLASS (new_mode_iter, MODE_INT)
 {
   rtx target, new_reg, new_lhs;
   rtx_insn *shift_seq, *insn;
@@ -1779,6 +1778,25 @@ find_shift_sequence (poly_int64 access_size,
  && !targetm.modes_tieable_p (new_mode, store_mode))
continue;
 
+  if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)
+ && known_le (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode)))
+   {
+ /* Try to implement the shift using a subreg.  */
+ poly_int64 offset
+   = subreg_offset_from_lsb (new_mode, store_mode, shift);
+ rtx rhs_subreg
+   = simplify_gen_subreg (new_mode, store_info->rhs, store_mode, 
offset);
+ if (rhs_subreg)
+   {
+ read_reg
+   = extract_low_bits (read_mode, new_mode, copy_rtx (rhs_subreg));
+ break;
+   }
+   }
+
+  if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
+   continue;
+
   new_reg = gen_reg_rtx (new_mode);
 
   start_sequence ();
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c 
b/gcc/testsuite/gcc.target/powerpc/pr71309.c
new file mode 100644
index 000..94d727a8ed9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+#define TYPE void*
+#define TYPE2 void*
+
+struct path {
+TYPE2 mnt;
+TYPE dentry;
+};
+
+struct nameidata {
+struct path path;
+struct path root;
+};
+
+__attribute__ ((noinline))
+TYPE foo(struct nameidata *nd)
+{
+  TYPE d;
+  

Re: [PATCH v3] dse: Remove partial load after full store for high part access[PR71309]

2020-07-22 Thread luoxhu via Gcc-patches
Hi,

On 2020/7/22 19:05, Richard Sandiford wrote:
> This wasn't really what I meant.  Using subregs is fine, but I was
> thinking of:
> 
>/* Also try a wider mode if the necessary punning is either not
>desirable or not possible.  */
>if (!CONSTANT_P (store_info->rhs)
> && !targetm.modes_tieable_p (new_mode, store_mode))
>   continue;
> 
>if (multiple_p (shift, GET_MODE_BITSIZE (new_mode)))
>   {
> /* Try to implement the shift using a subreg.  */
> poly_int64 offset = subreg_offset_from_lsb (new_mode, store_mode,
> shift);
> rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs,
>   store_mode, offset);
> if (rhs_subreg)
>   {
> ...
> break;
>   }
>   }
> 
> where the rhs_subreg is from your original patch.
> 
> The multiple_p should be that way round: the shift needs to be a
> multiple of the new_mode for the subreg to be valid.
> 
> I think this should also avoid the BITS_PER_WORD problem.  On the
> other hand, I agree BITS_PER_UNIT isn't a very sensible limit if
> we're using subregs, so maybe moving it to after the multiple_p
> if block would still make sense.
> 

Thanks, I took that rhs_subreg part back for the v3 patch and updated a bit
based on your prototype, shift should be put in op1 as multiple_p requires
op0 >= op1. 

Then, new_mode is still TImode same to store_mode, offset will return 8 when
shift is 64,  simplify_gen_subreg needs an additional inner_mode(DImode) 
generated from "smallest_int_mode_for_size (shift)" to get rhs_subreg, 
otherwise it will return NULL if new_mode is equal to store_mode.

Lastly, move the BITS_PER_UNIT after multiple_p as it still need generate
shift_seq for other circumstances. :)


[PATCH v3] dse: Remove partial load after full store for high part 
access[PR71309]


This patch could optimize (works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
16: r122:DI=r119:TI#8

Final ASM will be as below without partial load after full store(stxv+ld):
  ld 10,16(3)
  mr 9,3
  ld 3,24(3)
  std 10,0(9)
  std 3,8(9)
  blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression tested on Power9-LE.

For AArch64, one ldr is replaced by mov with this patch:

ldp x2, x3, [x0, 16]
stp x2, x3, [x0]
ldr x0, [x0, 8]

=>

mov x1, x0
ldp x2, x0, [x0, 16]
stp x2, x0, [x1]

gcc/ChangeLog:

2020-07-22  Xionghu Luo  

PR rtl-optimization/71309
* dse.c (find_shift_sequence): Use subreg of shifted from high part
register to avoid loading from address.

gcc/testsuite/ChangeLog:

2020-07-22  Xionghu Luo  

PR rtl-optimization/71309
* gcc.target/powerpc/pr71309.c: New test.
---
 gcc/dse.c  | 21 --
 gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++
 2 files changed, 52 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c

diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..aaa161237c3 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1736,8 +1736,6 @@ find_shift_sequence (poly_int64 access_size,
   int cost;
 
   new_mode = new_mode_iter.require ();
-  if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
-   break;
 
   /* If a constant was stored into memory, try to simplify it here,
 otherwise the cost of the shift might preclude this optimization
@@ -1779,6 +1777,25 @@ find_shift_sequence (poly_int64 access_size,
  && !targetm.modes_tieable_p (new_mode, store_mode))
continue;
 
+  if (multiple_p (GET_MODE_BITSIZE (new_mode), shift))
+   {
+ /* Try to implement the shift using a subreg.  */
+ scalar_int_mode inner_mode = smallest_int_mode_for_size (shift);
+ poly_int64 offset
+   = subreg_offset_from_lsb (new_mode, store_mode, shift);
+ rtx rhs_subreg = simplify_gen_subreg (inner_mode, store_info->rhs,
+   store_mode, offset);
+ if (rhs_subreg)
+   {
+ read_reg = extract_low_bits (read_mode, inner_mode,
+  copy_rtx (rhs_subreg));
+ break;
+   }
+   }
+
+  if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
+   break;
+
   new_reg = gen_reg_rtx (new_mode);
 
   start_sequence ();
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c 
b/gcc/testsuite/gcc.target/powerpc/pr71309.c
new file mode 100644
index 000..94d727a8ed9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+#define 

Re: [PATCH v2] dse: Remove partial load after full store for high part access[PR71309]

2020-07-22 Thread luoxhu via Gcc-patches
Hi,

On 2020/7/21 23:30, Richard Sandiford wrote:
> Xiong Hu Luo  writes:>> @@ -1872,9 +1872,27 @@ 
> get_stored_val (store_info *store_info, machine_mode read_mode,
>>   {
>> poly_int64 shift = gap * BITS_PER_UNIT;
>> poly_int64 access_size = GET_MODE_SIZE (read_mode) + gap;
>> -  read_reg = find_shift_sequence (access_size, store_info, read_mode,
>> -  shift, optimize_bb_for_speed_p (bb),
>> -  require_cst);
>> +  rtx rhs_subreg = NULL;
>> +
>> +  if (known_eq (GET_MODE_BITSIZE (store_mode), shift * 2))
>> +{
>> +  scalar_int_mode inner_mode = smallest_int_mode_for_size (shift);
>> +  poly_uint64 sub_off
>> += ((!BYTES_BIG_ENDIAN)
>> + ? GET_MODE_SIZE (store_mode) - GET_MODE_SIZE (inner_mode)
>> + : 0);
>> +
>> +  rhs_subreg = simplify_gen_subreg (inner_mode, store_info->rhs,
>> +store_mode, sub_off);
>> +  if (rhs_subreg)
>> +read_reg
>> +  = extract_low_bits (read_mode, inner_mode, copy_rtx (rhs_subreg));
>> +}
>> +
>> +  if (read_reg == NULL)
>> +read_reg
>> +  = find_shift_sequence (access_size, store_info, read_mode, shift,
>> + optimize_bb_for_speed_p (bb), require_cst);
> 
> Did you consider doing this in find_shift_sequence instead?
> ISTM that this is really using subregs to optimise:
> 
>/* In theory we could also check for an ashr.  Ian Taylor knows
>of one dsp where the cost of these two was not the same.  But
>this really is a rare case anyway.  */
>target = expand_binop (new_mode, lshr_optab, new_reg,
>gen_int_shift_amount (new_mode, shift),
>new_reg, 1, OPTAB_DIRECT);
> 
> I think everything up to:
> 
>/* Also try a wider mode if the necessary punning is either not
>desirable or not possible.  */
>if (!CONSTANT_P (store_info->rhs)
> && !targetm.modes_tieable_p (new_mode, store_mode))
>   continue;
> 
> is either neutral or helpful for the subreg case too, so maybe
> we could just add the optimisation after that.  (It probably isn't
> worth reusing any of the later loop body code though, since the
> subreg case is much simpler.)
> 
> I don't think we need to restrict this case to modes of size
> shift * 2.  We can just check whether the shift is a multiple of
> the new_mode calculated by find_shift_sequence (using multiple_p).
> 
> An easier way of converting the shift to a subreg byte offset
> is to use subreg_offset_from_lsb, which also handles
> BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN.
> 

Thanks, I've updated the patch by moving it into find_shift_sequence.
Not sure whether meets your comments precisely though it still works:)
There is a comment mentioned that 

  /* Some machines like the x86 have shift insns for each size of
 operand.  Other machines like the ppc or the ia-64 may only have
 shift insns that shift values within 32 or 64 bit registers.
 This loop tries to find the smallest shift insn that will right
 justify the value we want to read but is available in one insn on
 the machine.  */

So it will early break without some additional check as the new_mode is
TImode here:

  if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
break;



[PATCH v2] dse: Remove partial load after full store for high part 
access[PR71309]


This patch could optimize (works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
18: r122:TI=r119:TI
16: r123:TI#0=r122:TI#8 0>>0
17: r123:TI#8=0
19: r124:DI=r123:TI#0
7: [r118:DI]=r119:TI
8: r121:DI=r124:DI

Final ASM will be as below without partial load after full store(stxv+ld):
  mr 9,3
  ld 3,24(3)
  ld 10,16(3)
  std 3,8(9)
  std 10,0(9)
  blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression testing on Power9-LE.

For AArch64, one ldr is replaced by mov:

ldp x2, x3, [x0, 16]
stp x2, x3, [x0]
ldr x0, [x0, 8]

=>

mov x1, x0
ldp x2, x0, [x0, 16]
stp x2, x0, [x1]

gcc/ChangeLog:

2020-07-22  Xionghu Luo  

PR rtl-optimization/71309
* dse.c (find_shift_sequence): Use subreg of shifted from high part
register to avoid loading from address.

gcc/testsuite/ChangeLog:

2020-07-22  Xionghu Luo  

PR rtl-optimization/71309
* gcc.target/powerpc/pr71309.c: New test.
---
 gcc/dse.c  | 15 +-
 gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++
 2 files changed, 47 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c

diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..e06a9fbb0cd 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1736,7 +1736,8 @@ find_shift_sequence (poly_int64 

Re: [PATCH] rs6000: Define movsf_from_si2 to extract high part SF element from DImode[PR89310]

2020-07-20 Thread luoxhu via Gcc-patches

On 2020/7/20 23:31, Segher Boessenkool wrote:

On Mon, Jul 13, 2020 at 02:30:28PM +0800, luoxhu wrote:

For extracting high part element from DImode register like:

{%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}

split it before reload with "and mask" to avoid generating shift right
32 bit then shift left 32 bit.  This pattern also exists in PR42475 and
PR67741, etc.

srdi 3,3,32
sldi 9,3,32
mtvsrd 1,9
xscvspdpn 1,1

=>

rldicr 3,3,0,31
mtvsrd 1,3
xscvspdpn 1,1



* config/rs6000/rs6000.md (movsf_from_si2): New
define_insn_and_split.


(That fits on one line).


--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr89310.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */



+/* { dg-final { scan-assembler-not {\msrdi\M} } } */
+/* { dg-final { scan-assembler-not {\msldi\M} } } */
+/* { dg-final { scan-assembler-times {\mrldicr\M} 1 } } */


I'm not sure that works on older cpus?  Please test there, and add
-mdejagnu-cpu=power8 to the dg-options if needed.  Also test on BE please.

Okay for trunk with those last details taking care of.  Thank you!


Thanks for the remind.  Addressed the comments and committed in r11-2245.

Xionghu


Re: [PATCH] rs6000: Define movsf_from_si2 to extract high part SF element from DImode[PR89310]

2020-07-14 Thread luoxhu via Gcc-patches
Hi David,

On 2020/7/14 22:17, David Edelsohn wrote:
> Unfortunately this patch is eliciting a number of new testsuite
> failures, all like
> 
> error: unrecognizable insn:
> (insn 44 43 45 5 (parallel [
>  (set (reg:SI 199)
>  (unspec:SI [
>  (reg:SF 202)
>  ] UNSPEC_SI_FROM_SF))
>  (clobber (scratch:V4SF))
>  ]) 
> "/nasfarm/edelsohn/src/src/gcc/testsuite/gcc.dg/vect/vect-alias-check-11.c":70:1
> -1
>   (nil))
> during RTL pass: vregs
> 
> for
> 
> gcc.dg/vect/vect-alias-check-11.c
> gcc.dg/vect/vect-alias-check-12.c
> gcc.dg/vect/pr57741-2.c
> gcc.dg/vect/pr57741-3.c
> gcc.dg/vect/pr89440.c
> gcc.target/powerpc/sse-movss-1.c

This patch won't match the instruction with a "clobber (scratch:V4SF)",
it only matches "(clobber (match_scratch:DI 2 "=r"))", I guess you are
replying to the other patch?

"[PATCH 2/2] rs6000: Define define_insn_and_split to split unspec sldi+or to 
rldimi"

Thanks for your fix patch! :)

This patch's regression tested pass on Power8-LE, I re-run these cases on
Power8-LE, and confirmed these could pass, what is your platform please?
BTW, TARGET_NO_SF_SUBREG ensured TARGET_POWERPC64 for this 
define_insn_and_split.
Thanks.

Xionghu

> 
> Thanks, David
> 
> On Mon, Jul 13, 2020 at 2:30 AM luoxhu  wrote:
>>
>> Hi,
>>
>> On 2020/7/11 08:54, Segher Boessenkool wrote:
>>> Hi!
>>>
>>> On Fri, Jul 10, 2020 at 09:39:40AM +0800, luoxhu wrote:
>>>> OK, seems the md file needs a format tool too...
>>>
>>> Heh.  Just make sure it looks good (that is, does what it looks like),
>>> looks like the rest, etc.  It's hard to do anything nice with unspecs,
>>> [ ] lists do not format well.
>>>
>>>>>> +  "TARGET_NO_SF_SUBREG"
>>>>>> +  "#"
>>>>>> +  "&& vsx_reg_sfsubreg_ok (operands[0], SFmode)"
>>>>>
>>>>> Put this in the insn condition?  And since this is just a predicate,
>>>>> you can just use it instead of gpc_reg_operand.
>>>>>
>>>>> (The split condition becomes "&& 1" then, not "").
>>>>
>>>> OK, this seems a bit strange as movsi_from_sf, movsf_from_si and
>>>> movdi_from_sf_zero_ext all use it as condition...
>>>
>>> Since in your case you *always* split, the split condition should be
>>> "always".  There are no alternatives that do not split here.
>>>
>>>> And why vsx_reg_sfsubreg_ok allows "SF SUBREGS" and TARGET_NO_SF_SUBREG
>>>> "avoid (SUBREG:SF (REG:SI)", I guess they are not the same meaning? (The
>>>> TARGET_NO_SF_SUBREG is also copied from other similar defines.)  Thanks.
>>>
>>> Good question.  I do not know.
>>>
>>> Well...  Since this define_insn* requires p8 *anyway*, we do not need
>>> any of these sf_subreg things?  We always know for each one if it should
>>> be true or false.
>>
>> Yes, removed the vsx_reg_sfsubreg_ok check.
>>
>>>
>>>> +  "TARGET_NO_SF_SUBREG"
>>>
>>> But here we should require p8 some other way, then.
>>
>> TARGET_NO_SF_SUBREG is defined to TARGET_DIRECT_MOVE_64BIT, and
>> TARGET_DIRECT_MOVE_64BIT is TARGET_DIRECT_MOVE && TARGET_P8_VECTOR && 
>> TARGET_POWERPC64
>> which means TARGET_P8_VECTOR must be true for TARGET_NO_SF_SUBREG.
>>
>>>
>>>> +  (set_attr "isa" "p8v")])
>>>
>>> (This isn't enough, unfortunately).
>>>
>>
>>
>> Updated patch to removed the vsx_reg_sfsubreg_ok and ICE fix:
>>
>>
>> For extracting high part element from DImode register like:
>>
>> {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
>>
>> split it before reload with "and mask" to avoid generating shift right
>> 32 bit then shift left 32 bit.  This pattern also exists in PR42475 and
>> PR67741, etc.
>>
>> srdi 3,3,32
>> sldi 9,3,32
>> mtvsrd 1,9
>> xscvspdpn 1,1
>>
>> =>
>>
>> rldicr 3,3,0,31
>> mtvsrd 1,3
>> xscvspdpn 1,1
>>
>> Bootstrap and regression tested pass on Power8-LE.
>>
>> gcc/ChangeLog:
>>
>> 2020-07-13  Xionghu Luo  
>>
>>  PR rtl-optimization/89310
>>  * config/rs6000/rs6000.md (movsf_from_si2): New
>>  define_insn_and_split.
>&

Re: [PATCH] rs6000: Define movsf_from_si2 to extract high part SF element from DImode[PR89310]

2020-07-13 Thread luoxhu via Gcc-patches
Hi,

On 2020/7/11 08:54, Segher Boessenkool wrote:
> Hi!
> 
> On Fri, Jul 10, 2020 at 09:39:40AM +0800, luoxhu wrote:
>> OK, seems the md file needs a format tool too...
> 
> Heh.  Just make sure it looks good (that is, does what it looks like),
> looks like the rest, etc.  It's hard to do anything nice with unspecs,
> [ ] lists do not format well.
> 
>>>> +  "TARGET_NO_SF_SUBREG"
>>>> +  "#"
>>>> +  "&& vsx_reg_sfsubreg_ok (operands[0], SFmode)"
>>>
>>> Put this in the insn condition?  And since this is just a predicate,
>>> you can just use it instead of gpc_reg_operand.
>>>
>>> (The split condition becomes "&& 1" then, not "").
>>
>> OK, this seems a bit strange as movsi_from_sf, movsf_from_si and
>> movdi_from_sf_zero_ext all use it as condition...
> 
> Since in your case you *always* split, the split condition should be
> "always".  There are no alternatives that do not split here.
> 
>> And why vsx_reg_sfsubreg_ok allows "SF SUBREGS" and TARGET_NO_SF_SUBREG
>> "avoid (SUBREG:SF (REG:SI)", I guess they are not the same meaning? (The
>> TARGET_NO_SF_SUBREG is also copied from other similar defines.)  Thanks.
> 
> Good question.  I do not know.
> 
> Well...  Since this define_insn* requires p8 *anyway*, we do not need
> any of these sf_subreg things?  We always know for each one if it should
> be true or false.

Yes, removed the vsx_reg_sfsubreg_ok check.

> 
>> +  "TARGET_NO_SF_SUBREG"
> 
> But here we should require p8 some other way, then.

TARGET_NO_SF_SUBREG is defined to TARGET_DIRECT_MOVE_64BIT, and
TARGET_DIRECT_MOVE_64BIT is TARGET_DIRECT_MOVE && TARGET_P8_VECTOR && 
TARGET_POWERPC64
which means TARGET_P8_VECTOR must be true for TARGET_NO_SF_SUBREG.

> 
>> +  (set_attr "isa" "p8v")])
> 
> (This isn't enough, unfortunately).
> 


Updated patch to removed the vsx_reg_sfsubreg_ok and ICE fix:


For extracting high part element from DImode register like:

{%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}

split it before reload with "and mask" to avoid generating shift right
32 bit then shift left 32 bit.  This pattern also exists in PR42475 and
PR67741, etc.

srdi 3,3,32
sldi 9,3,32
mtvsrd 1,9
xscvspdpn 1,1

=>

rldicr 3,3,0,31
mtvsrd 1,3
xscvspdpn 1,1

Bootstrap and regression tested pass on Power8-LE.

gcc/ChangeLog:

2020-07-13  Xionghu Luo  

PR rtl-optimization/89310
* config/rs6000/rs6000.md (movsf_from_si2): New
define_insn_and_split.

gcc/testsuite/ChangeLog:

2020-07-13  Xionghu Luo  

PR rtl-optimization/89310
* gcc.target/powerpc/pr89310.c: New test.
---
 gcc/config/rs6000/rs6000.md| 31 ++
 gcc/testsuite/gcc.target/powerpc/pr89310.c | 17 
 2 files changed, 48 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr89310.c

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 4fcd6a94022..480385ed4d2 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -7593,6 +7593,37 @@ (define_insn_and_split "movsf_from_si"
"*,  *, p9v,   p8v,   *, *,
 p8v,p8v,   p8v,   *")])
 
+;; For extracting high part element from DImode register like:
+;; {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
+;; split it before reload with "and mask" to avoid generating shift right
+;; 32 bit then shift left 32 bit.
+(define_insn_and_split "movsf_from_si2"
+  [(set (match_operand:SF 0 "gpc_reg_operand" "=wa")
+   (unspec:SF
+[(subreg:SI
+  (ashiftrt:DI
+   (match_operand:DI 1 "input_operand" "r")
+   (const_int 32))
+  0)]
+UNSPEC_SF_FROM_SI))
+  (clobber (match_scratch:DI 2 "=r"))]
+  "TARGET_NO_SF_SUBREG"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  if (GET_CODE (operands[2]) == SCRATCH)
+operands[2] = gen_reg_rtx (DImode);
+
+  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << 32);
+  emit_insn (gen_anddi3 (operands[2], operands[1], mask));
+  emit_insn (gen_p8_mtvsrd_sf (operands[0], operands[2]));
+  emit_insn (gen_vsx_xscvspdpn_directmove (operands[0], operands[0]));
+  DONE;
+}
+  [(set_attr "length" "12")
+  (set_attr "type" "vecfloat")
+  (set_attr "isa" "p8v")])
 
 ;; Move 64-bit binary/decimal floating point
 (define_expand "mov"
diff --git a/gcc/testsuite/gcc.target/powerpc/pr89310.c 
b/gcc/testsuite/gcc.target/powerpc/pr89310.c
new file mode 100644
index 000..15e78509246
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr89310.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+struct s {
+  int i;
+  float f;
+};
+
+float
+foo (struct s arg)
+{
+  return arg.f;
+}
+
+/* { dg-final { scan-assembler-not {\msrdi\M} } } */
+/* { dg-final { scan-assembler-not {\msldi\M} } } */
+/* { dg-final { scan-assembler-times {\mrldicr\M} 1 } } */
-- 
2.21.0.777.g83232e3864




Re: [PATCH 2/2] rs6000: Define define_insn_and_split to split unspec sldi+or to rldimi

2020-07-12 Thread luoxhu via Gcc-patches




On 2020/7/11 08:28, Segher Boessenkool wrote:

Hi!

On Thu, Jul 09, 2020 at 09:14:45PM -0500, Xiong Hu Luo wrote:

* config/rs6000/rs6000.md (rotl_unspec): New
define_insn_and_split.



+; rldimi with UNSPEC_SI_FROM_SF.
+(define_insn_and_split "*rotl_unspec"


Please have rotldi3_insert in the name.  "unspec" in the name doesn't
really mean much...  Can you put "sf" in the name, instead?  So something
like "*rotldi3_insert_sf"?


--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector_float.c
@@ -0,0 +1,14 @@
+/* { dg-do compile  } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */


This needs p9vector_ok (yes, that name doesn't make too much sense).


+vector float
+test (float *a, float *b, float *c, float *d)
+{
+  return (vector float){*a, *b, *c, *d};
+}
+
+/* { dg-final { scan-assembler-not {\mlxsspx\M} } } */
+/* { dg-final { scan-assembler-not {\mlfs\M} } } */


No lxssp or lfsx either...  or the update forms...

/* { dg-final { scan-assembler-not {\mlxssp} } } */
/* { dg-final { scan-assembler-not {\mlfs} } } */

works fine (there are no other mnemonics starting with those strings).


+/* { dg-final { scan-assembler-times {\mlwz\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mrldimi\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mmtvsrdd\M} 1 } } */


Okay for trunk with those changes (or post again if you prefer).  Thanks!



Thanks.  The 2 patches are committed to trunk(r11-2043, r11-2044) after
modifications.

Xionghu


Re: [PATCH] rs6000: Split movsf_from_si from high word before reload[PR89310]

2020-07-10 Thread luoxhu via Gcc-patches



On 2020/7/10 03:25, Segher Boessenkool wrote:
> 
>> +  "TARGET_NO_SF_SUBREG"
>> +  "#"
>> +  "&& vsx_reg_sfsubreg_ok (operands[0], SFmode)"
> 
> Put this in the insn condition?  And since this is just a predicate,
> you can just use it instead of gpc_reg_operand.
> 
> (The split condition becomes "&& 1" then, not "").
> 

+(define_insn_and_split "movsf_from_si2"
+  [(set (match_operand:SF 0 "vsx_reg_sfsubreg_ok" "=wa")
+   (unspec:SF
+[(subreg:SI
+  (ashiftrt:DI
+   (match_operand:DI 1 "input_operand" "r")
+   (const_int 32))
+  0)]
+UNSPEC_SF_FROM_SI))
+  (clobber (match_scratch:DI 2 "=r"))]
+  "TARGET_NO_SF_SUBREG"
+  "#"
+  "&& 1"
+  [(const_int 0)]

This change is invalid as it will cause an ICE in PR42475.c, reason is:
insn #29 will be combined to insn #40, though split1 is success, but it 
will cause ICE in sched1 as the op[0] is not SFmode.  Without this, #29
won't be combined to #40 as the gpc_reg_operand (operands[0], E_SFmode)
will cause the match fail for subreg:SF (reg:SI 155 [ _4 ]) 0).


pr42475.c.268r.combine:
Trying 29 -> 40:
   29: {r120:SF=unspec[r133:DI>>0x20#0] 86;clobber scratch;}
   40: r155:SI#0=r120:SF
  REG_DEAD r120:SF
Successfully matched this instruction:
(set (subreg:SF (reg:SI 155 [ _4 ]) 0)
(unspec:SF [
(subreg:SI (ashiftrt:DI (reg:DI 133)
(const_int 32 [0x20])) 0)
] UNSPEC_SF_FROM_SI))
allowing combination of insns 29 and 40
original costs 12 + 4 = 16
replacement cost 12
deferring deletion of insn with uid = 29.
modifying insn i340: {r155:SI#0=unspec[r133:DI>>0x20#0] 86;clobber scratch;}
  REG_DEAD r133:DI
deferring rescan insn with uid = 40.


pr42475.c.273r.split1:
69: r172:DI=r133:DI&0x
70: r155:SI#0=unspec[r172:DI] 62
71: r155:SI#0=unspec[r155:SI#0] 103
41: NOTE_INSN_DELETED
42: r157:DI=r155:SI#0<<0x20


pr42475.c.280r.sched1:
pr42475.c: In function ‘bar’:
pr42475.c:20:1: error: unrecognizable insn:
   20 | }
  | ^
(insn 71 70 41 2 (set (subreg:SF (reg:SI 155 [ _4 ]) 0)
(unspec:SF [
(subreg:SF (reg:SI 155 [ _4 ]) 0)
] UNSPEC_VSX_CVSPDPN)) -1
 (nil))
during RTL pass: sched1
dump file: pr42475.c.280r.sched1
pr42475.c:20:1: internal compiler error: in extract_insn, at recog.c:2294
Please submit a full bug report,
with preprocessed source if appropriate.
See  for instructions.


VS not using vsx_reg_sfsubreg_ok as condition:


pr42475.c.268r.combine:
Trying 29 -> 40:
   29: {r120:SF=unspec[r133:DI>>0x20#0] 86;clobber scratch;}
   40: r155:SI#0=r120:SF
  REG_DEAD r120:SF
Failed to match this instruction:
(set (subreg:SF (reg:SI 155 [ _4 ]) 0)
(unspec:SF [
(subreg:SI (ashiftrt:DI (reg:DI 133)
(const_int 32 [0x20])) 0)
] UNSPEC_SF_FROM_SI))


273r.split1:
69: r172:DI=r133:DI&0x
70: r120:SF=unspec[r172:DI] 62
71: r120:SF=unspec[r120:SF] 103
40: r155:SI#0=r120:SF
REG_DEAD r120:SF


Re: [PATCH 1/2] rs6000: Init V4SF vector without converting SP to DP

2020-07-09 Thread luoxhu via Gcc-patches
Update patch to keep the logic for non TARGET_P8_VECTOR targets.
Please ignore the previous [PATCH 1/2], Sorry!


Move V4SF to V4SI, init vector like V4SI and move to V4SF back.
Better instruction sequence could be generated on Power9:

lfs + xxpermdi + xvcvdpsp + vmrgew
=>
lwz + (sldi + or) + mtvsrdd

With the patch followed, it could be continue optimized to:

lwz + rldimi + mtvsrdd

The point is to use lwz to avoid converting the single-precision to
double-precision upon load, pack four 32-bit data into one 128-bit
register directly.

gcc/ChangeLog:

2020-07-10  Xionghu Luo  

* config/rs6000/rs6000.c (rs6000_expand_vector_init):
Move V4SF to V4SI, init vector like V4SI and move to V4SF back.
---
 gcc/config/rs6000/rs6000.c | 55 +-
 1 file changed, 37 insertions(+), 18 deletions(-)

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 58f5d780603..00972fb5165 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -6423,29 +6423,48 @@ rs6000_expand_vector_init (rtx target, rtx vals)
}
   else
{
- rtx dbl_even = gen_reg_rtx (V2DFmode);
- rtx dbl_odd  = gen_reg_rtx (V2DFmode);
- rtx flt_even = gen_reg_rtx (V4SFmode);
- rtx flt_odd  = gen_reg_rtx (V4SFmode);
- rtx op0 = force_reg (SFmode, XVECEXP (vals, 0, 0));
- rtx op1 = force_reg (SFmode, XVECEXP (vals, 0, 1));
- rtx op2 = force_reg (SFmode, XVECEXP (vals, 0, 2));
- rtx op3 = force_reg (SFmode, XVECEXP (vals, 0, 3));
-
- /* Use VMRGEW if we can instead of doing a permute.  */
  if (TARGET_P8_VECTOR)
{
- emit_insn (gen_vsx_concat_v2sf (dbl_even, op0, op2));
- emit_insn (gen_vsx_concat_v2sf (dbl_odd, op1, op3));
- emit_insn (gen_vsx_xvcvdpsp (flt_even, dbl_even));
- emit_insn (gen_vsx_xvcvdpsp (flt_odd, dbl_odd));
- if (BYTES_BIG_ENDIAN)
-   emit_insn (gen_p8_vmrgew_v4sf_direct (target, flt_even, 
flt_odd));
- else
-   emit_insn (gen_p8_vmrgew_v4sf_direct (target, flt_odd, 
flt_even));
+ rtx tmpSF[4];
+ rtx tmpSI[4];
+ rtx tmpDI[4];
+ rtx mrgDI[4];
+ for (i = 0; i < 4; i++)
+   {
+ tmpSI[i] = gen_reg_rtx (SImode);
+ tmpDI[i] = gen_reg_rtx (DImode);
+ mrgDI[i] = gen_reg_rtx (DImode);
+ tmpSF[i] = force_reg (SFmode, XVECEXP (vals, 0, i));
+ emit_insn (gen_movsi_from_sf (tmpSI[i], tmpSF[i]));
+ emit_insn (gen_zero_extendsidi2 (tmpDI[i], tmpSI[i]));
+   }
+
+ if (!BYTES_BIG_ENDIAN)
+   {
+ std::swap (tmpDI[0], tmpDI[1]);
+ std::swap (tmpDI[2], tmpDI[3]);
+   }
+
+ emit_insn (gen_ashldi3 (mrgDI[0], tmpDI[0], GEN_INT (32)));
+ emit_insn (gen_iordi3 (mrgDI[1], mrgDI[0], tmpDI[1]));
+ emit_insn (gen_ashldi3 (mrgDI[2], tmpDI[2], GEN_INT (32)));
+ emit_insn (gen_iordi3 (mrgDI[3], mrgDI[2], tmpDI[3]));
+
+ rtx tmpV2DI = gen_reg_rtx (V2DImode);
+ emit_insn (gen_vsx_concat_v2di (tmpV2DI, mrgDI[1], mrgDI[3]));
+ emit_move_insn (target, gen_lowpart (V4SFmode, tmpV2DI));
}
  else
{
+ rtx dbl_even = gen_reg_rtx (V2DFmode);
+ rtx dbl_odd  = gen_reg_rtx (V2DFmode);
+ rtx flt_even = gen_reg_rtx (V4SFmode);
+ rtx flt_odd  = gen_reg_rtx (V4SFmode);
+ rtx op0 = force_reg (SFmode, XVECEXP (vals, 0, 0));
+ rtx op1 = force_reg (SFmode, XVECEXP (vals, 0, 1));
+ rtx op2 = force_reg (SFmode, XVECEXP (vals, 0, 2));
+ rtx op3 = force_reg (SFmode, XVECEXP (vals, 0, 3));
+
  emit_insn (gen_vsx_concat_v2sf (dbl_even, op0, op1));
  emit_insn (gen_vsx_concat_v2sf (dbl_odd, op2, op3));
  emit_insn (gen_vsx_xvcvdpsp (flt_even, dbl_even));
-- 
2.27.0.90.geebb51ba8c




Re: [PATCH] rs6000: Define movsf_from_si2 to extract high part SF element from DImode[PR89310]

2020-07-09 Thread luoxhu via Gcc-patches
Hi,

On 2020/7/10 03:25, Segher Boessenkool wrote:
> Hi!
> 
> On Thu, Jul 09, 2020 at 11:09:42AM +0800, luoxhu wrote:
>>> Maybe change it back to just SI?  It won't match often at all for QI or
>>> HI anyway, it seems.  Sorry for that detour.  Should be good with the
>>> above nits fixed :-)
>>
>> OK, if I see correctly, subreg of DImode should be SImode and I used
>> subreg:SI to match only SI, so no need to consider mask for QI and HI? :)
> 
> My problem with it was that the shift amounts won't be 32 for QI etc.?

But if the subreg:SI only accepts SImode comes in, why do we need handle the
shift amounts for QI/HI? It not using QHSI here...

> 
>> +;; {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
>> +;; split it before reload with "and mask" to avoid generating shift right
>> +;; 32 bit then shift left 32 bit.
>> +(define_insn_and_split "movsf_from_si2"
>> +  [(set (match_operand:SF 0 "gpc_reg_operand" "=wa")
>> +(unspec:SF [
>> + (subreg:SI (
>> + ashiftrt:DI (
> 
> Opening parens *start* a line, they never end it.  So:
> 
> (define_insn_and_split "movsf_from_si2"
>[(set (match_operand:SF 0 "vsx_reg_sfsubreg_ok" "=wa")
> (unspec:SF
>   [(subreg:SI
>  (ashiftrt:DI
>(match_operand:DI 1 "input_operand" "r")
>(const_int 32))
> 0)]
>   UNSPEC_SF_FROM_SI))
> (clobber (match_scratch:DI 2 "=r"))]
> 
> or something like that.  There aren't really any real rules...  The
> important points are that nested things should be indented, and things
> at the same level should have the same indent (like the outer set and
> clobber).  The [ for an unspec is sometimes put at the end of a line,
> that reads a little better perhaps.

OK, seems the md file needs a format tool too...

> 
>> +  "TARGET_NO_SF_SUBREG"
>> +  "#"
>> +  "&& vsx_reg_sfsubreg_ok (operands[0], SFmode)"
> 
> Put this in the insn condition?  And since this is just a predicate,
> you can just use it instead of gpc_reg_operand.
> 
> (The split condition becomes "&& 1" then, not "").

OK, this seems a bit strange as movsi_from_sf, movsf_from_si and
movdi_from_sf_zero_ext all use it as condition...

And why vsx_reg_sfsubreg_ok allows "SF SUBREGS" and TARGET_NO_SF_SUBREG
"avoid (SUBREG:SF (REG:SI)", I guess they are not the same meaning? (The 
TARGET_NO_SF_SUBREG is also copied from other similar defines.)  Thanks.


Updated patch and correct the title:


Define movsf_from_si2 to extract high part SF element from DImode[PR89310]


For extracting high part element from DImode register like:

{%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}

split it before reload with "and mask" to avoid generating shift right
32 bit then shift left 32 bit.  This pattern also exists in PR42475 and
PR67741, etc.

srdi 3,3,32
sldi 9,3,32
mtvsrd 1,9
xscvspdpn 1,1

=>

rldicr 3,3,0,31
mtvsrd 1,3
xscvspdpn 1,1

Bootstrap and regression tested pass on Power8-LE.

gcc/ChangeLog:

2020-07-08  Xionghu Luo  

PR rtl-optimization/89310
* config/rs6000/rs6000.md (movsf_from_si2): New
define_insn_and_split.

gcc/testsuite/ChangeLog:

2020-07-08  Xionghu Luo  

PR rtl-optimization/89310
* gcc.target/powerpc/pr89310.c: New test.
---
 gcc/config/rs6000/rs6000.md| 32 ++
 gcc/testsuite/gcc.target/powerpc/pr89310.c | 17 
 2 files changed, 49 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr89310.c

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 4fcd6a94022..2331c84dcbd 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -7593,6 +7593,38 @@ (define_insn_and_split "movsf_from_si"
"*,  *, p9v,   p8v,   *, *,
 p8v,p8v,   p8v,   *")])
 
+;; For extracting high part element from DImode register like:
+;; {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
+;; split it before reload with "and mask" to avoid generating shift right
+;; 32 bit then shift left 32 bit.
+(define_insn_and_split "movsf_from_si2"
+  [(set (match_operand:SF 0 "vsx_reg_sfsubreg_ok" "=wa")
+   (unspec:SF
+[(subreg:SI
+  (ashiftrt:DI
+   (match_operand:DI 1 "input_operand" "r")
+   (const_int 32))
+  0)]
+UNSPEC_SF_FROM_SI))
+  (clobber (match_scratch:DI 2 "=r"))]
+  "TARGET_NO_SF_SUBRE

Re: [PATCH] rs6000: Split movsf_from_si from high word before reload[PR89310]

2020-07-08 Thread luoxhu via Gcc-patches



On 2020/7/9 06:43, Segher Boessenkool wrote:
> Hi!
> 
> On Wed, Jul 08, 2020 at 11:19:21AM +0800, luoxhu wrote:
>> For extracting high part element from DImode register like:
>>
>> {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
>>
>> split it before reload with "and mask" to avoid generating shift right
>> 32 bit then shift left 32 bit.  This pattern also exists in PR42475 and
>> PR67741, etc.
> 
>> 2020-07-08  Xionghu Luo  
>>
>>  PR rtl-optimization/89310
>>  * config/rs6000/rs6000.md (movsf_from_si2): New
>>  define_insn_and_split.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 2020-07-08  Xionghu Luo  
>>
>>  PR rtl-optimization/89310
>>  * gcc.target/powerpc/pr89310.c: New test.
> 
>> +;; For extracting high part element from DImode register like:
>> +;; {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
>> +;; split it before reload with "and mask" to avoid generating shift right
>> +;; 32 bit then shift left 32 bit.
>> +(define_insn_and_split "movsf_from_si2"
>> +  [(set (match_operand:SF 0 "gpc_reg_operand" "=wa")
>> +(unspec:SF [
>> + (subreg:SI (ashiftrt:DI
> 
> Put the  (ashiftrt:DI  on a separate line as well?  With indent changes,
> etc.
> 
>> +   (match_operand:DI 1 "input_operand" "r")
>> +   (const_int 32))
>> +  0)]
>> + UNSPEC_SF_FROM_SI))
>> +(clobber (match_scratch:DI 2 "=r"))]
>> +  "TARGET_NO_SF_SUBREG"
>> +  "@
>> +  #"
> 
> @ with only one alternative doesn't do anything; so just write
>"#"
> please.
> 
>> +
> 
> And no empty line here.
> 
>> +  "&& !reload_completed
> 
> Why this?  Why will this not work after reload?  In the very few cases
> where you do need this, you usually also need to check for
> lra_in_progress.
> 
>> +   && vsx_reg_sfsubreg_ok (operands[0], SFmode)"
>> +  [(const_int 0)]
>> +{
>> +  if (GET_CODE (operands[2]) == SCRATCH)
>> +operands[2] = gen_reg_rtx (DImode);
>> +
>> +  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << 32);
> 
> The mask should be different for QI and HI.
> 
>> +  emit_insn (gen_anddi3 (operands[2], operands[1], mask));
>> +  emit_insn (gen_p8_mtvsrd_sf (operands[0], operands[2]));
>> +  emit_insn (gen_vsx_xscvspdpn_directmove (operands[0], operands[0]));
>> +  DONE;
>> +}
>> +  [(set_attr "length" "12")
>> +  (set_attr "type" "vecfloat")
>> +  (set_attr "isa" "p8v")])
>> +
> 
> No extra whiteline please.
> 
> 
> Maybe change it back to just SI?  It won't match often at all for QI or
> HI anyway, it seems.  Sorry for that detour.  Should be good with the
> above nits fixed :-)

OK, if I see correctly, subreg of DImode should be SImode and I used
subreg:SI to match only SI, so no need to consider mask for QI and HI? :)

Others are updated: removed reload_completed and adjust format. 


For extracting high part element from DImode register like:

{%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}

split it before reload with "and mask" to avoid generating shift right
32 bit then shift left 32 bit.  This pattern also exists in PR42475 and
PR67741, etc.

srdi 3,3,32
sldi 9,3,32
mtvsrd 1,9
xscvspdpn 1,1

=>

rldicr 3,3,0,31
mtvsrd 1,3
xscvspdpn 1,1

Bootstrap and regression tested pass on Power8-LE.

gcc/ChangeLog:

2020-07-08  Xionghu Luo  

PR rtl-optimization/89310
* config/rs6000/rs6000.md (movsf_from_si2): New
define_insn_and_split.

gcc/testsuite/ChangeLog:

2020-07-08  Xionghu Luo  

PR rtl-optimization/89310
* gcc.target/powerpc/pr89310.c: New test.
---
 gcc/config/rs6000/rs6000.md| 31 ++
 gcc/testsuite/gcc.target/powerpc/pr89310.c | 17 
 2 files changed, 48 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr89310.c

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 4fcd6a94022..a493dfd4596 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -7593,6 +7593,37 @@ (define_insn_and_split "movsf_from_si"
"*,  *, p9v,   p8v,   *, *,
 p8v,p8v,   p8v,   *")])
 
+;; For extracting high part element from DImode register like:
+;; {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
+;; split it before reload with "and mask" to avoid generating shift right
+;; 32 b

Re: [PATCH] rs6000: Split movsf_from_si from high word before reload[PR89310]

2020-07-07 Thread luoxhu via Gcc-patches



On 2020/7/8 05:31, Segher Boessenkool wrote:
> Hi!
> 
> On Tue, Jul 07, 2020 at 04:39:58PM +0800, luoxhu wrote:
>>> Lots of questions, sorry!
>>
>> Thanks for the nice suggestions of the initial patch contains many issues:),
> 
> Pretty much all of it should *work*, it just can be improved and
> simplified quite a bit :-)
> 
>> For this case, %1:SF matches with "=wa"?  And how to construct cases to
>> match("=?r", "wa") and ("=!r", "r") combinations, please?
> 
> operands[0], not operands[1]?
> 
> Simple testcases will not put the output into a GPR, unless you force
> the compiler to do that, because of the ? and !.
> 
> Often you can just do
> 
>asm("#" : "+r"(x));
> 
> to force "x" into a GPR at that point of the program.  But there is
> nothing stopping the compiler from copying it back to a VSR where it
> thinks that is cheaper ;-)
> 
> So maybe this pattern should just have the GPR-to-VSR alternative?  It
> does not look like the GPR destination variants are useful?
> 
>> +  rtx op0 = operands[0];
>> +  rtx op1 = operands[1];
>> +  rtx op2 = operands[2];
> 
> (Please just write out operands[N] everywhere).
> 
>> +  if (GET_CODE (operands[2]) == SCRATCH)
>> +op2 = gen_reg_rtx (DImode);
>> +
>> +  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << 32);
>> +  emit_insn (gen_anddi3 (op2, op1, mask));
> 
> Groovy :-)
> 
> So, it looks like you can remove the ? and ! alternatives, leaving just
> the first alternative?
> 

Thanks.

V3 Update: Leave only GPR-to-VSR alternative and use operands[N].
Bootstrap and regression tested pass on Power8-LE.


For extracting high part element from DImode register like:

{%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}

split it before reload with "and mask" to avoid generating shift right
32 bit then shift left 32 bit.  This pattern also exists in PR42475 and
PR67741, etc.

srdi 3,3,32
sldi 9,3,32
mtvsrd 1,9
xscvspdpn 1,1

=>

rldicr 3,3,0,31
mtvsrd 1,3
xscvspdpn 1,1

gcc/ChangeLog:

2020-07-08  Xionghu Luo  

PR rtl-optimization/89310
* config/rs6000/rs6000.md (movsf_from_si2): New
define_insn_and_split.

gcc/testsuite/ChangeLog:

2020-07-08  Xionghu Luo  

PR rtl-optimization/89310
* gcc.target/powerpc/pr89310.c: New test.
---
 gcc/config/rs6000/rs6000.md| 34 ++
 gcc/testsuite/gcc.target/powerpc/pr89310.c | 17 +++
 2 files changed, 51 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr89310.c

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 4fcd6a94022..02c5171378c 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -7593,6 +7593,40 @@ (define_insn_and_split "movsf_from_si"
"*,  *, p9v,   p8v,   *, *,
 p8v,p8v,   p8v,   *")])
 
+;; For extracting high part element from DImode register like:
+;; {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
+;; split it before reload with "and mask" to avoid generating shift right
+;; 32 bit then shift left 32 bit.
+(define_insn_and_split "movsf_from_si2"
+  [(set (match_operand:SF 0 "gpc_reg_operand" "=wa")
+   (unspec:SF [
+(subreg:SI (ashiftrt:DI
+  (match_operand:DI 1 "input_operand" "r")
+  (const_int 32))
+ 0)]
+UNSPEC_SF_FROM_SI))
+   (clobber (match_scratch:DI 2 "=r"))]
+  "TARGET_NO_SF_SUBREG"
+  "@
+  #"
+
+  "&& !reload_completed
+   && vsx_reg_sfsubreg_ok (operands[0], SFmode)"
+  [(const_int 0)]
+{
+  if (GET_CODE (operands[2]) == SCRATCH)
+operands[2] = gen_reg_rtx (DImode);
+
+  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << 32);
+  emit_insn (gen_anddi3 (operands[2], operands[1], mask));
+  emit_insn (gen_p8_mtvsrd_sf (operands[0], operands[2]));
+  emit_insn (gen_vsx_xscvspdpn_directmove (operands[0], operands[0]));
+  DONE;
+}
+  [(set_attr "length" "12")
+  (set_attr "type" "vecfloat")
+  (set_attr "isa" "p8v")])
+
 
 ;; Move 64-bit binary/decimal floating point
 (define_expand "mov"
diff --git a/gcc/testsuite/gcc.target/powerpc/pr89310.c 
b/gcc/testsuite/gcc.target/powerpc/pr89310.c
new file mode 100644
index 000..15e78509246
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr89310.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+struct s {
+  int i;
+  float f;
+};
+
+float
+foo (struct s arg)
+{
+  return arg.f;
+}
+
+/* { dg-final { scan-assembler-not {\msrdi\M} } } */
+/* { dg-final { scan-assembler-not {\msldi\M} } } */
+/* { dg-final { scan-assembler-times {\mrldicr\M} 1 } } */
-- 
2.21.0.777.g83232e3864



Re: [PATCH] rs6000: Split movsf_from_si from high word before reload[PR89310]

2020-07-07 Thread luoxhu via Gcc-patches



On 2020/7/7 08:18, Segher Boessenkool wrote:
> Hi!
> 
> On Sun, Jul 05, 2020 at 09:17:57PM -0500, Xionghu Luo wrote:
>> For extracting high part element from DImode register like:
>>
>> {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
>>
>> split it before reload with "and mask" to avoid generating shift right
>> 32 bit then shift left 32 bit.
>>
>> srdi 3,3,32
>> sldi 9,3,32
>> mtvsrd 1,9
>> xscvspdpn 1,1
>>
>> =>
>>
>> rldicr 3,3,0,31
>> mtvsrd 1,3
>> xscvspdpn 1,1
> 
> Great :-)
> 
>> +;; For extracting high part element from DImode register like:
>> +;; {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
>> +;; split it before reload with "and mask" to avoid generating shift right
>> +;; 32 bit then shift left 32 bit.
>> +(define_insn_and_split "movsf_from_si2"
>> +  [(set (match_operand:SF 0 "nonimmediate_operand"
>> +"=!r,   f, v, wa,m, Z,
>> + Z, wa,?r,!r")
>> +(unspec:SF [
>> + (subreg:SI (ashiftrt:DI
>> +   (match_operand:DI 1 "input_operand"
>> +   "m, m, wY,Z, r, f,
>> +   wa,r, wa,r")
>> +  (const_int 32)) 0)]
>> +   UNSPEC_SF_FROM_SI))
>> +  (clobber (match_scratch:DI 2
>> +"=X,X, X, X, X, X,
>> +X, r, X, X"))]
>> +  "TARGET_NO_SF_SUBREG
>> +   && (register_operand (operands[0], SFmode)
>> +   && register_operand (operands[1], DImode))"
> 
> If the insn condition requires operands 0 and 1 to be register_operands,
> it can ask for that in the predicates, instead: not nonimmediate_operand
> and input_operand, but just gpc_reg_operand instead.  You can leave out
> the impossible alternatives as well (0, 1, 2, 3, 4, 5, 6), leaving just
> 
> (define_insn_and_split "movsf_from_si2"
>[(set (match_operand:SF 0 "gpc_reg_operand" "=wa,?r,!r")
>   (unspec:SF
> [(subreg:SI (ashiftrt:DI
>   (match_operand:DI 1 "input_operand" "r,wa,r")
>   (const_int 32))
> 0)]
> UNSPEC_SF_FROM_SI)))]
>"TARGET_NO_SF_SUBREG"
>"@
> #
> mfvsrwz %0,%x1
> mr %0,%1"
> 
>"&& !reload_completed
> && vsx_reg_sfsubreg_ok (operands[0], SFmode)"
>[(const_int 0)]
> {
>rtx op0 = operands[0];
>rtx op1 = operands[1];
>rtx tmp = gen_reg_rtx (DImode);
> 
> You cannot call gen_reg_rtx too late in the pass pipeline.  What we
> usually do for such cases is put it as a match_scratch in the pattern,
> and then do code like
> 
>if (GET_CODE (operands[2]) == SCRATCH)
>  operands[2] = gen_reg_rtx (DImode);
> 
> so that it will work both before and after reload.
> 
>/* Avoid split {r155:SI#0=unspec[r133:DI>>0x20#0] 86;clobber scratch;} 
> from PR42745.  */
> 
> (This line is too long, btw.)
> 
>if (!SUBREG_P (operands[0]))
>  {
>rtx mask = GEN_INT (HOST_WIDE_INT_M1U << 32);
>emit_insn (gen_anddi3 (tmp, op1, mask));
>emit_insn (gen_p8_mtvsrd_sf (op0, tmp));
>emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0));
>DONE;
>  }
>else
>  FAIL;
> }
>[(set_attr "length" "12,*,*")
> (set_attr "type" "vecfloat,mffgpr,*")
> (set_attr "isa" "p8v,p8v,*")])
> 
> I wonder what will happen if you actually do FAIL here...  There then is
> no insn alternative that can match, so we ICE?  In that case, just leave
> out the whole FAIL thing, it is useless?  You can do a gcc_assert if you
> want to check something.
> 
> Oh, and maybe you only want to handle GPRs here, not VSRs?  So just the
> "r", not the "wa" at all?  What code would it generate for vector regs?
> 
> Lots of questions, sorry!

Thanks for the nice suggestions of the initial patch contains many issues:),

For this case, %1:SF matches with "=wa"?  And how to construct cases to
match("=?r", "wa") and ("=!r", "r") combinations, please?

Removed lots of copy-paste from "movsf_from_si" and update the patch with 
your comments:


For extracting high part element from DImode register like:

{%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}

split it before reload with "and mask" to avoid generating shift right
32 bit then shift left 32 bit.  This pattern also exists in PR42475 and
PR67741, etc.

srdi 3,3,32
sldi 9,3,32
mtvsrd 1,9
xscvspdpn 1,1

=>

rldicr 3,3,0,31
mtvsrd 1,3
xscvspdpn 1,1

Bootstrap and regression tested pass on Power8-LE.

gcc/ChangeLog:

2020-07-07  Xionghu Luo  

PR rtl-optimization/89310
* config/rs6000/rs6000.md (movsf_from_si2): New
define_insn_and_split.

gcc/testsuite/ChangeLog:

2020-07-07  Xionghu Luo  

PR rtl-optimization/89310
* gcc.target/powerpc/pr89310.c: New test.
---
 gcc/config/rs6000/rs6000.md| 40 ++
 gcc/testsuite/gcc.target/powerpc/pr89310.c | 17 +
 2 files changed, 57 insertions(+)
 create mode 100644 

Ping^1 : [PATCH] [stage1] ipa-cp: Fix PGO regression caused by r278808

2020-06-15 Thread luoxhu via Gcc-patches

Gentle ping...


On 2020/6/1 09:45, Xionghu Luo wrote:

resend the patch for stage1:
https://gcc.gnu.org/pipermail/gcc-patches/2020-January/538186.html

The performance of exchange2 built with PGO will decrease ~28% by r278808
due to profile count set incorrectly.  The cloned nodes are updated to a
very small count caused later pass cunroll fail to unroll the recursive
function in exchange2.

digits_2 ->
digits_2.constprop.0, digits_2.constprop.1, etc.

1. Enable proportion orig_sum to the new nodes for self recursive node
(k means multiple self recursive calls):
new_sum = (orig_sum + new_sum) * 1 / k \
* self_recursive_probability * (1 / param_ipa_cp_max_recursive_depth).
2. Two mutually recursive functions are not supported in self recursive
clone yet so also not supported in update_profiling_info here.
3. Improve value range for param_ipa_cp_max_recursive_depth to (0, 8).
If it calls itself two different sets, usually recursive boudary limit
will stop the specialize first, otherwise it is slow even without
recursive specialize.

gcc/ChangeLog:

2020-05-29  Xionghu Luo  

* ipa-cp.c (update_profiling_info): Check self_scc node.
* params.opt (ipa-cp-max-recursive-depth): Set param range.

gcc/testsuite/ChangeLog:

2020-05-29  Xionghu Luo  

* gcc.dg/ipa/ipa-clone-3.c: Update count.
---
  gcc/ipa-cp.c   | 33 +-
  gcc/params.opt |  2 +-
  gcc/testsuite/gcc.dg/ipa/ipa-clone-3.c |  2 +-
  3 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
index c64e9104a94..919c741ecfa 100644
--- a/gcc/ipa-cp.c
+++ b/gcc/ipa-cp.c
@@ -2046,7 +2046,7 @@ propagate_vals_across_arith_jfunc (cgraph_edge *cs,
/* Recursively generate lattice values with a limited count.  */
FOR_EACH_VEC_ELT (val_seeds, i, src_val)
{
- for (int j = 1; j < max_recursive_depth; j++)
+ for (int j = 0; j < max_recursive_depth; j++)
{
  tree cstval = get_val_across_arith_op (opcode, opnd1_type, opnd2,
 src_val, res_type);
@@ -4345,6 +4345,37 @@ update_profiling_info (struct cgraph_node *orig_node,
  false);
new_sum = stats.count_sum;
  
+  class ipa_node_params *info = IPA_NODE_REF (orig_node);

+  if (info && info->node_is_self_scc)
+{
+  profile_count self_recursive_count = profile_count::zero ();
+  unsigned k = 0;
+
+  /* The self recursive edge is on the orig_node.  */
+  for (cs = orig_node->callees; cs; cs = cs->next_callee)
+   if (ipa_edge_within_scc (cs))
+ {
+   cgraph_node *callee = cs->callee->function_symbol ();
+   if (callee && cs->caller == cs->callee)
+ {
+   self_recursive_count += cs->count;
+   k++;
+ }
+ }
+
+  /* Proportion count for multiple self recursive node.  */
+  if (k)
+   self_recursive_count.apply_scale (1, k);
+
+  /* Proportion count for self recursive node from all callers.  */
+  new_sum
+   = (orig_sum + new_sum).apply_scale (self_recursive_count, orig_sum);
+
+  /* Proportion count for specialized cloned node.  */
+  if (param_ipa_cp_max_recursive_depth)
+   new_sum = new_sum.apply_scale (1, param_ipa_cp_max_recursive_depth);
+}
+
if (orig_node_count < orig_sum + new_sum)
  {
if (dump_file)
diff --git a/gcc/params.opt b/gcc/params.opt
index 4aec480798b..ad000145dfb 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -199,7 +199,7 @@ Common Joined UInteger Var(param_ipa_cp_loop_hint_bonus) 
Init(64) Param Optimiza
  Compile-time bonus IPA-CP assigns to candidates which make loop bounds or 
strides known.
  
  -param=ipa-cp-max-recursive-depth=

-Common Joined UInteger Var(param_ipa_cp_max_recursive_depth) Init(8) Param 
Optimization
+Common Joined UInteger Var(param_ipa_cp_max_recursive_depth) Init(7) 
IntegerRange(0, 8) Param Optimization
  Maximum depth of recursive cloning for self-recursive function.
  
  -param=ipa-cp-min-recursive-probability=

diff --git a/gcc/testsuite/gcc.dg/ipa/ipa-clone-3.c 
b/gcc/testsuite/gcc.dg/ipa/ipa-clone-3.c
index a73cb8b63fc..1efa23ae109 100644
--- a/gcc/testsuite/gcc.dg/ipa/ipa-clone-3.c
+++ b/gcc/testsuite/gcc.dg/ipa/ipa-clone-3.c
@@ -41,4 +41,4 @@ int main ()
return recur_fn ();
  }
  
-/* { dg-final { scan-ipa-dump-times "Creating a specialized node of recur_fn/\[0-9\]*\\." 8 "cp" } } */

+/* { dg-final { scan-ipa-dump-times "Creating a specialized node of 
recur_fn/\[0-9\]*\\." 9 "cp" } } */



Re: [PATCH] rs6000: Use REAL_TYPE to copy when block move array in structure[PR65421]

2020-06-08 Thread luoxhu via Gcc-patches
Hi,

On 2020/6/3 04:32, Segher Boessenkool wrote:
> Hi Xiong Hu,
> 
> On Tue, Jun 02, 2020 at 04:41:50AM -0500, Xionghu Luo wrote:
>> Double array in structure as function arguments or return value is accessed
>> by BLKmode, they are stored to stack and load from stack with redundant
>> conversion from DF->DI->DF.  This patch checks the homogeneous type and
>> use the actual element type to do block move to by pass the conversions.
> 
>> @@ -2733,6 +2734,7 @@ expand_block_move (rtx operands[], bool might_overlap)
>> rtx loads[MAX_MOVE_REG];
>> rtx stores[MAX_MOVE_REG];
>> int num_reg = 0;
>> +  machine_mode elt_mode = DImode;
>>   
>> /* If this is not a fixed size move, just call memcpy */
>> if (! constp)
>> @@ -2750,6 +2752,17 @@ expand_block_move (rtx operands[], bool might_overlap)
>> if (bytes > rs6000_block_move_inline_limit)
>>   return 0;
>>   
>> +  tree type = TREE_TYPE (MEM_EXPR (orig_dest));
> 
> Declare elt_mode here as well?
> 
>> +  if (TREE_CODE (type) == RECORD_TYPE
>> +  && rs6000_discover_homogeneous_aggregate (TYPE_MODE (type), type, 
>> NULL,
>> +NULL))
>> +{
>> +  tree field_type = TREE_TYPE (first_field (type));
>> +  if (field_type && TREE_CODE (field_type) == ARRAY_TYPE
>> +  && TREE_CODE (TREE_TYPE (field_type)) == REAL_TYPE)
>> +elt_mode = TYPE_MODE (TREE_TYPE (field_type));
>> +}
> 
> Homogeneous aggregates only exist in the ELFv2 ABI, while the problem
> here is the SP float things.  You also noticed (elsewhere) that if the
> struct contains (say) SI, SF, SI, SF, then this does not help.
> 
> Is there some better condition this could use, and maybe an expansion
> that works in more cases as well?
> 
> And, it would be lovely if generic code could expand to something better
> already (not expand to a block move at all, certainly not for something
> as tiny as this).

This pr65421 is array in structure, the assignment is just struct to struct
and won't be split by SROA to element assignment like struct contains 
or .

pr65421.c.236t.optimized:
foo (const struct A a)
{
  struct A D.2909;

   [local count: 1073741824]:
  D.2909 = a;  // struct to struct.
  return D.2909;
}

pr69143.c.234t.optimized:
blah1 (struct foo1 a)
{
  struct foo1 D.2909;
  float _1;
  float _2;

   [local count: 1073741824]:
  _1 = a.y;
  _2 = a.x;
  D.2909.x = _1;// element to element.
  D.2909.y = _2;// element to element.
  return D.2909;
}

So the expander will choose difference path to expand them...

For pr65421, the arguments and return value are accessed by BLKmode after 
gimplify,
since there is no IPA pass, it is never changed from pass gimple to expand.

In expander, the type conversion only happens on expand_assignment of "D.2909 = 
a;"
(arguments assigned to local variable, stack to stack, generated by 
expand_block_move,
insn #13~#20 as followed), the expand_function_start(insn #2~#9) load each 
element type
to be DF already, DF to DI conversion in insn #13~#20 cause the later RTL 
passes fail to
do forward propagation in 246r.fwprop1.  So my patch tries to use the actual 
type for
array in structure here.  If rs6000_discover_homogeneous_aggregate is not 
allowed to be
used here, how about expose and call rs6000_aggregate_candidate directly?  Not 
clear why
"Homogeneous aggregates only exist in the ELFv2 ABI" since double array in 
structure is a
common usage?

pr65421.c.238r.expand:
1: NOTE_INSN_DELETED
   11: NOTE_INSN_BASIC_BLOCK 2
2: r121:DF=%1:DF
3: r122:DF=%2:DF
4: r123:DF=%3:DF
5: r124:DF=%4:DF
6: [r112:DI+0x20]=r121:DF
7: [r112:DI+0x28]=r122:DF
8: [r112:DI+0x30]=r123:DF
9: [r112:DI+0x38]=r124:DF
   10: NOTE_INSN_FUNCTION_BEG
   13: r125:DI=[r112:DI+0x20]
   15: r126:DI=[r112:DI+0x28]
   17: r127:DI=[r112:DI+0x30]
   19: r128:DI=[r112:DI+0x38]
   14: [r112:DI]=r125:DI
   16: [r112:DI+0x8]=r126:DI
   18: [r112:DI+0x10]=r127:DI
   20: [r112:DI+0x18]=r128:DI
   21: r129:DF=[r112:DI]
   22: r130:DF=[r112:DI+0x8]
   23: r131:DF=[r112:DI+0x10]
   24: r132:DF=[r112:DI+0x18]
   25: r117:DF=r129:DF
   26: r118:DF=r130:DF
   27: r119:DF=r131:DF
   28: r120:DF=r132:DF
   32: %1:DF=r117:DF
   33: %2:DF=r118:DF
   34: %3:DF=r119:DF
   35: %4:DF=r120:DF
   36: use %1:DF
   37: use %2:DF
   38: use %3:DF
   39: use %4:DF

To bypass block move requires very generic code change, the BLK mode is 
determined very early
in gimple, remove BLKmode seems huge project in stor-layout.c\function.c\expr.c 
etc. and not sure
other targets like it, the ARM64 use OImode register to avoid BLKmode/stack 
operations, while
X86 expand to two TImode register assignment and pointer result return.

Or do you mean some workaround that don't call emit_block_move to fall in 
expand_block_move in
rs6000-string.c when expand_assignment of "D.2909 = a;" below?
rtx
store_expr (tree exp, rtx target, int call_param_p,
bool nontemporal, bool reverse)
{
...
  else if 

Re: [PATCH v2] Fold (add -1; zero_ext; add +1) operations to zero_ext when not overflow (PR37451, part of PR61837)

2020-05-13 Thread luoxhu via Gcc-patches
On 2020/5/13 02:24, Richard Sandiford wrote:
> luoxhu  writes:
>> +  /* Fold (add -1; zero_ext; add +1) operations to zero_ext. i.e:
>> +
>> + 73: r145:SI=r123:DI#0-0x1
>> + 74: r144:DI=zero_extend (r145:SI)
>> + 75: r143:DI=r144:DI+0x1
>> + ...
>> + 31: r135:CC=cmp (r123:DI,0)
>> + 72: {pc={(r143:DI!=0x1)?L70:pc};r143:DI=r143:DI-0x1;clobber
>> + scratch;clobber scratch;}
> 
> Minor, but it might be worth stubbing out the clobbers, since they're
> not really necessary to understand the comment:
> 
>72: {pc={(r143:DI!=0x1)?L70:pc};r143:DI=r143:DI-0x1;...}
> 
>> +
>> + r123:DI#0-0x1 is param count derived from loop->niter_expr equal to the
>> + loop iterations, if loop iterations expression doesn't overflow, then
>> + (zero_extend (r123:DI#0-1))+1 could be simplified to zero_extend only.
>> +   */
>> +  bool simplify_zext = false;
> 
> I think it'd be easier to follow if this was split out into
> a subroutine, rather than having the simplify_zext variable.
> 
>> +  rtx extop0 = XEXP (count, 0);
>> +  if (GET_CODE (count) == ZERO_EXTEND && GET_CODE (extop0) == PLUS)
> 
> This isn't valid: we can only do XEXP (count, 0) *after* checking
> for a ZERO_EXTEND.  (It'd be good to test the patch with
> --enable-checking=yes,extra,rtl , which hopefully would have
> caught this.)
> 
>> +{
>> +  rtx addop0 = XEXP (extop0, 0);
>> +  rtx addop1 = XEXP (extop0, 1);
>> +
>> +  int nonoverflow = 0;
>> +  unsigned int_mode
>> += GET_MODE_PRECISION (as_a GET_MODE (addop0));
> 
> Heh.  I wondered at first how on earth this compiled.  It looked like
> there was a missing "(...)" around the GET_MODE.  But of course,
> GET_MODE adds its own parentheses, so it all works out. :-)
> 
> Please add the "(...)" anyway though.  We shouldn't rely on that.
> 
> "int_mode" seems a bit of a confusing name, since it's actually a precision
> in bits rather than a mode.
> 
>> +  unsigned HOST_WIDE_INT int_mode_max
>> += (HOST_WIDE_INT_1U << (int_mode - 1) << 1) - 1;
>> +  if (get_max_loop_iterations (loop, )
>> +  && wi::ltu_p (iterations, int_mode_max))
> 
> You could use GET_MODE_MASK instead of int_mode_max here.
> 
> For extra safety, it would be good to add a HWI_COMPUTABLE_P test,
> to make sure that using HWIs is valid.
> 
>> +nonoverflow = 1;
>> +
>> +  if (nonoverflow
> 
> Having the nonoverflow variable doesn't seem necessary.  We could
> just fuse the two "if" conditions together.
> 
>> +  && CONST_SCALAR_INT_P (addop1)
>> +  && GET_MODE_PRECISION (mode) == int_mode * 2
> 
> This GET_MODE_PRECISION condition also shouldn't be necessary.
> If we can prove that the subtraction doesn't wrap, we can extend
> to any wider mode, not just to double the width.
> 
>> +  && addop1 == GEN_INT (-1))
> 
> This can just be:
> 
> addop1 == constm1_rtx
> 
> There's then no need for the CONST_SCALAR_INT_P check.
> 
> Thanks,
> Richard
> 

Thanks for all your great comments, addressed them all with below update,
"--enable-checking=yes,extra,rtl" did catch the ICE with performance penalty.


This "subtract/extend/add" existed for a long time and still annoying us
(PR37451, part of PR61837) when converting from 32bits to 64bits, as the ctr
register is used as 64bits on powerpc64, Andraw Pinski had a patch but
caused some issue and reverted by Joseph S. Myers(PR37451, PR37782).

Andraw:
http://gcc.gnu.org/ml/gcc-patches/2008-09/msg01070.html
http://gcc.gnu.org/ml/gcc-patches/2008-10/msg01321.html
Joseph:
https://gcc.gnu.org/legacy-ml/gcc-patches/2011-11/msg02405.html

We still can do the simplification from "subtract/zero_ext/add" to "zero_ext"
when loop iterations is known to be LT than MODE_MAX (only do simplify
when counter+0x1 NOT overflow).

Bootstrap and regression tested pass on Power8-LE.

gcc/ChangeLog

2020-05-14  Xiong Hu Luo  

PR rtl-optimization/37451, part of PR target/61837
* loop-doloop.c (doloop_simplify_count): New function.  Simplify
(add -1; zero_ext; add +1) to zero_ext when not wrapping.
(doloop_modify): Call doloop_simplify_count.

gcc/testsuite/ChangeLog

2020-05-14  Xiong Hu Luo  

PR rtl-optimization/37451, part of PR target/61837
* gcc.target/powerpc/doloop-2.c: New test.
---
 gcc/loop-doloop.c   | 38 -
 gcc/testsuite/gcc.target/powerpc/doloop-2.c | 29 
 2 

[PATCH v2] Fold (add -1; zero_ext; add +1) operations to zero_ext when not overflow (PR37451, part of PR61837)

2020-05-12 Thread luoxhu via Gcc-patches
Minor refine of checking iterations nonoverflow and a testcase for stage 1.


This "subtract/extend/add" existed for a long time and still annoying us
(PR37451, part of PR61837) when converting from 32bits to 64bits, as the ctr
register is used as 64bits on powerpc64, Andraw Pinski had a patch but
caused some issue and reverted by Joseph S. Myers(PR37451, PR37782).

Andraw:
http://gcc.gnu.org/ml/gcc-patches/2008-09/msg01070.html
http://gcc.gnu.org/ml/gcc-patches/2008-10/msg01321.html
Joseph:
https://gcc.gnu.org/legacy-ml/gcc-patches/2011-11/msg02405.html

We can do the simplification from "subtract/extend/add" to only extend
when loop iterations is known to be LT than MODE_MAX-1(NOT do simplify
when counter+0x1 overflow).

Bootstrap and regression tested pass on Power8-LE.

gcc/ChangeLog

2020-05-12  Xiong Hu Luo  

PR rtl-optimization/37451, part of PR target/61837
* loop-doloop.c (doloop_modify): Simplify (add -1; zero_ext; add +1)
to zero_ext when not wrapping overflow.

gcc/testsuite/ChangeLog

2020-05-12  Xiong Hu Luo  

PR rtl-optimization/37451, part of PR target/61837
* gcc.target/powerpc/doloop-2.c: New test.
---
 gcc/loop-doloop.c   | 46 -
 gcc/testsuite/gcc.target/powerpc/doloop-2.c | 14 +++
 2 files changed, 59 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/doloop-2.c

diff --git a/gcc/loop-doloop.c b/gcc/loop-doloop.c
index db6a014e43d..16372382a22 100644
--- a/gcc/loop-doloop.c
+++ b/gcc/loop-doloop.c
@@ -477,7 +477,51 @@ doloop_modify (class loop *loop, class niter_desc *desc,
 }
 
   if (increment_count)
-count = simplify_gen_binary (PLUS, mode, count, const1_rtx);
+{
+  /* Fold (add -1; zero_ext; add +1) operations to zero_ext. i.e:
+
+73: r145:SI=r123:DI#0-0x1
+74: r144:DI=zero_extend (r145:SI)
+75: r143:DI=r144:DI+0x1
+...
+31: r135:CC=cmp (r123:DI,0)
+72: {pc={(r143:DI!=0x1)?L70:pc};r143:DI=r143:DI-0x1;clobber
+scratch;clobber scratch;}
+
+r123:DI#0-0x1 is param count derived from loop->niter_expr equal to the
+loop iterations, if loop iterations expression doesn't overflow, then
+(zero_extend (r123:DI#0-1))+1 could be simplified to zero_extend only.
+   */
+  bool simplify_zext = false;
+  rtx extop0 = XEXP (count, 0);
+  if (GET_CODE (count) == ZERO_EXTEND && GET_CODE (extop0) == PLUS)
+   {
+ rtx addop0 = XEXP (extop0, 0);
+ rtx addop1 = XEXP (extop0, 1);
+
+ int nonoverflow = 0;
+ unsigned int_mode
+   = GET_MODE_PRECISION (as_a GET_MODE (addop0));
+ unsigned HOST_WIDE_INT int_mode_max
+   = (HOST_WIDE_INT_1U << (int_mode - 1) << 1) - 1;
+ if (get_max_loop_iterations (loop, )
+ && wi::ltu_p (iterations, int_mode_max))
+   nonoverflow = 1;
+
+ if (nonoverflow
+ && CONST_SCALAR_INT_P (addop1)
+ && GET_MODE_PRECISION (mode) == int_mode * 2
+ && addop1 == GEN_INT (-1))
+   {
+ count = simplify_gen_unary (ZERO_EXTEND, mode, addop0,
+ GET_MODE (addop0));
+ simplify_zext = true;
+   }
+   }
+
+  if (!simplify_zext)
+   count = simplify_gen_binary (PLUS, mode, count, const1_rtx);
+}
 
   /* Insert initialization of the count register into the loop header.  */
   start_sequence ();
diff --git a/gcc/testsuite/gcc.target/powerpc/doloop-2.c 
b/gcc/testsuite/gcc.target/powerpc/doloop-2.c
new file mode 100644
index 000..dc8516bb0ab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/doloop-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target powerpc*-*-* } } */
+/* { dg-options "-O2 -fno-unroll-loops" } */
+
+int f(int l, int *a)
+{
+int i;
+  for(i = 0;i < l; i++)
+   a[i] = i;
+return l;
+}
+
+/* { dg-final { scan-assembler-not "-1" } } */
+/* { dg-final { scan-assembler "bdnz" } } */
+/* { dg-final { scan-assembler-times "mtctr" 1 } } */
-- 
2.21.0.777.g83232e3864





Re: [PATCH v2] Add handling of MULT_EXPR/PLUS_EXPR for wrapping overflow in affine combination(PR83403)

2020-05-11 Thread luoxhu via Gcc-patches

在 2020-05-06 20:09,Richard Biener 写道:

On Thu, 30 Apr 2020, luoxhu wrote:

Update the patch with overflow check.  Bootstrap and regression tested 
PASS on Power8-LE.



Use determine_value_range to get value range info for fold convert 
expressions
with internal operation PLUS_EXPR/MINUS_EXPR/MULT_EXPR when not 
overflow on

wrapping overflow inner type.  i.e.:

(long unsigned int)((unsigned  int)n * 10 + 1)
=>
(long unsigned int)n * (long unsigned int)10 + (long unsigned int)1

With this patch for affine combination, load/store motion could detect
more address refs independency and promote some memory expressions to
registers within loop.

PS: Replace the previous "(T1)(X + CST) as (T1)X - (T1)(-CST))"
to "(T1)(X + CST) as (T1)X + (T1)(CST))" for wrapping overflow.


This is OK for trunk if bootstrapped / tested properl.



Bootstrap and regression tested pass on power8LE, committed to
r11-259-g0447929f11e6a3e1b076841712b90a8b6bc7d33a, is it necessary
to backport it to gcc-10?


Thanks,
Xionghu




Thanks,
Richard.


gcc/ChangeLog

2020-04-30  Xiong Hu Luo  

PR tree-optimization/83403
* tree-affine.c (expr_to_aff_combination): Replace SSA_NAME with
determine_value_range, Add fold conversion of MULT_EXPR, fix the
previous PLUS_EXPR.

gcc/testsuite/ChangeLog

2020-04-30  Xiong Hu Luo  

PR tree-optimization/83403
* gcc.dg/tree-ssa/pr83403-1.c: New test.
* gcc.dg/tree-ssa/pr83403-2.c: New test.
* gcc.dg/tree-ssa/pr83403.h: New header.
---
 gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c |  8 ++
 gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c |  8 ++
 gcc/testsuite/gcc.dg/tree-ssa/pr83403.h   | 30 
+++

 gcc/tree-affine.c | 24 ++
 4 files changed, 60 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr83403.h

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c

new file mode 100644
index 000..748375b03af
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -funroll-loops -fdump-tree-lim2-details" } */
+
+#define TYPE unsigned int
+
+#include "pr83403.h"
+
+/* { dg-final { scan-tree-dump-times "Executing store motion of" 10 
"lim2" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c

new file mode 100644
index 000..ca2e6bbd61c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -funroll-loops -fdump-tree-lim2-details" } */
+
+#define TYPE int
+
+#include "pr83403.h"
+
+/* { dg-final { scan-tree-dump-times "Executing store motion of" 10 
"lim2" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr83403.h 
b/gcc/testsuite/gcc.dg/tree-ssa/pr83403.h

new file mode 100644
index 000..0da8a835b5f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403.h
@@ -0,0 +1,30 @@
+__attribute__ ((noinline)) void
+calculate (const double *__restrict__ A, const double *__restrict__ 
B,

+  double *__restrict__ C)
+{
+  TYPE m = 0;
+  TYPE n = 0;
+  TYPE k = 0;
+
+  A = (const double *) __builtin_assume_aligned (A, 16);
+  B = (const double *) __builtin_assume_aligned (B, 16);
+  C = (double *) __builtin_assume_aligned (C, 16);
+
+  for (n = 0; n < 9; n++)
+{
+  for (m = 0; m < 10; m++)
+   {
+ C[(n * 10) + m] = 0.0;
+   }
+
+  for (k = 0; k < 17; k++)
+   {
+#pragma simd
+ for (m = 0; m < 10; m++)
+   {
+ C[(n * 10) + m] += A[(k * 20) + m] * B[(n * 20) + k];
+   }
+   }
+}
+}
+
diff --git a/gcc/tree-affine.c b/gcc/tree-affine.c
index 0eb8db1b086..5620e6bf28f 100644
--- a/gcc/tree-affine.c
+++ b/gcc/tree-affine.c
@@ -343,24 +343,28 @@ expr_to_aff_combination (aff_tree *comb, 
tree_code code, tree type,

wide_int minv, maxv;
/* If inner type has wrapping overflow behavior, fold conversion
   for below case:
-(T1)(X - CST) -> (T1)X - (T1)CST
-	   if X - CST doesn't overflow by range information.  Also 
handle

-  (T1)(X + CST) as (T1)(X - (-CST)).  */
+(T1)(X *+- CST) -> (T1)X *+- (T1)CST
+  if X *+- CST doesn't overflow by range information.  */
if (TYPE_UNSIGNED (itype)
&& TYPE_OVERFLOW_WRAPS (itype)
-   && TREE_CODE (op0) == SSA_NAME
&& TREE_CODE (op1) == INTEGER_CST
-   && icode != MULT_EXPR
-   && get_range_info (op0, , ) == VR_RANGE)
+   

[PATCH v2] Add handling of MULT_EXPR/PLUS_EXPR for wrapping overflow in affine combination(PR83403)

2020-04-30 Thread luoxhu via Gcc-patches
Update the patch with overflow check.  Bootstrap and regression tested PASS on 
Power8-LE.


Use determine_value_range to get value range info for fold convert expressions
with internal operation PLUS_EXPR/MINUS_EXPR/MULT_EXPR when not overflow on
wrapping overflow inner type.  i.e.:

(long unsigned int)((unsigned  int)n * 10 + 1)
=>
(long unsigned int)n * (long unsigned int)10 + (long unsigned int)1

With this patch for affine combination, load/store motion could detect
more address refs independency and promote some memory expressions to
registers within loop.

PS: Replace the previous "(T1)(X + CST) as (T1)X - (T1)(-CST))"
to "(T1)(X + CST) as (T1)X + (T1)(CST))" for wrapping overflow.

gcc/ChangeLog

2020-04-30  Xiong Hu Luo  

PR tree-optimization/83403
* tree-affine.c (expr_to_aff_combination): Replace SSA_NAME with
determine_value_range, Add fold conversion of MULT_EXPR, fix the
previous PLUS_EXPR.

gcc/testsuite/ChangeLog

2020-04-30  Xiong Hu Luo  

PR tree-optimization/83403
* gcc.dg/tree-ssa/pr83403-1.c: New test.
* gcc.dg/tree-ssa/pr83403-2.c: New test.
* gcc.dg/tree-ssa/pr83403.h: New header.
---
 gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c |  8 ++
 gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c |  8 ++
 gcc/testsuite/gcc.dg/tree-ssa/pr83403.h   | 30 +++
 gcc/tree-affine.c | 24 ++
 4 files changed, 60 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr83403.h

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
new file mode 100644
index 000..748375b03af
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -funroll-loops -fdump-tree-lim2-details" } */
+
+#define TYPE unsigned int
+
+#include "pr83403.h"
+
+/* { dg-final { scan-tree-dump-times "Executing store motion of" 10 "lim2" } } 
*/
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
new file mode 100644
index 000..ca2e6bbd61c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -funroll-loops -fdump-tree-lim2-details" } */
+
+#define TYPE int
+
+#include "pr83403.h"
+
+/* { dg-final { scan-tree-dump-times "Executing store motion of" 10 "lim2" } } 
*/
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr83403.h 
b/gcc/testsuite/gcc.dg/tree-ssa/pr83403.h
new file mode 100644
index 000..0da8a835b5f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403.h
@@ -0,0 +1,30 @@
+__attribute__ ((noinline)) void
+calculate (const double *__restrict__ A, const double *__restrict__ B,
+  double *__restrict__ C)
+{
+  TYPE m = 0;
+  TYPE n = 0;
+  TYPE k = 0;
+
+  A = (const double *) __builtin_assume_aligned (A, 16);
+  B = (const double *) __builtin_assume_aligned (B, 16);
+  C = (double *) __builtin_assume_aligned (C, 16);
+
+  for (n = 0; n < 9; n++)
+{
+  for (m = 0; m < 10; m++)
+   {
+ C[(n * 10) + m] = 0.0;
+   }
+
+  for (k = 0; k < 17; k++)
+   {
+#pragma simd
+ for (m = 0; m < 10; m++)
+   {
+ C[(n * 10) + m] += A[(k * 20) + m] * B[(n * 20) + k];
+   }
+   }
+}
+}
+
diff --git a/gcc/tree-affine.c b/gcc/tree-affine.c
index 0eb8db1b086..5620e6bf28f 100644
--- a/gcc/tree-affine.c
+++ b/gcc/tree-affine.c
@@ -343,24 +343,28 @@ expr_to_aff_combination (aff_tree *comb, tree_code code, 
tree type,
wide_int minv, maxv;
/* If inner type has wrapping overflow behavior, fold conversion
   for below case:
-(T1)(X - CST) -> (T1)X - (T1)CST
-  if X - CST doesn't overflow by range information.  Also handle
-  (T1)(X + CST) as (T1)(X - (-CST)).  */
+(T1)(X *+- CST) -> (T1)X *+- (T1)CST
+  if X *+- CST doesn't overflow by range information.  */
if (TYPE_UNSIGNED (itype)
&& TYPE_OVERFLOW_WRAPS (itype)
-   && TREE_CODE (op0) == SSA_NAME
&& TREE_CODE (op1) == INTEGER_CST
-   && icode != MULT_EXPR
-   && get_range_info (op0, , ) == VR_RANGE)
+   && determine_value_range (op0, , ) == VR_RANGE)
  {
+   wi::overflow_type overflow = wi::OVF_NONE;
+   signop sign = UNSIGNED;
if (icode == PLUS_EXPR)
- op1 = wide_int_to_tree (itype, -wi::to_wide (op1));
-   if (wi::geu_p (minv, wi::to_wide (op1)))
+ wi::add (maxv, wi::to_wide (op1), sign, );
+   else if (icode == MULT_EXPR)
+ wi::mul (maxv, wi::to_wide (op1), sign, );
+ 

Re: [PATCH] Add value range info for affine combination to improve store motion (PR83403)

2020-04-29 Thread luoxhu via Gcc-patches



On 2020/4/28 18:30, Richard Biener wrote:

> 
> OK, I guess instead of get_range_info expr_to_aff_combination could
> simply use determine_value_range (op0, , ) == VR_RANGE
> (the && TREE_CODE (op0) == SSA_NAME check can then be removed)?
> 

Tried with determine_value_range, it works and is much more straight forward,
Thanks for your suggestion, update the patch as below with some tests, 
regression testing.


[PATCH] Add handling of MULT_EXPR/PLUS_EXPR for wrapping overflow in affine 
combination(PR83403)

Use determine_value_range to get value range info for fold convert expressions
with internal operation PLUS_EXPR/MINUS_EXPR/MULT_EXPR when not overflow on
wrapping overflow inner type.  i.e.:

(long unsigned int)((unsigned  int)n * 10 + 1)
=>
(long unsigned  int)n * (long unsigned int)10 + (long unsigned int)1

With this patch for affine combination, load/store motion could detect
more address refs independency and promote some memory expressions to
registers within loop.

PS: Replace the previous "(T1)(X + CST) as (T1)X - (T1)(-CST))"
to "(T1)(X + CST) as (T1)X + (T1)(CST))" for wrapping overflow.

gcc/ChangeLog

2020-04-29  Xiong Hu Luo  

PR tree-optimization/83403
* tree-affine.c (expr_to_aff_combination): Replace SSA_NAME with
determine_value_range, Add fold conversion of MULT_EXPR, fix the
previous PLUS_EXPR.

gcc/testsuite/ChangeLog

2020-04-29  Xiong Hu Luo  

PR tree-optimization/83403
* gcc.dg/tree-ssa/pr83403-1.c: New test.
* gcc.dg/tree-ssa/pr83403-2.c: New test.
* gcc.dg/tree-ssa/pr83403.h: New header.
---
 gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c |  8 ++
 gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c |  8 ++
 gcc/testsuite/gcc.dg/tree-ssa/pr83403.h   | 30 +++
 gcc/tree-affine.c | 22 +
 4 files changed, 58 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr83403.h

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
new file mode 100644
index 000..748375b03af
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -funroll-loops -fdump-tree-lim2-details" } */
+
+#define TYPE unsigned int
+
+#include "pr83403.h"
+
+/* { dg-final { scan-tree-dump-times "Executing store motion of" 10 "lim2" } } 
*/
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
new file mode 100644
index 000..ca2e6bbd61c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -funroll-loops -fdump-tree-lim2-details" } */
+
+#define TYPE int
+
+#include "pr83403.h"
+
+/* { dg-final { scan-tree-dump-times "Executing store motion of" 10 "lim2" } } 
*/
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr83403.h 
b/gcc/testsuite/gcc.dg/tree-ssa/pr83403.h
new file mode 100644
index 000..0da8a835b5f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403.h
@@ -0,0 +1,30 @@
+__attribute__ ((noinline)) void
+calculate (const double *__restrict__ A, const double *__restrict__ B,
+  double *__restrict__ C)
+{
+  TYPE m = 0;
+  TYPE n = 0;
+  TYPE k = 0;
+
+  A = (const double *) __builtin_assume_aligned (A, 16);
+  B = (const double *) __builtin_assume_aligned (B, 16);
+  C = (double *) __builtin_assume_aligned (C, 16);
+
+  for (n = 0; n < 9; n++)
+{
+  for (m = 0; m < 10; m++)
+   {
+ C[(n * 10) + m] = 0.0;
+   }
+
+  for (k = 0; k < 17; k++)
+   {
+#pragma simd
+ for (m = 0; m < 10; m++)
+   {
+ C[(n * 10) + m] += A[(k * 20) + m] * B[(n * 20) + k];
+   }
+   }
+}
+}
+
diff --git a/gcc/tree-affine.c b/gcc/tree-affine.c
index 0eb8db1b086..86bb64fe245 100644
--- a/gcc/tree-affine.c
+++ b/gcc/tree-affine.c
@@ -343,23 +343,25 @@ expr_to_aff_combination (aff_tree *comb, tree_code code, 
tree type,
wide_int minv, maxv;
/* If inner type has wrapping overflow behavior, fold conversion
   for below case:
-(T1)(X - CST) -> (T1)X - (T1)CST
-  if X - CST doesn't overflow by range information.  Also handle
-  (T1)(X + CST) as (T1)(X - (-CST)).  */
+(T1)(X *+- CST) -> (T1)X *+- (T1)CST
+  if X *+- CST doesn't overflow by range information.  */
if (TYPE_UNSIGNED (itype)
&& TYPE_OVERFLOW_WRAPS (itype)
-   && TREE_CODE (op0) == SSA_NAME
&& TREE_CODE (op1) == INTEGER_CST
-   && icode != MULT_EXPR
-   && get_range_info (op0, , ) == VR_RANGE)
+   && determine_value_range (op0, , ) == VR_RANGE)
  {
-   if 

Re: [PATCH] Add value range info for affine combination to improve store motion (PR83403)

2020-04-28 Thread luoxhu via Gcc-patches



On 2020/4/28 15:01, Richard Biener wrote:
> On Tue, 28 Apr 2020, Xionghu Luo wrote:
> 
>> From: Xionghu Luo 
>>
>> Get and propagate value range info to convert expressions with convert
>> operation on PLUS_EXPR/MINUS_EXPR/MULT_EXPR when not overflow.  i.e.:
>>
>> (long unsigned int)((unsigned  int)n * 10 + 1)
>> =>
>> (long unsigned int)((unsigned  int) n * (long unsigned int)10 + (long 
>> unsigned int)1)
>>
>> With this patch for affine combination, load/store motion could detect
>> more address refs independency and promote some memory expressions to
>> registers within loop.
>>
>> PS: Replace the previous "(T1)(X + CST) as (T1)X - (T1)(-CST))"
>> to "(T1)(X + CST) as (T1)X + (T1)(CST))" for wrapping overflow.
>>
>> Bootstrap and regression tested pass on Power8-LE.  Any comments?
>> Thanks.
> 
> So the core of the patch is adding handling of MULT_EXPR and the rest
> is a no-op?  It's not clear from the patch what passing down a
> value-range does and your description doesn't say anything about this
> either.

This patch handles MULT_EXPR first, and also handles (long unsigned int)(n_93 * 
10 + 1) as
"n_93*10" is not a ssa_name by using the value_range passed down, minv 
[1,81] is the 
range info of "n_93 * 10 +1", so wi::leu_p (maxv, wi::to_wide (TYPE_MAX_VALUE 
(itype)) in
expr_to_aff_combination could ensure that no wrapping overflow of this 
converting.  Not sure
if it's better described now...  And some debug message as follows:

131t.lim2:
# n_93 = PHI <0(2), n_36(7)>
_118 = n_93 * 10;
_120 = (long unsigned int) _118;
_121 = _120 * 8;
_122 = [0][0] + _121;
*_122 = 0.0;
_128 = _118 + 1;
_129 = (long unsigned int) _128;
_130 = _129 * 8;
_131 = [0][0] + _130;
*_131 = 0.0;
_137 = _118 + 2;
_138 = (long unsigned int) _137;
_139 = _138 * 8;
_140 = [0][0] + _139;
*_140 = 0.0;


Breakpoint 28, expr_to_aff_combination (comb=0x7fffc068, code=NOP_EXPR, 
type=, op0=, op1=, 
vr=0x7fffc490) at ../../gcc-mast
er/gcc/tree-affine.c:350
92: itype = 
93: otype = 
94: icode = PLUS_EXPR
95: code = NOP_EXPR
(gdb) ptree op0
 
unit-size 
align:32 warn_if_not_align:0 symtab:0 alias-set -1 canonical-type 
0x753c0690 precision:32 min <
integer_cst 0x752f1230 0> max  
context 
pointer_to_this >

arg:0 
visited var 
def_stmt n_93 = PHI <0(2), n_36(7)>
version:93
ptr-info 0x7530b680>
arg:1  constant 10>>
(gdb) ptree op1
  
constant 1>
(gdb) p minv
$580 = {
   = {
val = {1, 0, 140737310886240},
len = 1,
precision = 32
  },
  members of generic_wide_int:
  static is_sign_extended = true
}
(gdb) p maxv
$581 = {
   = {
val = {81, 65, 40},
len = 1,
precision = 32
  },
  members of generic_wide_int:
  static is_sign_extended = true
}


Thanks,
Xionghu

> 
> Richard.
> 
>> gcc/ChangeLog
>>
>>  2020-04-28  Xiong Hu Luo  
>>
>>  PR tree-optimization/83403
>>  * tree-affine.c (aff_combination_convert): New parameter
>>  value_range.
>>  (expr_to_aff_combination): Use range info to check CONVERT
>>  expr on PLUS_EXPR/MINUS_EXPR/MULT_EXPR.
>>  (tree_to_aff_combination): Likewise.
>>  (aff_combination_expand): Get and propagate range info.
>>  * tree-affine.h: Include value-range.h.
>>  (aff_combination_convert): New parameter value_range.
>>  (tree_to_aff_combination): Likewise.
>>  (aff_combination_expand): Likewise.
>> ---
>>   gcc/tree-affine.c | 66 +++
>>   gcc/tree-affine.h |  8 +++---
>>   2 files changed, 43 insertions(+), 31 deletions(-)
>>
>> diff --git a/gcc/tree-affine.c b/gcc/tree-affine.c
>> index 0eb8db1b086..63f8acd4c73 100644
>> --- a/gcc/tree-affine.c
>> +++ b/gcc/tree-affine.c
>> @@ -220,7 +220,7 @@ aff_combination_add (aff_tree *comb1, aff_tree *comb2)
>>   /* Converts affine combination COMB to TYPE.  */
>>   
>>   void
>> -aff_combination_convert (aff_tree *comb, tree type)
>> +aff_combination_convert (aff_tree *comb, tree type, value_range *vr)
>>   {
>> unsigned i, j;
>> tree comb_type = comb->type;
>> @@ -228,7 +228,7 @@ aff_combination_convert (aff_tree *comb, tree type)
>> if  (TYPE_PRECISION (type) > TYPE_PRECISION (comb_type))
>>   {
>> tree val = fold_convert (type, aff_combination_to_tree (comb));
>> -  tree_to_aff_combination (val, type, comb);
>> +  tree_to_aff_combination (val, type, comb, vr);
>> return;
>>   }
>>   
>> @@ -263,8 +263,8 @@ aff_combination_convert (aff_tree *comb, tree type)
>>  true when that was successful and returns the combination in COMB.  */
>>   
>>   static bool
>> -expr_to_aff_combination (aff_tree *comb, tree_code code, tree type,
>> - tree op0, tree op1 = NULL_TREE)
>> +expr_to_aff_combination (aff_tree *comb, tree_code code, tree type, tree 
>> op0,
>> + tree op1 = NULL_TREE, value_range *vr = NULL)
>>   {
>> aff_tree tmp;
>> poly_int64 bitpos, bitsize, bytepos;
>> @@ -279,8 +279,8 

Re: [PATCH] Fold (add -1; zero_ext; add +1) operations to zero_ext when not zero (PR37451, PR61837)

2020-04-20 Thread luoxhu via Gcc-patches

Tiny update to accommodate unsigned int compare.

On 2020/4/20 16:21, luoxhu via Gcc-patches wrote:

Hi,

On 2020/4/18 00:32, Segher Boessenkool wrote:

On Thu, Apr 16, 2020 at 08:21:40PM -0500, Segher Boessenkool wrote:

On Wed, Apr 15, 2020 at 10:18:16AM +0100, Richard Sandiford wrote:

luoxhu--- via Gcc-patches  writes:

-count = simplify_gen_binary (PLUS, mode, count, const1_rtx);
+{
+  /* Fold (add -1; zero_ext; add +1) operations to zero_ext based on addop0
+is never zero, as gimple pass loop ch will do optimization to simplify
+the loop to NO loop for loop condition is false.  */


IMO the code needs to prove this, rather than just assume that previous
passes have made it so.


Well, it should gcc_assert it, probably.

It is the left-hand side of a+b...  it cannot be 0, because niter always
is simplified!


Scratch that...  it cannot be *constant* 0, but that isn't the issue here.


Sorry that my comments in the code is a bit misleading, it is actually not
related to loop-ch at all.  The instruction sequence at 255r.loop2_invariant:

25: NOTE_INSN_BASIC_BLOCK 5
26: r133:SI=r123:DI#0-0x1
   REG_DEAD r123:DI
27: r123:DI=zero_extend(r133:SI)
   REG_DEAD r133:SI
28: r124:DI=r124:DI+0x4
30: r134:CC=cmp(r123:DI,0)
31: pc={(r134:CC!=0)?L69:pc}

And 257r.loop2_doloop (inserted #72,#73,#74, and #31 is replaced by #71):

;; Determined upper bound -1.
Loop 2 is simple:
   simple exit 6 -> 7
   number of iterations: (plus:SI (subreg:SI (reg:DI 123 [ doloop.6 ]) 0)
 (const_int -1 [0x]))
   upper bound: 2147483646
   likely upper bound: 2147483646
   realistic bound: -1
...
72: r144:SI=r123:DI#0-0x1
73: r143:DI=zero_extend(r144:SI)
74: r142:DI=r143:DI+0x1
...
25: NOTE_INSN_BASIC_BLOCK 5
26: r133:SI=r123:DI#0-0x1
   REG_DEAD r123:DI
27: r123:DI=zero_extend(r133:SI)
   REG_DEAD r133:SI
28: r124:DI=r124:DI+0x4
30: r134:CC=cmp(r123:DI,0)
71: {pc={(r142:DI!=0x1)?L69:pc};r142:DI=r142:DI-0x1;clobber scratch;clobber 
scratch;}

increment_count is true ensures the (condition NE const1_rtx), r123:DI#0-0x1 is 
the loop number
of iterations in doloop, it is definitely >= 0, and r123:DI#0 also shouldn't be 
zero as the
loop upper bound is 2147483646(0x7fffe)???

Since this simplification is in doloop-modify,  there is already some doloop 
form check like
!desc->simple_p || desc->assumptions|| desc->infinite in doloop_valid_p, so it 
seems
not necessary to repeat check it here again?
Maybe we just need check the loop upper bound is LEU than 0x7fffe to avoid 
if
instruction #26 overflow?

0xfffe




Updated patch, thanks:


This "subtract/extend/add" existed for a long time and still annoying us
(PR37451, PR61837) when converting from 32bits to 64bits, as the ctr
register is used as 64bits on powerpc64, Andraw Pinski had a patch but
caused some issue and reverted by Joseph S. Myers(PR37451, PR37782).

Andraw:
http://gcc.gnu.org/ml/gcc-patches/2008-09/msg01070.html
http://gcc.gnu.org/ml/gcc-patches/2008-10/msg01321.html
Joseph:
https://gcc.gnu.org/legacy-ml/gcc-patches/2011-11/msg02405.html

However, the doloop code improved a lot since so many years passed,
gcc.c-torture/execute/doloop-1.c is no longer a simple loop with constant
desc->niter_expr since r125:SI#0 is not SImode, so it is not a valid doloop
and no transform done in doloop again.  Thus we can do the simplification
from "subtract/extend/add" to only extend when loop upper_bound is known
to be LE than SINT_MAX-1(not do simplify when r120:DI#0-0x1 overflow).

UINT_MAX-1



Bootstrap and regression tested pass on Power8-LE.

gcc/ChangeLog

2020-04-20  Xiong Hu Luo  

PR rtl-optimization/37451, PR target/61837
* loop-doloop.c (doloop_modify): Simplify (add -1; zero_ext; add +1)
to zero_ext.
---
  gcc/loop-doloop.c | 41 -
  1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/gcc/loop-doloop.c b/gcc/loop-doloop.c
index db6a014e43d..da537aff60f 100644
--- a/gcc/loop-doloop.c
+++ b/gcc/loop-doloop.c
@@ -477,7 +477,46 @@ doloop_modify (class loop *loop, class niter_desc *desc,
  }

if (increment_count)
-count = simplify_gen_binary (PLUS, mode, count, const1_rtx);
+{
+  /* Fold (add -1; zero_ext; add +1) operations to zero_ext. i.e:
+
+73: r145:SI=r123:DI#0-0x1
+74: r144:DI=zero_extend(r145:SI)
+75: r143:DI=r144:DI+0x1
+...
+31: r135:CC=cmp(r123:DI,0)
+72: {pc={(r143:DI!=0x1)?L70:pc};r143:DI=r143:DI-0x1;clobber
+scratch;clobber scratch;}
+
+r123:DI#0-0x1 is the loop iterations be GE than 0, r143 is the loop
+count be saved to ctr, if this loop's upper bound is known, r123:DI#0
+won't be zero, then the expressions could be simplified to zero_extend
+only.  */
+  bool simplify_zext = false;
+  rtx ext

Re: [PATCH] Fold (add -1; zero_ext; add +1) operations to zero_ext when not zero (PR37451, PR61837)

2020-04-20 Thread luoxhu via Gcc-patches
Hi,

On 2020/4/18 00:32, Segher Boessenkool wrote:
> On Thu, Apr 16, 2020 at 08:21:40PM -0500, Segher Boessenkool wrote:
>> On Wed, Apr 15, 2020 at 10:18:16AM +0100, Richard Sandiford wrote:
>>> luoxhu--- via Gcc-patches  writes:
>>>> -count = simplify_gen_binary (PLUS, mode, count, const1_rtx);
>>>> +{
>>>> +  /* Fold (add -1; zero_ext; add +1) operations to zero_ext based on 
>>>> addop0
>>>> +   is never zero, as gimple pass loop ch will do optimization to simplify
>>>> +   the loop to NO loop for loop condition is false.  */
>>>
>>> IMO the code needs to prove this, rather than just assume that previous
>>> passes have made it so.
>>
>> Well, it should gcc_assert it, probably.
>>
>> It is the left-hand side of a+b...  it cannot be 0, because niter always
>> is simplified!
> 
> Scratch that...  it cannot be *constant* 0, but that isn't the issue here.

Sorry that my comments in the code is a bit misleading, it is actually not
related to loop-ch at all.  The instruction sequence at 255r.loop2_invariant:

   25: NOTE_INSN_BASIC_BLOCK 5
   26: r133:SI=r123:DI#0-0x1
  REG_DEAD r123:DI
   27: r123:DI=zero_extend(r133:SI)
  REG_DEAD r133:SI
   28: r124:DI=r124:DI+0x4
   30: r134:CC=cmp(r123:DI,0)
   31: pc={(r134:CC!=0)?L69:pc}

And 257r.loop2_doloop (inserted #72,#73,#74, and #31 is replaced by #71):   

;; Determined upper bound -1.
Loop 2 is simple:
  simple exit 6 -> 7
  number of iterations: (plus:SI (subreg:SI (reg:DI 123 [ doloop.6 ]) 0)
(const_int -1 [0x]))
  upper bound: 2147483646
  likely upper bound: 2147483646
  realistic bound: -1
...
   72: r144:SI=r123:DI#0-0x1
   73: r143:DI=zero_extend(r144:SI)
   74: r142:DI=r143:DI+0x1
...
   25: NOTE_INSN_BASIC_BLOCK 5
   26: r133:SI=r123:DI#0-0x1
  REG_DEAD r123:DI
   27: r123:DI=zero_extend(r133:SI)
  REG_DEAD r133:SI
   28: r124:DI=r124:DI+0x4
   30: r134:CC=cmp(r123:DI,0)
   71: {pc={(r142:DI!=0x1)?L69:pc};r142:DI=r142:DI-0x1;clobber scratch;clobber 
scratch;}

increment_count is true ensures the (condition NE const1_rtx), r123:DI#0-0x1 is 
the loop number
of iterations in doloop, it is definitely >= 0, and r123:DI#0 also shouldn't be 
zero as the
loop upper bound is 2147483646(0x7fffe)???

Since this simplification is in doloop-modify,  there is already some doloop 
form check like
!desc->simple_p || desc->assumptions|| desc->infinite in doloop_valid_p, so it 
seems
not necessary to repeat check it here again? 
Maybe we just need check the loop upper bound is LEU than 0x7fffe to avoid 
if
instruction #26 overflow?


Updated patch, thanks:


This "subtract/extend/add" existed for a long time and still annoying us
(PR37451, PR61837) when converting from 32bits to 64bits, as the ctr
register is used as 64bits on powerpc64, Andraw Pinski had a patch but
caused some issue and reverted by Joseph S. Myers(PR37451, PR37782).

Andraw:
http://gcc.gnu.org/ml/gcc-patches/2008-09/msg01070.html
http://gcc.gnu.org/ml/gcc-patches/2008-10/msg01321.html
Joseph:
https://gcc.gnu.org/legacy-ml/gcc-patches/2011-11/msg02405.html

However, the doloop code improved a lot since so many years passed,
gcc.c-torture/execute/doloop-1.c is no longer a simple loop with constant
desc->niter_expr since r125:SI#0 is not SImode, so it is not a valid doloop
and no transform done in doloop again.  Thus we can do the simplification
from "subtract/extend/add" to only extend when loop upper_bound is known
to be LE than SINT_MAX-1(not do simplify when r120:DI#0-0x1 overflow).

Bootstrap and regression tested pass on Power8-LE.

gcc/ChangeLog

2020-04-20  Xiong Hu Luo  

PR rtl-optimization/37451, PR target/61837
* loop-doloop.c (doloop_modify): Simplify (add -1; zero_ext; add +1)
to zero_ext.
---
 gcc/loop-doloop.c | 41 -
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/gcc/loop-doloop.c b/gcc/loop-doloop.c
index db6a014e43d..da537aff60f 100644
--- a/gcc/loop-doloop.c
+++ b/gcc/loop-doloop.c
@@ -477,7 +477,46 @@ doloop_modify (class loop *loop, class niter_desc *desc,
 }
 
   if (increment_count)
-count = simplify_gen_binary (PLUS, mode, count, const1_rtx);
+{
+  /* Fold (add -1; zero_ext; add +1) operations to zero_ext. i.e:
+
+73: r145:SI=r123:DI#0-0x1
+74: r144:DI=zero_extend(r145:SI)
+75: r143:DI=r144:DI+0x1
+...
+31: r135:CC=cmp(r123:DI,0)
+72: {pc={(r143:DI!=0x1)?L70:pc};r143:DI=r143:DI-0x1;clobber
+scratch;clobber scratch;}
+
+r123:DI#0-0x1 is the loop iterations be GE than 0, r143 is the loop
+count be saved to ctr, if this loop's upper bound is known, r123:DI#0
+won't be zero, then the expressions could be simplified to zero_extend
+only.  */
+   

Re: [PATCH v2] rs6000: Don't use HARD_FRAME_POINTER_REGNUM if it's not live in pro_and_epilogue (PR91518)

2020-04-16 Thread luoxhu via Gcc-patches



On 2020/4/17 08:52, Segher Boessenkool wrote:
> Hi!
> 
> On Mon, Apr 13, 2020 at 10:11:43AM +0800, luoxhu wrote:
>> frame_pointer_needed is set to true in reload pass setup_can_eliminate,
>> but regs_ever_live[31] is false, pro_and_epilogue uses it without live
>> check causing CPU2006 465.tonto segment fault of loading from invalid
>> addresses due to r31 not saved/restored.  Thus, add HARD_FRAME_POINTER_REGNUM
>> live check with frame_pointer_needed_indeed_p when generating 
>> pro_and_epilogue
>> instructions.
> 
> I see.
> 
> Can you instead make a boolean variable "frame_pointer_needed_indeed",
> that you set somewhere early in *logue processing?  So that we can be
> sure that it will not change behind our backs.


Thanks, rs6000_emit_prologue seems the proper place to set the 
frame_pointer_needed_indeed,
but it's strange that hard_frame_pointer_rtx will be marked USE in 
make_prologue_seq, also
need check here though not causing segfault? PS, this piece of code is in 
different file.

function.c 
static rtx_insn *
make_prologue_seq (void)
{
  if (!targetm.have_prologue ())
return NULL;

  start_sequence ();
  rtx_insn *seq = targetm.gen_prologue ();
  emit_insn (seq);

  /* Insert an explicit USE for the frame pointer
 if the profiling is on and the frame pointer is required.  */
  if (crtl->profile && frame_pointer_needed)
emit_use (hard_frame_pointer_rtx);
...



Any way, update the patch as below with your previous comments:



This bug is exposed by FRE refactor of r263875.  Comparing the fre
dump file shows no obvious change of the segment fault function proves
it to be a target issue.
frame_pointer_needed is set to true in reload pass setup_can_eliminate,
but regs_ever_live[31] is false, pro_and_epilogue uses it without live
check causing CPU2006 465.tonto segment fault of loading from invalid
addresses due to r31 not saved/restored.  Thus, add HARD_FRAME_POINTER_REGNUM
live check with frame_pointer_needed_indeed when generating pro_and_epilogue
instructions.

Bootstrap and regression tested pass on Power8-LE.  Backport to gcc-9
required once approved.

gcc/ChangeLog

2020-04-17  Xiong Hu Luo  

PR target/91518
* config/rs6000/rs6000-logue.c (frame_pointer_needed_indeed):
New variable.
(rs6000_emit_prologue_components):
Check with frame_pointer_needed_indeed.
(rs6000_emit_epilogue_components): Likewise.
(rs6000_emit_epilogue): Likewise.
(rs6000_emit_prologue): Set frame_pointer_needed_indeed.
---
 gcc/config/rs6000/rs6000-logue.c | 23 ++-
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/gcc/config/rs6000/rs6000-logue.c b/gcc/config/rs6000/rs6000-logue.c
index 4cbf228eb79..2213d1fa227 100644
--- a/gcc/config/rs6000/rs6000-logue.c
+++ b/gcc/config/rs6000/rs6000-logue.c
@@ -58,6 +58,8 @@ static bool rs6000_save_toc_in_prologue_p (void);
 
 static rs6000_stack_t stack_info;
 
+/* Set if HARD_FRAM_POINTER_REGNUM is really needed.  */
+static bool frame_pointer_needed_indeed = false;
 
 /* Label number of label created for -mrelocatable, to call to so we can
get the address of the GOT section */
@@ -2735,9 +2737,9 @@ void
 rs6000_emit_prologue_components (sbitmap components)
 {
   rs6000_stack_t *info = rs6000_stack_info ();
-  rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
-? HARD_FRAME_POINTER_REGNUM
-: STACK_POINTER_REGNUM);
+  rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed_indeed
+ ? HARD_FRAME_POINTER_REGNUM
+ : STACK_POINTER_REGNUM);
 
   machine_mode reg_mode = Pmode;
   int reg_size = TARGET_32BIT ? 4 : 8;
@@ -2815,9 +2817,9 @@ void
 rs6000_emit_epilogue_components (sbitmap components)
 {
   rs6000_stack_t *info = rs6000_stack_info ();
-  rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
-? HARD_FRAME_POINTER_REGNUM
-: STACK_POINTER_REGNUM);
+  rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed_indeed
+ ? HARD_FRAME_POINTER_REGNUM
+ : STACK_POINTER_REGNUM);
 
   machine_mode reg_mode = Pmode;
   int reg_size = TARGET_32BIT ? 4 : 8;
@@ -2996,7 +2998,10 @@ rs6000_emit_prologue (void)
&& (lookup_attribute ("no_split_stack",
  DECL_ATTRIBUTES (cfun->decl))
== NULL));
- 
+
+  frame_pointer_needed_indeed
+= frame_pointer_needed && df_regs_ever_live_p (HARD_FRAME_POINTER_REGNUM);
+
   /* Offset to top of frame for frame_reg and sp respectively.  */
   HOST_WIDE_INT frame_off = 0;
   HOST_WIDE_INT sp_off = 0;
@@ -3658,7 +3663,7 @@ rs6000_emit_prologue (void)
 }
 
   /

[PATCH] Fold (add -1; zero_ext; add +1) operations to zero_ext when not zero (PR37451, PR61837)

2020-04-15 Thread luoxhu--- via Gcc-patches
From: Xionghu Luo 

This "subtract/extend/add" existed for a long time and still annoying us
(PR37451, PR61837) when converting from 32bits to 64bits, as the ctr
register is used as 64bits on powerpc64, Andraw Pinski had a patch but
caused some issue and reverted by Joseph S. Myers(PR37451, PR37782).

Andraw:
http://gcc.gnu.org/ml/gcc-patches/2008-09/msg01070.html
http://gcc.gnu.org/ml/gcc-patches/2008-10/msg01321.html
Joseph:
https://gcc.gnu.org/legacy-ml/gcc-patches/2011-11/msg02405.html

However, the doloop code improved a lot since so many years passed,
gcc.c-torture/execute/doloop-1.c is no longer a simple loop with constant
desc->niter_expr since r125:SI#0 is not SImode, so it is not a valid doloop
and no transform done in doloop again.  Thus we can do the simplification
from "subtract/extend/add" to only extend as the condition in doloop will
never be false based on loop ch's optimization.
What's more, this patch is slightly different with Andrw's implementation,
the check of ZERO_EXT and SImode will guard the count won't be changed
from char/short caused cases not time out on slow platforms before.
Any comments?  Thanks.

doloop-1.c.257r.loop2_doloop
...
12: [r129:DI]=r123:SI
  REG_DEAD r129:DI
  REG_DEAD r123:SI
13: r125:SI=r120:DI#0-0x1
  REG_DEAD r120:DI
14: r120:DI=zero_extend(r125:SI#0)
  REG_DEAD r125:SI
16: r126:CC=cmp(r120:DI,0)
17: pc={(r126:CC!=0)?L43:pc}
  REG_DEAD r126:CC
...

Bootstrap and regression tested pass on Power8-LE.

gcc/ChangeLog

2020-04-15  Xiong Hu Luo  

PR rtl-optimization/37451, PR target/61837
loop-doloop.c (doloop_modify): Simplify (add -1; zero_ext; add +1)
to zero_ext.
---
 gcc/loop-doloop.c | 26 +-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/gcc/loop-doloop.c b/gcc/loop-doloop.c
index db6a014e43d..9f967fa3a0b 100644
--- a/gcc/loop-doloop.c
+++ b/gcc/loop-doloop.c
@@ -477,7 +477,31 @@ doloop_modify (class loop *loop, class niter_desc *desc,
 }
 
   if (increment_count)
-count = simplify_gen_binary (PLUS, mode, count, const1_rtx);
+{
+  /* Fold (add -1; zero_ext; add +1) operations to zero_ext based on addop0
+is never zero, as gimple pass loop ch will do optimization to simplify
+the loop to NO loop for loop condition is false.  */
+  bool simplify_zext = false;
+  rtx extop0 = XEXP (count, 0);
+  if (mode == E_DImode
+ && GET_CODE (count) == ZERO_EXTEND
+ && GET_CODE (extop0) == PLUS)
+   {
+ rtx addop0 = XEXP (extop0, 0);
+ rtx addop1 = XEXP (extop0, 1);
+ if (CONST_SCALAR_INT_P (addop1)
+ && GET_MODE (addop0) == E_SImode
+ && addop1 == GEN_INT (-1))
+   {
+ count = simplify_gen_unary (ZERO_EXTEND, mode, addop0,
+ GET_MODE (addop0));
+ simplify_zext = true;
+   }
+   }
+
+  if (!simplify_zext)
+   count = simplify_gen_binary (PLUS, mode, count, const1_rtx);
+}
 
   /* Insert initialization of the count register into the loop header.  */
   start_sequence ();
-- 
2.21.0.777.g83232e3864



[PATCH v2] rs6000: Don't use HARD_FRAME_POINTER_REGNUM if it's not live in pro_and_epilogue (PR91518)

2020-04-12 Thread luoxhu via Gcc-patches
This bug is exposed by FRE refactor of r263875.  Comparing the fre
dump file shows no obvious change of the segment fault function proves
it to be a target issue.
frame_pointer_needed is set to true in reload pass setup_can_eliminate,
but regs_ever_live[31] is false, pro_and_epilogue uses it without live
check causing CPU2006 465.tonto segment fault of loading from invalid
addresses due to r31 not saved/restored.  Thus, add HARD_FRAME_POINTER_REGNUM
live check with frame_pointer_needed_indeed_p when generating pro_and_epilogue
instructions.

Bootstrap and regression tested pass on Power8-LE.  Backport to gcc-9
required once approved.

gcc/ChangeLog

2020-04-13  Xiong Hu Luo  

PR target/91518
* config/rs6000/rs6000-logue.c (frame_pointer_needed_indeed_p):
New function.
(rs6000_emit_prologue_components):
Check with frame_pointer_needed_indeed_p.
(rs6000_emit_epilogue_components): Likewise.
(rs6000_emit_prologue): Likewise.
(rs6000_emit_epilogue): Likewise.
---
 gcc/config/rs6000/rs6000-logue.c | 24 
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/gcc/config/rs6000/rs6000-logue.c b/gcc/config/rs6000/rs6000-logue.c
index 4cbf228eb79..d17876ac0fb 100644
--- a/gcc/config/rs6000/rs6000-logue.c
+++ b/gcc/config/rs6000/rs6000-logue.c
@@ -2730,14 +2730,22 @@ rs6000_disqualify_components (sbitmap components, edge 
e,
 }
 }
 
+/* Determine whether HARD_FRAM_POINTER_REGNUM is really needed.  */
+static bool
+frame_pointer_needed_indeed_p (void)
+{
+  return frame_pointer_needed
+&& df_regs_ever_live_p (HARD_FRAME_POINTER_REGNUM);
+}
+
 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
 void
 rs6000_emit_prologue_components (sbitmap components)
 {
   rs6000_stack_t *info = rs6000_stack_info ();
-  rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
-? HARD_FRAME_POINTER_REGNUM
-: STACK_POINTER_REGNUM);
+  rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed_indeed_p ()
+ ? HARD_FRAME_POINTER_REGNUM
+ : STACK_POINTER_REGNUM);
 
   machine_mode reg_mode = Pmode;
   int reg_size = TARGET_32BIT ? 4 : 8;
@@ -2815,9 +2823,9 @@ void
 rs6000_emit_epilogue_components (sbitmap components)
 {
   rs6000_stack_t *info = rs6000_stack_info ();
-  rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
-? HARD_FRAME_POINTER_REGNUM
-: STACK_POINTER_REGNUM);
+  rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed_indeed_p ()
+ ? HARD_FRAME_POINTER_REGNUM
+ : STACK_POINTER_REGNUM);
 
   machine_mode reg_mode = Pmode;
   int reg_size = TARGET_32BIT ? 4 : 8;
@@ -3658,7 +3666,7 @@ rs6000_emit_prologue (void)
 }
 
   /* Set frame pointer, if needed.  */
-  if (frame_pointer_needed)
+  if (frame_pointer_needed_indeed_p ())
 {
   insn = emit_move_insn (gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM),
 sp_reg_rtx);
@@ -4534,7 +4542,7 @@ rs6000_emit_epilogue (enum epilogue_type epilogue_type)
 }
   /* If we have a frame pointer, we can restore the old stack pointer
  from it.  */
-  else if (frame_pointer_needed)
+  else if (frame_pointer_needed_indeed_p ())
 {
   frame_reg_rtx = sp_reg_rtx;
   if (DEFAULT_ABI == ABI_V4)
-- 
2.21.0.777.g83232e3864



Re: [PATCH] rs6000: Don't split constant oprator when add, move to temp register for future optimization

2020-04-03 Thread luoxhu via Gcc-patches



On 2020/4/3 06:16, Segher Boessenkool wrote:
> Hi!
> 
> On Mon, Mar 30, 2020 at 11:59:57AM +0800, luoxhu wrote:
>>> Do we want something later in the RTL pipeline to make "addi"s etc. again?
> 
> (This would be a good thing to consider -- maybe a define_insn_and_split
> will work.  But see below).
> 
>> [PATCH] rs6000: Don't split constant operator add before reload, move to 
>> temp register for future optimization
>>
>> Don't split code from add3 for SDI to allow a later pass to split.
>> This allows later logic to hoist out constant load in add instructions.
>> In loop, lis+ori could be hoisted out to improve performance compared with
>> previous addis+addi (About 15% on typical case), weak point is
>> one more register is used and one more instruction is generated.  i.e.:
>>
>> addis 3,3,0x8765
>> addi 3,3,0x4321
>>
>> =>
>>
>> lis 9,0x8765
>> ori 9,9,0x4321
>> add 3,3,9
> 
> (This patch will of course have to wait for stage 1).
> 
> Such a define_insn_and_split could be for an add of a (signed) 32-bit
> immediate.  combine will try to combine the three insns (lis;ori;add),
> and match the new pattern.

Currently 286r.split2 will split "12:%9:DI=0x87654321" to lis+ori by
rs6000_emit_set_const of define_split, do you mean add new define_insn_and_split
to do the split?  Another patch to do this after this one goes upstream in 
stage 1?

> 
> This also links in with Alan's work on big immediates, and with paddi
> insns, etc.

Seems PR94393?  Yes, rs6000_emit_set_const calls rs6000_emit_set_long_const for 
DImode.
I tried unsigned long like 0xabcd87654321, 0xabcd87654321 and 
0xc000ULL, 
All of them are outside of loop even without my patch.  No difference with or 
without
Alan's patch.

0xabcd87654321: li 9,0  ori 9,9,0xabcd  sldi 9,9,32   oris 9,9,0x8765ori 
9,9,0x4321
0xabcd87654321: lis 9,0xabcd   ori 9,9,0x8765 sldi 9,9,16 ori 
9,9,0x4321
0xc000ULL:   li 9,-1   rldicr 9,9,0,1


Thanks,
Xionghu

> 
> 
> Segher
> 



Re: [PATCH] rs6000: Save/restore r31 if frame_pointer_needed is true

2020-03-29 Thread luoxhu via Gcc-patches



On 2020/3/28 00:04, Segher Boessenkool wrote:

Hi!

On Fri, Mar 27, 2020 at 09:34:00AM +0800, luoxhu wrote:

On 2020/3/27 07:59, Segher Boessenkool wrote:

On Wed, Mar 25, 2020 at 11:15:22PM -0500, luo...@linux.ibm.com wrote:

frame_pointer_needed is set to true in reload pass setup_can_eliminate,
but regs_ever_live[31] is false, so pro_and_epilogue doesn't save/restore
r31 even it is used actually, causing CPU2006 465.tonto segment fault of
loading from invalid addresses.


If df_regs_ever_live_p(31) is false there is no hard frame pointer
anywhere in the program.  How can it be used then?


There is a piece of code emit move instruction to r31 even 
df_regs_ever_live_p(31) is false
in pro_and_epilogue.


Can you point out where (in rs6000-logue.c ot similar)?  We should fix
*that*.


As frame_point_needed is true and frame_pointer_needed is widely
used in this function, so I propose to save r31 in save_reg_p instead of check
(frame_pointer_needed && df_regs_ever_live_p(31), I haven't verify whether this 
works yet).
Is this reasonable?  Thanks.


frame_pointer_needed is often true when the backend can figure out we do
not actually need it.


rs6000-logue.c
void
rs6000_emit_prologue (void)
{
...
bbd21807fdf6 (geoffk 2000-03-16 03:16:41 + 26840)   /* Set frame 
pointer, if needed.  */
bbd21807fdf6 (geoffk 2000-03-16 03:16:41 + 26841)   if 
(frame_pointer_needed)
bbd21807fdf6 (geoffk 2000-03-16 03:16:41 + 26842) {
0d6c02bf24e4 (jakub  2005-06-30 14:26:32 + 26843)   insn = 
emit_move_insn (gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM),
bbd21807fdf6 (geoffk 2000-03-16 03:16:41 + 26844)   
 sp_reg_rtx);
bbd21807fdf6 (geoffk 2000-03-16 03:16:41 + 26845)   
RTX_FRAME_RELATED_P (insn) = 1;
6b02f2a5c61e (meissner   1995-11-30 20:02:16 + 26846) }
d1bd513ed578 (kenner 1992-02-09 19:26:21 + 26847)
...
}


Ah, so this you mean.  I see.  It looks like if you change this to

   if (frame_pointer_needed && df_regs_ever_live_p (HARD_FRAME_POINTER_REGNUM))
 {
   insn = emit_move_insn (gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM),
 sp_reg_rtx);
   RTX_FRAME_RELATED_P (insn) = 1;
 }

(so just that "if" clause changes), it'll all be fine.  Could you test
that please?


Tried with below patch, it still fails at same place, I guess this is not 
enough.
The instruction in 100dc034 change r31 from 0x105fdd70 to 0x7fffbbf0 and 
didn't
restore it before return. 


100dc020 <__atomvec_module_MOD_atom_index_from_pos>:
   100dc020:   51 10 40 3c lis r2,4177
   100dc024:   00 7f 42 38 addir2,r2,32512
   100dc028:   28 00 e3 e8 ld  r7,40(r3)
   100dc02c:   e1 ff 21 f8 stdur1,-32(r1)
   100dc030:   00 00 27 2c cmpdi   r7,0
   100dc034:   78 0b 3f 7c mr  r31,r1
   100dc038:   08 00 82 40 bne 100dc040 
<__atomvec_module_MOD_atom_index_from_pos+0x20>
   100dc03c:   01 00 e0 38 li  r7,1
   100dc040:   38 00 43 e9 ld  r10,56(r3)
   100dc044:   30 00 23 e9 ld  r9,48(r3)
   100dc048:   00 00 03 e9 ld  r8,0(r3)


Diff:

diff --git a/gcc/config/rs6000/rs6000-logue.c b/gcc/config/rs6000/rs6000-logue.c
index 4cbf228eb79..28fda1866d8 100644
--- a/gcc/config/rs6000/rs6000-logue.c
+++ b/gcc/config/rs6000/rs6000-logue.c
@@ -3658,7 +3658,7 @@ rs6000_emit_prologue (void)
}

  /* Set frame pointer, if needed.  */
-  if (frame_pointer_needed)
+  if (frame_pointer_needed && df_regs_ever_live_p (HARD_FRAME_POINTER_REGNUM))
{
  insn = emit_move_insn (gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM),
sp_reg_rtx);
@@ -4534,7 +4534,8 @@ rs6000_emit_epilogue (enum epilogue_type epilogue_type)
}
  /* If we have a frame pointer, we can restore the old stack pointer
 from it.  */
-  else if (frame_pointer_needed)
+  else if (frame_pointer_needed
+  && df_regs_ever_live_p (HARD_FRAME_POINTER_REGNUM))
{
  frame_reg_rtx = sp_reg_rtx;
  if (DEFAULT_ABI == ABI_V4)
@@ -4873,7 +4874,8 @@ rs6000_emit_epilogue (enum epilogue_type epilogue_type)
a REG_CFA_DEF_CFA note, but that's OK;  A duplicate is
discarded by dwarf2cfi.c/dwarf2out.c, and in any case would
be harmless if emitted.  */
-  if (frame_pointer_needed)
+  if (frame_pointer_needed
+ && df_regs_ever_live_p (HARD_FRAME_POINTER_REGNUM))
   {
 insn = get_last_insn ();
 add_reg_note (insn, REG_CFA_DEF_CFA,



Program received signal SIGSEGV: Segmentation fault - invalid memory reference.

Backtrace for this error:
#0  0x200a2243 in ???
#1  0x200a0abb in ???
#2  0x200604d7 in ???
#3  0x1027418c in __mol_module_MOD_make_image_of_shell
   at 
/home/luoxhu/workspace/cpu2006/benchspec/CPU2006/465.tonto/build/build_peak_none./mol.fppized.f90:9376
#4

Re: [PATCH] rs6000: Don't split constant oprator when add, move to temp register for future optimization

2020-03-29 Thread luoxhu via Gcc-patches



On 2020/3/27 22:33, Segher Boessenkool wrote:
> Hi!
> 
> On Thu, Mar 26, 2020 at 05:06:43AM -0500, luo...@linux.ibm.com wrote:
>> Remove split code from add3 to allow a later pass to split.
>> This allows later logic to hoist out constant load in add instructions.
>> In loop, lis+ori could be hoisted out to improve performance compared with
>> previous addis+addi (About 15% on typical case), weak point is
>> one more register is used and one more instruction is generated.  i.e.:
>>
>> addis 3,3,0x8765
>> addi 3,3,0x4321
>>
>> =>
>>
>> lis 9,0x8765
>> ori 9,9,0x4321
>> add 3,3,9
> 
> What does it do overall?  Say, to SPEC.  What does it do to execution
> time, and what does it do to binary size?
> 
> Do we want something later in the RTL pipeline to make "addi"s etc. again?
> 
>> 2020-03-26  Xiong Hu Luo  
>>
>>  * config/rs6000/rs6000.md (add3): Remove split code, move constant
>>to temp register before add.
> 
> This should not be indented, so just:
>   * config/rs6000/rs6000.md (add3): Remove split code, move constant
>   to temp register before add.
> 
> We have six separate add3 patterns (three of those are in rs6000.md,
> too).  You can write something like
>   (add3 for SDI):
> to show which iterator (or mode) this one is for.  This is helpful with
> any  or  or the like, even if there (currently) is only one
> pattern you could mean.
> 

Thanks, it is necessary to re-enable split add as some later RTL passes like 
final will
still need generate addis+addi (case: g++.dg/opt/thunk1.C ). 
Update the patch as below:


[PATCH] rs6000: Don't split constant operator add before reload, move to temp 
register for future optimization

Don't split code from add3 for SDI to allow a later pass to split.
This allows later logic to hoist out constant load in add instructions.
In loop, lis+ori could be hoisted out to improve performance compared with
previous addis+addi (About 15% on typical case), weak point is
one more register is used and one more instruction is generated.  i.e.:

addis 3,3,0x8765
addi 3,3,0x4321

=>

lis 9,0x8765
ori 9,9,0x4321
add 3,3,9

No obvious performance and binary size change to SPEC2017.

gcc/ChangeLog:

2020-03-30  Xiong Hu Luo  

* config/rs6000/rs6000.md (add3 for SDI): Don't split before 
reload,
move constant to temp register for add.

gcc/testsuite/ChangeLog:

2020-03-26  Xiong Hu Luo  

* gcc.target/powerpc/add-const.c: New.
---
 gcc/config/rs6000/rs6000.md  | 51 +---
 gcc/testsuite/gcc.target/powerpc/add-const.c | 18 +++
 2 files changed, 42 insertions(+), 27 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/add-const.c

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index ad88b6783af..76af3d5b1d9 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -1729,34 +1729,31 @@ (define_expand "add3"
 
   if (CONST_INT_P (operands[2]) && !add_operand (operands[2], mode))
 {
-  rtx tmp = ((!can_create_pseudo_p ()
- || rtx_equal_p (operands[0], operands[1]))
-? operands[0] : gen_reg_rtx (mode));
-
-  /* Adding a constant to r0 is not a valid insn, so use a different
-strategy in that case.  */
-  if (reg_or_subregno (operands[1]) == 0 || reg_or_subregno (tmp) == 0)
-   {
- if (operands[0] == operands[1])
-   FAIL;
- rs6000_emit_move (operands[0], operands[2], mode);
- emit_insn (gen_add3 (operands[0], operands[1], operands[0]));
- DONE;
-   }
-
-  HOST_WIDE_INT val = INTVAL (operands[2]);
-  HOST_WIDE_INT low = ((val & 0x) ^ 0x8000) - 0x8000;
-  HOST_WIDE_INT rest = trunc_int_for_mode (val - low, mode);
-
-  if (mode == DImode && !satisfies_constraint_L (GEN_INT (rest)))
-   FAIL;
+  if (can_create_pseudo_p ())
+  {
+   rtx tmp = rtx_equal_p (operands[0], operands[1])
+ ? operands[0] : gen_reg_rtx (mode);
 
-  /* The ordering here is important for the prolog expander.
-When space is allocated from the stack, adding 'low' first may
-produce a temporary deallocation (which would be bad).  */
-  emit_insn (gen_add3 (tmp, operands[1], GEN_INT (rest)));
-  emit_insn (gen_add3 (operands[0], tmp, GEN_INT (low)));
-  DONE;
+   rs6000_emit_move (tmp, operands[2], mode);
+   emit_insn (gen_add3 (operands[0], operands[1], tmp));
+   DONE;
+  }
+  else
+  {
+   HOST_WIDE_INT val = INTVAL (operands[2]);
+   HOST_WIDE_INT low = ((val & 0x) ^ 0x8000) - 0x8000;
+   HOST_WIDE_INT rest = trunc_int_for_mode (val - low, mode);
+
+   if (mode == DImode && !satisfies_constraint_L (GEN_INT (rest)))
+ FAIL;
+
+   /* The ordering here is important for the prolog expander.
+  When space is allocated from the stack, adding 'low' first may
+  produce a temporary deallocation (which would be bad).  */
+   emit_insn 

Re: [PATCH] rs6000: Save/restore r31 if frame_pointer_needed is true

2020-03-26 Thread luoxhu via Gcc-patches



On 2020/3/27 07:59, Segher Boessenkool wrote:
> Hi!
> 
> On Wed, Mar 25, 2020 at 11:15:22PM -0500, luo...@linux.ibm.com wrote:
>> frame_pointer_needed is set to true in reload pass setup_can_eliminate,
>> but regs_ever_live[31] is false, so pro_and_epilogue doesn't save/restore
>> r31 even it is used actually, causing CPU2006 465.tonto segment fault of
>> loading from invalid addresses.
> 
> If df_regs_ever_live_p(31) is false there is no hard frame pointer
> anywhere in the program.  How can it be used then?

There is a piece of code emit move instruction to r31 even 
df_regs_ever_live_p(31) is false
in pro_and_epilogue.  As frame_point_needed is true and frame_pointer_needed is 
widely
used in this function, so I propose to save r31 in save_reg_p instead of check
(frame_pointer_needed && df_regs_ever_live_p(31), I haven't verify whether this 
works yet).
Is this reasonable?  Thanks.
 
rs6000-logue.c
void
rs6000_emit_prologue (void)
{
...
bbd21807fdf6 (geoffk 2000-03-16 03:16:41 + 26840)   /* Set frame 
pointer, if needed.  */
bbd21807fdf6 (geoffk 2000-03-16 03:16:41 + 26841)   if 
(frame_pointer_needed)
bbd21807fdf6 (geoffk 2000-03-16 03:16:41 + 26842) {
0d6c02bf24e4 (jakub  2005-06-30 14:26:32 + 26843)   insn = 
emit_move_insn (gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM),
bbd21807fdf6 (geoffk 2000-03-16 03:16:41 + 26844)   
 sp_reg_rtx);
bbd21807fdf6 (geoffk 2000-03-16 03:16:41 + 26845)   
RTX_FRAME_RELATED_P (insn) = 1;
6b02f2a5c61e (meissner   1995-11-30 20:02:16 + 26846) }
d1bd513ed578 (kenner 1992-02-09 19:26:21 + 26847)
...
}

Xionghu

> 
> 
> Segher
> 



[PATCH] rs6000: Don't split constant oprator when add, move to temp register for future optimization

2020-03-26 Thread luoxhu--- via Gcc-patches
From: Xionghu Luo 

Remove split code from add3 to allow a later pass to split.
This allows later logic to hoist out constant load in add instructions.
In loop, lis+ori could be hoisted out to improve performance compared with
previous addis+addi (About 15% on typical case), weak point is
one more register is used and one more instruction is generated.  i.e.:

addis 3,3,0x8765
addi 3,3,0x4321

=>

lis 9,0x8765
ori 9,9,0x4321
add 3,3,9

gcc/ChangeLog:

2020-03-26  Xiong Hu Luo  

* config/rs6000/rs6000.md (add3): Remove split code, move constant
  to temp register before add.

gcc/testsuite/ChangeLog:

2020-03-26  Xiong Hu Luo  

* gcc.target/powerpc/add-const.c: New.
---
 gcc/config/rs6000/rs6000.md  | 25 ++--
 gcc/testsuite/gcc.target/powerpc/add-const.c | 18 ++
 2 files changed, 20 insertions(+), 23 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/add-const.c

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index ad88b6783af..72f3f604e0d 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -1733,29 +1733,8 @@ (define_expand "add3"
  || rtx_equal_p (operands[0], operands[1]))
 ? operands[0] : gen_reg_rtx (mode));
 
-  /* Adding a constant to r0 is not a valid insn, so use a different
-strategy in that case.  */
-  if (reg_or_subregno (operands[1]) == 0 || reg_or_subregno (tmp) == 0)
-   {
- if (operands[0] == operands[1])
-   FAIL;
- rs6000_emit_move (operands[0], operands[2], mode);
- emit_insn (gen_add3 (operands[0], operands[1], operands[0]));
- DONE;
-   }
-
-  HOST_WIDE_INT val = INTVAL (operands[2]);
-  HOST_WIDE_INT low = ((val & 0x) ^ 0x8000) - 0x8000;
-  HOST_WIDE_INT rest = trunc_int_for_mode (val - low, mode);
-
-  if (mode == DImode && !satisfies_constraint_L (GEN_INT (rest)))
-   FAIL;
-
-  /* The ordering here is important for the prolog expander.
-When space is allocated from the stack, adding 'low' first may
-produce a temporary deallocation (which would be bad).  */
-  emit_insn (gen_add3 (tmp, operands[1], GEN_INT (rest)));
-  emit_insn (gen_add3 (operands[0], tmp, GEN_INT (low)));
+  rs6000_emit_move (tmp, operands[2], mode);
+  emit_insn (gen_add3 (operands[0], operands[1], tmp));
   DONE;
 }
 })
diff --git a/gcc/testsuite/gcc.target/powerpc/add-const.c 
b/gcc/testsuite/gcc.target/powerpc/add-const.c
new file mode 100644
index 000..e1007247b32
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/add-const.c
@@ -0,0 +1,18 @@
+/* { dg-do compile { target { lp64 } } } */
+/* { dg-options "-O3 -fno-unroll-loops" } */
+
+/* Ensure the lis,ori are generated, which indicates they have
+   been hoisted outside of the loop.  */
+
+typedef unsigned long ulong;
+ulong
+foo (ulong n, ulong h)
+{
+  int i;
+  for (i = 0; i < n; i++)
+h = ((h + 8) | h) + 0x87654321;
+  return h;
+}
+
+/* { dg-final { scan-assembler-times {\mlis\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mori\M} 1 } } */
-- 
2.21.0.777.g83232e3864



[PATCH] rs6000: Save/restore r31 if frame_pointer_needed is true

2020-03-25 Thread luoxhu--- via Gcc-patches
From: Xionghu Luo 

This P1 bug is exposed by FRE refactor of r263875.  Comparing the fre
dump file shows no obvious change of the segment fault function proves
it to be a target issue.
frame_pointer_needed is set to true in reload pass setup_can_eliminate,
but regs_ever_live[31] is false, so pro_and_epilogue doesn't save/restore
r31 even it is used actually, causing CPU2006 465.tonto segment fault of
loading from invalid addresses.

Bootstrap and regression tested pass on Power8-LE.  Backport to gcc-9
required once approved.

gcc/ChangeLog

2020-03-26  Xiong Hu Luo  

PR target/91518
* config/rs6000/rs6000-logue.c (save_reg_p): Save r31 if
frame_pointer_needed.
---
 gcc/config/rs6000/rs6000-logue.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gcc/config/rs6000/rs6000-logue.c b/gcc/config/rs6000/rs6000-logue.c
index 4cbf228eb79..7b62ff03afd 100644
--- a/gcc/config/rs6000/rs6000-logue.c
+++ b/gcc/config/rs6000/rs6000-logue.c
@@ -116,6 +116,9 @@ save_reg_p (int reg)
return true;
 }
 
+  if (reg == HARD_FRAME_POINTER_REGNUM && frame_pointer_needed)
+return true;
+
   return !call_used_or_fixed_reg_p (reg) && df_regs_ever_live_p (reg);
 }
 
-- 
2.21.0.777.g83232e3864



Re: [PATCH] Backport to gcc-9: PR92398: Fix testcase failure of pr72804.c

2020-03-09 Thread luoxhu



On 2020/3/10 05:28, Segher Boessenkool wrote:

On Thu, Mar 05, 2020 at 02:21:58AM -0600, luo...@linux.ibm.com wrote:

From: Xionghu Luo 

Backport the patch to fix failures on P9 and P8BE, P7LE for PR94036.


No changes were needed?


Yes, no conflicts of the patch and instruction counts are same with
master.  Committed in r9-8357(85c08558c66dd8e2000a4ad282ca03368028fce3).


Thanks
Xionghu




Tested pass on P9/P8/P7, ok to commit?


Yes, okay for 9.  Thanks!


Segher



2020-03-05  Luo Xiong Hu  

backport from master.
PR testsuite/94036

2019-12-02  Luo Xiong Hu  

PR testsuite/92398
* gcc.target/powerpc/pr72804.c: Split the store function to...
* gcc.target/powerpc/pr92398.h: ... this one.  New.
* gcc.target/powerpc/pr92398.p9+.c: New.
* gcc.target/powerpc/pr92398.p9-.c: New.
* lib/target-supports.exp (check_effective_target_p8): New.
(check_effective_target_p9+): New.




[PATCH] Backport to gcc-9: PR92398: Fix testcase failure of pr72804.c

2020-03-05 Thread luoxhu
From: Xionghu Luo 

Backport the patch to fix failures on P9 and P8BE, P7LE for PR94036.
Tested pass on P9/P8/P7, ok to commit?
(gcc-8 is not needed as the test doesn't exists.)

P9LE generated instruction is not worse than P8LE.
mtvsrdd;xxlnot;stxv vs. not;not;std;std.
It can have longer latency, but latency via memory is not so critical,
and this does save decode and other resources.  It's hard to choose
which is best.  Update the test case to fix failures.

gcc/testsuite/ChangeLog:

2020-03-05  Luo Xiong Hu  

backport from master.
PR testsuite/94036

2019-12-02  Luo Xiong Hu  

PR testsuite/92398
* gcc.target/powerpc/pr72804.c: Split the store function to...
* gcc.target/powerpc/pr92398.h: ... this one.  New.
* gcc.target/powerpc/pr92398.p9+.c: New.
* gcc.target/powerpc/pr92398.p9-.c: New.
* lib/target-supports.exp (check_effective_target_p8): New.
(check_effective_target_p9+): New.
---
 gcc/testsuite/gcc.target/powerpc/pr72804.c| 16 +++
 gcc/testsuite/gcc.target/powerpc/pr92398.h| 17 
 .../gcc.target/powerpc/pr92398.p9+.c  | 12 +++
 .../gcc.target/powerpc/pr92398.p9-.c  | 10 ++
 gcc/testsuite/lib/target-supports.exp | 20 +++
 5 files changed, 62 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr92398.h
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr92398.p9+.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr92398.p9-.c

diff --git a/gcc/testsuite/gcc.target/powerpc/pr72804.c 
b/gcc/testsuite/gcc.target/powerpc/pr72804.c
index b83b6350d75..c0711499ae5 100644
--- a/gcc/testsuite/gcc.target/powerpc/pr72804.c
+++ b/gcc/testsuite/gcc.target/powerpc/pr72804.c
@@ -9,17 +9,7 @@ foo (__int128_t *src)
   return ~*src;
 }
 
-void
-bar (__int128_t *dst, __int128_t src)
-{
-  *dst =  ~src;
-}
 
-/* { dg-final { scan-assembler-times "not " 4 } } */
-/* { dg-final { scan-assembler-times "std " 2 } } */
-/* { dg-final { scan-assembler-times "ld " 2 } } */
-/* { dg-final { scan-assembler-not "lxvd2x" } } */
-/* { dg-final { scan-assembler-not "stxvd2x" } } */
-/* { dg-final { scan-assembler-not "xxpermdi" } } */
-/* { dg-final { scan-assembler-not "mfvsrd" } } */
-/* { dg-final { scan-assembler-not "mfvsrd" } } */
+/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mnot\M} 2 } } */
+/* { dg-final { scan-assembler-not {\mlxvd2x\M} } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr92398.h 
b/gcc/testsuite/gcc.target/powerpc/pr92398.h
new file mode 100644
index 000..5a4a8bcab80
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr92398.h
@@ -0,0 +1,17 @@
+/* This test code is included into pr92398.p9-.c and pr92398.p9+.c.
+   The two files have the tests for the number of instructions generated for
+   P9- versus P9+.
+
+   store generates difference instructions as below:
+   P9+: mtvsrdd;xxlnot;stxv.
+   P8/P7/P6 LE: not;not;std;std.
+   P8 BE: mtvsrd;mtvsrd;xxpermdi;xxlnor;stxvd2x.
+   P7/P6 BE: std;std;addi;lxvd2x;xxlnor;stxvd2x.
+   P9+ and P9- LE are expected, P6/P7/P8 BE are unexpected.  */
+
+void
+bar (__int128_t *dst, __int128_t src)
+{
+  *dst =  ~src;
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/pr92398.p9+.c 
b/gcc/testsuite/gcc.target/powerpc/pr92398.p9+.c
new file mode 100644
index 000..a819c3f16af
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr92398.p9+.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target { lp64 && p9+ } } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O2 -mvsx" } */
+
+/* { dg-final { scan-assembler-times {\mmtvsrdd\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxxlnor\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mstxv\M} 1 } } */
+/* { dg-final { scan-assembler-not {\mld\M} } } */
+/* { dg-final { scan-assembler-not {\mnot\M} } } */
+
+/* Source code for the test in pr92398.h */
+#include "pr92398.h"
diff --git a/gcc/testsuite/gcc.target/powerpc/pr92398.p9-.c 
b/gcc/testsuite/gcc.target/powerpc/pr92398.p9-.c
new file mode 100644
index 000..065ae73f267
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr92398.p9-.c
@@ -0,0 +1,10 @@
+/* { dg-do compile { target { lp64 && {! p9+} } } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O2 -mvsx" } */
+
+/* { dg-final { scan-assembler-times {\mnot\M} 2 { xfail be } } } */
+/* { dg-final { scan-assembler-times {\mstd\M} 2 { xfail { p8 && be } } } } */
+
+/* Source code for the test in pr92398.h */
+#include "pr92398.h"
+
diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index cda0f3d350b..ea9a50ccb27 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -2555,6 +2555,26 @@ proc check_effective_target_le { } {
 }]
 }
 
+# Return 1 if we're generating code for only power8 

*Ping^1* [PATCH v3] ipa-cp: Fix PGO regression caused by r278808

2020-03-03 Thread luoxhu
Hi Honza,

The performance regression still exists. For exchange2, the performance
is about 28% slower for option:
"-fprofile-generate/-fprofile-use --param ipa-cp-eval-threshold=0 --param 
ipa-cp-unit-growth=80 -fno-inline".

r278808:
commit ad06966f6677d55c11214d9c7b6d5518f915e341
Author: hubicka 
Date:   Thu Nov 28 14:16:29 2019 +

* ipa-cp.c (update_profiling_info): Fix scaling.


Fix v3 patch and logs are here:
https://gcc.gnu.org/ml/gcc-patches/2020-01/msg00764.html


Thanks
Xionghu

On 2020/1/14 14:45, luoxhu wrote:
> Hi,
> 
> On 2020/1/3 00:58, Jan Hubicka wrote:
>>> diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
>>> index 14064ae0034..947bf7c7199 100644
>>> --- a/gcc/ipa-cp.c
>>> +++ b/gcc/ipa-cp.c
>>> @@ -4272,6 +4272,31 @@ update_profiling_info (struct cgraph_node 
>>> *orig_node,
>>>     false);
>>>     new_sum = stats.count_sum;
>>> +  class ipa_node_params *info = IPA_NODE_REF (orig_node);
>>> +  if (info && info->node_is_self_scc)
>>> +    {
>>> +  profile_count self_recursive_count;
>>> +
>>> +  /* The self recursive edge is on the orig_node.  */
>>> +  for (cs = orig_node->callees; cs; cs = cs->next_callee)
>>> +    if (ipa_edge_within_scc (cs))
>>> +  {
>>> +    cgraph_node *callee = cs->callee->function_symbol ();
>>> +    if (callee && cs->caller == cs->callee)
>>> +  {
>>> +    self_recursive_count = cs->count;
>>> +    break;
>>> +  }
>>> +  }
>>
>> What happens here when there are multiple self recursive calls or when
>> the SCC has two mutually recursive functions?
>>
>> I am still confused by the logic of this function.  I will take a deeper
>> look at your previous email.
>>> +
>>> +  /* Proportion count for self recursive node from all callers.  */
>>> +  new_sum
>>> +    = (orig_sum + new_sum).apply_scale (self_recursive_count, orig_sum);
>>> +
>>> +  /* Proportion count for specialized cloned node.  */
>>> +  new_sum = new_sum.apply_scale (1, param_ipa_cp_max_recursive_depth);
>>> +    }
>>> +
>>>     if (orig_node_count < orig_sum + new_sum)
>>>   {
>>>     if (dump_file)
>>> diff --git a/gcc/params.opt b/gcc/params.opt
>>> index d88ae0c468b..40a03b04580 100644
>>> --- a/gcc/params.opt
>>> +++ b/gcc/params.opt
>>> @@ -199,7 +199,7 @@ Common Joined UInteger 
>>> Var(param_ipa_cp_loop_hint_bonus) Init(64) Param
>>>   Compile-time bonus IPA-CP assigns to candidates which make loop bounds 
>>> or strides known.
>>>   -param=ipa-cp-max-recursive-depth=
>>> -Common Joined UInteger Var(param_ipa_cp_max_recursive_depth) Init(8) Param
>>> +Common Joined UInteger Var(param_ipa_cp_max_recursive_depth) Init(8) 
>>> IntegerRange(1, 10) Param
>>>   Maximum depth of recursive cloning for self-recursive function.
>>
>> The values stats from 0 but I also wonder why 10 is a good upper bound?
>> If the function calls itself with one type of value (like depth-1) then
>> we may clone well over 10 times, if it calls itself with two different
>> sets then 8 is already quite high I would say...
>>
>> I suppose the call probabilities will eventually drop to be very low,
>> but I am not quite sure about that because we do not handle any IP
>> frequency propagation.  Do we have some way to treat wide trees? Or do
>> we clone only all self recursive calls are the same?
>>
>> Honza
> 
> Update v3 patch.  This regression could only be reproduced when built with
> "-fprofile-generate/-fprofile-use --param ipa-cp-eval-threshold=0 --param
> ipa-cp-unit-growth=80" on exchange_2, on some platforms -fno-inline may be
> also needed, I attached 3 files (compressed to exchange.tar.gz)
> exchange2_gcc.use.orig.wpa.071i.cp.old, exchange2_gcc.use.orig.wpa.071i.cp.new
> and cp.diff to show the profile count difference of specialized node
> digits_2.constprop/152 to digits_2.constprop/159 without/with this patch.
> 
> Profile count is decreasing slowly with this patch instead of keeping very
> small from the first clone (very low count as cold will cause complete unroll
> not working), it still differs from really execution (exchange2.png), but this
> average model takes the recursive edge as feedback.  Thanks.
> 
> 
> v3:
> 1. Enable proportion orig_sum to the new nodes for self recursive node (k 
> means
>    multiple self recursive calls):

Re: [RFC] Run store-merging pass once more before pass fre/pre

2020-02-26 Thread luoxhu
On 2020/2/18 17:57, Richard Biener wrote:
> On Tue, 18 Feb 2020, Xionghu Luo wrote:
> 
>> Store-merging pass should run twice, the reason is pass fre/pre will do
>> some kind of optimizations to instructions by:
>>1. Converting the load from address to load from function arguments
>>(store_merging_30.c:foo1).
>>2. Converting the byte access to BIT_FIELD_REF(store_merging_30.c:foo2).
>>3. Other bitfield combinations or potential interference optimizations 
>> etc.
>> These optimizations will break the store chain, store-merging pass fails
>> to catch such kind of pattern so stores are not merged in middle end,
>> then consecutive stb/sth instructions(should be merged to stw) are emitted
>> finally.
>>
>> And why not directly move store-merging pass(numbered 194) just before
>> fre1(numbered 35) is for case store_merging_14.c, 5 merges are done by
>> store_merging1, and 4 merges are done fore store_merge2. So, keep the
>> original store_merge as store_merge2 as store merge may be still available
>> after other pass optimizations.  Most of the 30 store_merging_N.c test
>> case dg-final pass name would be updated from store-merging to
>> store-merging1 once this RFC patch idea got confirmed.
>> Any comments?  Thanks.
> 
> Generally trying to solve a pass ordering issue by re-ordering/duplicating
> passes might help a single testcase but will surely pessimize others.
> So that's a no-go.
> 
> What the testcase shows is that store-merging needs to operate
> similar to bswap when trying to find a "common" source for a combined
> store.  That is, it should appropriately follow defs.  For foo2 I expect
> it to be not too difficult, for foo1 I'd say we miss a FRE opportunity
> here (but Jakub is working on that one IIRC).

Thanks Richard, not sure about my understanding and please correct if any.

I tried Jukub's latest patch of "sccvn: Handle bitfields in push_partial_def".
Got to know fre pass checks the load instruction's vuse chain and do the 
constant
bitfield combines in push_partial_def, then 
native_encode_expr/native_interpret_expr
can decode and encode the constant content and shift/combine the data.  
This should be based on one change to my test case(by adding return 
page->counters;)
to trigger the fre pass push all SSA name's partial_defs.  Currently, for SSA 
variables,
this encode/interpret is not supported yet, I suppose this is the opportunity 
you mean.
As this case's input is not constant, so Jukub's patch doesn't fix it.

struct page
{
  union
  {
unsigned counters;
struct
{
  union
  {
struct
{
  unsigned inuse : 16;
  unsigned inuse2 : 8;
  unsigned objects : 5;
  unsigned frozen : 3;
};
  };
};
  };
};

unsigned
foo1 (struct page *page, unsigned long counters_new)
{
struct page tmp;
tmp.counters = counters_new;
page->inuse   = tmp.inuse;
page->inuse2  = tmp.inuse2;
page->objects = tmp.objects;
page->frozen  = tmp.frozen;
return page->counters;
}

If "return page->counters;" is removed, this case won't match the fre's current 
code path
in vn_reference_lookup_3 without a load(_14 = page_9(D)->D.2912.counters) to 
walk all the vuses.
So it seems not a fre opportunity but exactly a store merging issue(Also 
checked llvm that it 
doesn't generate byte store and short store so it only produces 1 stw for all 
patterns).

Optimize this case in pass store-merging is reasonable, just as you said,
"trying to find a "common" source for a combined store.", it requires break
store-merging pass's behavior: so far store-merging pass track all store
instruction's RHS and stops when RHS is a load instruction with  extracted, convert instructions and 
bitfield
instructions generated by pass fre are ignored in 
pass_store_merging::process_store.
To teach the process_store capture real common source requires refactoring 
handled_load
to continue tracking even RHS is a load instruction and support the convert  
instructions
and bitfield_instructions, this seems to be a big change.

Regarding to Jakub's comments, merging could reduce many byte stores and half 
stores
to improve performance for this type of case.  There is already an 033t.esra 
running before,
and not sure whether SRA should replace such kind of bitfield operations.
Adding a store-merging pass is so simple and many passes run more than once...
So which would be best for this optimization, please?  Thanks again :)

Xionghu

> 
> Richard.
> 
>> PS:
>> Before this patch, store_merging_30.c.035t.fre1:
>>
>> ... foo1:
>> Inserted _13 = (short unsigned int) counters_new_5(D);
>> Replaced tmp.D.2912.D.2911.D.2910.D.2909.inuse with _13 in all uses of
>> _1 = tmp.D.2912.D.2911.D.2910.D.2909.inuse;
>> Removing dead stmt _1 = tmp.D.2912.D.2911.D.2910.D.2909.inuse;
>> ... foo2:
>> Inserted _17 = BIT_FIELD_REF <_1, 8, 16>;
>> Replaced tmp.D.2926.D.2925.D.2924.D.2923.objects with _17 in all uses of
>> _3 = 

Ping^1: [PATCH v3] ipa-cp: Fix PGO regression caused by r278808

2020-02-09 Thread luoxhu
Ping,

attachment of 
https://gcc.gnu.org/ml/gcc-patches/2020-01/msg00764/exchange2.tar.gz
shows the profile count difference on cloned nodes digits_2.constprop.[0...8]
without/with this patch.  Thanks!

Xiong Hu


On 2020/1/14 14:45, luoxhu wrote:
> Hi,
> 
> On 2020/1/3 00:58, Jan Hubicka wrote:
>>> diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
>>> index 14064ae0034..947bf7c7199 100644
>>> --- a/gcc/ipa-cp.c
>>> +++ b/gcc/ipa-cp.c
>>> @@ -4272,6 +4272,31 @@ update_profiling_info (struct cgraph_node 
>>> *orig_node,
>>>     false);
>>>     new_sum = stats.count_sum;
>>> +  class ipa_node_params *info = IPA_NODE_REF (orig_node);
>>> +  if (info && info->node_is_self_scc)
>>> +    {
>>> +  profile_count self_recursive_count;
>>> +
>>> +  /* The self recursive edge is on the orig_node.  */
>>> +  for (cs = orig_node->callees; cs; cs = cs->next_callee)
>>> +    if (ipa_edge_within_scc (cs))
>>> +  {
>>> +    cgraph_node *callee = cs->callee->function_symbol ();
>>> +    if (callee && cs->caller == cs->callee)
>>> +  {
>>> +    self_recursive_count = cs->count;
>>> +    break;
>>> +  }
>>> +  }
>>
>> What happens here when there are multiple self recursive calls or when
>> the SCC has two mutually recursive functions?
>>
>> I am still confused by the logic of this function.  I will take a deeper
>> look at your previous email.
>>> +
>>> +  /* Proportion count for self recursive node from all callers.  */
>>> +  new_sum
>>> +    = (orig_sum + new_sum).apply_scale (self_recursive_count, orig_sum);
>>> +
>>> +  /* Proportion count for specialized cloned node.  */
>>> +  new_sum = new_sum.apply_scale (1, param_ipa_cp_max_recursive_depth);
>>> +    }
>>> +
>>>     if (orig_node_count < orig_sum + new_sum)
>>>   {
>>>     if (dump_file)
>>> diff --git a/gcc/params.opt b/gcc/params.opt
>>> index d88ae0c468b..40a03b04580 100644
>>> --- a/gcc/params.opt
>>> +++ b/gcc/params.opt
>>> @@ -199,7 +199,7 @@ Common Joined UInteger 
>>> Var(param_ipa_cp_loop_hint_bonus) Init(64) Param
>>>   Compile-time bonus IPA-CP assigns to candidates which make loop bounds 
>>> or strides known.
>>>   -param=ipa-cp-max-recursive-depth=
>>> -Common Joined UInteger Var(param_ipa_cp_max_recursive_depth) Init(8) Param
>>> +Common Joined UInteger Var(param_ipa_cp_max_recursive_depth) Init(8) 
>>> IntegerRange(1, 10) Param
>>>   Maximum depth of recursive cloning for self-recursive function.
>>
>> The values stats from 0 but I also wonder why 10 is a good upper bound?
>> If the function calls itself with one type of value (like depth-1) then
>> we may clone well over 10 times, if it calls itself with two different
>> sets then 8 is already quite high I would say...
>>
>> I suppose the call probabilities will eventually drop to be very low,
>> but I am not quite sure about that because we do not handle any IP
>> frequency propagation.  Do we have some way to treat wide trees? Or do
>> we clone only all self recursive calls are the same?
>>
>> Honza
> 
> Update v3 patch.  This regression could only be reproduced when built with
> "-fprofile-generate/-fprofile-use --param ipa-cp-eval-threshold=0 --param
> ipa-cp-unit-growth=80" on exchange_2, on some platforms -fno-inline may be
> also needed, I attached 3 files (compressed to exchange.tar.gz)
> exchange2_gcc.use.orig.wpa.071i.cp.old, exchange2_gcc.use.orig.wpa.071i.cp.new
> and cp.diff to show the profile count difference of specialized node
> digits_2.constprop/152 to digits_2.constprop/159 without/with this patch.
> 
> Profile count is decreasing slowly with this patch instead of keeping very
> small from the first clone (very low count as cold will cause complete unroll
> not working), it still differs from really execution (exchange2.png), but this
> average model takes the recursive edge as feedback.  Thanks.
> 
> 
> v3:
> 1. Enable proportion orig_sum to the new nodes for self recursive node (k 
> means
>    multiple self recursive calls):
>    new_sum = (orig_sum + new_sum) * 1 / k \
>    * self_recursive_probability * (1 / param_ipa_cp_max_recursive_depth).
> 2. Two mutually recursive functions are not supported in self recursive
>    clone yet so also not supported in update_profiling_info here.
> 3. Improve value range for

Re: [PATCH] Add Optimization for various IPA parameters.

2020-01-13 Thread luoxhu




On 2020/1/11 20:20, Tamar Christina wrote:

Hi Martin,

This change (r280099) is causing a major performance regression on exchange2 in 
SPEC2017 dropping the benchmark by more than 30%.

It seems the parameters no longer do anything. i.e. -flto --param 
ipa-cp-eval-threshold=1 --param ipa-cp-unit-growth=80 doesn't have any effect 
anymore.


Strange that you don't need -fno-inline to achieve the 30% performance 
boost, I found that some specialized nodes are recursively inlined cause 
register spill and no performance improve.  What's your platform and could 
you please try it with -fno-inline and see any difference please?


Thanks
Xiong Hu


Thanks,
Tamar


-Original Message-
From: gcc-patches-ow...@gcc.gnu.org 
On Behalf Of Martin Jambor
Sent: Wednesday, January 8, 2020 4:14 PM
To: Martin Liška ; gcc-patches@gcc.gnu.org
Cc: hubi...@ucw.cz; Richard Biener 
Subject: Re: [PATCH] Add Optimization for various IPA parameters.

Hi,

On Fri, Jan 03 2020, Martin Liška wrote:

Hi.

This is similar transformation for IPA passes. This time, one needs to
use opt_for_fn in order to get the right parameter values.

@Martin, Honza:
There are last few remaining parameters which should use
opt_for_fn:

param_ipa_cp_unit_growth


So as we discussed, picking this one from one particular node is not what one
would expect to happen, but inlining does it too and so anyway:


IPA-CP: Always access param_ipcp_unit_growth through opt_for_fn

2020-01-07  Martin Jambor  

* params.opt (param_ipcp_unit_growth): Mark as Optimization.
* ipa-cp.c (max_new_size): Removed.
(orig_overall_size): New variable.
(get_max_overall_size): New function.
(estimate_local_effects): Use it.  Adjust dump.
(decide_about_value): Likewise.
(ipcp_propagate_stage): Do not calculate max_new_size, just store
orig_overall_size.  Adjust dump.
(ipa_cp_c_finalize): Clear orig_overall_size instead of max_new_size.
---
  gcc/ipa-cp.c   | 39 ++-
  gcc/params.opt |  2 +-
  2 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c index 9e20e278eff..c2572e3e0e8
100644
--- a/gcc/ipa-cp.c
+++ b/gcc/ipa-cp.c
@@ -375,7 +375,7 @@ static profile_count max_count;

  /* Original overall size of the program.  */

-static long overall_size, max_new_size;
+static long overall_size, orig_overall_size;

  /* Node name to unique clone suffix number map.  */  static
hash_map *clone_num_suffixes; @@ -3395,6
+3395,23 @@ perform_estimation_of_a_value (cgraph_node *node,
vec known_csts,
val->local_size_cost = size;
  }

+/* Get the overall limit oof growth based on parameters extracted from
growth.
+   it does not really make sense to mix functions with different overall
growth
+   limits but it is possible and if it happens, we do not want to select one
+   limit at random.  */
+
+static long
+get_max_overall_size (cgraph_node *node) {
+  long max_new_size = orig_overall_size;
+  long large_unit = opt_for_fn (node->decl, param_large_unit_insns);
+  if (max_new_size < large_unit)
+max_new_size = large_unit;
+  int unit_growth = opt_for_fn (node->decl, param_ipcp_unit_growth);
+  max_new_size += max_new_size * unit_growth / 100 + 1;
+  return max_new_size;
+}
+
  /* Iterate over known values of parameters of NODE and estimate the local
 effects in terms of time and size they have.  */

@@ -3457,7 +3474,7 @@ estimate_local_effects (struct cgraph_node *node)
   stats.freq_sum, stats.count_sum,
   size))
{
- if (size + overall_size <= max_new_size)
+ if (size + overall_size <= get_max_overall_size (node))
{
  info->do_clone_for_all_contexts = true;
  overall_size += size;
@@ -3467,8 +3484,8 @@ estimate_local_effects (struct cgraph_node *node)
 "known contexts, growth deemed beneficial.\n");
}
  else if (dump_file && (dump_flags & TDF_DETAILS))
-   fprintf (dump_file, "   Not cloning for all contexts because "
-"max_new_size would be reached with %li.\n",
+   fprintf (dump_file, "  Not cloning for all contexts because "
+"maximum unit size would be reached with %li.\n",
 size + overall_size);
}
else if (dump_file && (dump_flags & TDF_DETAILS)) @@ -3860,14
+3877,10 @@ ipcp_propagate_stage (class ipa_topo_info *topo)
  max_count = max_count.max (node->count.ipa ());
}

-  max_new_size = overall_size;
-  if (max_new_size < param_large_unit_insns)
-max_new_size = param_large_unit_insns;
-  max_new_size += max_new_size * param_ipcp_unit_growth / 100 + 1;
+  orig_overall_size = overall_size;

if (dump_file)
-fprintf (dump_file, "\noverall_size: %li, max_new_size: %li\n",
-overall_size, max_new_size);
+fprintf (dump_file, 

Re: [PATCH v7] Missed function specialization + partial devirtualization

2020-01-12 Thread luoxhu
On 2020/1/10 19:08, Jan Hubicka wrote:
> OK. You will need to do the obvious updates for Martin's patch
> which turned some member functions into static functions.
> 
> Honza

Thanks a lot!  Rebased & updated, will commit below patch shortly when git push 
is ready.


v8:
 1. Rebase to master with Martin's static function (r280043) comments merge.
Boostrap/testsuite/SPEC2017 tested pass on Power8-LE.
 2. TODO:
2.1. C++ devirt for multiple speculative call targets.
2.2. ipa-icf ipa_merge_profiles refine with COMDAT inline testcase.


This patch aims to fix PR69678 caused by PGO indirect call profiling
performance issues.
The bug that profiling data is never working was fixed by Martin's pull
back of topN patches, performance got GEOMEAN ~1% improvement(+24% for
511.povray_r specifically).
Still, currently the default profile only generates SINGLE indirect target
that called more than 75%.  This patch leverages MULTIPLE indirect
targets use in LTO-WPA and LTO-LTRANS stage, as a result, function
specialization, profiling, partial devirtualization, inlining and
cloning could be done successfully based on it.
Performance can get improved from 0.70 sec to 0.38 sec on simple tests.
Details are:
  1.  PGO with topn is enabled by default now, but only one indirect
  target edge will be generated in ipa-profile pass, so add variables to enable
  multiple speculative edges through passes, speculative_id will record the
  direct edge index bind to the indirect edge, indirect_call_targets length
  records how many direct edges owned by the indirect edge, postpone gimple_ic
  to ipa-profile like default as inline pass will decide whether it is benefit
  to transform indirect call.
  2.  Use speculative_id to track and search the reference node matched
  with the direct edge's callee for multiple targets.  Actually, it is the
  caller's responsibility to handle the direct edges mapped to same indirect
  edge.  speculative_call_info will return one of the direct edge specified,
  this will leverage current IPA edge process framework mostly.
  3.  Enable LTO WPA/LTRANS stage multiple indirect call targets analysis for
  profile full support in ipa passes and cgraph_edge functions.  speculative_id
  can be set by make_speculative id when multiple targets are binded to
  one indirect edge, and cloned if new edge is cloned.  speculative_id
  is streamed out and stream int by lto like lto_stmt_uid.
  4.  Create and duplicate all speculative direct edge's call summary
  in ipa-fnsummary.c with auto_vec.
  5.  Add 1 in module testcase and 2 cross module testcases.
  6.  Bootstrap and regression test passed on Power8-LE.  No function
  and performance regression for SPEC2017.

gcc/ChangeLog

2020-01-13  Xiong Hu Luo  

PR ipa/69678
* cgraph.c (symbol_table::create_edge): Init speculative_id and
target_prob.
(cgraph_edge::make_speculative): Add param for setting speculative_id
and target_prob.
(cgraph_edge::speculative_call_info): Update comments and find reference
by speculative_id for multiple indirect targets.
(cgraph_edge::resolve_speculation): Decrease the speculations
for indirect edge, drop it's speculative if not direct target
left. Update comments.
(cgraph_edge::redirect_call_stmt_to_callee): Likewise.
(cgraph_node::dump): Print num_speculative_call_targets.
(cgraph_node::verify_node): Don't report error if speculative
edge not include statement.
(cgraph_edge::num_speculative_call_targets_p): New function.
* cgraph.h (int common_target_id): Remove.
(int common_target_probability): Remove.
(num_speculative_call_targets): New variable.
(make_speculative): Add param for setting speculative_id.
(cgraph_edge::num_speculative_call_targets_p): New declare.
(target_prob): New variable.
(speculative_id): New variable.
* ipa-fnsummary.c (analyze_function_body): Create and duplicate
  call summaries for multiple speculative call targets.
* cgraphclones.c (cgraph_node::create_clone): Clone speculative_id.
* ipa-profile.c (struct speculative_call_target): New struct.
(class speculative_call_summary): New class.
(class speculative_call_summaries): New class.
(call_sums): New variable.
(ipa_profile_generate_summary): Generate indirect multiple targets 
summaries.
(ipa_profile_write_edge_summary): New function.
(ipa_profile_write_summary): Stream out indirect multiple targets 
summaries.
(ipa_profile_dump_all_summaries): New function.
(ipa_profile_read_edge_summary): New function.
(ipa_profile_read_summary_section): New function.
(ipa_profile_read_summary): Stream in indirect multiple targets 
summaries.
(ipa_profile): Generate num_speculative_call_targets from
profile summaries.
* ipa-ref.h (speculative_id): 

Re: [PATCH] Use cgraph_node::dump_{asm_},name where possible.

2020-01-08 Thread luoxhu




On 2020/1/8 22:54, Martin Liška wrote:

diff --git a/gcc/cgraphclones.c b/gcc/cgraphclones.c
index bd44063a1ac..789564ba335 100644
--- a/gcc/cgraphclones.c
+++ b/gcc/cgraphclones.c
@@ -1148,8 +1148,7 @@ symbol_table::materialize_all_clones (void)
  if (symtab->dump_file)
{
  fprintf (symtab->dump_file, "cloning %s to %s\n",
-  xstrdup_for_dump (node->clone_of->name ()),
-  xstrdup_for_dump (node->name ()));
+  node->clone_of->dump_name (), node->name ());


Also node->dump_name () here?



Re: [PATCH] ipa-inline: Adjust condition for caller_growth_limits

2020-01-07 Thread luoxhu


On 2020/1/7 23:40, Jan Hubicka wrote:
>>
>>
>> On 2020/1/7 16:40, Jan Hubicka wrote:
 On Mon, 2020-01-06 at 01:03 -0600, Xiong Hu Luo wrote:
> Inline should return failure either (newsize > param_large_function_insns)
> OR (newsize > limit).  Sometimes newsize is larger than
> param_large_function_insns, but smaller than limit, inline doesn't return
> failure even if the new function is a large function.
> Therefore, param_large_function_insns and param_large_function_growth 
> should be
> OR instead of AND, otherwise --param large-function-growth won't
> work correctly with --param large-function-insns.
>
> gcc/ChangeLog:
>
>   2020-01-06  Luo Xiong Hu  
>
>   ipa-inline-analysis.c (estimate_growth): Fix typo.
>   ipa-inline.c (caller_growth_limits): Use OR instead of AND.
 OK
>>>
>>> This patch also seems wrong.  Inliner should return failure when
>>>newsize > param_large_function_insns && newsize > limit
>>> The reason is same as with large_unit_insns.  We used to have only
>>> growth limits but it has turned out that this gets too
>>> restrictive/random for very small units/growths.
>>>
>>> So the logic is meant to be that inlining is OK either
>>>- if function is reasonably small (large_function_insns limit is not
>>>  met)
>>>- or it does not grow too much (large-function-growth isnot met)
>>
>> Sorry for checking too fast.  Will revert once confirmed.
> 
> Yes, i can confirm that the origina test was as intended and it is OK to
> revert the patch (I am currently traveling)

Partially reverted in r279986 and remained the typo fix as it is obvious.

> 
>> large_function_insns is default to 2700, and large-function-growth is
>> default to 100, so inline a large function with insn 2700 to another large
>> function with insn 2700 is allowed here (limit reach to 5400), I observed
>> HUGE performance drop in some cases for this behavior(large function inline
>> large function due to register spilling) compared with not inlined, that's
>> why this patch comes out.
>>
>>  From performance point of view,it seems benefitial to inline small function 
>> to
>> large function, but negative when inline large function to large 
>> function(causing
>> register spill)?  2700 seems too large for defining a function to be large 
>> and and
>> 5400 also seems too big for growth limit?
>>
>> This inline happens at "Inlining one function called once", it's out of
>> inline_small_functions and since newsize > param_large_function_insns, so I
>> suppose it won't change behavior for inlining small functions?
> 
> large-function-growth/insns is not meant to improve performance. The
> reason why they exists is the fact that compiler is non-linar
> in some cases and thus producing very large functions makes it slow / or
> give up on some optimizations.
> 
> The problem that sometimes inlining a larger function into a cold path
> of other function leads to slowdown is rather old problem (see PR49194).
> In general most of logic in inliner is about determining when inlining
> is going to be useful. It is hard to tell when it is *not* going to be
> useful, so I am bit unsure what to do about these cases in general.
> 
> Recently I experimented path which disables inlining functions called
> once when callee and caller frequency differ (i.e. one is hot other is
> cold) etc, but it did not lead to consistent perormance improvements.
> 
> What situation did you run into?

Thanks.  So caller could be {hot, cold} + {large, small}, same for callee.  It 
may
produce up to 4 * 4 = 16 combinations.  Agree that hard to define useful,
and useful really doesn't reflect performance improvements certainly. :)

My case is A1(1) calls A2(2), A2(2) calls A3(3).  A1, A2, A3 are 
specialized cloned nodes with different input, they are all hot, called once
and each have about 1000+ insns.  By default, large-function-growth/insns are
both not reached, so A1 will inline A2, A2 will inline A3, which is 40% slower 
than no-inline.

If adjust the large-function-growth/insns to allow 
A2 inline A3 only, the performance is 20% slower then no-inline.
All the inlinings are generated in functions called once.


Xionghu

> 
> Honza
> 



Re: [PATCH] ipa-inline: Adjust condition for caller_growth_limits

2020-01-07 Thread luoxhu



On 2020/1/7 16:40, Jan Hubicka wrote:
>> On Mon, 2020-01-06 at 01:03 -0600, Xiong Hu Luo wrote:
>>> Inline should return failure either (newsize > param_large_function_insns)
>>> OR (newsize > limit).  Sometimes newsize is larger than
>>> param_large_function_insns, but smaller than limit, inline doesn't return
>>> failure even if the new function is a large function.
>>> Therefore, param_large_function_insns and param_large_function_growth 
>>> should be
>>> OR instead of AND, otherwise --param large-function-growth won't
>>> work correctly with --param large-function-insns.
>>>
>>> gcc/ChangeLog:
>>>
>>> 2020-01-06  Luo Xiong Hu  
>>>
>>> ipa-inline-analysis.c (estimate_growth): Fix typo.
>>> ipa-inline.c (caller_growth_limits): Use OR instead of AND.
>> OK
> 
> This patch also seems wrong.  Inliner should return failure when
>   newsize > param_large_function_insns && newsize > limit
> The reason is same as with large_unit_insns.  We used to have only
> growth limits but it has turned out that this gets too
> restrictive/random for very small units/growths.
> 
> So the logic is meant to be that inlining is OK either
>   - if function is reasonably small (large_function_insns limit is not
> met)
>   - or it does not grow too much (large-function-growth isnot met)

Sorry for checking too fast.  Will revert once confirmed.
large_function_insns is default to 2700, and large-function-growth is 
default to 100, so inline a large function with insn 2700 to another large 
function with insn 2700 is allowed here (limit reach to 5400), I observed 
HUGE performance drop in some cases for this behavior(large function inline 
large function due to register spilling) compared with not inlined, that's 
why this patch comes out.

>From performance point of view,it seems benefitial to inline small function to 
large function, but negative when inline large function to large 
function(causing
register spill)?  2700 seems too large for defining a function to be large and 
and
5400 also seems too big for growth limit?

This inline happens at "Inlining one function called once", it's out of 
inline_small_functions and since newsize > param_large_function_insns, so I  
suppose it won't change behavior for inlining small functions? 

> 
> @item large-function-insns
> The limit specifying really large functions.  For functions larger than
> thislimit after inlining, inlining is constrained by @option{--param
> large-function-growth}.  This parameter is useful primarily to avoid
> extreme compilation time caused by non-linear algorithms used by the
> back end.
> 
> 
> Honza
>> jeff
>>



Re: [PATCH] ipa-inline: Adjust condition for caller_growth_limits

2020-01-06 Thread luoxhu

On 2020/1/7 02:01, Jeff Law wrote:

On Mon, 2020-01-06 at 01:03 -0600, Xiong Hu Luo wrote:

Inline should return failure either (newsize > param_large_function_insns)
OR (newsize > limit).  Sometimes newsize is larger than
param_large_function_insns, but smaller than limit, inline doesn't return
failure even if the new function is a large function.
Therefore, param_large_function_insns and param_large_function_growth should be
OR instead of AND, otherwise --param large-function-growth won't
work correctly with --param large-function-insns.

gcc/ChangeLog:

2020-01-06  Luo Xiong Hu  

ipa-inline-analysis.c (estimate_growth): Fix typo.
ipa-inline.c (caller_growth_limits): Use OR instead of AND.

OK
jeff


Thanks, committed in r279942.

XiongHu






Re: [PATCH v2] ipa-cp: Fix PGO regression caused by r278808

2019-12-30 Thread luoxhu



On 2019/12/31 14:43, Feng Xue OS wrote:

One comment: it's better to allow zero value for 
param_ipa_cp_max_recursive_depth,
this can be used to disable recursive cloning.


Thanks, "1" means no recursive cloning but only constant propagation from
caller to callee in your code?

ipa-cp.c, line 2019:

  /* Recursively generate lattice values with a limited count.  */
  FOR_EACH_VEC_ELT (val_seeds, i, src_val)
{
  for (int j = 1; j < param_ipa_cp_max_recursive_depth; j++)
{
  tree cstval = get_val_across_arith_op (opcode, opnd1_type, opnd2,
 src_val, res_type);
  if (!cstval)
break;

  ret |= dest_lat->add_value (cstval, cs, src_val, src_idx,
  src_offset, _val, true);
  gcc_checking_assert (src_val);
}
}

XiongHu



Feng
________
From: luoxhu 
Sent: Monday, December 30, 2019 4:11 PM
To: Jan Hubicka; Martin Jambor
Cc: Martin Liška; gcc-patches@gcc.gnu.org; seg...@kernel.crashing.org; 
wschm...@linux.ibm.com; guoji...@linux.ibm.com; li...@gcc.gnu.org; Feng Xue OS
Subject: [PATCH v2] ipa-cp: Fix PGO regression caused by r278808

v2 Changes:
1. Enable proportion orig_sum to the new nodes for self recursive node:
new_sum = (orig_sum + new_sum) \
* self_recursive_probability * (1 / param_ipa_cp_max_recursive_depth).
2. Add value range for param_ipa_cp_max_recursive_depth.

The performance of exchange2 built with PGO will decrease ~28% by r278808
due to profile count set incorrectly.  The cloned nodes are updated to a
very small count caused later pass cunroll fail to unroll the recursive
function in exchange2.

digits_2 ->
digits_2.constprop.0, digits_2.constprop.1, etc.

gcc/ChangeLog:

 2019-12-26  Luo Xiong Hu  

 * ipa-cp.c (update_profiling_info): Check self_scc node.
 * params.opt (ipa-cp-max-recursive-depth): Set param range.
---
  gcc/ipa-cp.c   | 25 +
  gcc/params.opt |  2 +-
  2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
index 14064ae0034..947bf7c7199 100644
--- a/gcc/ipa-cp.c
+++ b/gcc/ipa-cp.c
@@ -4272,6 +4272,31 @@ update_profiling_info (struct cgraph_node *orig_node,
   false);
new_sum = stats.count_sum;

+  class ipa_node_params *info = IPA_NODE_REF (orig_node);
+  if (info && info->node_is_self_scc)
+{
+  profile_count self_recursive_count;
+
+  /* The self recursive edge is on the orig_node.  */
+  for (cs = orig_node->callees; cs; cs = cs->next_callee)
+   if (ipa_edge_within_scc (cs))
+ {
+   cgraph_node *callee = cs->callee->function_symbol ();
+   if (callee && cs->caller == cs->callee)
+ {
+   self_recursive_count = cs->count;
+   break;
+ }
+ }
+
+  /* Proportion count for self recursive node from all callers.  */
+  new_sum
+   = (orig_sum + new_sum).apply_scale (self_recursive_count, orig_sum);
+
+  /* Proportion count for specialized cloned node.  */
+  new_sum = new_sum.apply_scale (1, param_ipa_cp_max_recursive_depth);
+}
+
if (orig_node_count < orig_sum + new_sum)
  {
if (dump_file)
diff --git a/gcc/params.opt b/gcc/params.opt
index d88ae0c468b..40a03b04580 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -199,7 +199,7 @@ Common Joined UInteger Var(param_ipa_cp_loop_hint_bonus) 
Init(64) Param
  Compile-time bonus IPA-CP assigns to candidates which make loop bounds or 
strides known.

  -param=ipa-cp-max-recursive-depth=
-Common Joined UInteger Var(param_ipa_cp_max_recursive_depth) Init(8) Param
+Common Joined UInteger Var(param_ipa_cp_max_recursive_depth) Init(8) 
IntegerRange(1, 10) Param
  Maximum depth of recursive cloning for self-recursive function.

  -param=ipa-cp-min-recursive-probability=
--
2.21.0.777.g83232e3864





[PATCH v2] ipa-cp: Fix PGO regression caused by r278808

2019-12-30 Thread luoxhu
v2 Changes:
1. Enable proportion orig_sum to the new nodes for self recursive node:
   new_sum = (orig_sum + new_sum) \
   * self_recursive_probability * (1 / param_ipa_cp_max_recursive_depth).
2. Add value range for param_ipa_cp_max_recursive_depth.

The performance of exchange2 built with PGO will decrease ~28% by r278808
due to profile count set incorrectly.  The cloned nodes are updated to a
very small count caused later pass cunroll fail to unroll the recursive
function in exchange2.

digits_2 ->
digits_2.constprop.0, digits_2.constprop.1, etc.

gcc/ChangeLog:

2019-12-26  Luo Xiong Hu  

* ipa-cp.c (update_profiling_info): Check self_scc node.
* params.opt (ipa-cp-max-recursive-depth): Set param range.
---
 gcc/ipa-cp.c   | 25 +
 gcc/params.opt |  2 +-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
index 14064ae0034..947bf7c7199 100644
--- a/gcc/ipa-cp.c
+++ b/gcc/ipa-cp.c
@@ -4272,6 +4272,31 @@ update_profiling_info (struct cgraph_node *orig_node,
  false);
   new_sum = stats.count_sum;
 
+  class ipa_node_params *info = IPA_NODE_REF (orig_node);
+  if (info && info->node_is_self_scc)
+{
+  profile_count self_recursive_count;
+
+  /* The self recursive edge is on the orig_node.  */
+  for (cs = orig_node->callees; cs; cs = cs->next_callee)
+   if (ipa_edge_within_scc (cs))
+ {
+   cgraph_node *callee = cs->callee->function_symbol ();
+   if (callee && cs->caller == cs->callee)
+ {
+   self_recursive_count = cs->count;
+   break;
+ }
+ }
+
+  /* Proportion count for self recursive node from all callers.  */
+  new_sum
+   = (orig_sum + new_sum).apply_scale (self_recursive_count, orig_sum);
+
+  /* Proportion count for specialized cloned node.  */
+  new_sum = new_sum.apply_scale (1, param_ipa_cp_max_recursive_depth);
+}
+
   if (orig_node_count < orig_sum + new_sum)
 {
   if (dump_file)
diff --git a/gcc/params.opt b/gcc/params.opt
index d88ae0c468b..40a03b04580 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -199,7 +199,7 @@ Common Joined UInteger Var(param_ipa_cp_loop_hint_bonus) 
Init(64) Param
 Compile-time bonus IPA-CP assigns to candidates which make loop bounds or 
strides known.
 
 -param=ipa-cp-max-recursive-depth=
-Common Joined UInteger Var(param_ipa_cp_max_recursive_depth) Init(8) Param
+Common Joined UInteger Var(param_ipa_cp_max_recursive_depth) Init(8) 
IntegerRange(1, 10) Param
 Maximum depth of recursive cloning for self-recursive function.
 
 -param=ipa-cp-min-recursive-probability=
-- 
2.21.0.777.g83232e3864



[PATCH v7] Missed function specialization + partial devirtualization

2019-12-25 Thread luoxhu
>>>   profile_count indir_cnt = indirect->count;
>>>   indirect = indirect->clone (id->dst_node, call_stmt,
>>>   gimple_uid (stmt),
>>>   num, den,
>>>   true);
>>>
>>>   profile_probability prob
>>>  = indir_cnt.probability_in (old_cnt + indir_cnt);
>>>   indirect->count
>>>  = copy_basic_block->count.apply_probability (prob);
> 
> The udpating of indirect edges needs to be done in loop as well so we
> update all of them.

One indirect edge may have multiple speculative direct edge, the indirect edge 
must
be out of the loop after all direct edges are cloned.  Added comments in code.

> 
> I think you are also not removing the common_target and
> common_target_probability from cgraph_edge.
> There is now code in ipa-utils merging the histograms. You will need to
> update that to your representation. It should not be hard - it either
> copies all the values from target function or merges them with given
> probability scales.

This seems a bit tricky for me.  ipa-icf runs after ipa-profile, the reason why 
remove  is they are never used in
later passes during previous discussion.

I added a new variable target_prob to save it in direct edge when calling
make_speculative, since the target_prob is assigned in ipa-profile, so no
need to stream out and in through LTO like speculative_id.

Even though, scaling in ipa-icf with multiple direct edges is more complicate 
then
single direct edge, need double circulation to find each src to dst edge map 
and do
scaling one by one. 

BTW, the breaking patch r279373 "Fix merging of common traget info." only 
mentioned
firefox but doesn't include a testcase, I tried to construct some test
cases, but fail to cover all the circumstance (I just implemented the N:N map 
yet, 
please refer to the code in below patch.), do you have cases other than firefox 
to
cover it?  Or how to test it as r279373 required?  As this piece of code is not 
quite
related to this patch, can we just leave it for future refine?  Thanks :)

Other changes are pasted as below patch. Thank you!


From: Xiong Hu Luo 

v7:
 1. Remove GTY for summaries, replace ggc vec with auto_vec, remove
is_empty.  Add comment for speculative_id.
 2. Create and duplicate all speculative direct edge's call summary
in ipa-fnsummary.c.
 3. ipa_merge_profiles update with target_prob.  target_prob is
generated in ipa-profile, so no need stream out and in through LTO.
 4. Bootstrap passed and no SPEC2017 regression on power8le.

This patch aims to fix PR69678 caused by PGO indirect call profiling
performance issues.
The bug that profiling data is never working was fixed by Martin's pull
back of topN patches, performance got GEOMEAN ~1% improvement(+24% for
511.povray_r specifically).
Still, currently the default profile only generates SINGLE indirect target
that called more than 75%.  This patch leverages MULTIPLE indirect
targets use in LTO-WPA and LTO-LTRANS stage, as a result, function
specialization, profiling, partial devirtualization, inlining and
cloning could be done successfully based on it.
Performance can get improved from 0.70 sec to 0.38 sec on simple tests.
Details are:
  1.  PGO with topn is enabled by default now, but only one indirect
  target edge will be generated in ipa-profile pass, so add variables to enable
  multiple speculative edges through passes, speculative_id will record the
  direct edge index bind to the indirect edge, indirect_call_targets length
  records how many direct edges owned by the indirect edge, postpone gimple_ic
  to ipa-profile like default as inline pass will decide whether it is benefit
  to transform indirect call.
  2.  Use speculative_id to track and search the reference node matched
  with the direct edge's callee for multiple targets.  Actually, it is the
  caller's responsibility to handle the direct edges mapped to same indirect
  edge.  speculative_call_info will return one of the direct edge specified,
  this will leverage current IPA edge process framework mostly.
  3.  Enable LTO WPA/LTRANS stage multiple indirect call targets analysis for
  profile full support in ipa passes and cgraph_edge functions.  speculative_id
  can be set by make_speculative id when multiple targets are binded to
  one indirect edge, and cloned if new edge is cloned.  speculative_id
  is streamed out and stream int by lto like lto_stmt_uid.
  4.  Add 1 in module testcase and 2 cross module testcases.
  5.  Bootstrap and regression test passed on Power8-LE.  No function
  and performance regression for SPEC2017.

gcc/ChangeLog

2019-12-26  Xiong Hu Luo  

PR ipa/69678
* cgraph.c (symbol_table::create_edge): Init speculative_id and
target_prob.

Re: [PATCH] [RFC] ipa: duplicate ipa_size_summary for cloned nodes

2019-12-18 Thread luoxhu
On 2019/12/18 23:48, Jan Hubicka wrote:
>> The size_info of ipa_size_summary are created by r277424.  It should be
>> duplicated for cloned nodes, otherwise self_size and 
>> estimated_self_stack_size
>> would be 0, causing param large-function-insns and large-function-growth 
>> working
>> inaccurate when ipa-inline.
>>
>> gcc/ChangeLog:
>>
>>  2019-12-18  Luo Xiong Hu  
>>
>>  * ipa-fnsummary.c (ipa_fn_summary_t::duplicate): Copy
>>  ipa_size_summary for cloned nodes.
>> ---
>>   gcc/ipa-fnsummary.c | 5 +
>>   1 file changed, 5 insertions(+)
>>
>> diff --git a/gcc/ipa-fnsummary.c b/gcc/ipa-fnsummary.c
>> index a46b1445765..9a01be1708b 100644
>> --- a/gcc/ipa-fnsummary.c
>> +++ b/gcc/ipa-fnsummary.c
>> @@ -868,7 +868,12 @@ ipa_fn_summary_t::duplicate (cgraph_node *src,
>>  }
>>   }
>> if (!dst->inlined_to)
>> +  {
>> +class ipa_size_summary *src_size = ipa_size_summaries->get_create (src);
>> +class ipa_size_summary *dst_size = ipa_size_summaries->get_create (dst);
> 
> This is intended to happen by the default duplicate method of
> ipa_size_summaries via to copy constructor. It seems there is a stupid
> pasto and the copy constructor is unused since the default duplicate
> implementation does nothing (wonder why).
> 
> I am testing the attached patch.  Does this help?

Yes, It works.  Thanks for your refine.  The default duplicate implementation 
is in
symbol-summary.h:template class function_summary_base::duplicate, I 
tried
to call duplicate in it, but it will cause a lot of errors as many classes 
doesn't
implement the virtual duplicate function.  Please commit your patch once tested 
pass :)

Xiong Hu
> 
> Index: ipa-fnsummary.h
> ===
> --- ipa-fnsummary.h   (revision 279523)
> +++ ipa-fnsummary.h   (working copy)
> @@ -99,11 +99,6 @@ public:
> : estimated_self_stack_size (0), self_size (0), size (0)
> {
> }
> -  /* Copy constructor.  */
> -  ipa_size_summary (const ipa_size_summary )
> -  : estimated_self_stack_size (0), self_size (s.self_size), size (s.size)
> -  {
> -  }
>   };
>   
>   /* Function inlining information.  */
> @@ -226,18 +221,20 @@ extern GTY(()) fast_function_summary  *ipa_fn_summaries;
>   
>   class ipa_size_summary_t:
> -  public fast_function_summary 
> +  public fast_function_summary 
>   {
>   public:
> ipa_size_summary_t (symbol_table *symtab):
> -fast_function_summary  (symtab) {}
> +fast_function_summary  (symtab)
> +  {
> +disable_insertion_hook ();
> +  }
>   
> -  static ipa_size_summary_t *create_ggc (symbol_table *symtab)
> +  virtual void duplicate (cgraph_node *, cgraph_node *,
> +   ipa_size_summary *src_data,
> +   ipa_size_summary *dst_data)
> {
> -class ipa_size_summary_t *summary = new (ggc_alloc  
> ())
> -  ipa_size_summary_t (symtab);
> -summary->disable_insertion_hook ();
> -return summary;
> +*dst_data = *src_data;
> }
>   };
>   extern fast_function_summary 
> Index: ipa-fnsummary.c
> ===
> --- ipa-fnsummary.c   (revision 279523)
> +++ ipa-fnsummary.c   (working copy)
> @@ -672,8 +672,7 @@ static void
>   ipa_fn_summary_alloc (void)
>   {
> gcc_checking_assert (!ipa_fn_summaries);
> -  ipa_size_summaries = new fast_function_summary  va_heap>
> -  (symtab);
> +  ipa_size_summaries = new ipa_size_summary_t (symtab);
> ipa_fn_summaries = ipa_fn_summary_t::create_ggc (symtab);
> ipa_call_summaries = new ipa_call_summary_t (symtab);
>   }
> 



*Ping* Re: [PATCH v6] Missed function specialization + partial devirtualization

2019-12-17 Thread luoxhu

Ping :)

Patch is here:
https://gcc.gnu.org/ml/gcc-patches/2019-12/msg00099.html


On 2019/12/3 10:31, luoxhu wrote:

Hi Martin and Honza,


On 2019/11/18 21:02, Martin Liška wrote:

On 11/16/19 10:59 AM, luoxhu wrote:

Sorry that I don't quite understand your meanning here.  I didn't grep the
word "cgraph_edge_summary" in source code, do you mean add new structure


Hello.

He wanted to write call_summary class and so you need something similar to
ipa-sra.c:431. It's a data structure which associate a data to cgraph_edge.
Is it understandable please?

Martin



Update the patch as below with "git format-patch -U15" for review convenience,
the GC issue is fixed after leveraging the full GC framework in ipa-profile.c, 
Thanks:

v6 Changes:
  1. Define and use speculative_call_targets summary, move
  speculative_call_target from cgraph.h to ipa-profile.c.
  2. Use num_speculative_call_targets in cgraph_indirect_call_info.
  3. Refine with review comments.

This patch aims to fix PR69678 caused by PGO indirect call profiling
performance issues.
The bug that profiling data is never working was fixed by Martin's pull
back of topN patches, performance got GEOMEAN ~1% improvement(+24% for
511.povray_r specifically).
Still, currently the default profile only generates SINGLE indirect target
that called more than 75%.  This patch leverages MULTIPLE indirect
targets use in LTO-WPA and LTO-LTRANS stage, as a result, function
specialization, profiling, partial devirtualization, inlining and
cloning could be done successfully based on it.
Performance can get improved from 0.70 sec to 0.38 sec on simple tests.
Details are:
   1.  PGO with topn is enabled by default now, but only one indirect
   target edge will be generated in ipa-profile pass, so add variables to enable
   multiple speculative edges through passes, speculative_id will record the
   direct edge index bind to the indirect edge, indirect_call_targets length
   records how many direct edges owned by the indirect edge, postpone gimple_ic
   to ipa-profile like default as inline pass will decide whether it is benefit
   to transform indirect call.
   2.  Use speculative_id to track and search the reference node matched
   with the direct edge's callee for multiple targets.  Actually, it is the
   caller's responsibility to handle the direct edges mapped to same indirect
   edge.  speculative_call_info will return one of the direct edge specified,
   this will leverage current IPA edge process framework mostly.
   3.  Enable LTO WPA/LTRANS stage multiple indirect call targets analysis for
   profile full support in ipa passes and cgraph_edge functions.  speculative_id
   can be set by make_speculative id when multiple targets are binded to
   one indirect edge, and cloned if new edge is cloned.  speculative_id
   is streamed out and stream int by lto like lto_stmt_uid.
   4.  Add 1 in module testcase and 2 cross module testcases.
   5.  Bootstrap and regression test passed on Power8-LE.  No function
   and performance regression for SPEC2017.

gcc/ChangeLog

2019-12-02  Xiong Hu Luo  

PR ipa/69678
* Makefile.in (GTFILES): Add ipa-profile.c.
* cgraph.c (symbol_table::create_edge): Init speculative_id.
(cgraph_edge::make_speculative): Add param for setting speculative_id.
(cgraph_edge::speculative_call_info): Update comments and find reference
by speculative_id for multiple indirect targets.
(cgraph_edge::resolve_speculation): Decrease the speculations
for indirect edge, drop it's speculative if not direct target
left. Update comments.
(cgraph_edge::redirect_call_stmt_to_callee): Likewise.
(cgraph_node::dump): Print num_speculative_call_targets.
(cgraph_node::verify_node): Don't report error if speculative
edge not include statement.
(cgraph_edge::num_speculative_call_targets_p): New function.
* cgraph.h (int common_target_id): Remove.
(int common_target_probability): Remove.
(num_speculative_call_targets): New variable.
(make_speculative): Add param for setting speculative_id.
(cgraph_edge::num_speculative_call_targets_p): New declare.
(speculative_id): New variable.
* cgraphclones.c (cgraph_node::create_clone): Clone speculative_id.
* ipa-profile.c (struct speculative_call_target): New struct.
(class speculative_call_summary): New class.
(class speculative_call_summaries): New class.
(call_sums): New variable.
(ipa_profile_generate_summary): Generate indirect multiple targets 
summaries.
(ipa_profile_write_edge_summary): New function.
(ipa_profile_write_summary): Stream out indirect multiple targets 
summaries.
(ipa_profile_dump_all_summaries): New function.
(ipa_profile_read_edge_summary): New function.
(ipa_profile_read_summary_section): New function.
(ipa_profile_r

Re: [RFC] ipa-cp: Fix PGO regression caused by r278808

2019-12-13 Thread luoxhu
Thanks Honza,

On 2019/12/10 19:06, Jan Hubicka wrote:
>> Hi,
>>
>> On Tue, Dec 10 2019, Jan Hubicka wrote:
>>> Hi,
>>> I think the updating should treat self recursive edges as loops: that is
>>> calculate SUM of counts incomming edges which are not self recursive,
>>> calculate probability of self recursing and then set count as
>>> SUM * (1/(1-probability_of_recursion))
>>> we do same thing when computing counts withing loops.
>>>
>>> Martin, I woner how updating works for indirectly self recursive edges?
>>
>> It does not special-case them in any way, if that was the question.
> 
> Well, it should, summing only frequencies of edges entering the SCC is
> not going to give a good answer.
> If SCC is having single entry and forms a loop the generalization of
> algorithm above will work, but for general graphs this is hard problem.
> 
> I suppose we could special case self recurisve nodes (since they are
> most common) and apply some fixed scale for other SCCs?
> 
> Also i wonder if the updating is done in RPO so we do not account edges
> not updated yet?

I have several questions with you comments.
1.  Where is the loops counts prob calculation code please?  And I am not
quite sure about the formula SUM * (1/(1-probability_of_recursion)).
At this moment, the self recursive node graph will be transformed as below:

orig_node<-
   ||--| 
   |  
  caller ---> new_node

Assuming:   
e1: caller->  new_node:  count1
e2: orig_node -> orig_node:  count2
e3: new_node  -> orig_node:  count2 (same as back edge when cloned)

e3 is not considered to be a self recursive edge now?
So:
SUM = count2
probability_of_recursion should be count2/(count2 + count2)
=>
orig_node.count = SUM * (1/(1-probability_of_recursion)) = count2 * 2

But the count2 of e3 is not correct as it was copied when cloned which makes
the calculation inaccurate.  And the important thing is new_node.count is
still UNCERTAIN.

2.  Also, the orig_node_count is already inaccurate due to
ipa-cp.c:4289   orig_node_count = (orig_sum + new_sum).apply_scale (12, 10);
Since the parameter param_ipa_cp_max_recursive_depth defines how many nodes
are cloned, is it reasonable to update self_scc node as below to simplify the
calculation(replace self_recursive_p with info->node_is_self_scc)?

+  class ipa_node_params *info = IPA_NODE_REF (orig_node);
+  if (info && info->node_is_self_scc)
+new_node->count
+  = (orig_sum + new_sum).apply_scale (1, param_ipa_cp_max_recursive_depth);
+  else
+new_node->count = new_sum;

3.  The recursive ipa-cp will create multiple SCC nodes based on orig_node,
next new_node2 no need update previous new_node1, as each new_node takes
1/param_ipa_cp_max_recursive_depth execution count from orig_node.

BR
Xiong Hu

> 
> Honza
> 
>>
>> Martin



[PATCH v6] Missed function specialization + partial devirtualization

2019-12-02 Thread luoxhu
Hi Martin and Honza,


On 2019/11/18 21:02, Martin Liška wrote:
> On 11/16/19 10:59 AM, luoxhu wrote:
>> Sorry that I don't quite understand your meanning here.  I didn't grep the
>> word "cgraph_edge_summary" in source code, do you mean add new structure
> 
> Hello.
> 
> He wanted to write call_summary class and so you need something similar to
> ipa-sra.c:431. It's a data structure which associate a data to cgraph_edge.
> Is it understandable please?
> 
> Martin
> 

Update the patch as below with "git format-patch -U15" for review convenience,
the GC issue is fixed after leveraging the full GC framework in ipa-profile.c, 
Thanks:

v6 Changes:
 1. Define and use speculative_call_targets summary, move
 speculative_call_target from cgraph.h to ipa-profile.c.
 2. Use num_speculative_call_targets in cgraph_indirect_call_info.
 3. Refine with review comments.

This patch aims to fix PR69678 caused by PGO indirect call profiling
performance issues.
The bug that profiling data is never working was fixed by Martin's pull
back of topN patches, performance got GEOMEAN ~1% improvement(+24% for
511.povray_r specifically).
Still, currently the default profile only generates SINGLE indirect target
that called more than 75%.  This patch leverages MULTIPLE indirect
targets use in LTO-WPA and LTO-LTRANS stage, as a result, function
specialization, profiling, partial devirtualization, inlining and
cloning could be done successfully based on it.
Performance can get improved from 0.70 sec to 0.38 sec on simple tests.
Details are:
  1.  PGO with topn is enabled by default now, but only one indirect
  target edge will be generated in ipa-profile pass, so add variables to enable
  multiple speculative edges through passes, speculative_id will record the
  direct edge index bind to the indirect edge, indirect_call_targets length
  records how many direct edges owned by the indirect edge, postpone gimple_ic
  to ipa-profile like default as inline pass will decide whether it is benefit
  to transform indirect call.
  2.  Use speculative_id to track and search the reference node matched
  with the direct edge's callee for multiple targets.  Actually, it is the
  caller's responsibility to handle the direct edges mapped to same indirect
  edge.  speculative_call_info will return one of the direct edge specified,
  this will leverage current IPA edge process framework mostly.
  3.  Enable LTO WPA/LTRANS stage multiple indirect call targets analysis for
  profile full support in ipa passes and cgraph_edge functions.  speculative_id
  can be set by make_speculative id when multiple targets are binded to
  one indirect edge, and cloned if new edge is cloned.  speculative_id
  is streamed out and stream int by lto like lto_stmt_uid.
  4.  Add 1 in module testcase and 2 cross module testcases.
  5.  Bootstrap and regression test passed on Power8-LE.  No function
  and performance regression for SPEC2017.

gcc/ChangeLog

2019-12-02  Xiong Hu Luo  

PR ipa/69678
* Makefile.in (GTFILES): Add ipa-profile.c.
* cgraph.c (symbol_table::create_edge): Init speculative_id.
(cgraph_edge::make_speculative): Add param for setting speculative_id.
(cgraph_edge::speculative_call_info): Update comments and find reference
by speculative_id for multiple indirect targets.
(cgraph_edge::resolve_speculation): Decrease the speculations
for indirect edge, drop it's speculative if not direct target
left. Update comments.
(cgraph_edge::redirect_call_stmt_to_callee): Likewise.
(cgraph_node::dump): Print num_speculative_call_targets.
(cgraph_node::verify_node): Don't report error if speculative
edge not include statement.
(cgraph_edge::num_speculative_call_targets_p): New function.
* cgraph.h (int common_target_id): Remove.
(int common_target_probability): Remove.
(num_speculative_call_targets): New variable.
(make_speculative): Add param for setting speculative_id.
(cgraph_edge::num_speculative_call_targets_p): New declare.
(speculative_id): New variable.
* cgraphclones.c (cgraph_node::create_clone): Clone speculative_id.
* ipa-profile.c (struct speculative_call_target): New struct.
(class speculative_call_summary): New class.
(class speculative_call_summaries): New class.
(call_sums): New variable.
(ipa_profile_generate_summary): Generate indirect multiple targets 
summaries.
(ipa_profile_write_edge_summary): New function.
(ipa_profile_write_summary): Stream out indirect multiple targets 
summaries.
(ipa_profile_dump_all_summaries): New function.
(ipa_profile_read_edge_summary): New function.
(ipa_profile_read_summary_section): New function.
(ipa_profile_read_summary): Stream in indirect multiple targets 
summaries.
(ipa_profile)

Ping: [PATCH] Add explicit description for -finline

2019-11-27 Thread luoxhu




On 2019/11/4 11:42, luoxhu wrote:

On 2019/11/2 00:23, Joseph Myers wrote:

On Thu, 31 Oct 2019, Xiong Hu Luo wrote:


+@code{-finline} enables inlining of function declared \"inline\".
+@code{-finline} is enabled at levels -O1, -O2, -O3 and -Os, but not -Og.


Use @option{} to mark up option names (both -finline and all the -O
options in this paragraph).  Use @code{} to mark up keyword names, not
\"\".



Thanks.  So shall I commit the tiny patch with below updates?

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 1407d019d14..ea0d407fe11 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -8576,6 +8576,10 @@ optimizing.
  Single functions can be exempted from inlining by marking them
  with the @code{noinline} attribute.

+@option{-finline} enables inlining of function declared @code{inline}.
+@option{-finline} is enabled at levels @option{-O1}, @option{-O2}, @option{-O3}
+and @option{-Os}, but not @option{-Og}.
+
  @item -finline-small-functions
  @opindex finline-small-functions
  Integrate functions into their callers when their body is smaller than 
expected





Re: [PATCH] Fix two potential memory leak

2019-11-26 Thread luoxhu
Thanks,

On 2019/11/26 18:15, Jan Hubicka wrote:
>> Hi,
>>
>> On 2019/11/26 16:04, Jan Hubicka wrote:
 Summary variables should be deleted at the end of write_summary.
 It's first newed in generate_summary, and second newed in read_summary.
 Therefore, delete the first in write_summary, delete the second in
 execute.

 gcc/ChangeLog:

2019-11-26  Luo Xiong Hu  

* ipa-pure-const.c (pure_const_write_summary): Fix memory leak.
* ipa-sra.c (ipa_sra_write_summary): Likewise.
>>>
>>> This is not going to work with -ffat-lto-objects because in this case
>>> IPA pass is executed later.  So you will need
>>> if (!flag_fat_lto_objects).
>>>
>>> I think most IPA passes just waits for compiler to exit with LTO rather
>>> than free the summary.  ipa-fnsummary and ipa-prop allocate optimization
>>> summaries, too. Are those freed?
>>
>> ipa-prop is a bit different, the ipcp_transformation_sum is only created in
>> read_summary, so need only delete once after execute.
> 
> It also has ipa_node_params_sum and ipa_edge_args_sum which live from
> ipa-prop analysis until after ipa-inlining.
>>
>> ipa-fnsummary also forgot to free the ipa summaries at the end of
>> write_summary.  The pass pass_ipa_free_fn_summary will delete all summaries
>> in execute.
>>
>> For -ffat-lto-objects, I suppose there would be no write_summary and
>> read_summary?  so summaries are also newed once and delete once?  Thanks.
>>
>> Anyway, lto1 will run after cc1, which means cc1 is finished, the memory is
>> freed theoretically.
> 
> With -ffat-lto-object we run full optimization pipieline at compile
> time. This means that we first write summary, then execute the pass
> (which uses the summary and frees it) and proceed by compilation.
> With -fno-fat-lto-object the pipieline is terminated after summary
> writting. By quiting compiler we free all the memory.
> 
> We do have a lot of places where memory leaks this way and is freed at
> exit. I see that for jit we added sume explicit free calls, but I think
> jit is not using this path.
> 
> I am not against freeing things explicitly since it makes it easier to
> identify real leaks, but we should do it systematically. Thinking of
> this case probably may even lead to some memory savings because after
> summary streaming the global stream is produced which is typically the
> biggest object streamed.  So freeing it is OK, but please add check for
> !fat lto objects and also free ipa-fnsummary (there is
> ipa_free_fn_summary, ipa_free_size_summary for that) and ipa-prop
> summaries.

Verified that "if (!flag_fat_lto_objects)" is required for explicitly
deleting the summaries when built with -ffat-lto-objects.

> 
> I will also look into your changes for ipa-profile tomorrow (I suppose
> you noticed this while looking into summaries for that patch :)

It's true that I am moving the speculative target summaries from cgraph.h
to ipa-profile.c, code is exploded with hundred of new lines.  Simple test
case could work now with write_summary/read_summary/execute, but most spec
test cases ICEd due to ggc_grow() or ggc_collect(), I debugged the code and
found that read_summary is correct streaming in the target summaries, but some
edge's summary was GCed UNEXPECTEDLY by followed ggc_grow() or ggc_collect()
before calling ipa_profile, any clue on this, please?
Great that you take time on this!:)


ipa-profile.c
...
+/* Structure containing speculative target information from profile.  */
+
+struct GTY (()) speculative_call_target
+{
+  speculative_call_target (unsigned int id, int prob)
+: target_id (id), target_probability (prob)
+  {
+  }
+
+  /* Profile_id of target obtained from profile.  */
+  unsigned int target_id;
+  /* Probability that call will land in function with target_id.  */
+  int target_probability;
+};
+
+class speculative_call_summary
+{
+public:
+  speculative_call_summary () : speculative_call_targets () {}
+
+  vec *speculative_call_targets;
+  ~speculative_call_summary ();
+
+  void dump (FILE *f);
+
+  /* Check whether this is a empty summary.  */
+  bool is_empty ();
+};
...

Xiong Hu

> Honza
>>
>> Xiong Hu
>>
>>
>>>
>>> Honza
 ---
gcc/ipa-pure-const.c | 3 +++
gcc/ipa-sra.c| 5 +
2 files changed, 8 insertions(+)

 diff --git a/gcc/ipa-pure-const.c b/gcc/ipa-pure-const.c
 index a142e0cc8f6..89f334cedac 100644
 --- a/gcc/ipa-pure-const.c
 +++ b/gcc/ipa-pure-const.c
 @@ -1260,6 +1260,9 @@ pure_const_write_summary (void)
}
  lto_destroy_simple_output_block (ob);
 +
 +  delete funct_state_summaries;
 +  funct_state_summaries = NULL;
}
 diff --git a/gcc/ipa-sra.c b/gcc/ipa-sra.c
 index c6ed0f44b87..bbc5e84edfb 100644
 --- a/gcc/ipa-sra.c
 +++ b/gcc/ipa-sra.c
 @@ -2671,6 +2671,11 @@ ipa_sra_write_summary (void)
  streamer_write_char_stream (ob->main_stream, 0);
  produce_asm (ob, NULL);

Re: [PATCH] Fix two potential memory leak

2019-11-26 Thread luoxhu

Hi,

On 2019/11/26 16:04, Jan Hubicka wrote:

Summary variables should be deleted at the end of write_summary.
It's first newed in generate_summary, and second newed in read_summary.
Therefore, delete the first in write_summary, delete the second in
execute.

gcc/ChangeLog:

2019-11-26  Luo Xiong Hu  

* ipa-pure-const.c (pure_const_write_summary): Fix memory leak.
* ipa-sra.c (ipa_sra_write_summary): Likewise.


This is not going to work with -ffat-lto-objects because in this case
IPA pass is executed later.  So you will need
if (!flag_fat_lto_objects).

I think most IPA passes just waits for compiler to exit with LTO rather
than free the summary.  ipa-fnsummary and ipa-prop allocate optimization
summaries, too. Are those freed?


ipa-prop is a bit different, the ipcp_transformation_sum is only created in
read_summary, so need only delete once after execute.

ipa-fnsummary also forgot to free the ipa summaries at the end of 
write_summary.  The pass pass_ipa_free_fn_summary will delete all summaries 
in execute.


For -ffat-lto-objects, I suppose there would be no write_summary and 
read_summary?  so summaries are also newed once and delete once?  Thanks.


Anyway, lto1 will run after cc1, which means cc1 is finished, the memory is 
freed theoretically.


Xiong Hu




Honza

---
  gcc/ipa-pure-const.c | 3 +++
  gcc/ipa-sra.c| 5 +
  2 files changed, 8 insertions(+)

diff --git a/gcc/ipa-pure-const.c b/gcc/ipa-pure-const.c
index a142e0cc8f6..89f334cedac 100644
--- a/gcc/ipa-pure-const.c
+++ b/gcc/ipa-pure-const.c
@@ -1260,6 +1260,9 @@ pure_const_write_summary (void)
  }
  
lto_destroy_simple_output_block (ob);

+
+  delete funct_state_summaries;
+  funct_state_summaries = NULL;
  }
  
  
diff --git a/gcc/ipa-sra.c b/gcc/ipa-sra.c

index c6ed0f44b87..bbc5e84edfb 100644
--- a/gcc/ipa-sra.c
+++ b/gcc/ipa-sra.c
@@ -2671,6 +2671,11 @@ ipa_sra_write_summary (void)
streamer_write_char_stream (ob->main_stream, 0);
produce_asm (ob, NULL);
destroy_output_block (ob);
+
+  ggc_delete (func_sums);
+  func_sums = NULL;
+  delete call_sums;
+  call_sums = NULL;
  }
  
  /* Read intraprocedural analysis information about E and all of its outgoing

--
2.21.0.777.g83232e3864





[PATCH] Fix two potential memory leak

2019-11-25 Thread luoxhu
Summary variables should be deleted at the end of write_summary.
It's first newed in generate_summary, and second newed in read_summary.
Therefore, delete the first in write_summary, delete the second in
execute.

gcc/ChangeLog:

2019-11-26  Luo Xiong Hu  

* ipa-pure-const.c (pure_const_write_summary): Fix memory leak.
* ipa-sra.c (ipa_sra_write_summary): Likewise.
---
 gcc/ipa-pure-const.c | 3 +++
 gcc/ipa-sra.c| 5 +
 2 files changed, 8 insertions(+)

diff --git a/gcc/ipa-pure-const.c b/gcc/ipa-pure-const.c
index a142e0cc8f6..89f334cedac 100644
--- a/gcc/ipa-pure-const.c
+++ b/gcc/ipa-pure-const.c
@@ -1260,6 +1260,9 @@ pure_const_write_summary (void)
 }
 
   lto_destroy_simple_output_block (ob);
+
+  delete funct_state_summaries;
+  funct_state_summaries = NULL;
 }
 
 
diff --git a/gcc/ipa-sra.c b/gcc/ipa-sra.c
index c6ed0f44b87..bbc5e84edfb 100644
--- a/gcc/ipa-sra.c
+++ b/gcc/ipa-sra.c
@@ -2671,6 +2671,11 @@ ipa_sra_write_summary (void)
   streamer_write_char_stream (ob->main_stream, 0);
   produce_asm (ob, NULL);
   destroy_output_block (ob);
+
+  ggc_delete (func_sums);
+  func_sums = NULL;
+  delete call_sums;
+  call_sums = NULL;
 }
 
 /* Read intraprocedural analysis information about E and all of its outgoing
-- 
2.21.0.777.g83232e3864



Re: [PATCH v3] PR92398: Fix testcase failure of pr72804.c

2019-11-24 Thread luoxhu
Hi,

>> +++ b/gcc/testsuite/gcc.target/powerpc/pr72804-1.c
> 
>> +/* store generates difference instructions as below:
>> +   P9: mtvsrdd;xxlnot;stxv.
>> +   P8/P7/P6 LE: not;not;std;std.
>> +   P8 BE: mtvsrd;mtvsrd;xxpermdi;xxlnor;stxvd2x.
>> +   P7/P6 BE: std;std;addi;lxvd2x;xxlnor;stxvd2x.  */
> 
> But it should generate just the first or second.  So just document that,
> and base the tests around that as well?

check_effective_target_p8 is needed as P7/P6 BE has two unexpected std?
Also I split the store to new head file and rename it to pr92398.h.

Others are updated as below patch, Thanks:


P9LE generated instruction is not worse than P8LE.
mtvsrdd;xxlnot;stxv vs. not;not;std;std.
Update the test case to fix failures.

gcc/testsuite/ChangeLog:

2019-11-25  Luo Xiong Hu  

testsuite/pr92398
* gcc.target/powerpc/pr72804.c: Split the store function to...
* gcc.target/powerpc/pr92398.h: ... this one.  New.
* gcc.target/powerpc/pr92398.p9+.c: New.
* gcc.target/powerpc/pr92398.p9-.c: New.
* lib/target-supports.exp (check_effective_target_p8): New.
(check_effective_target_p9+): New.
---
 gcc/testsuite/gcc.target/powerpc/pr72804.c| 19 --
 gcc/testsuite/gcc.target/powerpc/pr92398.h| 17 
 .../gcc.target/powerpc/pr92398.p9+.c  | 10 ++
 .../gcc.target/powerpc/pr92398.p9-.c  | 10 ++
 gcc/testsuite/lib/target-supports.exp | 20 +++
 5 files changed, 61 insertions(+), 15 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr92398.h
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr92398.p9+.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr92398.p9-.c

diff --git a/gcc/testsuite/gcc.target/powerpc/pr72804.c 
b/gcc/testsuite/gcc.target/powerpc/pr72804.c
index 10e37caed6b..0b083a44ede 100644
--- a/gcc/testsuite/gcc.target/powerpc/pr72804.c
+++ b/gcc/testsuite/gcc.target/powerpc/pr72804.c
@@ -1,7 +1,6 @@
 /* { dg-do compile { target { lp64 } } } */
-/* { dg-skip-if "" { powerpc*-*-darwin* } } */
 /* { dg-require-effective-target powerpc_vsx_ok } */
-/* { dg-options "-O2 -mvsx -fno-inline-functions --param 
max-inline-insns-single-O2=200" } */
+/* { dg-options "-O2 -mvsx" } */
 
 __int128_t
 foo (__int128_t *src)
@@ -9,17 +8,7 @@ foo (__int128_t *src)
   return ~*src;
 }
 
-void
-bar (__int128_t *dst, __int128_t src)
-{
-  *dst =  ~src;
-}
 
-/* { dg-final { scan-assembler-times "not " 4 } } */
-/* { dg-final { scan-assembler-times "std " 2 } } */
-/* { dg-final { scan-assembler-times "ld " 2 } } */
-/* { dg-final { scan-assembler-not "lxvd2x" } } */
-/* { dg-final { scan-assembler-not "stxvd2x" } } */
-/* { dg-final { scan-assembler-not "xxpermdi" } } */
-/* { dg-final { scan-assembler-not "mfvsrd" } } */
-/* { dg-final { scan-assembler-not "mfvsrd" } } */
+/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mnot\M} 2 } } */
+/* { dg-final { scan-assembler-not {\mlxvd2x\M} } }*/
diff --git a/gcc/testsuite/gcc.target/powerpc/pr92398.h 
b/gcc/testsuite/gcc.target/powerpc/pr92398.h
new file mode 100644
index 000..184d02d3521
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr92398.h
@@ -0,0 +1,17 @@
+/* This test code is included into pr92398.p9-.c and pr92398.p9+.c.
+   The two files have the tests for the number of instructions generated for
+   P9- versus P9+.
+
+   store generates difference instructions as below:
+   P9+: mtvsrdd;xxlnot;stxv.
+   P8/P7/P6 LE: not;not;std;std.
+   P8 BE: mtvsrd;mtvsrd;xxpermdi;xxlnor;stxvd2x.
+   P7/P6 BE: std;std;addi;lxvd2x;xxlnor;stxvd2x.
+   P9+ and P9- LE are expected, P6/P7/P8 BE are unexpected.  */
+
+void
+bar (__int128_t *dst, __int128_t src)
+{
+  *dst =  ~src;
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/pr92398.p9+.c 
b/gcc/testsuite/gcc.target/powerpc/pr92398.p9+.c
new file mode 100644
index 000..2ebe2025cef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr92398.p9+.c
@@ -0,0 +1,10 @@
+/* { dg-do compile { target { lp64 && p9+ } } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O2 -mvsx" } */
+
+/* { dg-final { scan-assembler-times {\mmtvsrdd\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxxlnor\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mstxv\M} 1 } } */
+
+/* Source code for the test in pr92398.h */
+#include "pr92398.h"
diff --git a/gcc/testsuite/gcc.target/powerpc/pr92398.p9-.c 
b/gcc/testsuite/gcc.target/powerpc/pr92398.p9-.c
new file mode 100644
index 000..eaf0ddb86eb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr92398.p9-.c
@@ -0,0 +1,10 @@
+/* { dg-do compile { target { lp64 && {! p9+} } } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O2 -mvsx" } */
+
+/* { dg-final { scan-assembler-times {\mnot\M} 2 { xfail be } } } */
+/* { dg-final { scan-assembler-times {\mstd\M} 2 { xfail { {p8} && {be} } } } 
} */
+
+/* 

Re: [PATCH v3] PR92398: Fix testcase failure of pr72804.c

2019-11-21 Thread luoxhu
Hi Segher,
Update the code as you wish, Thanks:

P9LE generated instruction is not worse than P8LE.
mtvsrdd;xxlnot;stxv vs. not;not;std;std.
Update the test case to fix failures.

v4:
Define and use check_effective_target_xxx etc.
power9+: power9, power10 ...
power8: power8 only.

gcc/testsuite/ChangeLog:

2019-11-22  Luo Xiong Hu  

testsuite/pr92398
* gcc.target/powerpc/pr72804.c: Split the store function to...
* gcc.target/powerpc/pr72804-1.c: ... this one.  New.
* lib/target-supports.exp (check_effective_target_power8): New.
(check_effective_target_power9+): New.
---
 gcc/testsuite/gcc.target/powerpc/pr72804-1.c | 23 
 gcc/testsuite/gcc.target/powerpc/pr72804.c   | 19 
 gcc/testsuite/lib/target-supports.exp| 20 +
 3 files changed, 47 insertions(+), 15 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr72804-1.c

diff --git a/gcc/testsuite/gcc.target/powerpc/pr72804-1.c 
b/gcc/testsuite/gcc.target/powerpc/pr72804-1.c
new file mode 100644
index 000..fce08079bd3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr72804-1.c
@@ -0,0 +1,23 @@
+/* { dg-do compile { target { lp64 } } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O2 -mvsx" } */
+
+void
+bar (__int128_t *dst, __int128_t src)
+{
+  *dst =  ~src;
+}
+
+/* store generates difference instructions as below:
+   P9: mtvsrdd;xxlnot;stxv.
+   P8/P7/P6 LE: not;not;std;std.
+   P8 BE: mtvsrd;mtvsrd;xxpermdi;xxlnor;stxvd2x.
+   P7/P6 BE: std;std;addi;lxvd2x;xxlnor;stxvd2x.  */
+
+/* { dg-final { scan-assembler-times {\mmtvsrdd\M} 1 { target power9+ } } } */
+/* { dg-final { scan-assembler-times {\mxxlnor\M} 1 { target power9+ } } } */
+/* { dg-final { scan-assembler-times {\mstxv\M} 1 { target power9+ } } } */
+
+/* { dg-final { scan-assembler-times {\mnot\M} 2 { xfail {! { {! power9+} && 
{le} } } } } } */
+
+/* { dg-final { scan-assembler-times {\mstd\M} 2 { xfail { { {power8} && {be} 
} || {power9+} } } } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr72804.c 
b/gcc/testsuite/gcc.target/powerpc/pr72804.c
index 10e37caed6b..0b083a44ede 100644
--- a/gcc/testsuite/gcc.target/powerpc/pr72804.c
+++ b/gcc/testsuite/gcc.target/powerpc/pr72804.c
@@ -1,7 +1,6 @@
 /* { dg-do compile { target { lp64 } } } */
-/* { dg-skip-if "" { powerpc*-*-darwin* } } */
 /* { dg-require-effective-target powerpc_vsx_ok } */
-/* { dg-options "-O2 -mvsx -fno-inline-functions --param 
max-inline-insns-single-O2=200" } */
+/* { dg-options "-O2 -mvsx" } */
 
 __int128_t
 foo (__int128_t *src)
@@ -9,17 +8,7 @@ foo (__int128_t *src)
   return ~*src;
 }
 
-void
-bar (__int128_t *dst, __int128_t src)
-{
-  *dst =  ~src;
-}
 
-/* { dg-final { scan-assembler-times "not " 4 } } */
-/* { dg-final { scan-assembler-times "std " 2 } } */
-/* { dg-final { scan-assembler-times "ld " 2 } } */
-/* { dg-final { scan-assembler-not "lxvd2x" } } */
-/* { dg-final { scan-assembler-not "stxvd2x" } } */
-/* { dg-final { scan-assembler-not "xxpermdi" } } */
-/* { dg-final { scan-assembler-not "mfvsrd" } } */
-/* { dg-final { scan-assembler-not "mfvsrd" } } */
+/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mnot\M} 2 } } */
+/* { dg-final { scan-assembler-not {\mlxvd2x\M} } }*/
diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 751045d4744..d2e54c57e96 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -2585,6 +2585,26 @@ proc check_effective_target_le { } {
 }]
 }
 
+# Return 1 if we're generating code for only power8 platforms.
+
+proc check_effective_target_power8 {  } {
+  return [check_no_compiler_messages_nocache power8 assembly {
+   #if !(!defined(_ARCH_PWR9) && defined(_ARCH_PWR8))
+   #error NO
+   #endif
+  } ""]
+}
+
+# Return 1 if we're generating code for power9 and future platforms.
+
+proc check_effective_target_power9+ {  } {
+  return [check_no_compiler_messages_nocache power9+ assembly {
+   #if !(defined(_ARCH_PWR9))
+   #error NO
+   #endif
+  } ""]
+}
+
 # Return 1 if we're generating 32-bit code using default options, 0
 # otherwise.
 
-- 
2.21.0.777.g83232e3864



Re: [PATCH v3] PR92398: Fix testcase failure of pr72804.c

2019-11-19 Thread luoxhu
P9LE generated instruction is not worse than P8LE.
mtvsrdd;xxlnot;stxv vs. not;not;std;std.
Update the test case to fix failures.

v3:
Define and use check_effective_target_xxx etc.
pre_power8: ... power6, power7.
power8: power8 only.
post_power8: power8, power9 ...
post_power9: power9, power10 ...

gcc/testsuite/ChangeLog:

2019-11-15  Luo Xiong Hu  

testsuite/pr92398
* gcc.target/powerpc/pr72804.c: Update instruction count per
platform.
* lib/target-supports.exp (check_effective_target_pre_power8): New.
(check_effective_target_power8): New.
(check_effective_target_post_power8): New.
(check_effective_target_post_power9): New.
---
 gcc/testsuite/gcc.target/powerpc/pr72804.c | 39 
 gcc/testsuite/lib/target-supports.exp  | 43 ++
 2 files changed, 74 insertions(+), 8 deletions(-)

diff --git a/gcc/testsuite/gcc.target/powerpc/pr72804.c 
b/gcc/testsuite/gcc.target/powerpc/pr72804.c
index 10e37caed6b..69da8942ddd 100644
--- a/gcc/testsuite/gcc.target/powerpc/pr72804.c
+++ b/gcc/testsuite/gcc.target/powerpc/pr72804.c
@@ -1,7 +1,6 @@
 /* { dg-do compile { target { lp64 } } } */
-/* { dg-skip-if "" { powerpc*-*-darwin* } } */
-/* { dg-require-effective-target powerpc_vsx_ok } */
-/* { dg-options "-O2 -mvsx -fno-inline-functions --param 
max-inline-insns-single-O2=200" } */
+/* { dg-require-effective-target powerpc_vsx_ok  } */
+/* { dg-options "-O2 -mvsx" } */
 
 __int128_t
 foo (__int128_t *src)
@@ -15,11 +14,35 @@ bar (__int128_t *dst, __int128_t src)
   *dst =  ~src;
 }
 
-/* { dg-final { scan-assembler-times "not " 4 } } */
-/* { dg-final { scan-assembler-times "std " 2 } } */
+/* { dg-final { scan-assembler-times "not " 2 { target post_power9 } } } */
+/* { dg-final { scan-assembler-not "std " { target post_power9 } } } */
+/* { dg-final { scan-assembler-not "stxvd2x" { target post_power9 } } } */
+/* { dg-final { scan-assembler-not "xxpermdi" { target post_power9 } } } */
+
+/* { dg-final { scan-assembler-times "not " 2 { target { {power8} && {be} } } 
} } */
+/* { dg-final { scan-assembler-not "std " { target { {power8} && {be} } } } } 
*/
+/* { dg-final { scan-assembler-times "stxvd2x" 1 { target { {power8} && {be} } 
} } } */
+/* { dg-final { scan-assembler-times "xxpermdi" 1 { target { {power8} && {be} 
} } } } */
+
+/* { dg-final { scan-assembler-times "not " 4 { target { {power8} && {le} } } 
} } */
+/* { dg-final { scan-assembler-times "std " 2 { target { {power8} && {le} } } 
} } */
+/* { dg-final { scan-assembler-not "stxvd2x" { target { {power8} && {le} } } } 
} */
+/* { dg-final { scan-assembler-not "xxpermdi" { target { {power8} && {le} } } 
} } */
+
+/* { dg-final { scan-assembler-not "lxvd2x" { target post_power8 } } } */
+
+/* { dg-final { scan-assembler-times "not " 2 { target { {pre_power8} && {be} 
} } } } */
+/* { dg-final { scan-assembler-not "std " { target { {pre_power8} && {be} } } 
} } */
+/* { dg-final { scan-assembler-times "lxvd2x" 1 { target { {pre_power8} && 
{be} } } } } */
+/* { dg-final { scan-assembler-times "stxvd2x" 1 { target { {pre_power8} && 
{be} } } } } */
+
+/* { dg-final { scan-assembler-times "not " 4 { target { {pre_power8} && {le} 
} } } } */
+/* { dg-final { scan-assembler-times "std " 2 { target { {pre_power8} && {le} 
} } } } */
+/* { dg-final { scan-assembler-not "lxvd2x" { target { {pre_power8} && {le} } 
} } } */
+/* { dg-final { scan-assembler-not "stxvd2x" { target { {pre_power8} && {le} } 
} } } */
+
+/* { dg-final { scan-assembler-not "xxpermdi" { target pre_power8 } } } */
+
 /* { dg-final { scan-assembler-times "ld " 2 } } */
-/* { dg-final { scan-assembler-not "lxvd2x" } } */
-/* { dg-final { scan-assembler-not "stxvd2x" } } */
-/* { dg-final { scan-assembler-not "xxpermdi" } } */
 /* { dg-final { scan-assembler-not "mfvsrd" } } */
 /* { dg-final { scan-assembler-not "mfvsrd" } } */
diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 751045d4744..324ec152544 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -2585,6 +2585,49 @@ proc check_effective_target_le { } {
 }]
 }
 
+# Return 1 if we're generating code for pre-power8 platforms.
+# Power8 is NOT included.
+
+proc check_effective_target_pre_power8 {  } {
+  return [check_no_compiler_messages_nocache pre_power8 assembly {
+   #if defined(_ARCH_PWR8)
+   #error NO
+   #endif
+  } ""]
+}
+
+# Return 1 if we're generating code for power8 platforms.
+
+proc check_effective_target_power8 {  } {
+  return [check_no_compiler_messages_nocache power8 assembly {
+   #if !(!defined(_ARCH_PWR9) && defined(_ARCH_PWR8))
+   #error NO
+   #endif
+  } ""]
+}
+
+# Return 1 if we're generating code for post-power8 platforms.
+# Power8 is included.
+
+proc check_effective_target_post_power8 {  } {
+  return [check_no_compiler_messages_nocache post_power8 assembly {
+   #if 

Re: [PATCH v2] PR92398: Fix testcase failure of pr72804.c

2019-11-18 Thread luoxhu
Hi,

On 2019/11/15 18:17, Segher Boessenkool wrote:
> Hi!
> 
> On Thu, Nov 14, 2019 at 09:12:32PM -0600, Xiong Hu Luo wrote:
>> P9LE generated instruction is not worse than P8LE.
>> mtvsrdd;xxlnot;stxv vs. not;not;std;std.
>> Update the test case to fix failures.
> 
> So this no longer runs it for p7, and it also doesn't run it for cpus
> after p9 anymore.  Could you change it to be a test for p9 and above,
> and one for before p9?  Does that work?
Since one test file could not support multiple dg do-compile or dg-options,
it seems not feasible to build both power7, power8 and power9 option in one
source file, I added check_effective_target_power[7,8,9] and
add_options_for_power[7,8,9] in target-supports.exp for each cpu configure.

> 
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/pr72804.p8.c
>> @@ -0,0 +1,16 @@
>> +/* { dg-do compile { target lp64 } } */
>> +/* { dg-skip-if "" { powerpc*-*-darwin* } } */
>> +/* { dg-require-effective-target powerpc_vsx_ok } */
>> +/* { dg-options "-O2 -mvsx -mdejagnu-cpu=power8" } */
> 
> -mvsx is implied by power8 (or power7, already).
> 
> You don't need the test for Darwin: Darwin never has VSX, already (and
> if it somehow would ever get support for it, we'd want this test to run
> for it).
OK.

> 
>> +/* { dg-final { scan-assembler-times "std " 0 } } */
> 
> We usually write this as scan-assembler-not, but this works just fine
> as well.  Whichever you prefer.
Updated.

> 
> 
> Segher
> 

V2 patch as below, Thanks:


P9LE generated instruction is not worse than P8LE.
mtvsrdd;xxlnot;stxv vs. not;not;std;std.
Update the test case to fix failures.

gcc/testsuite/ChangeLog:

2019-11-15  Luo Xiong Hu  

testsuite/pr92398
* gcc.target/powerpc/pr72804.h: New.
* gcc.target/powerpc/pr72804.p7.c: New.
* gcc.target/powerpc/pr72804.p8.c: New.
* gcc.target/powerpc/pr72804.c: Rename to ...
* gcc.target/powerpc/pr72804.p9.c: ... this one.
* lib/target-supports.exp (check_effective_target_power7): New.
(add_options_for_power7): New.
(check_effective_target_power8): New.
(add_options_for_power9): New.
(check_effective_target_power9): New.
(add_options_for_power8): New.
---
 gcc/testsuite/gcc.target/powerpc/pr72804.h| 17 +++
 gcc/testsuite/gcc.target/powerpc/pr72804.p7.c | 16 +++
 gcc/testsuite/gcc.target/powerpc/pr72804.p8.c | 16 +++
 .../powerpc/{pr72804.c => pr72804.p9.c}   | 23 +++---
 gcc/testsuite/lib/target-supports.exp | 45 +++
 5 files changed, 101 insertions(+), 16 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr72804.h
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr72804.p7.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr72804.p8.c
 rename gcc/testsuite/gcc.target/powerpc/{pr72804.c => pr72804.p9.c} (52%)

diff --git a/gcc/testsuite/gcc.target/powerpc/pr72804.h 
b/gcc/testsuite/gcc.target/powerpc/pr72804.h
new file mode 100644
index 000..8a5ea93cc17
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr72804.h
@@ -0,0 +1,17 @@
+/* This test code is included into pr72804.p8.c and pr72804.p9.c
+   The two files have the tests for the number of instructions generated for
+   P8LE versus P9LE.  */
+
+__int128_t
+foo (__int128_t *src)
+{
+  return ~*src;
+}
+
+void
+bar (__int128_t *dst, __int128_t src)
+{
+  *dst =  ~src;
+}
+
+
diff --git a/gcc/testsuite/gcc.target/powerpc/pr72804.p7.c 
b/gcc/testsuite/gcc.target/powerpc/pr72804.p7.c
new file mode 100644
index 000..f78a8557c9f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr72804.p7.c
@@ -0,0 +1,16 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options power7 } */
+
+/* { dg-final { scan-assembler-times "not " 4 {xfail be} } } */
+/* { dg-final { scan-assembler-times "std " 2  } } */
+/* { dg-final { scan-assembler-times "ld " 2 } } */
+/* { dg-final { scan-assembler-not "lxvd2x" {xfail be} } } */
+/* { dg-final { scan-assembler-not "stxvd2x" {xfail be} } } */
+/* { dg-final { scan-assembler-not "xxpermdi" } } */
+/* { dg-final { scan-assembler-not "mfvsrd" } } */
+/* { dg-final { scan-assembler-not "mfvsrd" } } */
+
+/* Source code for the test in pr72804.h */
+#include "pr72804.h"
diff --git a/gcc/testsuite/gcc.target/powerpc/pr72804.p8.c 
b/gcc/testsuite/gcc.target/powerpc/pr72804.p8.c
new file mode 100644
index 000..1efc0691809
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr72804.p8.c
@@ -0,0 +1,16 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options power8 } */
+
+/* { dg-final { scan-assembler-times "not " 4 {xfail be} } } */
+/* { dg-final { scan-assembler-times "std " 2 {xfail be} } } */
+/* { dg-final { scan-assembler-times "ld " 2 } } */
+/* { dg-final { scan-assembler-not "lxvd2x" } } */
+/* { 

Re: Ping*2: [PATCH v5] Missed function specialization + partial devirtualization

2019-11-16 Thread luoxhu
Hi Thanks,

On 2019/11/14 17:04, Jan Hubicka wrote:
>>  PR ipa/69678
>>  * cgraph.c (symbol_table::create_edge): Init speculative_id.
>>  (cgraph_edge::make_speculative): Add param for setting speculative_id.
>>  (cgraph_edge::speculative_call_info): Find reference by
>>  speculative_id for multiple indirect targets.
>>  (cgraph_edge::resolve_speculation): Decrease the speculations
>>  for indirect edge, drop it's speculative if not direct target
>>  left.
>>  (cgraph_edge::redirect_call_stmt_to_callee): Likewise.
>>  (cgraph_node::verify_node): Don't report error if speculative
>>  edge not include statement.
>>  (cgraph_edge::has_multiple_indirect_call_p): New function.
>>  (cgraph_edge::has_indirect_call_p): New function.
>>  * cgraph.h (struct indirect_target_info): New struct.
>>  (indirect_call_targets): New vector variable.
>>  (make_speculative): Add param for setting speculative_id.
>>  (cgraph_edge::has_multiple_indirect_call_p): New declare.
>>  (cgraph_edge::has_indirect_call_p): New declare.
>>  (speculative_id): New variable.
>>  * cgraphclones.c (cgraph_node::create_clone): Clone speculative_id.
>>  * cgraphunit.c: Fix comments typo.
>>  * ipa-comdats.c: Fix comments typo.
>>  * ipa-inline.c (inline_small_functions): Fix iterator update.
>>  * ipa-profile.c (ipa_profile_generate_summary): Add indirect
>>  multiple targets logic.
>>  (ipa_profile): Likewise.
>>  * ipa-ref.h (speculative_id): New variable.
>>  * ipa.c (process_references): Fix typo.
>>  * lto-cgraph.c (lto_output_edge): Add indirect multiple targets
>>  logic.  Stream out speculative_id.
>>  (input_edge): Likewise.
>>  * predict.c (dump_prediction): Revome edges count assert to be
>>  precise.
>>  * symtab.c (symtab_node::create_reference): Init speculative_id.
>>  (symtab_node::clone_references): Clone speculative_id.
>>  (symtab_node::clone_referring): Clone speculative_id.
>>  (symtab_node::clone_reference): Clone speculative_id.
>>  (symtab_node::clear_stmts_in_references): Clear speculative_id.
>>  * tree-inline.c (copy_bb): Duplicate all the speculative edges
>>  if indirect call contains multiple speculative targets.
>>  * tree-profile.c (gimple_gen_ic_profiler): Use the new variable
>>  __gcov_indirect_call.counters and __gcov_indirect_call.callee.
>>  (gimple_gen_ic_func_profiler): Likewise.
>>  (pass_ipa_tree_profile::gate): Fix comment typos.
>>  * value-prof.h  (check_ic_target): Remove.
>>  * value-prof.c  (gimple_value_profile_transformations):
>>  Use void function gimple_ic_transform.
>>  * value-prof.c  (gimple_ic_transform): Handle topn case.
>>  Fix comment typos.  Change it to a void function.
>>
>> gcc/testsuite/ChangeLog
>>
>>  2019-11-14  Xiong Hu Luo  
>>
>>  PR ipa/69678
>>  * gcc.dg/tree-prof/indir-call-prof-topn.c: New testcase.
>>  * gcc.dg/tree-prof/crossmodule-indir-call-topn-1.c: New testcase.
>>  * gcc.dg/tree-prof/crossmodule-indir-call-topn-1a.c: New testcase.
>>  * gcc.dg/tree-prof/crossmodule-indir-call-topn-2.c: New testcase.
>>  * lib/scandump.exp: Dump executable file name.
>>  * lib/scanwpaipa.exp: New scan-pgo-wap-ipa-dump.
>> @@ -1089,6 +1093,38 @@ cgraph_edge::make_speculative (cgraph_node *n2, 
>> profile_count direct_count)
>>  call) and if one of them exists, all of them must exist.
>>   
>>  Given speculative call edge, return all three components.
>> +
>> +   For some indirect edge, it may maps to multiple direct edges, i.e.  1:N.
>> +   check the speculative_id to return all the three components for specified
>> +   direct edge or indirect edge.
>> +   If input is indirect, caller of this function will get the direct edge 
>> one by
>> +   one, get_edge will just return one of the direct edge mapped to the 
>> indirect
>> +   edge, the returned direct edge will be resolved or redirected by the 
>> caller,
>> +   then number of indirect calls (speculations) is deceased in each access.
>> +   If input is direct, this function will get the indirect edge and 
>> reference
>> +   with matched speculative_id, the returned edge will also be resolved or
>> +   redirected, decrease the speculations accordingly.
>> +   Speculations of indirect edge will be dropped only if all direct edges
>> +   be handled.
>> +
>> +   e.g.  for indirect edge E statement "call call_dest":
>> +
>> +   Redirect N3 after redirected N2:
>> +
>> +   if (call_dest == N2)
>> + n2 ();
>> +   else if (call_dest == N3)
>> + n3 ();
>> +   else
>> + call call_dest
>> +
>> +   Resolve N3 and only redirect N2:
>> +
>> +   if (call_dest == N2)
>> + n2 ();
>> +   else
>> + call call_dest
>> +
> 
> I find this comment hard to read. Reader probably does not know what
> speculative edges are and we only want to describe speculative_call_info
> function not also the way we 

Re: [PATCH 1/2] Update iterator of next

2019-11-15 Thread luoxhu



On 2019/11/15 17:19, Jan Hubicka wrote:
>> On Fri, Nov 15, 2019 at 9:10 AM Jan Hubicka  wrote:
>>>
 next is initialized only in the loop before, it is never updated
 in it's own loop.

 gcc/ChangeLog

2019-11-15  Xiong Hu Luo  

* ipa-inline.c (inline_small_functions): Update iterator of next.
>>>
>>> OK,
>>> thanks!
>>
>> This breaks bootstrap and the loop before is similarly odd.
>>
>> (gdb) p edge
>> $1 = (cgraph_edge *) 0xa5a5a5a5a5a5a5a5
>>
>> so apparently edge->next_callee is GCed (thus 'edge' itself is freed?)
> Loop before seems OK, it has next variable because the edge it works
> with may be removed.
> 
> Only the next loop seems to omit set of next.
 +   {
 + if (edge->speculative
 + && !speculation_useful_p (edge, edge->aux != NULL))
 +   {
 + edge->resolve_speculation ();
 + update = true;
 +   }
 + next = edge->next_callee;
> The problem seems that next should be set before and not after resolvign
> (which indeed may remove edge).
> 
> I will fix that.

Sorry to break the bootstrap.  This was my lack of consideration when
splitting this small piece of code from the previous patch, the line
should be in the first line of the second loop.   Could you please add
some comments that edge may be freed in resolve_speculation()?  Thanks.

Xiong Hu

> 
> Honza
 +   }
 if (update)
{
  struct cgraph_node *where = node->inlined_to
 --
 2.21.0.777.g83232e3864




Re: [PATCH] PR92398: Fix testcase failure of pr72804.c

2019-11-14 Thread luoxhu



On 2019/11/15 11:12, Xiong Hu Luo wrote:

P9LE generated instruction is not worse than P8LE.
mtvsrdd;xxlnot;stxv vs. not;not;std;std.
Update the test case to fix failures.

gcc/testsuite/ChangeLog:

2019-11-15  Luo Xiong Hu  

testsuite/pr92398
* gcc.target/powerpc/pr72804.h: New.
* gcc.target/powerpc/pr72804.p8.c: New.
* gcc.target/powerpc/pr72804.c: Rename to ...
* gcc.target/powerpc/pr72804.p9.c: ... this one.
---
  gcc/testsuite/gcc.target/powerpc/pr72804.h| 17 ++
  gcc/testsuite/gcc.target/powerpc/pr72804.p8.c | 16 ++
  .../powerpc/{pr72804.c => pr72804.p9.c}   | 22 ++-
  3 files changed, 40 insertions(+), 15 deletions(-)
  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr72804.h
  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr72804.p8.c
  rename gcc/testsuite/gcc.target/powerpc/{pr72804.c => pr72804.p9.c} (59%)

diff --git a/gcc/testsuite/gcc.target/powerpc/pr72804.h 
b/gcc/testsuite/gcc.target/powerpc/pr72804.h
new file mode 100644
index 000..8a5ea93cc17
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr72804.h
@@ -0,0 +1,17 @@
+/* This test code is included into pr72804.p8.c and pr72804.p9.c
+   The two files have the tests for the number of instructions generated for
+   P8LE versus P9LE.  */
+
+__int128_t
+foo (__int128_t *src)
+{
+  return ~*src;
+}
+
+void
+bar (__int128_t *dst, __int128_t src)
+{
+  *dst =  ~src;
+}
+
+
diff --git a/gcc/testsuite/gcc.target/powerpc/pr72804.p8.c 
b/gcc/testsuite/gcc.target/powerpc/pr72804.p8.c
new file mode 100644
index 000..ad968769aae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr72804.p8.c
@@ -0,0 +1,16 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O2 -mvsx -mdejagnu-cpu=power8" } */
+
+/* { dg-final { scan-assembler-times "not " 4 {xfail be} } } */
+/* { dg-final { scan-assembler-times "std " 2 {xfail be} } } */
+/* { dg-final { scan-assembler-times "ld " 2 } } */
+/* { dg-final { scan-assembler-not "lxvd2x" } } */
+/* { dg-final { scan-assembler-not "stxvd2x" } } */
+/* { dg-final { scan-assembler-not "xxpermdi" } } */


Update to this after test it on P8BE:
-/* { dg-final { scan-assembler-not "stxvd2x" } } */
-/* { dg-final { scan-assembler-not "xxpermdi" } } */
+/* { dg-final { scan-assembler-not "stxvd2x" {xfail be} } } */
+/* { dg-final { scan-assembler-not "xxpermdi" {xfail be} } } */



+/* { dg-final { scan-assembler-not "mfvsrd" } } */
+/* { dg-final { scan-assembler-not "mfvsrd" } } */
+
+/* Source code for the test in pr72804.h */
+#include "pr72804.h"
diff --git a/gcc/testsuite/gcc.target/powerpc/pr72804.c 
b/gcc/testsuite/gcc.target/powerpc/pr72804.p9.c
similarity index 59%
rename from gcc/testsuite/gcc.target/powerpc/pr72804.c
rename to gcc/testsuite/gcc.target/powerpc/pr72804.p9.c
index 10e37caed6b..2059d7df1a2 100644
--- a/gcc/testsuite/gcc.target/powerpc/pr72804.c
+++ b/gcc/testsuite/gcc.target/powerpc/pr72804.p9.c
@@ -1,25 +1,17 @@
  /* { dg-do compile { target { lp64 } } } */
  /* { dg-skip-if "" { powerpc*-*-darwin* } } */
  /* { dg-require-effective-target powerpc_vsx_ok } */
-/* { dg-options "-O2 -mvsx -fno-inline-functions --param 
max-inline-insns-single-O2=200" } */
+/* { dg-options "-O2 -mvsx -mdejagnu-cpu=power9" } */
  
-__int128_t

-foo (__int128_t *src)
-{
-  return ~*src;
-}
-
-void
-bar (__int128_t *dst, __int128_t src)
-{
-  *dst =  ~src;
-}
-
-/* { dg-final { scan-assembler-times "not " 4 } } */
-/* { dg-final { scan-assembler-times "std " 2 } } */
+/* { dg-final { scan-assembler-times "not " 2 } } */
+/* { dg-final { scan-assembler-times "std " 0 } } */
  /* { dg-final { scan-assembler-times "ld " 2 } } */
  /* { dg-final { scan-assembler-not "lxvd2x" } } */
  /* { dg-final { scan-assembler-not "stxvd2x" } } */
  /* { dg-final { scan-assembler-not "xxpermdi" } } */
  /* { dg-final { scan-assembler-not "mfvsrd" } } */
  /* { dg-final { scan-assembler-not "mfvsrd" } } */
+
+/* Source code for the test in pr72804.h */
+#include "pr72804.h"
+





Ping*2: [PATCH v5] Missed function specialization + partial devirtualization

2019-11-13 Thread luoxhu
Rebase to trunk including void gimple_ic_transform.


This patch aims to fix PR69678 caused by PGO indirect call profiling
performance issues.
The bug that profiling data is never working was fixed by Martin's pull
back of topN patches, performance got GEOMEAN ~1% improvement(+24% for
511.povray_r specifically).
Still, currently the default profile only generates SINGLE indirect target
that called more than 75%.  This patch leverages MULTIPLE indirect
targets use in LTO-WPA and LTO-LTRANS stage, as a result, function
specialization, profiling, partial devirtualization, inlining and
cloning could be done successfully based on it.
Performance can get improved from 0.70 sec to 0.38 sec on simple tests.
Details are:
  1.  PGO with topn is enabled by default now, but only one indirect
  target edge will be generated in ipa-profile pass, so add variables to enable
  multiple speculative edges through passes, speculative_id will record the
  direct edge index bind to the indirect edge, indirect_call_targets length
  records how many direct edges owned by the indirect edge, postpone gimple_ic
  to ipa-profile like default as inline pass will decide whether it is benefit
  to transform indirect call.
  2.  Use speculative_id to track and search the reference node matched
  with the direct edge's callee for multiple targets.  Actually, it is the
  caller's responsibility to handle the direct edges mapped to same indirect
  edge.  speculative_call_info will return one of the direct edge specified,
  this will leverage current IPA edge process framework mostly.
  3.  Enable LTO WPA/LTRANS stage multiple indirect call targets analysis for
  profile full support in ipa passes and cgraph_edge functions.  speculative_id
  can be set by make_speculative id when multiple targets are binded to
  one indirect edge, and cloned if new edge is cloned.  speculative_id
  is streamed out and stream int by lto like lto_stmt_uid.
  4.  Add 1 in module testcase and 2 cross module testcases.
  5.  Bootstrap and regression test passed on Power8-LE.  No function
  and performance regression for SPEC2017.

gcc/ChangeLog

2019-11-14  Xiong Hu Luo  

PR ipa/69678
* cgraph.c (symbol_table::create_edge): Init speculative_id.
(cgraph_edge::make_speculative): Add param for setting speculative_id.
(cgraph_edge::speculative_call_info): Find reference by
speculative_id for multiple indirect targets.
(cgraph_edge::resolve_speculation): Decrease the speculations
for indirect edge, drop it's speculative if not direct target
left.
(cgraph_edge::redirect_call_stmt_to_callee): Likewise.
(cgraph_node::verify_node): Don't report error if speculative
edge not include statement.
(cgraph_edge::has_multiple_indirect_call_p): New function.
(cgraph_edge::has_indirect_call_p): New function.
* cgraph.h (struct indirect_target_info): New struct.
(indirect_call_targets): New vector variable.
(make_speculative): Add param for setting speculative_id.
(cgraph_edge::has_multiple_indirect_call_p): New declare.
(cgraph_edge::has_indirect_call_p): New declare.
(speculative_id): New variable.
* cgraphclones.c (cgraph_node::create_clone): Clone speculative_id.
* cgraphunit.c: Fix comments typo.
* ipa-comdats.c: Fix comments typo.
* ipa-inline.c (inline_small_functions): Fix iterator update.
* ipa-profile.c (ipa_profile_generate_summary): Add indirect
multiple targets logic.
(ipa_profile): Likewise.
* ipa-ref.h (speculative_id): New variable.
* ipa.c (process_references): Fix typo.
* lto-cgraph.c (lto_output_edge): Add indirect multiple targets
logic.  Stream out speculative_id.
(input_edge): Likewise.
* predict.c (dump_prediction): Revome edges count assert to be
precise.
* symtab.c (symtab_node::create_reference): Init speculative_id.
(symtab_node::clone_references): Clone speculative_id.
(symtab_node::clone_referring): Clone speculative_id.
(symtab_node::clone_reference): Clone speculative_id.
(symtab_node::clear_stmts_in_references): Clear speculative_id.
* tree-inline.c (copy_bb): Duplicate all the speculative edges
if indirect call contains multiple speculative targets.
* tree-profile.c (gimple_gen_ic_profiler): Use the new variable
__gcov_indirect_call.counters and __gcov_indirect_call.callee.
(gimple_gen_ic_func_profiler): Likewise.
(pass_ipa_tree_profile::gate): Fix comment typos.
* value-prof.h  (check_ic_target): Remove.
* value-prof.c  (gimple_value_profile_transformations):
Use void function gimple_ic_transform.
* value-prof.c  (gimple_ic_transform): Handle topn case.
Fix comment typos.  Change it to a void function.

gcc/testsuite/ChangeLog

2019-11-14  Xiong 

[PATCH] Fix copy-paste typo syntax error by r277872

2019-11-06 Thread luoxhu
Tested pass and committed to r277904.


gcc/testsuite/ChangeLog:

2019-11-07  Xiong Hu Luo  

* gcc.target/powerpc/pr72804.c: Move inline options from
dg-require-effective-target to dg-options.
---
 gcc/testsuite/gcc.target/powerpc/pr72804.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/gcc.target/powerpc/pr72804.c 
b/gcc/testsuite/gcc.target/powerpc/pr72804.c
index 0fc3df1d89b..10e37caed6b 100644
--- a/gcc/testsuite/gcc.target/powerpc/pr72804.c
+++ b/gcc/testsuite/gcc.target/powerpc/pr72804.c
@@ -1,7 +1,7 @@
 /* { dg-do compile { target { lp64 } } } */
 /* { dg-skip-if "" { powerpc*-*-darwin* } } */
-/* { dg-require-effective-target powerpc_vsx_ok  -fno-inline-functions --param 
max-inline-insns-single-O2=200 } */
-/* { dg-options "-O2 -mvsx" } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O2 -mvsx -fno-inline-functions --param 
max-inline-insns-single-O2=200" } */
 
 __int128_t
 foo (__int128_t *src)
-- 
2.21.0.777.g83232e3864



Re: [PATCH v2] PR92090: Fix testcase failures by r276469

2019-11-05 Thread luoxhu
On 2019/11/6 02:20, Joseph Myers wrote:
> On Tue, 5 Nov 2019, Kewen.Lin wrote:
> 
>> Very good point!  Since gcc doesn't pursue 100% testsuite pass rate, I 
>> noticed
>> there are a few failures exposed/caused by some PRs all the time.  Could we
>> just leave the test case there without any pre workaround till the PR get 
>> fixed?
> 
> Yes, leaving the test as-is would be typical in GCC (though there is also
> an argument for XFAILing and filing bugs to try to keep the baseline
> testsuite state clean and instead use only Bugzilla to track known
> regressions).
> 

Committed to r277872 to fix part of the failures except the ICE case 
atomics-exec-5.c.
Peter Bergner has a stand alone test to reproduce it.  It may be fixed it soon.


Xiong Hu
Thanks



Ping: [PATCH v5] Missed function specialization + partial devirtualization

2019-11-05 Thread luoxhu

On 2019/10/22 22:07, Martin Liška wrote:

On 9/27/19 9:13 AM, luoxhu wrote:

Thanks for your time of so many round of reviews.


You're welcome. One last request would be please to make
gimple_ic_transform a void function. See attached patch.

I'll remind the patch today to Honza.


Thanks,  Martin.  Will add your changes:).  Ping again~


Xiong Hu
BR



Thanks,
Martin





Re: [PATCH v3] PR92090: Fix testcase failures by r276469

2019-11-04 Thread luoxhu
Hi,

On 2019/11/5 06:57, Joseph Myers wrote:
> On Mon, 4 Nov 2019, luoxhu wrote:
> 
>> -finline-functions is enabled by default for O2 since r276469, update the
>> test cases with -fno-inline-functions.
>>
>> v2: disable inlining for the failed cases.  Add two more failed cases
>> not listed in BZ.  Tested on P8LE, P8BE and P9LE.
> 
> If inlining (or other interprocedural analysis) invalidates a test's
> intent (e.g. all the code gets optimized away), our normal approach is to
> use noinline etc. function attributes to prevent that inlining.
> 
> If you're adding such options to work around an ICE, which certainly
> appears to be the case in the architecture-independent testcases here, you
> should (a) have comments in the tests saying explicitly that the options
> are there temporarily to work around the ICE in a bug whose number is
> given in the comment, and (b) a remark in the open regression bug for the
> ICE saying that those options have been added as a temporary workaround
> and that a patch fixing the ICE should remove them again.  The commit
> message also needs to make very clear that the commit is *not* a fix for
> that bug and so it must *not* be closed as fixed until there is an actual
> fix for the ICE.
> 
> So I don't think this patch is OK without having such comments in the
> tests to explain the issue and a carefully written commit message warning
> that the patch is a workaround, not a fix and the bug in question must not
> be closed simply because of the commit mentioning it.
> 

Thanks. Updated the comments and message to mark the unsolved ICE issue as 
below,
will commit it soon:


-finline-functions is enabled by default for O2 since r276469, update the
test cases with -fno-inline-functions.
The options "-fno-inline-funtions --param max-inline-insns-single-O2=200"
for c11-atomic-exec-5.c is a temporary work around to the ICE of LRA on
BE systems in PR92090.  This commit is NOT a fix for the ICE bug and so it
must NOT be closed.

v2: Disable inlining for the failed cases.  Add two more failed cases
not listed in BZ.  Tested on P8LE, P8BE and P9LE.
v3: Update commit message and comments for atomic ICE.

gcc/testsuite/ChangeLog:

2019-10-30  Xiong Hu Luo  

PR92090
* g++.dg/tree-ssa/pr61034.C: Add -fno-inline-functions --param
max-inline-insns-single-O2=200.
* gcc.dg/atomic/c11-atomic-exec-5.c: Likewise.
* gcc.target/powerpc/pr72804.c: Likewise.
* gcc.target/powerpc/pr79439-1.c: Add -fno-inline-functions.
* gcc.target/powerpc/vsx-builtin-7.c: Likewise.
---
 gcc/testsuite/g++.dg/tree-ssa/pr61034.C  | 2 +-
 gcc/testsuite/gcc.dg/atomic/c11-atomic-exec-5.c  | 2 +-
 gcc/testsuite/gcc.target/powerpc/pr72804.c   | 2 +-
 gcc/testsuite/gcc.target/powerpc/pr79439-1.c | 2 +-
 gcc/testsuite/gcc.target/powerpc/vsx-builtin-7.c | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr61034.C 
b/gcc/testsuite/g++.dg/tree-ssa/pr61034.C
index 2e3dfecacb4..338de1ebb13 100644
--- a/gcc/testsuite/g++.dg/tree-ssa/pr61034.C
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr61034.C
@@ -1,5 +1,5 @@
 // { dg-do compile }
-// { dg-options "-O2 -fdump-tree-fre3 -fdump-tree-optimized 
-fdelete-null-pointer-checks --param early-inlining-insns-O2=14" }
+// { dg-options "-O2 -fdump-tree-fre3 -fdump-tree-optimized 
-fdelete-null-pointer-checks --param early-inlining-insns-O2=14 
-fno-inline-functions --param max-inline-insns-single-O2=200"  }
 
 #define assume(x) if(!(x))__builtin_unreachable()
 
diff --git a/gcc/testsuite/gcc.dg/atomic/c11-atomic-exec-5.c 
b/gcc/testsuite/gcc.dg/atomic/c11-atomic-exec-5.c
index 692c64ad207..a3a5a05da30 100644
--- a/gcc/testsuite/gcc.dg/atomic/c11-atomic-exec-5.c
+++ b/gcc/testsuite/gcc.dg/atomic/c11-atomic-exec-5.c
@@ -3,7 +3,7 @@
iterations of the compare-and-exchange loop are needed, exceptions
get properly cleared).  */
 /* { dg-do run } */
-/* { dg-options "-std=c11 -pedantic-errors -pthread -U_POSIX_C_SOURCE 
-D_POSIX_C_SOURCE=200809L" } */
+/* { dg-options "-std=c11 -pedantic-errors -pthread -U_POSIX_C_SOURCE 
-D_POSIX_C_SOURCE=200809L -fno-inline-functions --param 
max-inline-insns-single-O2=200" } */  /* Disable inline here as "-Os" will hit 
ICE in LRA on Power8 BE, see PR92090.  */
 /* { dg-add-options ieee } */
 /* { dg-additional-options "-mfp-trap-mode=sui" { target alpha*-*-* } } */
 /* { dg-additional-options "-D_XOPEN_SOURCE=600" { target *-*-solaris2* } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr72804.c 
b/gcc/testsuite/gcc.target/powerpc/pr72804.c
index b83b6350d75..0fc3df1d89b 100644
--- a/gcc/testsuite/gcc.target/powerpc/pr72804.c
+++ b/gcc/testsuite/gcc.target/powerpc/pr72804.c
@@ -1,6 +1,6 @@
 /* { dg-do compile { target { lp64 } } } */
 /* { dg-skip-

Re: [PATCH] Add explicit description for -finline

2019-11-03 Thread luoxhu
On 2019/11/2 00:23, Joseph Myers wrote:
> On Thu, 31 Oct 2019, Xiong Hu Luo wrote:
> 
>> +@code{-finline} enables inlining of function declared \"inline\".
>> +@code{-finline} is enabled at levels -O1, -O2, -O3 and -Os, but not -Og.
> 
> Use @option{} to mark up option names (both -finline and all the -O
> options in this paragraph).  Use @code{} to mark up keyword names, not
> \"\".
> 

Thanks.  So shall I commit the tiny patch with below updates?

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 1407d019d14..ea0d407fe11 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -8576,6 +8576,10 @@ optimizing.
 Single functions can be exempted from inlining by marking them
 with the @code{noinline} attribute.

+@option{-finline} enables inlining of function declared @code{inline}.
+@option{-finline} is enabled at levels @option{-O1}, @option{-O2}, @option{-O3}
+and @option{-Os}, but not @option{-Og}.
+
 @item -finline-small-functions
 @opindex finline-small-functions
 Integrate functions into their callers when their body is smaller than expected



[PATCH v2] PR92090: Fix testcase failures by r276469

2019-11-03 Thread luoxhu
-finline-functions is enabled by default for O2 since r276469, update the
test cases with -fno-inline-functions.

v2: disable inlining for the failed cases.  Add two more failed cases
not listed in BZ.  Tested on P8LE, P8BE and P9LE.


gcc/testsuite/ChangeLog:

2019-10-30  Xiong Hu Luo  

PR92090
* g++.dg/tree-ssa/pr61034.C: Add -fno-inline-functions --param
max-inline-insns-single-O2=200.
* gcc.dg/atomic/c11-atomic-exec-5.c: Likewise.
* gcc.target/powerpc/pr72804.c: Likewie.
* gcc.target/powerpc/pr79439-1.c: Add -fno-inline-functions.
* gcc.target/powerpc/vsx-builtin-7.c: Likewise.
---
 gcc/testsuite/g++.dg/tree-ssa/pr61034.C  | 2 +-
 gcc/testsuite/gcc.dg/atomic/c11-atomic-exec-5.c  | 2 +-
 gcc/testsuite/gcc.target/powerpc/pr72804.c   | 2 +-
 gcc/testsuite/gcc.target/powerpc/pr79439-1.c | 2 +-
 gcc/testsuite/gcc.target/powerpc/vsx-builtin-7.c | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr61034.C 
b/gcc/testsuite/g++.dg/tree-ssa/pr61034.C
index 2e3dfecacb4..338de1ebb13 100644
--- a/gcc/testsuite/g++.dg/tree-ssa/pr61034.C
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr61034.C
@@ -1,5 +1,5 @@
 // { dg-do compile }
-// { dg-options "-O2 -fdump-tree-fre3 -fdump-tree-optimized 
-fdelete-null-pointer-checks --param early-inlining-insns-O2=14" }
+// { dg-options "-O2 -fdump-tree-fre3 -fdump-tree-optimized 
-fdelete-null-pointer-checks --param early-inlining-insns-O2=14 
-fno-inline-functions --param max-inline-insns-single-O2=200"  }
 
 #define assume(x) if(!(x))__builtin_unreachable()
 
diff --git a/gcc/testsuite/gcc.dg/atomic/c11-atomic-exec-5.c 
b/gcc/testsuite/gcc.dg/atomic/c11-atomic-exec-5.c
index 692c64ad207..0d70a769fdd 100644
--- a/gcc/testsuite/gcc.dg/atomic/c11-atomic-exec-5.c
+++ b/gcc/testsuite/gcc.dg/atomic/c11-atomic-exec-5.c
@@ -3,7 +3,7 @@
iterations of the compare-and-exchange loop are needed, exceptions
get properly cleared).  */
 /* { dg-do run } */
-/* { dg-options "-std=c11 -pedantic-errors -pthread -U_POSIX_C_SOURCE 
-D_POSIX_C_SOURCE=200809L" } */
+/* { dg-options "-std=c11 -pedantic-errors -pthread -U_POSIX_C_SOURCE 
-D_POSIX_C_SOURCE=200809L -fno-inline-functions --param 
max-inline-insns-single-O2=200" } */
 /* { dg-add-options ieee } */
 /* { dg-additional-options "-mfp-trap-mode=sui" { target alpha*-*-* } } */
 /* { dg-additional-options "-D_XOPEN_SOURCE=600" { target *-*-solaris2* } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr72804.c 
b/gcc/testsuite/gcc.target/powerpc/pr72804.c
index b83b6350d75..0fc3df1d89b 100644
--- a/gcc/testsuite/gcc.target/powerpc/pr72804.c
+++ b/gcc/testsuite/gcc.target/powerpc/pr72804.c
@@ -1,6 +1,6 @@
 /* { dg-do compile { target { lp64 } } } */
 /* { dg-skip-if "" { powerpc*-*-darwin* } } */
-/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-require-effective-target powerpc_vsx_ok  -fno-inline-functions --param 
max-inline-insns-single-O2=200 } */
 /* { dg-options "-O2 -mvsx" } */
 
 __int128_t
diff --git a/gcc/testsuite/gcc.target/powerpc/pr79439-1.c 
b/gcc/testsuite/gcc.target/powerpc/pr79439-1.c
index 5732a236c8e..539c96f571e 100644
--- a/gcc/testsuite/gcc.target/powerpc/pr79439-1.c
+++ b/gcc/testsuite/gcc.target/powerpc/pr79439-1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile { target { powerpc*-*-linux* && lp64 } } } */
-/* { dg-options "-O2 -fpic -fno-reorder-blocks" } */
+/* { dg-options "-O2 -fpic -fno-reorder-blocks -fno-inline-functions" } */
 
 /* On the Linux 64-bit ABIs, we eliminate NOP in the 'rec' call even if
-fpic is used.  The recursive call should call the local alias.  The
diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-builtin-7.c 
b/gcc/testsuite/gcc.target/powerpc/vsx-builtin-7.c
index 5d31309f272..0780b01ffab 100644
--- a/gcc/testsuite/gcc.target/powerpc/vsx-builtin-7.c
+++ b/gcc/testsuite/gcc.target/powerpc/vsx-builtin-7.c
@@ -1,7 +1,7 @@
 /* { dg-do compile { target { powerpc*-*-* } } } */
 /* { dg-skip-if "" { powerpc*-*-darwin* } } */
 /* { dg-require-effective-target powerpc_vsx_ok } */
-/* { dg-options "-O2 -mdejagnu-cpu=power7" } */
+/* { dg-options "-O2 -mdejagnu-cpu=power7 -fno-inline-functions" } */
 
 /* Test simple extract/insert/slat operations.  Make sure all types are
supported with various options.  */
-- 
2.21.0.777.g83232e3864




Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)

2019-10-23 Thread luoxhu
Hi,


On 2019/10/17 16:23, Feng Xue OS wrote:
> IPA does not allow constant propagation on parameter that is used to control
> function recursion.
> 
> recur_fn (i)
> {
>if ( !terminate_recursion (i))
>  {
>...
>recur_fn (i + 1);
>...
>  }
>...
> }
> 
> This patch is composed to enable multi-versioning for self-recursive function,
> and versioning copies is limited by a specified option.
> 
> Feng
> ---
> diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
> index 045072e02ec..6255a766e4d 100644
> --- a/gcc/ipa-cp.c
> +++ b/gcc/ipa-cp.c
> @@ -229,7 +229,9 @@ public:
> inline bool set_contains_variable ();
> bool add_value (valtype newval, cgraph_edge *cs,
> ipcp_value *src_val = NULL,
> -   int src_idx = 0, HOST_WIDE_INT offset = -1);
> +   int src_idx = 0, HOST_WIDE_INT offset = -1,
> +   ipcp_value **val_pos_p = NULL,
> +   bool unlimited = false);
> void print (FILE * f, bool dump_sources, bool dump_benefits);
>   };
>   
> @@ -1579,22 +1581,37 @@ allocate_and_init_ipcp_value 
> (ipa_polymorphic_call_context source)
>   /* Try to add NEWVAL to LAT, potentially creating a new ipcp_value for it.  
> CS,
>  SRC_VAL SRC_INDEX and OFFSET are meant for add_source and have the same
>  meaning.  OFFSET -1 means the source is scalar and not a part of an
> -   aggregate.  */
> +   aggregate.  If non-NULL, VAL_POS_P specifies position in value list,
> +   after which newly created ipcp_value will be inserted, and it is also
> +   used to record address of the added ipcp_value before function returns.
> +   UNLIMITED means whether value count should not exceed the limit given
> +   by PARAM_IPA_CP_VALUE_LIST_SIZE.  */
>   
>   template 
>   bool
>   ipcp_lattice::add_value (valtype newval, cgraph_edge *cs,
> ipcp_value *src_val,
> -   int src_idx, HOST_WIDE_INT offset)
> +   int src_idx, HOST_WIDE_INT offset,
> +   ipcp_value **val_pos_p,
> +   bool unlimited)
>   {
> ipcp_value *val;
>   
> +  if (val_pos_p)
> +{
> +  for (val = values; val && val != *val_pos_p; val = val->next);
> +  gcc_checking_assert (val);
> +}
> +
> if (bottom)
>   return false;
>   
> for (val = values; val; val = val->next)
>   if (values_equal_for_ipcp_p (val->value, newval))
> {
> + if (val_pos_p)
> +   *val_pos_p = val;
> +
>   if (ipa_edge_within_scc (cs))
> {
>   ipcp_value_source *s;
> @@ -1609,7 +1626,8 @@ ipcp_lattice::add_value (valtype newval, 
> cgraph_edge *cs,
>   return false;
> }
>   
> -  if (values_count == PARAM_VALUE (PARAM_IPA_CP_VALUE_LIST_SIZE))
> +  if (!unlimited
> +  && values_count == PARAM_VALUE (PARAM_IPA_CP_VALUE_LIST_SIZE))
>   {
> /* We can only free sources, not the values themselves, because 
> sources
>of other values in this SCC might point to them.   */
> @@ -1623,6 +1641,9 @@ ipcp_lattice::add_value (valtype newval, 
> cgraph_edge *cs,
>   }
>   }
>   
> +  if (val_pos_p)
> + *val_pos_p = NULL;
> +
> values = NULL;
> return set_to_bottom ();
>   }
> @@ -1630,8 +1651,54 @@ ipcp_lattice::add_value (valtype newval, 
> cgraph_edge *cs,
> values_count++;
> val = allocate_and_init_ipcp_value (newval);
> val->add_source (cs, src_val, src_idx, offset);
> -  val->next = values;
> -  values = val;
> +  if (val_pos_p)
> +{
> +  val->next = (*val_pos_p)->next;
> +  (*val_pos_p)->next = val;
> +  *val_pos_p = val;
> +}
> +  else
> +{
> +  val->next = values;
> +  values = val;
> +}
> +
> +  return true;
> +}
> +
> +/* Return true, if a ipcp_value VAL is orginated from parameter value of
> +   self-feeding recursive function by applying non-passthrough arithmetic
> +   transformation.  */
> +
> +static bool
> +self_recursively_generated_p (ipcp_value *val)
> +{
> +  class ipa_node_params *info = NULL;
> +
> +  for (ipcp_value_source *src = val->sources; src; src = src->next)
> +{
> +  cgraph_edge *cs = src->cs;
> +
> +  if (!src->val || cs->caller != cs->callee->function_symbol ()
> +   || src->val == val)
> + return false;
> +
> +  if (!info)
> + info = IPA_NODE_REF (cs->caller);
> +
> +  class ipcp_param_lattices *plats = ipa_get_parm_lattices (info,
> + src->index);
> +  ipcp_lattice *src_lat = src->offset == -1 ? >itself
> +   : plats->aggs;

Thanks for the patch.
This function doesn't handle the by-ref case after rebasing this patch to your
previous ipa-cp by-ref of arith patch as below (Also some conflicts need fix):

foo (int * p) { ...  return foo(*() + 1); }

It will cause value explosion.

Secondly, the self_recursive 

Re: Ping: [PATCH V4] Extend IPA-CP to support arithmetically-computed value-passing on by-ref argument (PR ipa/91682)

2019-10-23 Thread luoxhu


Hi Feng, 
Thanks for the patch.  It works for me as expected.

I am not a reviewer, just tiny comment after tried.
This is quite a good case for newbies to go through the ipa-cp pass.
Is it necessary to update the test case a bit as attached to include more
circumstances for callee's aggregate include both by value and by
reference when caller's aggregate is passed either by value or by reference?


Xiong Hu
Thanks


On 2019/9/30 16:53, Feng Xue OS wrote:

Hi Honza & Martin,

And also hope your comments on this patch. Thanks.

Feng


From: Feng Xue OS 
Sent: Thursday, September 19, 2019 10:30 PM
To: Martin Jambor; Jan Hubicka; gcc-patches@gcc.gnu.org
Subject: [PATCH V4] Extend IPA-CP to support arithmetically-computed 
value-passing on by-ref argument (PR ipa/91682)

Fix a bug on unary/binary operation check.

Feng
---
diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
index 33d52fe5537..f218f1093b8 100644
--- a/gcc/ipa-cp.c
+++ b/gcc/ipa-cp.c
@@ -1244,23 +1244,23 @@ initialize_node_lattices (struct cgraph_node *node)
}
  }

-/* Return the result of a (possibly arithmetic) pass through jump function
-   JFUNC on the constant value INPUT.  RES_TYPE is the type of the parameter
-   to which the result is passed.  Return NULL_TREE if that cannot be
-   determined or be considered an interprocedural invariant.  */
+/* Return the result of a (possibly arithmetic) operation on the constant
+   value INPUT.  OPERAND is 2nd operand for binary operation.  RES_TYPE is
+   the type of the parameter to which the result is passed.  Return
+   NULL_TREE if that cannot be determined or be considered an
+   interprocedural invariant.  */

  static tree
-ipa_get_jf_pass_through_result (struct ipa_jump_func *jfunc, tree input,
-   tree res_type)
+ipa_get_jf_arith_result (enum tree_code opcode, tree input, tree operand,
+tree res_type)
  {
tree res;

-  if (ipa_get_jf_pass_through_operation (jfunc) == NOP_EXPR)
+  if (opcode == NOP_EXPR)
  return input;
if (!is_gimple_ip_invariant (input))
  return NULL_TREE;

-  tree_code opcode = ipa_get_jf_pass_through_operation (jfunc);
if (!res_type)
  {
if (TREE_CODE_CLASS (opcode) == tcc_comparison)
@@ -1274,8 +1274,7 @@ ipa_get_jf_pass_through_result (struct ipa_jump_func 
*jfunc, tree input,
if (TREE_CODE_CLASS (opcode) == tcc_unary)
  res = fold_unary (opcode, res_type, input);
else
-res = fold_binary (opcode, res_type, input,
-  ipa_get_jf_pass_through_operand (jfunc));
+res = fold_binary (opcode, res_type, input, operand);

if (res && !is_gimple_ip_invariant (res))
  return NULL_TREE;
@@ -1283,6 +1282,21 @@ ipa_get_jf_pass_through_result (struct ipa_jump_func 
*jfunc, tree input,
return res;
  }

+/* Return the result of a (possibly arithmetic) pass through jump function
+   JFUNC on the constant value INPUT.  RES_TYPE is the type of the parameter
+   to which the result is passed.  Return NULL_TREE if that cannot be
+   determined or be considered an interprocedural invariant.  */
+
+static tree
+ipa_get_jf_pass_through_result (struct ipa_jump_func *jfunc, tree input,
+   tree res_type)
+{
+  return ipa_get_jf_arith_result (ipa_get_jf_pass_through_operation (jfunc),
+ input,
+ ipa_get_jf_pass_through_operand (jfunc),
+ res_type);
+}
+
  /* Return the result of an ancestor jump function JFUNC on the constant value
 INPUT.  Return NULL_TREE if that cannot be determined.  */

@@ -1416,6 +1430,146 @@ ipa_context_from_jfunc (ipa_node_params *info, 
cgraph_edge *cs, int csidx,
return ctx;
  }

+/* See if NODE is a clone with a known aggregate value at a given OFFSET of a
+   parameter with the given INDEX.  */
+
+static tree
+get_clone_agg_value (struct cgraph_node *node, HOST_WIDE_INT offset,
+int index)
+{
+  struct ipa_agg_replacement_value *aggval;
+
+  aggval = ipa_get_agg_replacements_for_node (node);
+  while (aggval)
+{
+  if (aggval->offset == offset
+ && aggval->index == index)
+   return aggval->value;
+  aggval = aggval->next;
+}
+  return NULL_TREE;
+}
+
+/* Determine whether ITEM, jump function for an aggregate part, evaluates to a
+   single known constant value and if so, return it.  Otherwise return NULL.
+   NODE and INFO describes the caller node or the one it is inlined to, and
+   its related info.  */
+
+static tree
+ipa_agg_value_from_node (class ipa_node_params *info,
+struct cgraph_node *node,
+struct ipa_agg_jf_item *item)
+{
+  tree value = NULL_TREE;
+  int src_idx;
+
+  if (item->offset < 0 || item->jftype == IPA_JF_UNKNOWN)
+return NULL_TREE;
+
+  if (item->jftype == IPA_JF_CONST)
+return item->value.constant;
+
+  gcc_checking_assert 

Re: [PATCH] Support multi-versioning on self-recursive function (ipa/92133)

2019-10-17 Thread luoxhu
Hi Feng,

On 2019/10/17 16:23, Feng Xue OS wrote:
> IPA does not allow constant propagation on parameter that is used to control
> function recursion.
> 
> recur_fn (i)
> {
>if ( !terminate_recursion (i))
>  {
>...
>recur_fn (i + 1);
>...
>  }
>...
> }
> 
> This patch is composed to enable multi-versioning for self-recursive function,
> and versioning copies is limited by a specified option.

I noticed similar issue when analyzing the SPEC, self-recursive function is
not versioned and posted my observations in 
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92074.

Generally, this could be implemented well by your patch, while I am
wondering whether it is OK to convert the recursive function to
non-recursive function in a independent pass after ipa-cp and ipa-sra instead
of reuse the ipa-cp framework?
The reason is sometimes the argument is passed-by-reference, and
ipa-sra runs after ipa-cp, so this kind of optimization may not be done in
WPA.  What's your idea about this, please?   Thanks. 


Thanks
Xiong Hu

> 
> Feng
> ---
> diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
> index 045072e02ec..6255a766e4d 100644
> --- a/gcc/ipa-cp.c
> +++ b/gcc/ipa-cp.c
> @@ -229,7 +229,9 @@ public:
> inline bool set_contains_variable ();
> bool add_value (valtype newval, cgraph_edge *cs,
> ipcp_value *src_val = NULL,
> -   int src_idx = 0, HOST_WIDE_INT offset = -1);
> +   int src_idx = 0, HOST_WIDE_INT offset = -1,
> +   ipcp_value **val_pos_p = NULL,
> +   bool unlimited = false);
> void print (FILE * f, bool dump_sources, bool dump_benefits);
>   };
>   
> @@ -1579,22 +1581,37 @@ allocate_and_init_ipcp_value 
> (ipa_polymorphic_call_context source)
>   /* Try to add NEWVAL to LAT, potentially creating a new ipcp_value for it.  
> CS,
>  SRC_VAL SRC_INDEX and OFFSET are meant for add_source and have the same
>  meaning.  OFFSET -1 means the source is scalar and not a part of an
> -   aggregate.  */
> +   aggregate.  If non-NULL, VAL_POS_P specifies position in value list,
> +   after which newly created ipcp_value will be inserted, and it is also
> +   used to record address of the added ipcp_value before function returns.
> +   UNLIMITED means whether value count should not exceed the limit given
> +   by PARAM_IPA_CP_VALUE_LIST_SIZE.  */
>   
>   template 
>   bool
>   ipcp_lattice::add_value (valtype newval, cgraph_edge *cs,
> ipcp_value *src_val,
> -   int src_idx, HOST_WIDE_INT offset)
> +   int src_idx, HOST_WIDE_INT offset,
> +   ipcp_value **val_pos_p,
> +   bool unlimited)
>   {
> ipcp_value *val;
>   
> +  if (val_pos_p)
> +{
> +  for (val = values; val && val != *val_pos_p; val = val->next);
> +  gcc_checking_assert (val);
> +}
> +
> if (bottom)
>   return false;
>   
> for (val = values; val; val = val->next)
>   if (values_equal_for_ipcp_p (val->value, newval))
> {
> + if (val_pos_p)
> +   *val_pos_p = val;
> +
>   if (ipa_edge_within_scc (cs))
> {
>   ipcp_value_source *s;
> @@ -1609,7 +1626,8 @@ ipcp_lattice::add_value (valtype newval, 
> cgraph_edge *cs,
>   return false;
> }
>   
> -  if (values_count == PARAM_VALUE (PARAM_IPA_CP_VALUE_LIST_SIZE))
> +  if (!unlimited
> +  && values_count == PARAM_VALUE (PARAM_IPA_CP_VALUE_LIST_SIZE))
>   {
> /* We can only free sources, not the values themselves, because 
> sources
>of other values in this SCC might point to them.   */
> @@ -1623,6 +1641,9 @@ ipcp_lattice::add_value (valtype newval, 
> cgraph_edge *cs,
>   }
>   }
>   
> +  if (val_pos_p)
> + *val_pos_p = NULL;
> +
> values = NULL;
> return set_to_bottom ();
>   }
> @@ -1630,8 +1651,54 @@ ipcp_lattice::add_value (valtype newval, 
> cgraph_edge *cs,
> values_count++;
> val = allocate_and_init_ipcp_value (newval);
> val->add_source (cs, src_val, src_idx, offset);
> -  val->next = values;
> -  values = val;
> +  if (val_pos_p)
> +{
> +  val->next = (*val_pos_p)->next;
> +  (*val_pos_p)->next = val;
> +  *val_pos_p = val;
> +}
> +  else
> +{
> +  val->next = values;
> +  values = val;
> +}
> +
> +  return true;
> +}
> +
> +/* Return true, if a ipcp_value VAL is orginated from parameter value of
> +   self-feeding recursive function by applying non-passthrough arithmetic
> +   transformation.  */
> +
> +static bool
> +self_recursively_generated_p (ipcp_value *val)
> +{
> +  class ipa_node_params *info = NULL;
> +
> +  for (ipcp_value_source *src = val->sources; src; src = src->next)
> +{
> +  cgraph_edge *cs = src->cs;
> +
> +  if (!src->val || cs->caller != cs->callee->function_symbol ()
> +   || src->val == val)
> + return false;
> +
> + 

Ping: [PATCH v5] Missed function specialization + partial devirtualization

2019-10-15 Thread luoxhu

Ping:

Attachment: v5-0001-Missed-function-specialization-partial-devirtuali.patch:

https://gcc.gnu.org/ml/gcc-patches/2019-09/txtuTT17jV7n5.txt


Thanks,
Xiong Hu


On 2019/9/27 15:13, luoxhu wrote:

Hi Martin,

Thanks for your time of so many round of reviews.
It really helped me a lot.
Updated with your comments and attached for Honza's review and approve.  :)


Xiong Hu
BR

On 2019/9/26 16:36, Martin Liška wrote:

On 9/26/19 7:23 AM, luoxhu wrote:

Thanks Martin,


On 2019/9/25 18:57, Martin Liška wrote:

On 9/25/19 5:45 AM, luoxhu wrote:

Hi,

Sorry for replying so late due to cauldron conference and other LTO 
issues

I was working on.


Hello.

That's fine, we still have plenty of time for patch review.

Not fixed issues which I reported in v3 (and still valid in v4):
- please come up with indirect_target_info::indirect_target_info and 
use it

Sorry for miss out.


Hello.

Sure, please use a contructor initialization (see my patch).




- do you need to stream out indirect_call_targets when common_target_id 
== 0?


No need to stream out items with common_target_id == 0, removed the if 
condition in lto-cgraph.c.


Fine. Do we have a guarantee that item->common_target_id is always != 0? 
Please put there an assert.






Then I'm suggesting to use vec::is_empty (please see my patch).
OK.  But has_multiple_indirect_call_p should return different than 
has_indirect_call_p as it checks more that one targets?


Sure, that was mistake in my patch from previous reply.



gcc/cgraph.c
/* Return true if this edge has multiple indirect call targets.  */
  bool
  cgraph_edge::has_multiple_indirect_call_p (void)
  {
-  return indirect_info && indirect_info->indirect_call_targets
-    && indirect_info->indirect_call_targets->length () > 1;
+  return (indirect_info && indirect_info->indirect_call_targets
+ && indirect_info->indirect_call_targets->length () > 1);
  }

  /* Return true if this edge has at least one indirect call target.  */
  bool
  cgraph_edge::has_indirect_call_p (void)
  {
-  return indirect_info && indirect_info->indirect_call_targets
-    && indirect_info->indirect_call_targets->length ();
+  return (indirect_info && indirect_info->indirect_call_targets
+ && !indirect_info->indirect_call_targets->is_empty ());
  }



I see following failures for the tests provided:
FAIL: gcc.dg/tree-prof/crossmodule-indir-call-topn-1.c compilation,  
-fprofile-generate -D_PROFILE_GENERATE
FAIL: gcc.dg/tree-prof/crossmodule-indir-call-topn-2.c compilation,  
-fprofile-generate -D_PROFILE_GENERATE
FAIL: gcc.dg/tree-prof/indir-call-prof-topn.c compilation,  
-fprofile-generate -D_PROFILE_GENERATE


Sorry that I forgot to remove the deprecated build option in the 3 cases
(also updated the scan exp check):
-/* { dg-options "-O2 -flto -DDOJOB=1 -fdump-ipa-profile_estimate 
--param indir-call-topn-profile=1" } */

+/* { dg-options "-O2 -flto -DDOJOB=1 -fdump-ipa-profile_estimate" } */


The new patch is attached.  Thanks.


Hm, looking at the gimple_ic_transform function. I think the function 
should always

return false as it never does a GIMPLE transformation.

Apart from that, I'm fine with the patch. Note that I'm not the 
maintainer, but I bet we simplified

the patch review to Honza significantly.

Last missing piece is probably the update ChangeLog.

Thank you for working on that,
Martin




Xiong Hu



Next comments follow directly in the email body:



v4 Changes:
   1. Rebase to trunk.
   2. Remove num_of_ics and use vector's length to avoid redundancy.
   3. Update the code in ipa-profile.c to improve review feasibility.
   4. Add function has_indirect_call_p and has_multiple_indirect_call_p.
   5. For parameter control, I will leave it to next patch as it is a
  relative independent function.  Currently, maximum number of
  promotions is GCOV_TOPN_VALUES as only 4 profiling value limited
  from profile-generate, therefore minimum probability is adjusted to
  25% in value-prof.c, it was 75% also by hard code for single
  indirect target.  No control to minimal number of edge
  executions yet.  What's more, this patch is a bit large now.

This patch aims to fix PR69678 caused by PGO indirect call profiling
performance issues.
The bug that profiling data is never working was fixed by Martin's pull
back of topN patches, performance got GEOMEAN ~1% improvement(+24% for
511.povray_r specifically).
Still, currently the default profile only generates SINGLE indirect 
target

that called more than 75%.  This patch leverages MULTIPLE indirect
targets use in LTO-WPA and LTO-LTRANS stage, as a result, function
specialization, profiling, partial devirtualization, inlining and
cloning could be done successfully based on it.
Performance can get improved from 0.70 sec to 0.38 sec on simple tests.
Details are:
    1.  PGO with topn is

Re: [PATCH] Fix dump message issue

2019-10-13 Thread luoxhu
On 2019/10/14 00:32, Jeff Law wrote:
> On 10/8/19 4:45 AM, Martin Jambor wrote:
>> Hi,
>>
>> On Tue, Oct 08 2019, luoxhu wrote:
>>> '}' is missed at the end.
>>
>> heh, yeah, I wonder for how long.
>>
>> If it irritates you, I'd say the patch is obvious (though note that I
>> cannot approve a patch in this area).
> Looks obvious to me.  And while you may not be an official reviewer
> Martin, if you say someone's code looks good to you, I'm just going to
> rubber stamp it.
> 
> Which in turn means you ought to be a reviewer.

Thanks, Martin and Jeff:

It was introduced since the initial commit in 2009:

8d53b873fdce (jamborm   2009-05-29 16:47:31 +  277) fprintf (f, ", 
write = %d, grp_partial_lhs = %d\n", access->write,

and:

c79d6ecf5563 (jamborm   2009-09-02 17:52:18 +  271) "grp_to_be_replaced = 
%d\n", 

Commited in r276948.


Xiong Hu
BR

> 
> Jeff
> 



[PATCH] Fix dump message issue

2019-10-08 Thread luoxhu
'}' is missed at the end.

gcc/ChangeLog:
tree-sra.c (dump_access): Add missing braces.
---
 gcc/tree-sra.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/tree-sra.c b/gcc/tree-sra.c
index 48589323a1e..cb59b91f20e 100644
--- a/gcc/tree-sra.c
+++ b/gcc/tree-sra.c
@@ -403,7 +403,7 @@ dump_access (FILE *f, struct access *access, bool grp)
 "grp_hint = %d, grp_covered = %d, "
 "grp_unscalarizable_region = %d, grp_unscalarized_data = %d, "
 "grp_same_access_path = %d, grp_partial_lhs = %d, "
-"grp_to_be_replaced = %d, grp_to_be_debug_replaced = %d\n",
+"grp_to_be_replaced = %d, grp_to_be_debug_replaced = %d}\n",
 access->grp_read, access->grp_write, access->grp_assignment_read,
 access->grp_assignment_write, access->grp_scalar_read,
 access->grp_scalar_write, access->grp_total_scalarization,
@@ -413,7 +413,7 @@ dump_access (FILE *f, struct access *access, bool grp)
 access->grp_to_be_replaced, access->grp_to_be_debug_replaced);
   else
 fprintf (f, ", write = %d, grp_total_scalarization = %d, "
-"grp_partial_lhs = %d\n",
+"grp_partial_lhs = %d}\n",
 access->write, access->grp_total_scalarization,
 access->grp_partial_lhs);
 }
-- 
2.21.0.777.g83232e3864



Re: [PATCH] Come up with ipa passes introduction in gccint documentation

2019-10-08 Thread luoxhu
Hi,

This is the formal documentation patch for IPA passes.  Thanks.


None of the IPA passes are documented in passes.texi.  This patch adds
a section IPA passes just before GIMPLE passes and RTL passes in
Chapter 9 "Passes and Files of the Compiler".  Also, a short description
for each IPA pass is provided.
gccint.pdf can be produced without errors.

ChangeLog:
PR middle-end/26241
* doc/lto.texi (IPA): Reference to the IPA passes.
* doc/passes.texi (Pass manager): Add node IPA passes and
  description for each IPA pass.
---
 gcc/doc/lto.texi|   7 +-
 gcc/doc/passes.texi | 241 
 2 files changed, 245 insertions(+), 3 deletions(-)

diff --git a/gcc/doc/lto.texi b/gcc/doc/lto.texi
index 771e8278e50..1afb6eb4a98 100644
--- a/gcc/doc/lto.texi
+++ b/gcc/doc/lto.texi
@@ -350,8 +350,10 @@ while the @emph{Read summary}, @emph{Execute}, and
 @end itemize
 
 To simplify development, the GCC pass manager differentiates
-between normal inter-procedural passes and small inter-procedural
-passes.  A @emph{small inter-procedural pass}
+between normal inter-procedural passes (@pxref{All regular IPA passes}),
+small inter-procedural passes (@pxref{All small IPA passes})
+and late inter-procedural passes (@pxref{All late IPA passes}).
+A @emph{small inter-procedural pass}
 (@code{SIMPLE_IPA_PASS}) is a pass that does
 everything at once and thus it cannot be executed during WPA in
 WHOPR mode.  It defines only the @emph{Execute} stage and during
@@ -362,7 +364,6 @@ object files.  The simple inter-procedural passes can also 
be used
 for easier prototyping and development of a new inter-procedural
 pass.
 
-
 @subsection Virtual clones
 
 One of the main challenges of introducing the WHOPR compilation
diff --git a/gcc/doc/passes.texi b/gcc/doc/passes.texi
index 6edb9a0bfb7..3f9106d60c3 100644
--- a/gcc/doc/passes.texi
+++ b/gcc/doc/passes.texi
@@ -20,6 +20,7 @@ where near complete.
 * Parsing pass:: The language front end turns text into bits.
 * Gimplification pass::  The bits are turned into something we can optimize.
 * Pass manager:: Sequencing the optimization passes.
+* IPA passes::   Inter-procedural optimizations.
 * Tree SSA passes::  Optimizations on a high-level representation.
 * RTL passes::   Optimizations on a low-level representation.
 * Optimization info::Dumping optimization information from passes.
@@ -178,6 +179,246 @@ TODO: describe the global variables set up by the pass 
manager,
 and a brief description of how a new pass should use it.
 I need to look at what info RTL passes use first@enddots{}
 
+@node IPA passes
+@section IPA passes
+@cindex IPA passes
+The following briefly describes the IPA optimization passes including
+all small IPA passes, all regular IPA passes and all late IPA passes.
+
+@node All small IPA passes
+@subsection All small IPA passes
+@cindex all small ipa passes
+A small IPA pass is a pass derived from @code{simple_ipa_opt_pass}
+that does everything at once and thus it cannot be executed during WPA
+in WHOPR mode.  It defines only the @emph{Execute} stage and during this
+stage it accesses and modifies the function bodies.  No @code{generate_summary}
+@code{read_summary} or @code{write_summary} hooks defined.
+
+@itemize @bullet
+@item IPA free lang data
+
+This pass free resources that are used by FE but are not needed once
+they are done.  It is located in @file{tree.c} and is described by
+@code{pass_ipa_free_lang_data}.
+
+@item IPA function and variable visibility
+
+This is a local function pass handling visibilities of all symbols.  This
+happens before LTO streaming so in particular -fwhole-program should be ignored
+at this level.  It is located in @file{ipa-visibility.c} and is described by
+@code{pass_ipa_function_and_variable_visibility}.
+
+@item IPA remove symbols
+
+This pass performs reachability analysis and reclaim all unreachable nodes.
+It is located in @file{passes.c} and is described by
+@code{pass_ipa_remove_symbols}.
+
+@item IPA oacc
+
+It is located in @file{tree-ssa-loop.c} and is described by
+@code{pass_ipa_oacc}.
+
+@item IPA pta
+
+This is a tree based points-to analysis pass. The idea behind this analyzer
+is to generate set constraints from the program, then solve the resulting
+constraints in order to generate the points-to sets.  It is located in 
+@file{tree-ssa-structalias.c} and is described by @code{pass_ipa_pta}.
+
+@item IPA oacc kernels
+
+This pass is for processing oacc kernels.  It is located in 
@file{tree-ssa-loop.c}
+and is described by @code{pass_ipa_oacc_kernels}.
+
+@item Target clone
+
+This is a pass for parsing functions with multiple target attributes.
+It is located in @file{multiple_target.c} and is described by
+@code{pass_target_clone}.
+
+@item IPA auto profile
+
+This pass uses AutoFDO profile to annotate the control flow graph.
+It is located in @file{auto-profile.c} and is described by

Re: [RFC] Come up with ipa passes introduction in gccint documentation

2019-09-29 Thread luoxhu
Hi Segher,

On 2019/9/30 00:17, Segher Boessenkool wrote:
> Hi!
> 
> Just some editorial comments...  The idea of the patch is fine IMHO.
> (I am not maintainer of this, take all my comments for what they are).
> 
> On Sun, Sep 29, 2019 at 02:56:37AM -0500, Xiong Hu Luo wrote:
>>   To simplify development, the GCC pass manager differentiates
>> -between normal inter-procedural passes and small inter-procedural
>> -passes.  A @emph{small inter-procedural pass}
>> +between normal inter-procedural passes @pxref{All regular ipa passes}
>> +and small inter-procedural passes@pxref{All small ipa passes}
>> +& @pxref{All late ipa passes}. A @emph{small inter-procedural pass}
>>   (@code{SIMPLE_IPA_PASS}) is a pass that does
> 
> To simplify development, the GCC pass manager differentiates
> between normal inter-procedural passes @pxref{All regular IPA passes},
> simple inter-procedural passes@pxref{All simple IPA passes},
> and late inter-procedural passes @pxref{All late IPA passes}.
> A @emph{simple inter-procedural pass}
> (@code{SIMPLE_IPA_PASS}) is a pass that does
> 
> Don't use "&", write out the word.  IPA is an abbreviation, not a word,
> so it should be written in all caps.  Enumerations are written like this,
> that, and something else.  Is there a reason you used "small" instead of
> "simple"?

The updated output will be as below in gccint.pdf, references are valid to jump 
over
(suppose should be the same in info?):


To simplify development, the GCC pass manager differentiates between normal 
interprocedural
passes see Section 9.4.2 [All regular IPA passes], page 127, small 
inter-procedural
passes see Section 9.4.1 [All small IPA passes], page 127 and late 
inter-procedural passes see
Section 9.4.3 [All late IPA passes], page 128. A small inter-procedural pass 
(SIMPLE_IPA_
PASS) is a pass that does everything at once and thus it cannot be executed 
during WPA
in WHOPR mode. It defines only the Execute stage and during this stage it 
accesses and
modifies the function bodies. Such passes are useful for optimization at LGEN 
or LTRANS
time and are used, for example, to implement early optimization before writing 
object files.
The simple inter-procedural passes can also be used for easier prototyping and 
development
of a new inter-procedural pass.

And:

9.4 IPA passes
The following briefly describes the IPA optimization passes including all small 
IPA passes,
all regular IPA passes and all late IPA passes.
9.4.1 All small IPA passes
 ipa free lang data
 ipa function and variable visibility
 ipa remove symbols



The reason for "small" is in passes.def, it is used as 
INSERT_PASSES_AFTER (all_small_ipa_passes).  And in cgraphunit.c:
ipa_passes (void)
{
... 
  execute_ipa_pass_list (passes->all_small_ipa_passes);
...
}

So is it OK to use small here?

Thanks and welcome for other inputs.
One more thing to worry about is my poor English and I am not familiar with all 
the
30+ IPA passes so not easy to extract exact explanations for them. 

Xiong Hu
BR

> 
>> diff --git a/gcc/doc/passes.texi b/gcc/doc/passes.texi
>> index 6edb9a0bfb7..0b6cf73469c 100644
>> --- a/gcc/doc/passes.texi
>> +++ b/gcc/doc/passes.texi
>> @@ -21,6 +21,7 @@ where near complete.
>>   * Gimplification pass::  The bits are turned into something we can 
>> optimize.
>>   * Pass manager:: Sequencing the optimization passes.
>>   * Tree SSA passes::  Optimizations on a high-level representation.
>> +* IPA passes::   Optimizations on scope of intra-procedual.
>>   * RTL passes::   Optimizations on a low-level representation.
> 
> I'd just say "Intra-procedural optimizations."  And maybe order this
> differntly?  IPA, SSA, RTL, the same order as they run, and this also
> keeps the "high/low-level representation" together, which reads a bit
> better.
> 
> The rest should be worked out a bit more, but looks promising.
> 
> Did you test this with both "make info" and "make pdf" (and checked the
> result of those of course :-) )?
> 
> Thanks,
> 
> 
> Segher
> 



Re: [PATCH v5] Missed function specialization + partial devirtualization

2019-09-27 Thread luoxhu

Hi Martin,

Thanks for your time of so many round of reviews.
It really helped me a lot.
Updated with your comments and attached for Honza's review and approve.  :)


Xiong Hu
BR

On 2019/9/26 16:36, Martin Liška wrote:

On 9/26/19 7:23 AM, luoxhu wrote:

Thanks Martin,


On 2019/9/25 18:57, Martin Liška wrote:

On 9/25/19 5:45 AM, luoxhu wrote:

Hi,

Sorry for replying so late due to cauldron conference and other LTO issues
I was working on.


Hello.

That's fine, we still have plenty of time for patch review.

Not fixed issues which I reported in v3 (and still valid in v4):
- please come up with indirect_target_info::indirect_target_info and use it

Sorry for miss out.


Hello.

Sure, please use a contructor initialization (see my patch).





- do you need to stream out indirect_call_targets when common_target_id == 0?


No need to stream out items with common_target_id == 0, removed the if 
condition in lto-cgraph.c.


Fine. Do we have a guarantee that item->common_target_id is always != 0? Please 
put there an assert.





Then I'm suggesting to use vec::is_empty (please see my patch).

OK.  But has_multiple_indirect_call_p should return different than 
has_indirect_call_p as it checks more that one targets?


Sure, that was mistake in my patch from previous reply.



gcc/cgraph.c
/* Return true if this edge has multiple indirect call targets.  */
  bool
  cgraph_edge::has_multiple_indirect_call_p (void)
  {
-  return indirect_info && indirect_info->indirect_call_targets
-    && indirect_info->indirect_call_targets->length () > 1;
+  return (indirect_info && indirect_info->indirect_call_targets
+ && indirect_info->indirect_call_targets->length () > 1);
  }

  /* Return true if this edge has at least one indirect call target.  */
  bool
  cgraph_edge::has_indirect_call_p (void)
  {
-  return indirect_info && indirect_info->indirect_call_targets
-    && indirect_info->indirect_call_targets->length ();
+  return (indirect_info && indirect_info->indirect_call_targets
+ && !indirect_info->indirect_call_targets->is_empty ());
  }



I see following failures for the tests provided:
FAIL: gcc.dg/tree-prof/crossmodule-indir-call-topn-1.c compilation,  
-fprofile-generate -D_PROFILE_GENERATE
FAIL: gcc.dg/tree-prof/crossmodule-indir-call-topn-2.c compilation,  
-fprofile-generate -D_PROFILE_GENERATE
FAIL: gcc.dg/tree-prof/indir-call-prof-topn.c compilation,  -fprofile-generate 
-D_PROFILE_GENERATE


Sorry that I forgot to remove the deprecated build option in the 3 cases
(also updated the scan exp check):
-/* { dg-options "-O2 -flto -DDOJOB=1 -fdump-ipa-profile_estimate --param 
indir-call-topn-profile=1" } */
+/* { dg-options "-O2 -flto -DDOJOB=1 -fdump-ipa-profile_estimate" } */


The new patch is attached.  Thanks.


Hm, looking at the gimple_ic_transform function. I think the function should 
always
return false as it never does a GIMPLE transformation.

Apart from that, I'm fine with the patch. Note that I'm not the maintainer, but 
I bet we simplified
the patch review to Honza significantly.

Last missing piece is probably the update ChangeLog.

Thank you for working on that,
Martin




Xiong Hu



Next comments follow directly in the email body:



v4 Changes:
   1. Rebase to trunk.
   2. Remove num_of_ics and use vector's length to avoid redundancy.
   3. Update the code in ipa-profile.c to improve review feasibility.
   4. Add function has_indirect_call_p and has_multiple_indirect_call_p.
   5. For parameter control, I will leave it to next patch as it is a
  relative independent function.  Currently, maximum number of
  promotions is GCOV_TOPN_VALUES as only 4 profiling value limited
  from profile-generate, therefore minimum probability is adjusted to
  25% in value-prof.c, it was 75% also by hard code for single
  indirect target.  No control to minimal number of edge
  executions yet.  What's more, this patch is a bit large now.

This patch aims to fix PR69678 caused by PGO indirect call profiling
performance issues.
The bug that profiling data is never working was fixed by Martin's pull
back of topN patches, performance got GEOMEAN ~1% improvement(+24% for
511.povray_r specifically).
Still, currently the default profile only generates SINGLE indirect target
that called more than 75%.  This patch leverages MULTIPLE indirect
targets use in LTO-WPA and LTO-LTRANS stage, as a result, function
specialization, profiling, partial devirtualization, inlining and
cloning could be done successfully based on it.
Performance can get improved from 0.70 sec to 0.38 sec on simple tests.
Details are:
    1.  PGO with topn is enabled by default now, but only one indirect
    target edge will be generated in ipa-profile pass, so add variables to 
enable
    multiple speculative edges through passes, speculative_id will record the
    direc

Re: [PATCH v4] Missed function specialization + partial devirtualization

2019-09-25 Thread luoxhu

Thanks Martin,


On 2019/9/25 18:57, Martin Liška wrote:

On 9/25/19 5:45 AM, luoxhu wrote:

Hi,

Sorry for replying so late due to cauldron conference and other LTO issues
I was working on.


Hello.

That's fine, we still have plenty of time for patch review.

Not fixed issues which I reported in v3 (and still valid in v4):
- please come up with indirect_target_info::indirect_target_info and use it

Sorry for miss out.



- do you need to stream out indirect_call_targets when common_target_id == 0?


No need to stream out items with common_target_id == 0, removed the if 
condition in lto-cgraph.c.




Then I'm suggesting to use vec::is_empty (please see my patch).
OK.  But has_multiple_indirect_call_p should return different than 
has_indirect_call_p as it checks more that one targets?


gcc/cgraph.c
/* Return true if this edge has multiple indirect call targets.  */
 bool
 cgraph_edge::has_multiple_indirect_call_p (void)
 {
-  return indirect_info && indirect_info->indirect_call_targets
-&& indirect_info->indirect_call_targets->length () > 1;
+  return (indirect_info && indirect_info->indirect_call_targets
+ && indirect_info->indirect_call_targets->length () > 1);
 }

 /* Return true if this edge has at least one indirect call target.  */
 bool
 cgraph_edge::has_indirect_call_p (void)
 {
-  return indirect_info && indirect_info->indirect_call_targets
-&& indirect_info->indirect_call_targets->length ();
+  return (indirect_info && indirect_info->indirect_call_targets
+ && !indirect_info->indirect_call_targets->is_empty ());
 }



I see following failures for the tests provided:
FAIL: gcc.dg/tree-prof/crossmodule-indir-call-topn-1.c compilation,  
-fprofile-generate -D_PROFILE_GENERATE
FAIL: gcc.dg/tree-prof/crossmodule-indir-call-topn-2.c compilation,  
-fprofile-generate -D_PROFILE_GENERATE
FAIL: gcc.dg/tree-prof/indir-call-prof-topn.c compilation,  -fprofile-generate 
-D_PROFILE_GENERATE


Sorry that I forgot to remove the deprecated build option in the 3 cases
(also updated the scan exp check):
-/* { dg-options "-O2 -flto -DDOJOB=1 -fdump-ipa-profile_estimate --param 
indir-call-topn-profile=1" } */

+/* { dg-options "-O2 -flto -DDOJOB=1 -fdump-ipa-profile_estimate" } */


The new patch is attached.  Thanks.


Xiong Hu



Next comments follow directly in the email body:



v4 Changes:
  1. Rebase to trunk.
  2. Remove num_of_ics and use vector's length to avoid redundancy.
  3. Update the code in ipa-profile.c to improve review feasibility.
  4. Add function has_indirect_call_p and has_multiple_indirect_call_p.
  5. For parameter control, I will leave it to next patch as it is a
 relative independent function.  Currently, maximum number of
 promotions is GCOV_TOPN_VALUES as only 4 profiling value limited
 from profile-generate, therefore minimum probability is adjusted to
 25% in value-prof.c, it was 75% also by hard code for single
 indirect target.  No control to minimal number of edge
 executions yet.  What's more, this patch is a bit large now.

This patch aims to fix PR69678 caused by PGO indirect call profiling
performance issues.
The bug that profiling data is never working was fixed by Martin's pull
back of topN patches, performance got GEOMEAN ~1% improvement(+24% for
511.povray_r specifically).
Still, currently the default profile only generates SINGLE indirect target
that called more than 75%.  This patch leverages MULTIPLE indirect
targets use in LTO-WPA and LTO-LTRANS stage, as a result, function
specialization, profiling, partial devirtualization, inlining and
cloning could be done successfully based on it.
Performance can get improved from 0.70 sec to 0.38 sec on simple tests.
Details are:
   1.  PGO with topn is enabled by default now, but only one indirect
   target edge will be generated in ipa-profile pass, so add variables to enable
   multiple speculative edges through passes, speculative_id will record the
   direct edge index bind to the indirect edge, indirect_call_targets length
   records how many direct edges owned by the indirect edge, postpone gimple_ic
   to ipa-profile like default as inline pass will decide whether it is benefit
   to transform indirect call.
   2.  Use speculative_id to track and search the reference node matched
   with the direct edge's callee for multiple targets.  Actually, it is the
   caller's responsibility to handle the direct edges mapped to same indirect
   edge.  speculative_call_info will return one of the direct edge specified,
   this will leverage current IPA edge process framework mostly.
   3.  Enable LTO WPA/LTRANS stage multiple indirect call targets analysis for
   profile full support in ipa passes and cgraph_edge functions.  speculative_id
   can be set by make_speculative id when multiple targets are binded to
   one indirect edge, and cloned if n

[PATCH v4] Missed function specialization + partial devirtualization

2019-09-24 Thread luoxhu
Hi,

Sorry for replying so late due to cauldron conference and other LTO issues
I was working on.

v4 Changes:
 1. Rebase to trunk.
 2. Remove num_of_ics and use vector's length to avoid redundancy.
 3. Update the code in ipa-profile.c to improve review feasibility.
 4. Add function has_indirect_call_p and has_multiple_indirect_call_p.
 5. For parameter control, I will leave it to next patch as it is a
relative independent function.  Currently, maximum number of
promotions is GCOV_TOPN_VALUES as only 4 profiling value limited
from profile-generate, therefore minimum probability is adjusted to
25% in value-prof.c, it was 75% also by hard code for single
indirect target.  No control to minimal number of edge
executions yet.  What's more, this patch is a bit large now.

This patch aims to fix PR69678 caused by PGO indirect call profiling
performance issues.
The bug that profiling data is never working was fixed by Martin's pull
back of topN patches, performance got GEOMEAN ~1% improvement(+24% for
511.povray_r specifically).
Still, currently the default profile only generates SINGLE indirect target
that called more than 75%.  This patch leverages MULTIPLE indirect
targets use in LTO-WPA and LTO-LTRANS stage, as a result, function
specialization, profiling, partial devirtualization, inlining and
cloning could be done successfully based on it.
Performance can get improved from 0.70 sec to 0.38 sec on simple tests.
Details are:
  1.  PGO with topn is enabled by default now, but only one indirect
  target edge will be generated in ipa-profile pass, so add variables to enable
  multiple speculative edges through passes, speculative_id will record the
  direct edge index bind to the indirect edge, indirect_call_targets length
  records how many direct edges owned by the indirect edge, postpone gimple_ic
  to ipa-profile like default as inline pass will decide whether it is benefit
  to transform indirect call.
  2.  Use speculative_id to track and search the reference node matched
  with the direct edge's callee for multiple targets.  Actually, it is the
  caller's responsibility to handle the direct edges mapped to same indirect
  edge.  speculative_call_info will return one of the direct edge specified,
  this will leverage current IPA edge process framework mostly.
  3.  Enable LTO WPA/LTRANS stage multiple indirect call targets analysis for
  profile full support in ipa passes and cgraph_edge functions.  speculative_id
  can be set by make_speculative id when multiple targets are binded to
  one indirect edge, and cloned if new edge is cloned.  speculative_id
  is streamed out and stream int by lto like lto_stmt_uid.
  4.  Add 1 in module testcase and 2 cross module testcases.
  5.  Bootstrap and regression test passed on Power8-LE.  No function
  and performance regression for SPEC2017.

gcc/ChangeLog

2019-09-25  Xiong Hu Luo  

PR ipa/69678
* cgraph.c (symbol_table::create_edge): Init speculative_id.
(cgraph_edge::make_speculative): Add param for setting speculative_id.
(cgraph_edge::speculative_call_info): Find reference by
speculative_id for multiple indirect targets.
(cgraph_edge::resolve_speculation): Decrease the speculations
for indirect edge, drop it's speculative if not direct target
left.
(cgraph_edge::redirect_call_stmt_to_callee): Likewise.
(cgraph_node::verify_node): Don't report error if speculative
edge not include statement.
(cgraph_edge::has_multiple_indirect_call_p): New function.
(cgraph_edge::has_indirect_call_p): New function.
* cgraph.h (struct indirect_target_info): New struct.
(indirect_call_targets): New vector variable.
(make_speculative): Add param for setting speculative_id.
(cgraph_edge::has_multiple_indirect_call_p): New declare.
(cgraph_edge::has_indirect_call_p): New declare.
(speculative_id): New variable.
* cgraphclones.c (cgraph_node::create_clone): Clone speculative_id.
* ipa-inline.c (inline_small_functions): Fix iterator update.
* ipa-profile.c (ipa_profile_generate_summary): Add indirect
multiple targets logic.
(ipa_profile): Likewise.
* ipa-ref.h (speculative_id): New variable.
* ipa.c (process_references): Fix typo.
* lto-cgraph.c (lto_output_edge): Add indirect multiple targets
logic.  Stream out speculative_id.
(input_edge): Likewise.
* predict.c (dump_prediction): Remove edges count assert to be
precise.
* symtab.c (symtab_node::create_reference): Init speculative_id.
(symtab_node::clone_references): Clone speculative_id.
(symtab_node::clone_referring): Clone speculative_id.
(symtab_node::clone_reference): Clone speculative_id.
(symtab_node::clear_stmts_in_references): Clear speculative_id.
* tree-inline.c (copy_bb): Duplicate all the 

[PATCH] Backport r274411 from trunk to gcc-9-branch

2019-08-25 Thread luoxhu
This is the backport patch to gcc-9-branch, please ignore the previous
mail.


Backport r274411 of "Enable math functions linking with static library
for LTO" from mainline to gcc-9-branch.

Bootstrapped/Regression-tested on Linux POWER8 LE.

gcc/ChangeLog
2019-08-26  Xiong Hu Luo  

Backport r274411 from trunk to gcc-9-branch.
2019-08-14  Xiong Hu Luo  

PR lto/91287
* builtins.c (builtin_with_linkage_p): New function.
* builtins.h (builtin_with_linkage_p): New function.
* symtab.c (write_symbol): Remove redundant assert.
* lto-streamer-out.c (symtab_node::output_to_lto_symbol_table_p):
Remove FIXME and use builtin_with_linkage_p.
---
 gcc/builtins.c | 87 ++
 gcc/builtins.h |  1 +
 gcc/lto-streamer-out.c |  6 ---
 gcc/symtab.c   | 16 ++--
 4 files changed, 100 insertions(+), 10 deletions(-)

diff --git a/gcc/builtins.c b/gcc/builtins.c
index 9bcb310c015..ed11f79ff0b 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -11236,3 +11236,90 @@ target_char_cst_p (tree t, char *p)
   *p = (char)tree_to_uhwi (t);
   return true;
 }
+
+/* Return true if the builtin DECL is implemented in a standard library.
+   Otherwise returns false which doesn't guarantee it is not (thus the list of
+   handled builtins below may be incomplete).  */
+
+bool
+builtin_with_linkage_p (tree decl)
+{
+  if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
+switch (DECL_FUNCTION_CODE (decl))
+{
+  CASE_FLT_FN (BUILT_IN_ACOS):
+  CASE_FLT_FN (BUILT_IN_ACOSH):
+  CASE_FLT_FN (BUILT_IN_ASIN):
+  CASE_FLT_FN (BUILT_IN_ASINH):
+  CASE_FLT_FN (BUILT_IN_ATAN):
+  CASE_FLT_FN (BUILT_IN_ATANH):
+  CASE_FLT_FN (BUILT_IN_ATAN2):
+  CASE_FLT_FN (BUILT_IN_CBRT):
+  CASE_FLT_FN (BUILT_IN_CEIL):
+  CASE_FLT_FN_FLOATN_NX (BUILT_IN_CEIL):
+  CASE_FLT_FN (BUILT_IN_COPYSIGN):
+  CASE_FLT_FN_FLOATN_NX (BUILT_IN_COPYSIGN):
+  CASE_FLT_FN (BUILT_IN_COS):
+  CASE_FLT_FN (BUILT_IN_COSH):
+  CASE_FLT_FN (BUILT_IN_ERF):
+  CASE_FLT_FN (BUILT_IN_ERFC):
+  CASE_FLT_FN (BUILT_IN_EXP):
+  CASE_FLT_FN (BUILT_IN_EXP2):
+  CASE_FLT_FN (BUILT_IN_EXPM1):
+  CASE_FLT_FN (BUILT_IN_FABS):
+  CASE_FLT_FN_FLOATN_NX (BUILT_IN_FABS):
+  CASE_FLT_FN (BUILT_IN_FDIM):
+  CASE_FLT_FN (BUILT_IN_FLOOR):
+  CASE_FLT_FN_FLOATN_NX (BUILT_IN_FLOOR):
+  CASE_FLT_FN (BUILT_IN_FMA):
+  CASE_FLT_FN_FLOATN_NX (BUILT_IN_FMA):
+  CASE_FLT_FN (BUILT_IN_FMAX):
+  CASE_FLT_FN_FLOATN_NX (BUILT_IN_FMAX):
+  CASE_FLT_FN (BUILT_IN_FMIN):
+  CASE_FLT_FN_FLOATN_NX (BUILT_IN_FMIN):
+  CASE_FLT_FN (BUILT_IN_FMOD):
+  CASE_FLT_FN (BUILT_IN_FREXP):
+  CASE_FLT_FN (BUILT_IN_HYPOT):
+  CASE_FLT_FN (BUILT_IN_ILOGB):
+  CASE_FLT_FN (BUILT_IN_LDEXP):
+  CASE_FLT_FN (BUILT_IN_LGAMMA):
+  CASE_FLT_FN (BUILT_IN_LLRINT):
+  CASE_FLT_FN (BUILT_IN_LLROUND):
+  CASE_FLT_FN (BUILT_IN_LOG):
+  CASE_FLT_FN (BUILT_IN_LOG10):
+  CASE_FLT_FN (BUILT_IN_LOG1P):
+  CASE_FLT_FN (BUILT_IN_LOG2):
+  CASE_FLT_FN (BUILT_IN_LOGB):
+  CASE_FLT_FN (BUILT_IN_LRINT):
+  CASE_FLT_FN (BUILT_IN_LROUND):
+  CASE_FLT_FN (BUILT_IN_MODF):
+  CASE_FLT_FN (BUILT_IN_NAN):
+  CASE_FLT_FN (BUILT_IN_NEARBYINT):
+  CASE_FLT_FN_FLOATN_NX (BUILT_IN_NEARBYINT):
+  CASE_FLT_FN (BUILT_IN_NEXTAFTER):
+  CASE_FLT_FN (BUILT_IN_NEXTTOWARD):
+  CASE_FLT_FN (BUILT_IN_POW):
+  CASE_FLT_FN (BUILT_IN_REMAINDER):
+  CASE_FLT_FN (BUILT_IN_REMQUO):
+  CASE_FLT_FN (BUILT_IN_RINT):
+  CASE_FLT_FN_FLOATN_NX (BUILT_IN_RINT):
+  CASE_FLT_FN (BUILT_IN_ROUND):
+  CASE_FLT_FN_FLOATN_NX (BUILT_IN_ROUND):
+  CASE_FLT_FN (BUILT_IN_SCALBLN):
+  CASE_FLT_FN (BUILT_IN_SCALBN):
+  CASE_FLT_FN (BUILT_IN_SIN):
+  CASE_FLT_FN (BUILT_IN_SINH):
+  CASE_FLT_FN (BUILT_IN_SINCOS):
+  CASE_FLT_FN (BUILT_IN_SQRT):
+  CASE_FLT_FN_FLOATN_NX (BUILT_IN_SQRT):
+  CASE_FLT_FN (BUILT_IN_TAN):
+  CASE_FLT_FN (BUILT_IN_TANH):
+  CASE_FLT_FN (BUILT_IN_TGAMMA):
+  CASE_FLT_FN (BUILT_IN_TRUNC):
+  CASE_FLT_FN_FLOATN_NX (BUILT_IN_TRUNC):
+   return true;
+  default:
+   break;
+}
+  return false;
+}
diff --git a/gcc/builtins.h b/gcc/builtins.h
index 1ffb491d785..66c9295ff4a 100644
--- a/gcc/builtins.h
+++ b/gcc/builtins.h
@@ -150,5 +150,6 @@ extern internal_fn replacement_internal_fn (gcall *);
 
 extern void warn_string_no_nul (location_t, const char *, tree, tree);
 extern tree unterminated_array (tree, tree * = NULL, bool * = NULL);
+extern bool builtin_with_linkage_p (tree);
 
 #endif /* GCC_BUILTINS_H */
diff --git a/gcc/lto-streamer-out.c b/gcc/lto-streamer-out.c
index b6e395b53fd..e07085770a5 100644
--- a/gcc/lto-streamer-out.c
+++ b/gcc/lto-streamer-out.c
@@ -2617,12 +2617,6 @@ write_symbol (struct streamer_tree_cache_d *cache,
   const char *comdat;
   unsigned char 

  1   2   >