Re: [Bug libstdc++/109442] Dead local copy of std::vector not removed from function

2024-05-14 Thread Jan Hubicka via Gcc-bugs
This patch attempts to add __builtin_operator_new/delete. So far they
are not optimized, which will need to be done by extra flag of BUILT_IN_
code.  also the decl.cc code can be refactored to be less of cut
and I guess has_builtin hack to return proper value needs to be moved
to C++ FE.

However the immediate problem I run into is that libstdc++ testuiste
fails due to lack of std::nothrow overrides.  I wonder how to get that
working?
diff --git a/gcc/c-family/c-lex.cc b/gcc/c-family/c-lex.cc
index ff5ce2bf729..602b097059c 100644
--- a/gcc/c-family/c-lex.cc
+++ b/gcc/c-family/c-lex.cc
@@ -533,6 +533,10 @@ c_common_has_builtin (cpp_reader *pfile)
   if (!name)
 return 0;
 
+  if (!strcmp (name, "__builtin_operator_new")
+  || !strcmp (name, "__builtin_operator_delete"))
+return 201802L;
+
   return names_builtin_p (name);
 }
 
diff --git a/gcc/cp/decl.cc b/gcc/cp/decl.cc
index 3fc8835154d..90b100ca3dc 100644
--- a/gcc/cp/decl.cc
+++ b/gcc/cp/decl.cc
@@ -59,6 +59,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "omp-general.h"
 #include "omp-offload.h"  /* For offload_vars.  */
 #include "opts.h"
+#include "print-tree.h"
 #include "langhooks-def.h"  /* For lhd_simulate_record_decl  */
 
 /* Possible cases of bad specifiers type used by bad_specifiers. */
@@ -5048,13 +5049,27 @@ cxx_init_decl_processing (void)
 DECL_IS_MALLOC (opnew) = 1;
 DECL_SET_IS_OPERATOR_NEW (opnew, true);
 DECL_IS_REPLACEABLE_OPERATOR (opnew) = 1;
+tree builtin_opnew = build_cp_library_fn 
(get_identifier("__builtin_operator_new"),
+ NEW_EXPR, newtype, 0);
+DECL_IS_MALLOC (builtin_opnew) = 1;
+DECL_SET_IS_OPERATOR_NEW (builtin_opnew, true);
+DECL_IS_REPLACEABLE_OPERATOR (builtin_opnew) = 1;
+SET_DECL_ASSEMBLER_NAME (builtin_opnew, DECL_ASSEMBLER_NAME (opnew));
+pushdecl (builtin_opnew);
 opnew = push_cp_library_fn (VEC_NEW_EXPR, newtype, 0);
 DECL_IS_MALLOC (opnew) = 1;
 DECL_SET_IS_OPERATOR_NEW (opnew, true);
 DECL_IS_REPLACEABLE_OPERATOR (opnew) = 1;
+
 tree opdel = push_cp_library_fn (DELETE_EXPR, deltype, ECF_NOTHROW);
 DECL_SET_IS_OPERATOR_DELETE (opdel, true);
 DECL_IS_REPLACEABLE_OPERATOR (opdel) = 1;
+tree builtin_opdel = build_cp_library_fn 
(get_identifier("__builtin_operator_delete"),
+ DELETE_EXPR, deltype, 
ECF_NOTHROW);
+DECL_SET_IS_OPERATOR_DELETE (builtin_opdel, true);
+DECL_IS_REPLACEABLE_OPERATOR (builtin_opdel) = 1;
+SET_DECL_ASSEMBLER_NAME (builtin_opdel, DECL_ASSEMBLER_NAME (opdel));
+pushdecl (builtin_opdel);
 opdel = push_cp_library_fn (VEC_DELETE_EXPR, deltype, ECF_NOTHROW);
 DECL_SET_IS_OPERATOR_DELETE (opdel, true);
 DECL_IS_REPLACEABLE_OPERATOR (opdel) = 1;
@@ -5072,6 +5087,12 @@ cxx_init_decl_processing (void)
opdel = push_cp_library_fn (DELETE_EXPR, deltype, ECF_NOTHROW);
DECL_SET_IS_OPERATOR_DELETE (opdel, true);
DECL_IS_REPLACEABLE_OPERATOR (opdel) = 1;
+   builtin_opdel = build_cp_library_fn 
(get_identifier("__builtin_operator_delete"),
+DELETE_EXPR, deltype, ECF_NOTHROW);
+   DECL_SET_IS_OPERATOR_DELETE (builtin_opdel, true);
+   DECL_IS_REPLACEABLE_OPERATOR (builtin_opdel) = 1;
+   SET_DECL_ASSEMBLER_NAME (builtin_opdel, DECL_ASSEMBLER_NAME (opdel));
+   pushdecl (builtin_opdel);
opdel = push_cp_library_fn (VEC_DELETE_EXPR, deltype, ECF_NOTHROW);
DECL_SET_IS_OPERATOR_DELETE (opdel, true);
DECL_IS_REPLACEABLE_OPERATOR (opdel) = 1;
@@ -5094,6 +5115,13 @@ cxx_init_decl_processing (void)
DECL_IS_MALLOC (opnew) = 1;
DECL_SET_IS_OPERATOR_NEW (opnew, true);
DECL_IS_REPLACEABLE_OPERATOR (opnew) = 1;
+   builtin_opnew = build_cp_library_fn 
(get_identifier("__builtin_operator_new"),
+VEC_NEW_EXPR, newtype, 0);
+   DECL_IS_MALLOC (builtin_opnew) = 1;
+   DECL_SET_IS_OPERATOR_NEW (builtin_opnew, true);
+   DECL_IS_REPLACEABLE_OPERATOR (builtin_opnew) = 1;
+   SET_DECL_ASSEMBLER_NAME (builtin_opnew, DECL_ASSEMBLER_NAME (opnew));
+   pushdecl (builtin_opnew);
opnew = push_cp_library_fn (VEC_NEW_EXPR, newtype, 0);
DECL_IS_MALLOC (opnew) = 1;
DECL_SET_IS_OPERATOR_NEW (opnew, true);
@@ -5107,6 +5135,12 @@ cxx_init_decl_processing (void)
opdel = push_cp_library_fn (DELETE_EXPR, deltype, ECF_NOTHROW);
DECL_SET_IS_OPERATOR_DELETE (opdel, true);
DECL_IS_REPLACEABLE_OPERATOR (opdel) = 1;
+   builtin_opdel = build_cp_library_fn 
(get_identifier("__builtin_operator_delete"),
+DELETE_EXPR, deltype, ECF_NOTHROW);
+   DECL_SET_IS_OPERATOR_DELETE (builtin_opdel, true);
+   DECL_IS_REPLACEABLE_OPERATOR (builtin_opdel) = 1;
+   SET_DECL_ASSEMBLER_NAME (builtin_opdel, DECL_ASSEMBLER_NAME (opdel));
+   pushdecl 

Re: [Bug ipa/113907] [11/12/13/14 regression] ICU miscompiled on x86 since r14-5109-ga291237b628f41

2024-04-09 Thread Jan Hubicka via Gcc-bugs
There is still problem with loop bounds.  I am testing patch on that and
then we should be (finally) finally safe.


Re: [Bug ipa/114262] Over-inlining when optimizing for size with gnu_inline function

2024-03-07 Thread Jan Hubicka via Gcc-bugs
> Note GCC has not retuned its -Os heurstics for a long time because it has been
> decent enough for most folks and corner cases like this is almost never come
> up.
There were quite few changes to -Os heuristics :)
One of bigger challenges is that we do see more and more C++ code built
with -Os which relies on certain functions to be inlined and optimized
in context, so we had to get more optimistic in a hope that inlined code
will optimize well.

COMDAT functions are more likely inlined because statistics shows that
many of them are not really shared between translations units
(see -param=comdat-sharing-probability parameter). This was necessary to
get reasonable code for Firefox approx 15 years ago.


Re: [Bug target/114232] [14 regression] ICE when building rr-5.7.0 with LTO on x86

2024-03-05 Thread Jan Hubicka via Gcc-bugs
Looking at the prototype patch, why need to change also the splitters?

My original goal was to use splitters to expand to faster code sequences
while having patterns necessary for both variants.  This makes it
possible to use optimize_insn_for_size/speed and make decisions using BB
profile, since we will not ICE if the hotness of BB changes later.


Re: [Bug tree-optimization/113787] [12/13/14 Regression] Wrong code at -O with ipa-modref on aarch64

2024-02-14 Thread Jan Hubicka via Gcc-bugs
> > I guess PTA gets around by tracking points-to set also for non-pointer
> > types and consequently it also gives up on any such addition.
> 
> It does.  But note it does _not_ for POINTER_PLUS where it treats
> the offset operand as non-pointer.
> 
> > I think it is ipa-prop.c::unadjusted_ptr_and_unit_offset. It accepts
> > pointer_plus expression, but does not look through POINTER_PLUS.
> > We can restrict it further, but tracking base pointer is quite useful,
> > so it would be nice to not give up completely.
> 
> It looks like that function might treat that
> 
>  ADDR_EXPR >
> 
> as integer_zerop base.  It does
> 
>   if (TREE_CODE (op) == ADDR_EXPR) 
> {
>   poly_int64 extra_offset = 0; 
>   tree base = get_addr_base_and_unit_offset (TREE_OPERAND (op, 0),
>  );
>   if (!base)
> {
>   base = get_base_address (TREE_OPERAND (op, 0));
>   if (TREE_CODE (base) != MEM_REF)
> break;
>   offset_known = false;
> }
>   else
> {
>   if (TREE_CODE (base) != MEM_REF)
> break;
> 
> with a variable offset we fall to the TREE_CODE (base) != MEM_REF
> and will have offset_known == true.  Not sure what it does with
> the result though (it's not the address of a decl).
> 
> This function seems to oddly special-case != MEM_REF ... (maybe
> it wants to hande DECL_P () as finishing?

Hmm the function was definitely not written with TARGET_MEM_REF in mind,
since it was originally used for IPA passes only.
We basically want to handle stuff like
 >foo
or
 &(ptr->foo)
In the second case we want to continue the SSA walk to hopefully work
out the origin of PTR.
ipa-modref then looks if the base pointer is derived from function
parameter or points to local or readonly memory to produce its summary.
> 
> Note get_addr_base_and_unit_offset will return NULL for
> a TARGET_MEM_REF <, ..., offset> but TARGET_MEM_REF
> itself if the base isn't an ADDR_EXPR, irrespective of whether
> the offset within it is constant or not.

Hmm, interesting.  I would expect it to interpret the emantics of TMR
and return base.
> 
> Not sure if the above is a problem, but it seems the only caller
> will just call points_to_local_or_readonly_memory_p on the
> ADDR_EXPR where refs_local_or_readonly_memory_p via
> points_to_local_or_readonly_memory_p will eventually do
> 
>   /* See if memory location is clearly invalid.  */
>   if (integer_zerop (t))
> return flag_delete_null_pointer_checks;
> 
> and that might be a problem.  As said, we rely on
> ADDR_EXPR  > to be an address computation
> that's not subject to strict interpretation to allow IVOPTs
> doing this kind of optimization w/o introducing some kind of
> INTEGER_LEA <...>.  I know that's a bit awkward but we should
> make sure this is honored by IPA as well.
> 
> I'd say
> 
> diff --git a/gcc/ipa-fnsummary.cc b/gcc/ipa-fnsummary.cc
> index 74c9b4e1d1e..45a770cf940 100644
> --- a/gcc/ipa-fnsummary.cc
> +++ b/gcc/ipa-fnsummary.cc
> @@ -2642,7 +2642,8 @@ points_to_local_or_readonly_memory_p (tree t)
> return true;
>return !ptr_deref_may_alias_global_p (t, false);
>  }
> -  if (TREE_CODE (t) == ADDR_EXPR)
> +  if (TREE_CODE (t) == ADDR_EXPR
> +  && TREE_CODE (TREE_OPERAND (t, 0)) != TARGET_MEM_REF)
>  return refs_local_or_readonly_memory_p (TREE_OPERAND (t, 0));
>return false;
>  }
> 
> might eventually work?  Alternatively a bit less aggressive like
> the following.
> 
> diff --git a/gcc/ipa-fnsummary.cc b/gcc/ipa-fnsummary.cc
> index 74c9b4e1d1e..7c79adf6440 100644
> --- a/gcc/ipa-fnsummary.cc
> +++ b/gcc/ipa-fnsummary.cc
> @@ -2642,7 +2642,9 @@ points_to_local_or_readonly_memory_p (tree t)
> return true;
>return !ptr_deref_may_alias_global_p (t, false);
>  }
> -  if (TREE_CODE (t) == ADDR_EXPR)
> +  if (TREE_CODE (t) == ADDR_EXPR
> +  && (TREE_CODE (TREE_OPERAND (t, 0)) != TARGET_MEM_REF
> + || TREE_CODE (TREE_OPERAND (TREE_OPERAND (t, 0), 0)) != 
> INTEGER_CST))
>  return refs_local_or_readonly_memory_p (TREE_OPERAND (t, 0));
>return false;
>  }

Yes, those both looks reasonable to me, perhaps less agressive would be
better. 
> 
> A "nicer" solution might be to add a informational operand
> to TARGET_MEM_REF, representing the base pointer to be used for
> alias/points-to purposes.  But if that's not invariant it might
> keep some otherwise unnecessary definition stmts live.

Yep, I see that forcing extra IV to track original semantics would be
trouble here.  I think that after iv-opts we should be done with more
fancy propagation across loops.

However, to avoid ipa-modref summary degradation, perhaps scheduling the
pass before ivopts would make sense...

Thanks,
Honza


Re: [Bug target/113233] LoongArch: target options from LTO objects not respected during linking

2024-01-04 Thread Jan Hubicka via Gcc-bugs
> Confirm.  But option save/restore has been always implemented:
> 
> .section.gnu.lto_.opts,"",@progbits
> .ascii  "'-fno-openmp' '-fno-openacc' '-fno-pie' '-fcf-protection"
> .ascii  "=none' '-mabi=lp64d' '-march=loongarch64' '-mfpu=64' '-m"
> .ascii  "simd=lasx' '-mcmodel=normal' '-mtune=loongarch64' '-flto"
> .ascii  "'\000"
> 
> So -msimd=lasx is correctly recorded.  Not sure why it does not work.

With LTO we need to mix code compiled with different sets of options.
For this reason we imply for every function defition and optimization
and target attribute which record the flags.  So it seems target
attribute is likely broken for this flag.


Re: [Bug middle-end/111088] useless 'xor eax,eax' inserted when a value is not returned and icf

2023-08-21 Thread Jan Hubicka via Gcc-bugs
> But adds a return with a value. And then the inliner inlines foo into foo2 but
> we still have the return with a value around ...
I guess ICF can special case unused return value, but why this is not
taken care of by ipa-sra?


Re: [Predicated Ins vs Branches] O3 and PGO result in 2x performance drop relative to O2

2023-08-01 Thread Jan Hubicka via Gcc-bugs
> > If I comment it out as above patch, then O3/PGO can get 16% and 12% 
> > performance
> > improvement compared to O2 on x86.
> >
> > O2  O3  PGO
> > cycles  2,497,674,824   2,104,993,224   2,199,753,593
> > instructions10,457,508,646  9,723,056,131   10,457,216,225
> > branches2,303,029,380   2,250,522,323   2,302,994,942
> > branch-misses   0.00%   0.01%   0.01%
> >
> > The main difference in the compilation output about code around the 
> > miss-prediction
> > branch is:
> >   o In O2: predicated instruction (cmov here) is selected to eliminate above
> > branch. cmov is true better than branch here.
> >   o In O3/PGO: bitout() is inlined into encode_file(), and branch 
> > instruction
> > is selected. But this branch is obviously *unpredictable* and the 
> > compiler
> > doesn't know it. This why O3/PGO are are so bad for this program.
> >
> > Gcc doesn't support __builtin_unpredictable() which has been introduced by 
> > llvm.
> > Then I tried to see if __builtin_expect_with_probability(e,x, 0.5) can 
> > serve the
> > same purpose. The result is negative.
> 
> But does it appear to be predictable with your profiling data?

Also one thing is that __builtin_expect and
__builtin_expect_with_probability only affects the static branch
prediciton algorithm, so with profile feedback they are ignored on every
branch executed at least once during the train run.

setting probability 0.5 is really not exactly the same as hint that the
branch will be mispredicted, since modern CPUs handle well regularly
behaving branchs (such as a branch firing every even iteration of loop).

So I think having the builting is not a bad idea.  I was thinking if it
makes sense to represent it withing profile_probability type and I am
not convinced, since "unpredictable probability" sounds counceptually
odd and we would need to keep the flag intact over all probability
updates we do.  For things like loop exits we recompute probabilities
from frequencies after unrolling/vectorizaiton and other things and we
would need to invent new API to propagate the flag from previous
probability (which is not even part of the computation right now)

So I guess the challenge is how to pass this info down through the
optimization pipeline, since we would need to annotate gimple
conds/switches and manage it to RTL level.  On gimple we have flags and
on rtl level notes so there is space for it, but we would need to
maintain the info through CFG changes.

Auto-FDO may be interesting way to detect such branches.

Honza
> 
> > I think we could come to a conclusion that there must be something can 
> > improve in
> > Gcc's heuristic strategy about Predicated Instructions and branches, at 
> > least
> > for O3 and PGO.
> >
> > And can we add __builtin_unpredictable() support for Gcc? As usually it's 
> > hard
> > for the compiler to detect unpredictable branches.
> >
> > --
> > Cheers,
> > Changbin Du


Re: [Bug tree-optimization/106293] [13/14 Regression] 456.hmmer at -Ofast -march=native regressed by 19% on zen2 and zen3 in July 2022

2023-07-28 Thread Jan Hubicka via Gcc-bugs
> This heuristic wants to catch
> 
>   
>   if (foo) abort ();
>   
> 
> and avoid sinking "too far" across a path with "similar enough"
> execution count (I think the original motivation was to fix some
> spilling / register pressure issue).  The loop depth test
> should be !(bb_loop_depth (best_bb) < bb_loop_depth (early_bb))

I am still concenred that loop_depth (bb1) < loop_depth (bb2)
does not really imply that bb1 is not in different loop nest with
loop with significantly higher iteration count than bb2...
> so we shouldn't limit sinking to a more outer nest.  As we rule
> out > before this becomes ==.
> 
> It looks tempting to sink to the earliest place with the same
> execution count rather than the latest but the above doesn't
> really achive that (it doesn't look "upwards" but simply fails).
> With a guessed profile it's also going to be hard.

Statistically guessed profile works quite well for things like placement
of splills in IRA (not perfectly of course) and this looks like kind of
similar thing.  So perhaps it could work reasoably well...
> 
> And it in no way implements register pressure / spilling sensitivity
> (see also Ajits attempts at producing a patch that avoids sinking
> across a call).  All these are ultimatively doomed unless we at least
> consider a group of stmts together.

hmm, life is hard :)
Honza


Re: [Bug target/110758] [14 Regression] 8% hmmer regression on zen1/3 with -Ofast -march=native -flto between g:8377cf1bf41a0a9d (2023-07-05 01:46) and g:3a61ca1b9256535e (2023-07-06 16:56); g:d76d19c

2023-07-21 Thread Jan Hubicka via Gcc-bugs
> I suspect this is most likely the profile updates changes ...
Quite possibly. The goal of this excercise is to figure out if there are
some bugs in profile estimate or whether passes somehow preffer broken
profile or if it is just back luck.

Looking at sphinx and fatigue it seems that LRA really may preffer
increased profile counts in peeled vectorized loop since it does not
understand the fact that putting spill on critical path through the
dependnecy graph of the code is not good for out of order execution.


Re: [Bug ipa/110334] [13/14 Regresssion] unused functions not eliminated before LTO streaming

2023-06-28 Thread Jan Hubicka via Gcc-bugs
> 
> why disallow caller->indirect_calls?
See testcase in comment #9
> 
> > +   return false;
> > +  for (cgraph_edge *e2 = callee->callees; e2; e2 = e2->next_callee)
> 
> I don't think this flys - it looks quadratic.  Can we compute this
> in the inline summary once instead?

I guess I can place a cache there.  I think this check will become more
global over time so it more fits IMO here.
> 
> As for indirect calls, can we maybe mark initial direct GIMPLE call
> stmts as "always-inline" and only look at that marking, thus an
> indirect call will never become "always-inline"?  Iff cgraph edges
> prevail during all early inlining we could mark call edges for
> this purpose?

I also think we need call site specific info.
Tagging gimple call statements and copying the info to gimple edges will
probably be needed here.  We want to keep the info from early inlining
to late inlining since we output errors late.
We already have plenty of GF_CALL_ flags, so adding one should be easy?

Honza


Re: [Bug ipa/110334] [13/14 Regresssion] unused functions not eliminated before LTO streaming

2023-06-23 Thread Jan Hubicka via Gcc-bugs
Just so it is somewhere, here is a testcase that we can't inline leaf
functions to always_inlines unless we do some tracking of what calls
were formerly indirect calls.

We really overloaded always_inline from the original semantics "drop
inlining heuristics" into "be sure that result is inlined" while for
the second it does not make sense to take its address.
Clang apparently simply does not error on failes always inlines which
makes its life easier.

int n;
typedef void (*fnptr)();
fnptr get_me();
__attribute__ ((always_inline))
inline void test(void)
{
if (n < 10)
  (get_me())();
n++;
return;
}
fnptr get_me()
{
return test;
}
void
foo()
{
test();
}



Re: [Bug libstdc++/110287] _M_check_len is expensive

2023-06-19 Thread Jan Hubicka via Gcc-bugs
> 
> There is no guarantee that std::vector::max_size() is PTRDIFF_MAX. It
> depends on the Allocator type, A. A user-defined allocator could have
> max_size() == 100.

If inliner we see path to the throw functions, it will not determine
_M_check_len as early inlinable.
Perhaps we can __builtin_constant_p it as well and check that
max_size () * sizeof ()
is close to ptrdiff_max.  

Thanks for the comments on the patches.  I will try to update the patch.

I was wondering about the allocators. As shown in the mail, optimiznig
_M_check_len still leaves two independent throws for insanely large
ops.  Since allocator is user replaceable, I guess we can not add new
member function for safe_allocate or so.

We can use __builtin_unreachable to set the value range on the return
value.  For that to work during early optimizations we need 

 1) extend early VRP to retrofit the value determined by
__builtin_unreachable to the SSA name defned earlier based on fact
that the execution can not legally terminate in between
 2) teaching inliner to ignore conditionals guaring __builtin_unreacable
 3) add support for return functions to propagate the value range from
_M_check_len to _M_reallocate_insert.
so it is correctly propagated to allocator call.

This is not very easy, but can be generally useful elsewhere.


Re: [Bug c++/106943] GCC building clang/llvm with LTO flags causes ICE in clang

2023-05-12 Thread Jan Hubicka via Gcc-bugs
> > Indeed it is quite long time problem with clang not building with lifetime
> > DSE and strict aliasing.  I wonder why this is not fixed on clang side?
> 
> Because the problems were not communicated? I knew that Firefox needed
> -flifetime-dse=1, but it's the first time I hear that any such problems in
> Clang/LLVM were identified.
> 
> I could not find any workaround for lifetime-dse in SUSE spec file for llvm16.
> Are you saying it was known and worked around somehow? Or it is not 
> manifesting
> because LLVM is built without LTO?

I think opensuse package outs-out LTO probably for this reason.  I am
sometimes using LLVM as benchmark of LTO and PGO, so it would be great
to have this enabled in the packages, but I had no time to do that so
far.  LLVM built with LTO and PGO builds quite a lot faster.  I was
filling bugreport for that some time ago and it seems that the bugreport
linked above has quite good analysis about what breaks.


Re: [Bug target/87832] AMD pipeline models are very costly size-wise

2022-11-16 Thread Jan Hubicka via Gcc-bugs
> 
> Do you mean we should fix modeling of divisions there as well? I don't have
> latency/throughput measurements for those CPUs, nor access so I can run
> experiments myself, unfortunately.
> 
> I guess you mean just making a patch to model division units separately,
> leaving latency/throughput as in current incorrect models, and leave it to
> manufacturers to correct it? Alternatively, for AMD Bobcat and Bulldozer we
> might be able to crowd-source it eventually.
Actually for older cores I think the manufacturers do not care much.  I
still have a working Bulldozer machine and I can do some testing.
I think in Buldozer case I was basing the latency throughput on data in
Agner Fog's manuals.  How do you test it?
Honza


Re: [Bug middle-end/106078] Invalid loop invariant motion with non-call-exceptions

2022-06-25 Thread Jan Hubicka via Gcc-bugs
> > For this one it's PRE hoisting *b across the endless loop (PRE handles
> > calls as possibly not returning but not loops as possibly not 
> > terminating...)
> > So it's a different bug.
> 
> Btw, C++ requiring forward progress makes the testcase undefined.
In my understanding access to volatile variable is a forward progres:
In a valid C++ program, every thread eventually does one of the
following:

   -terminate
   -makes a call to an I/O library function
   -performs an access through a volatile glvalue
   -performs an atomic operation or a synchronization operation 

I think one can also replace volatile access by atomics: we only need to
know the side effects of that operation.
Honza


Re: [Bug lto/105727] __builtin_constant_p expansion in LTO

2022-05-25 Thread Jan Hubicka via Gcc-bugs
> > My guess is that the
> > BUILD_BUG();
> > line is the sole thing that is wrong, it should be just break;
> > as the memory_is_poisoned_n(addr, size); will handle all the sizes,
> > regardless if they are constants or not.
> 
> Sure, I'm going to suggest such a change.
To me it looked like a protection that size is not going to be large
(or perhaps author wants to add extra special cases as they are needed)

Honza


Re: [Bug c/105728] New: dead store to static var not optimized out

2022-05-25 Thread Jan Hubicka via Gcc-bugs
> To me, all of these do the same thing and should generate the same code.
> As nobody else can see removeme, and we aren't leaking its address, shouldn't
> the compiler be able to deduce that all accesses to removeme are
> inconsequential and can be removed?
> 
> My gcc 11.3 generates a condidion and a store and a return 0 for dummy1, the
> same thing for dummy2, but for dummy3 it understands that it only needs to 
> emit
> a return 0.

GCC detects "write olny" variables and that is what matches for dummy3.
I am not 100% sure it is valid to do the optimization in other two cases
since when multiple threads are considered. In any case we lack tracking
of constants stored to global variables which is something ipa-cp can be
extended to.

Honza


Re: [Bug rtl-optimization/102178] [12 Regression] SPECFP 2006 470.lbm regressions on AMD Zen CPUs after r12-897-gde56f95afaaa22

2022-01-27 Thread Jan Hubicka via Gcc-bugs
> I would say so.  It saves code size and also uop space unless the two
> can magically fuse to a immediate to %xmm move (I doubt that).
I made simple benchmark

double a=10;
int
main()
{
long int i;
double sum,val1,val2,val3,val4;
 for (i=0;i<10;i++)
 {
#if 1
#if 1
asm __volatile__("movabsq $0x3ff03db8fde2ef4e, %%r8;vmovq   
%%r8, %0": "=x"(val1): :"r8","xmm11");
asm __volatile__("movabsq $0x3ff03db8fde2ef4e, %%r8;vmovq   
%%r8, %0": "=x"(val2): :"r8","xmm11");
asm __volatile__("movabsq $0x3ff03db8fde2ef4e, %%r8;vmovq   
%%r8, %0": "=x"(val3): :"r8","xmm11");
asm __volatile__("movabsq $0x3ff03db8fde2ef4e, %%r8;vmovq   
%%r8, %0": "=x"(val4): :"r8","xmm11");
#else
asm __volatile__("movq %1, %%r8;vmovq   %%r8, %0": 
"=x"(val1):"m"(a) :"r8","xmm11");
asm __volatile__("movq %1, %%r8;vmovq   %%r8, %0": 
"=x"(val2):"m"(a) :"r8","xmm11");
asm __volatile__("movq %1, %%r8;vmovq   %%r8, %0": 
"=x"(val3):"m"(a) :"r8","xmm11");
asm __volatile__("movq %1, %%r8;vmovq   %%r8, %0": 
"=x"(val4):"m"(a) :"r8","xmm11");
#endif
#else
asm __volatile__("vmovq   %1, %0": "=x"(val1):"m"(a) 
:"r8","xmm11");
asm __volatile__("vmovq   %1, %0": "=x"(val2):"m"(a) 
:"r8","xmm11");
asm __volatile__("vmovq   %1, %0": "=x"(val3):"m"(a) 
:"r8","xmm11");
asm __volatile__("vmovq   %1, %0": "=x"(val4):"m"(a) 
:"r8","xmm11");
#endif
sum+=val1+val2+val3+val4;
 }
 return sum;

and indeed the third variant runs 1.2s while the first two takes equal
time 2.4s on my zen2 laptop.


Re: [Bug rtl-optimization/102178] [12 Regression] SPECFP 2006 470.lbm regressions on AMD Zen CPUs after r12-897-gde56f95afaaa22

2022-01-27 Thread Jan Hubicka via Gcc-bugs
> > According to znver2_cost
> > 
> > Cost of sse_to_integer is a little bit less than fp_store, maybe increase
> > sse_to_integer cost(more than fp_store) can helps RA to choose memory
> > instead of GPR.
> 
> That sounds reasonable - GPR<->xmm is cheaper than GPR -> stack -> xmm
> but GPR<->xmm should be more expensive than GPR/xmm<->stack.  As said above
> Zen2 can do reg -> mem, mem -> reg via renaming if 'mem' is somewhat special,
> but modeling that doesn't seem to be necessary.
> 
> We seem to have store costs of 8 and load costs of 6, I'll try bumping the
> gpr<->xmm move cost to 8.

I was simply following latencies here, so indeed reg<->mem bypass is not
really modelled.  I recall doing few experiments which was kind of
inconclusive.


Re: [Bug tree-optimization/104203] [12 Regressions] huge compile-time regression since r12-6606-g9d6a0f388eb048f8

2022-01-24 Thread Jan Hubicka via Gcc-bugs
> > bool
> Since the pass issues a bunch other warnings (e.g., -Wstringop-overflow,
> -Wuse-after-free, etc.) the gate doesn't seem right.  But since #pragma GCC
> diagnostic can re-enable warnings disabled by -w (or turn them into errors) 
> any
> gate that considers the global option setting will also interfere with that.

What the gate is executed the flags are set according to cfun, so you
can just combine all warning options for warnings issued by the pass
into the gate.


Re: [Bug ipa/104203] [12 Regressions] huge IPA compile-time regression since r12-6606-g9d6a0f388eb048f8

2022-01-24 Thread Jan Hubicka via Gcc-bugs
So I assume that this is due to new pass_waccess which was added into
early optimizations.  I think this is not really ipa component but
tree-optimize.


Re: [Bug tree-optimization/103195] [12 Regression] tfft2 text grows by 70% with -Ofast since r12-5113-gd70ef65692fced7a

2022-01-18 Thread Jan Hubicka via Gcc-bugs
> So nothing to see?  I guess our unit growth limit doesn't trigger because it's
> a small (benchmark) unit?
Yep, unit growths do not apply for very small units.  ipa-cp heuristics
still IMO needs work and be based on relative speedups rather then
absolute for the cutoffs.


Re: [Bug tree-optimization/103989] [12 regression] std::optional and bogus -Wmaybe-unitialized at -Og since r12-1992-g6feb628a706e86eb

2022-01-13 Thread Jan Hubicka via Gcc-bugs
> 
> Sure - I just remember (falsely?) that we finally decided to do it :)

I do not recall this, but I may have forgotten :))

> If we don't run IPA inline we don't figure we failed to inline the
> always_inline either ;)  And IPA inline can expose more indirect
> alywas-inlines we only discover after even more optimization so the
> issue is really moot unless we sorry () (or link-fail).

Problem with kernel was that it relied on quite complicated indirect
inliing of always inlined and did not work without it.  At beggining I
think we should have introduced two attributes - always_inline and
disregard_inline_limits just like we have internally. Always_inline
should have never allowed public linkage or taking its address, but
it is probbly late to fix that :(

Honza


Re: [Bug tree-optimization/103989] [12 regression] std::optional and bogus -Wmaybe-unitialized at -Og since r12-1992-g6feb628a706e86eb

2022-01-13 Thread Jan Hubicka via Gcc-bugs
> You can not disable an IPA pass becasuse then we will mishandle
> optimize attributes.  I think you simply want to set
> 
> flag_inline_small_functions = 0
> flag_inline_functions_called_once = 0 

Actually I forgot, we have flag_no_inline which makes
tree_inlinable_function_p to return false for everything except for
ALWAYS_INLINE and so we only want to set this one for Og.



Re: [Bug tree-optimization/103989] [12 regression] std::optional and bogus -Wmaybe-unitialized at -Og since r12-1992-g6feb628a706e86eb

2022-01-13 Thread Jan Hubicka via Gcc-bugs
> --- Comment #6 from Richard Biener  ---
> Honza, -Og was supposed to not do so much work, I intended to disable IPA
> inlining but there's no knob for that.  I wonder where to best put such
> guard?  I set flag_inline_small_functions to zero for -Og but we still
> run inline_small_functions ().  Basically -Og was supposed to only do
> early opts and then what is necessary for correct RTL expansion.  Doing
> IPA inlining defeats this :/
> 
> Can you help?  Is it safe to simply gate the inline_small_functions ()
> call?  Do we want an extra -f[no-]ipa-inline like we have -fearly-inlining?
> 
> Using -fdisable-ipa-inline gets rid of the diagnostic

You can not disable an IPA pass becasuse then we will mishandle
optimize attributes.  I think you simply want to set

flag_inline_small_functions = 0
flag_inline_functions_called_once = 0 

and we should only inline always_inlines. inline_small_functions will
still loop and check inlinability of functions but if everything is
compiled with -Og it will not find anything inlinable and exit.

Perhaps we may also extend initialize_inline_failed to add
CIF_DEBUG_OPTIMIZE so -Winline does say something more useufl then
"function not considered"

Honza


Re: [Bug rtl-optimization/98782] [11/12 Regression] Bad interaction between IPA frequences and IRA resulting in spills due to changes in BB frequencies

2022-01-11 Thread Jan Hubicka via Gcc-bugs
on zen2 and 3 with -flto the speedup seems to be cca 12% for both -O2
and -Ofast -march=native which is both very nice!
Zen1 for some reason sees less improvement, about 6%.
With PGO it is 3.8%

Overall it seems a win, but there are few noteworthy issues.

I also see a 6.69% regression on x64 with -Ofast -march=native -flto
https://lnt.opensuse.org/db_default/v4/SPEC/graph?plot.0=475.377.0
and perhaps 3-5% on sphinx
https://lnt.opensuse.org/db_default/v4/SPEC/graph?plot.0=476.280.0
https://lnt.opensuse.org/db_default/v4/SPEC/graph?plot.0=227.280.0

For non-spec benchmarks spec there is a regression on nbench
https://lnt.opensuse.org/db_default/v4/CPP/graph?plot.0=26.645.1
There are also large changes in tsvc
https://lnt.opensuse.org/db_default/v4/CPP/latest_runs_report
it may be noise since kernels are tiny, but for example x293 reproduces
both on kabylake and zen by about 80-90% regression that may be easy to
track (the kernel is included in the testsuite). Same regression is not
seen on zen3, so may be an ISA specific or so.

FInally there seems relatively large code size savings on polyhedron
benchmarks today (8% on capacita, 

Thanks a lot!


Re: [Bug gcov-profile/103652] Producing profile with -O2 -flto and trying to consume it with -O3 -flto leads to ICEs on indirect call profiling

2021-12-13 Thread Jan Hubicka via Gcc-bugs
> 
> Well, I'm specifically speaking about:
> error: the control flow of function ‘BZ2_compressBlock’ does not match its
> profile data (counter ‘arcs’) 
> 
> this type of errors should not happen even in a multi-threaded programs.

There are some cases where I see even those on clang build - I am not
sure how that happens (if it is configury difference or generated code
or gcc bug) It is on my TODO to analyse...

In any case we should never ICE on malformed gcda files. Especially not
by buffer overflow :)
> 
> > I think you can produce testcase easily by making a function with one
> > indirect call for train run and many indirect calls in profile-use run.
> > 
> > I have patch to avoid the buffer overflow - can send it after getting to
> > office.
> 
> Sure, please send it.
Attached.

Honza
diff --git a/gcc/coverage.c b/gcc/coverage.c
index 7f8b532cb52..49c370cb8c8 100644
--- a/gcc/coverage.c
+++ b/gcc/coverage.c
@@ -296,7 +296,7 @@ read_counts_file (void)
 
 gcov_type *
 get_coverage_counts (unsigned counter, unsigned cfg_checksum,
-unsigned lineno_checksum, unsigned int n_counts)
+unsigned lineno_checksum, unsigned int *n_counts)
 {
   counts_entry *entry, elt;
 
@@ -348,12 +348,12 @@ get_coverage_counts (unsigned counter, unsigned 
cfg_checksum,
   if (entry->cfg_checksum != cfg_checksum
   || (counter != GCOV_COUNTER_V_INDIR
  && counter != GCOV_COUNTER_V_TOPN
- && entry->n_counts != n_counts))
+ && entry->n_counts != *n_counts))
 {
   static int warned = 0;
   bool warning_printed = false;
 
-  if (entry->n_counts != n_counts)
+  if (entry->n_counts != *n_counts)
warning_printed =
  warning_at (DECL_SOURCE_LOCATION (current_function_decl),
  OPT_Wcoverage_mismatch,
@@ -361,7 +361,7 @@ get_coverage_counts (unsigned counter, unsigned 
cfg_checksum,
  "does not match "
  "its profile data (counter %qs, expected %i and have %i)",
  current_function_decl,
- ctr_names[counter], entry->n_counts, n_counts);
+ ctr_names[counter], entry->n_counts, *n_counts);
   else
warning_printed =
  warning_at (DECL_SOURCE_LOCATION (current_function_decl),
@@ -404,9 +404,25 @@ get_coverage_counts (unsigned counter, unsigned 
cfg_checksum,
  current_function_decl);
 }
 
+  *n_counts = entry->n_counts;
   return entry->counts;
 }
 
+/* Returns the counters for a particular tag and verifies that counts matches
+   the expectation.  */
+
+gcov_type *
+get_coverage_counts (unsigned counter, unsigned cfg_checksum,
+unsigned lineno_checksum, unsigned int n_counts)
+{
+  unsigned int n_counts2 = n_counts;
+  gcov_type *ret
+ = get_coverage_counts (counter, cfg_checksum,
+lineno_checksum, _counts2);
+  gcc_assert (!ret || n_counts2 == n_counts);
+  return ret;
+}
+
 /* Allocate NUM counters of type COUNTER. Returns nonzero if the
allocation succeeded.  */
 
diff --git a/gcc/coverage.h b/gcc/coverage.h
index 22646d439fc..7f488811a4e 100644
--- a/gcc/coverage.h
+++ b/gcc/coverage.h
@@ -54,6 +54,10 @@ extern gcov_type *get_coverage_counts (unsigned /*counter*/,
   unsigned /*cfg_checksum*/,
   unsigned /*lineno_checksum*/,
   unsigned /*n_counts*/);
+extern gcov_type *get_coverage_counts (unsigned /*counter*/,
+  unsigned /*cfg_checksum*/,
+  unsigned /*lineno_checksum*/,
+  unsigned */*n_counts*/);
 
 extern tree get_gcov_type (void);
 extern bool coverage_node_map_initialized_p (void);
diff --git a/gcc/profile.c b/gcc/profile.c
index d4103058fcd..0fe0910c296 100644
--- a/gcc/profile.c
+++ b/gcc/profile.c
@@ -898,7 +898,7 @@ compute_value_histograms (histogram_values values, unsigned 
cfg_checksum,
   histogram_counts[t] = get_coverage_counts (COUNTER_FOR_HIST_TYPE (t),
 cfg_checksum,
 lineno_checksum,
-n_histogram_counters[t]);
+_histogram_counters[t]);
   if (histogram_counts[t])
any = 1;
   act_count[t] = histogram_counts[t];
@@ -918,20 +918,47 @@ compute_value_histograms (histogram_values values, 
unsigned cfg_checksum,
   /* TOP N counter uses variable number of counters.  */
   if (topn_p)
{
- unsigned total_size;
+ gcov_type total_size;
+ bool ignore = false;
  if (act_count[t])
-   total_size = 2 + 2 * act_count[t][1];
+   {
+ total_size = 2 + 2 * act_count[t][1];
+ /* Watch for counter corruption

Re: [Bug tree-optimization/103168] Value numbering for PRE of pure functions can be improved

2021-11-22 Thread Jan Hubicka via Gcc-bugs
The patch passed testing on x86_64-linux.


Re: [Bug tree-optimization/103168] Value numbering for PRE of pure functions can be improved

2021-11-22 Thread Jan Hubicka via Gcc-bugs
This is bit modified patch I am testing.  I added pre-computation of the
number of accesses, enabled the path for const functions (in case they
have memory operand), initialized alias sets and clarified the logic
around every_* and global_memory_accesses

PR tree-optimization/103168
(modref_summary::finalize): Initialize load_accesses.
* ipa-modref.h (struct modref_summary): Add load_accesses.
* tree-ssa-sccvn.c (visit_reference_op_call): Use modref
info to walk the virtual use->def chain to CSE pure
function calls.

* g++.dg/tree-ssa/pr103168.C: New testcase.

diff --git a/gcc/ipa-modref.c b/gcc/ipa-modref.c
index 4f9323165ea..595eb6e0d8f 100644
--- a/gcc/ipa-modref.c
+++ b/gcc/ipa-modref.c
@@ -725,6 +727,23 @@ modref_summary::finalize (tree fun)
break;
}
 }
+  if (loads->every_base)
+load_accesses = 1;
+  else
+{
+  load_accesses = 0;
+  for (auto base_node : loads->bases)
+   {
+ if (base_node->every_ref)
+   load_accesses++;
+ else
+   for (auto ref_node : base_node->refs)
+ if (ref_node->every_access)
+   load_accesses++;
+ else
+   load_accesses += ref_node->accesses->length ();
+   }
+}
 }
 
 /* Get function summary for FUNC if it exists, return NULL otherwise.  */
diff --git a/gcc/ipa-modref.h b/gcc/ipa-modref.h
index f868eb6de07..a7937d74945 100644
--- a/gcc/ipa-modref.h
+++ b/gcc/ipa-modref.h
@@ -53,6 +53,8 @@ struct GTY(()) modref_summary
 
   /* Flags coputed by finalize method.  */
 
+  /* Total number of accesses in loads tree.  */
+  unsigned int load_accesses;
   /* global_memory_read is not set for functions calling functions
  with !binds_to_current_def which, after interposition, may read global
  memory but do nothing useful with it (except for crashing if some
diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr103168.C 
b/gcc/testsuite/g++.dg/tree-ssa/pr103168.C
new file mode 100644
index 000..82924a3e3ce
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr103168.C
@@ -0,0 +1,24 @@
+// { dg-do compile }
+// { dg-options "-O2 -fdump-tree-fre1-details" }
+
+struct a
+{
+  int a;
+  static __attribute__ ((noinline))
+  int ret (int v) {return v;}
+
+  __attribute__ ((noinline))
+  int inca () {return a++;}
+};
+
+int
+test()
+{
+  struct a av;
+  av.a=1;
+  int val = av.ret (0) + av.inca();
+  av.a=2;
+  return val + av.ret(0) + av.inca();
+}
+
+/* { dg-final { scan-tree-dump-times "Replaced a::ret" 1 "fre1" } } */
diff --git a/gcc/tree-ssa-sccvn.c b/gcc/tree-ssa-sccvn.c
index 149674e6a16..719f5184654 100644
--- a/gcc/tree-ssa-sccvn.c
+++ b/gcc/tree-ssa-sccvn.c
@@ -71,6 +71,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-ssa-loop-niter.h"
 #include "builtins.h"
 #include "fold-const-call.h"
+#include "ipa-modref-tree.h"
+#include "ipa-modref.h"
 #include "tree-ssa-sccvn.h"
 
 /* This algorithm is based on the SCC algorithm presented by Keith
@@ -5084,12 +5086,118 @@ visit_reference_op_call (tree lhs, gcall *stmt)
   struct vn_reference_s vr1;
   vn_reference_t vnresult = NULL;
   tree vdef = gimple_vdef (stmt);
+  modref_summary *summary;
 
   /* Non-ssa lhs is handled in copy_reference_ops_from_call.  */
   if (lhs && TREE_CODE (lhs) != SSA_NAME)
 lhs = NULL_TREE;
 
   vn_reference_lookup_call (stmt, , );
+
+  /* If the lookup did not succeed for pure functions try to use
+ modref info to find a candidate to CSE to.  */
+  const int accesses_limit = 8;
+  if (!vnresult
+  && !vdef
+  && lhs
+  && gimple_vuse (stmt)
+  && (((summary = get_modref_function_summary (stmt, NULL))
+  && !summary->global_memory_read
+  && summary->load_accesses < accesses_limit)
+ || gimple_call_flags (stmt) & ECF_CONST))
+{
+  /* First search if we can do someting useful and build a
+vector of all loads we have to check.  */
+  bool unknown_memory_access = false;
+  auto_vec accesses;
+
+  if (summary)
+   {
+ for (auto base_node : summary->loads->bases)
+   if (unknown_memory_access)
+ break;
+   else for (auto ref_node : base_node->refs)
+ if (unknown_memory_access)
+   break;
+ else for (auto access_node : ref_node->accesses)
+   {
+ accesses.quick_grow (accesses.length () + 1);
+ if (!access_node.get_ao_ref (stmt,  ()))
+   {
+ /* We could use get_call_arg (...) and initialize
+a ref based on the argument and unknown offset in
+some cases, but we have to get a ao_ref to
+disambiguate against other stmts.  */
+ unknown_memory_access = true;
+ break;
+   }
+ else
+   {
+ 

Re: [Bug driver/100937] configure: Add --enable-default-semantic-interposition

2021-11-22 Thread Jan Hubicka via Gcc-bugs
> (The -fno-semantic-interposition thing is probably the biggest performance gap
> between gcc -fpic and clang -fpic.)
Yep, it is often confusing to users (who do not understand what ELF
interposition is) that clang and gcc disagree on default flags here.
Recently -Ofast was extended to imply -fno-semantic-interposition that
will hopefully make more people notice this.

While doing that I have added per-symbol flag about interposition to the
symbol table, so we can also support 

__atttribute__ ((semantic_interposition))

and

__attribute__((no_semantic_interpoition))

if that would be useful for something.


Re: [Bug tree-optimization/103300] New: wrong code at -O3 on x86_64-linux-gnu

2021-11-17 Thread Jan Hubicka via Gcc-bugs
Needs -O2  -floop-unroll-and-jam   --param early-inlining-insns=14
to fail, so I guess it may be issue with unrol-and-jam.


Re: [Bug ipa/103267] Wrong code with ipa-sra

2021-11-16 Thread Jan Hubicka via Gcc-bugs
> @@ -1,4 +1,3 @@
> -static int
>  __attribute__ ((noinline,const))
>  infinite (int p)
>  {
Just for a record, it crahes with or without static int here for me :)

I run across it because the code tracking must access in ipa-sra is IMO
conceptually wrong.  I noticed that because ipa-modref solves similar
problem for kills (both need to verify that given access will always
happen).  The post-dominance check is not enough to verify that because
earlier function calls can do things like EH.  I failed to construct an
actual testcase because on interesting stuff like EH we punt for other
reasons (missed fnspec annotations on EH builtins).  I will play with it
more today.


Re: [Bug ipa/103267] Wrong code with ipa-sra

2021-11-16 Thread Jan Hubicka via Gcc-bugs
Aha, but here is better example (reproduces same way).
In the former one I forgot const attribute which makes it invalid.
The testcase tests that ipa-sra is missing ECF_LOOPING_CONST_OR_PURE
check

static int
__attribute__ ((noinline))
infinite (int p)
{
  if (p)
while (1);
  return p;
}
__attribute__ ((noinline))
static void
test(int p, int *a)
{
  int v = infinite (p);
  if (*a && v)
__builtin_abort ();
}
test2(int *a)
{
  test(0,a);
}
main()
{
  test (1,0);
}


Re: [Bug ipa/103267] Wrong code with ipa-sra

2021-11-16 Thread Jan Hubicka via Gcc-bugs
Works for me even with the 3 warnings.

hubicka@lomikamen:/aux/hubicka/trunk/build-lto2/gcc$ cat >tt.c
__attribute__ ((noinline,const))
infinite (int p)
{
  if (p)
while (1);
  return p;
}
__attribute__ ((noinline))
static void
test(int p, int *a)
{
  int v = infinite (p);
  if (*a && v)
__builtin_abort ();
}
test2(int *a)
{
  test(0,a);
}
main()
{
  test (1,0);
}
hubicka@lomikamen:/aux/hubicka/trunk/build-lto2/gcc$ ./xgcc --version
xgcc (GCC) 12.0.0 2024 (experimental)
Copyright (C) 2021 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

hubicka@lomikamen:/aux/hubicka/trunk/build-lto2/gcc$ ./xgcc -B ./ -O2 tt.c
tt.c:2:1: warning: return type defaults to ‘int’ [-Wimplicit-int]
2 | infinite (int p)
  | ^~~~
tt.c:16:1: warning: return type defaults to ‘int’ [-Wimplicit-int]
   16 | test2(int *a)
  | ^
tt.c:20:1: warning: return type defaults to ‘int’ [-Wimplicit-int]
   20 | main()
  | ^~~~
hubicka@lomikamen:/aux/hubicka/trunk/build-lto2/gcc$ ./a.out
Segmentation fault



Re: [Bug tree-optimization/103231] New: ICE (nondeterministic) on valid code at -O1 on x86_64-linux-gnu: Segmentation fault

2021-11-14 Thread Jan Hubicka via Gcc-bugs
> [659] % 
> [659] % gcctk -O0 -w small.c
> [660] % 
> [660] % gcctk -O1 -w small.c
> [661] % gcctk -O1 -w small.c
> [662] % gcctk -O1 -w small.c
> gcctk: internal compiler error: Segmentation fault signal terminated program
> cc1
> Please submit a full bug report,
> with preprocessed source if appropriate.
> See  for instructions.
Backtrace here would be useful.  It is bit strange that you did not get
it from error message.  One can use -S -wrapper gdb,--args to make the
cc1 executed within gdb.
> [663] %


Re: [Bug ipa/103230] ipa-modref-tree.h:550:33: runtime error: load of value 255, which is not a valid value for type 'bool'

2021-11-14 Thread Jan Hubicka via Gcc-bugs
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103230
> 
> --- Comment #2 from Martin Liška  ---
> > How do you build ubsan compiler?
> 
> F="-O0 -g -fsanitize=undefined" ; make -j16 all-host -k CFLAGS="$F"
> CXXFLAGS="$F"  LDFLAGS="$F"
> 
> is the fastest approach.
Thanks, it is similar to what I tried.  I guess there should be no ";"
but yet it leds to misconfigured libiberty for me on kunlun.  I will
look into that.


Re: [Bug ipa/103230] New: ipa-modref-tree.h:550:33: runtime error: load of value 255, which is not a valid value for type 'bool'

2021-11-14 Thread Jan Hubicka via Gcc-bugs
> Happens with UBSAN compiler for:
> 
> $ gcc gcc/testsuite/gcc.c-torture/execute/pr71494.c -O1  -flto
> ...
> /home/marxin/Programming/gcc/gcc/ipa-modref-tree.h:550:33: runtime error: load
> of value 255, which is not a valid value for type 'bool'
> #0 0x18acc38 in modref_tree::merge(modref_tree*,
> vec*, modref_parm_map*, bool)
> /home/marxin/Programming/gcc/gcc/ipa-modref-tree.h:550
> #1 0x188452c in modref_propagate_in_scc

At 4385 I have:
   changed |= cur_summary_lto->stores->merge
(callee_summary_lto->stores, _map, _map, !first);
 

parm-map is the vector, however there is no read of it.
There is bool which is relevant only when parm_index is not unknown, so
I suspect it may a full copy with uninitialized bool which would be
harmless. We had similar issues with asan before.

How do you build ubsan compiler?
Honza


Re: [Bug ipa/103211] [12 Regression] 416.gamess crashes after r12-5177-g494bdadf28d0fb35

2021-11-12 Thread Jan Hubicka via Gcc-bugs
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103211
> 
> --- Comment #2 from Martin Liška  ---
> Optimized dump differs for couple of functions in the same way:
> 
> diff -u good bad
> --- good2021-11-12 17:42:36.995947103 +0100
> +++ bad 2021-11-12 17:41:56.728194961 +0100
> @@ -38,7 +38,6 @@
> 
>  ;; Function abrt (abrt_, funcdef_no=10, decl_uid=4338, cgraph_uid=11,
> symbol_order=10) (executed once)
> 
> -Removing basic block 5
>  __attribute__((fn spec (". ")))
>  void abrt ()
>  {
> @@ -350,7 +349,6 @@
>  void setfm (integer(kind=4) * ipar)
>  {
> [local count: 1073741824]:
> -  master.0.setfm (0, ipar_2(D)); [tail call]
>return;
> 
>  }
> 
> maybe the fnspec for master.0.setfm is bad?
> 
> __attribute__((fn spec (". R w ")))
> void master.0.setfm (integer(kind=8) __entry, integer(kind=4) * ipar)
> {
It looks more like pure/const discovery. You should be able to use
-fdump-ipa-all -fdump-tree-all and grep "function found to be" 
either pure or const.

What is body of master.0.setfm. Does it look like it does nothing?

"R" in fnspec means that arg 0 is only read directly and not derefernced.
"w" means that it arg 1 is not escaping.

Honza


Re: [Bug tree-optimization/103175] [12 Regression] internal compiler error: in handle_call_arg, at tree-ssa-structalias.c:4139

2021-11-11 Thread Jan Hubicka via Gcc-bugs
The sanity check verifies that functions acessing parameter indirectly
also reads the parameter (otherwise the indirect reference can not
happen).  This patch moves the check earlier and removes some overactive
flag cleaning on function call boundary which introduces the non-sential
situation.  I got bit paranoid here on how return value relates to
escaping solution.

as discussed on ML, matmul failure is simply the fact that the testcase
verified missed optimization is still misssed.

diff --git a/gcc/ipa-modref.c b/gcc/ipa-modref.c
index 72006251f29..a97021c6c60 100644
--- a/gcc/ipa-modref.c
+++ b/gcc/ipa-modref.c
@@ -1681,6 +1681,13 @@ modref_lattice::merge (int f)
 {
   if (f & EAF_UNUSED)
 return false;
+  /* Check that flags seems sane: if function does not read the parameter
+ it can not access it indirectly.  */
+  gcc_checking_assert (!(f & EAF_NO_DIRECT_READ)
+  || ((f & EAF_NO_INDIRECT_READ)
+  && (f & EAF_NO_INDIRECT_CLOBBER)
+  && (f & EAF_NO_INDIRECT_ESCAPE)
+  && (f & EAF_NOT_RETURNED_INDIRECTLY)));
   if ((flags & f) != flags)
 {
   flags &= f;
@@ -1874,27 +1881,13 @@ modref_eaf_analysis::merge_call_lhs_flags (gcall *call, 
int arg,
argument if needed.  */
 
 static int
-callee_to_caller_flags (int call_flags, bool ignore_stores,
-   modref_lattice )
+callee_to_caller_flags (int call_flags, bool ignore_stores)
 {
   /* call_flags is about callee returning a value
  that is not the same as caller returning it.  */
   call_flags |= EAF_NOT_RETURNED_DIRECTLY
| EAF_NOT_RETURNED_INDIRECTLY;
-  /* TODO: We miss return value propagation.
- Be conservative and if value escapes to memory
- also mark it as escaping.  */
-  if (!ignore_stores && !(call_flags & EAF_UNUSED))
-{
-  if (!(call_flags & EAF_NO_DIRECT_ESCAPE))
-   lattice.merge (~(EAF_NOT_RETURNED_DIRECTLY
-| EAF_NOT_RETURNED_INDIRECTLY
-| EAF_UNUSED));
-  if (!(call_flags & EAF_NO_INDIRECT_ESCAPE))
-   lattice.merge (~(EAF_NOT_RETURNED_INDIRECTLY
-| EAF_UNUSED));
-}
-  else
+  if (ignore_stores)
 call_flags |= ignore_stores_eaf_flags;
   return call_flags;
 }
@@ -2033,15 +2026,9 @@ modref_eaf_analysis::analyze_ssa_name (tree name)
  if (!(call_flags & (EAF_NOT_RETURNED_DIRECTLY
  | EAF_UNUSED)))
m_lattice[index].merge (~(EAF_NO_DIRECT_ESCAPE
- | EAF_NO_INDIRECT_ESCAPE
- | EAF_UNUSED));
- if (!(call_flags & (EAF_NOT_RETURNED_INDIRECTLY
- | EAF_UNUSED)))
-   m_lattice[index].merge (~(EAF_NO_INDIRECT_ESCAPE
  | EAF_UNUSED));
  call_flags = callee_to_caller_flags
-  (call_flags, false,
-   m_lattice[index]);
+  (call_flags, false);
}
  m_lattice[index].merge (call_flags);
}
@@ -2057,8 +2044,7 @@ modref_eaf_analysis::analyze_ssa_name (tree name)
  !(call_flags & EAF_NOT_RETURNED_DIRECTLY),
  !(call_flags & EAF_NOT_RETURNED_INDIRECTLY));
  call_flags = callee_to_caller_flags
-  (call_flags, ignore_stores,
-   m_lattice[index]);
+  (call_flags, ignore_stores);
  if (!(ecf_flags & (ECF_CONST | ECF_NOVOPS)))
m_lattice[index].merge (call_flags);
}
@@ -2082,8 +2068,7 @@ modref_eaf_analysis::analyze_ssa_name (tree name)
if (!(ecf_flags & (ECF_CONST | ECF_NOVOPS)))
  {
call_flags = callee_to_caller_flags
-(call_flags, ignore_stores,
- m_lattice[index]);
+(call_flags, ignore_stores);
if (!record_ipa)
  m_lattice[index].merge (call_flags);
else
@@ -2105,8 +2090,7 @@ modref_eaf_analysis::analyze_ssa_name (tree name)
else
  {
call_flags = callee_to_caller_flags
-(call_flags, ignore_stores,
- m_lattice[index]);
+(call_flags, ignore_stores);
if (!record_ipa)
  

Re: [Bug middle-end/102997] [12 Regression] 45% 454.calculix regression with LTO+PGO -march=native -Ofast on Zen since r12-4526-gd8edfadfc7a9795b65177a50ce44fd348858e844

2021-11-08 Thread Jan Hubicka via Gcc-bugs
Note that it still seems to me that the crossed_loop_header handling is
overly conservative.  We have:

@ -2771,6 +2771,7 @@ jt_path_registry::cancel_invalid_paths 
(vec )
   bool seen_latch = false;
   int loops_crossed = 0;
   bool crossed_latch = false;
+  bool crossed_loop_header = false;
   // Use ->dest here instead of ->src to ignore the first block.  The
   // first block is allowed to be in a different loop, since it'll be
   // redirected.  See similar comment in profitable_path_p: "we don't
@@ -2804,6 +2805,14 @@ jt_path_registry::cancel_invalid_paths 
(vec )
  ++loops_crossed;
}
 
+  // ?? Avoid threading through loop headers that remain in the
+  // loop, as such threadings tend to create sub-loops which
+  // _might_ be OK ??.
+  if (e->dest->loop_father->header == e->dest
+ && !flow_loop_nested_p (exit->dest->loop_father,
+ e->dest->loop_father))
+   crossed_loop_header = true;
+
   if (flag_checking && !m_backedge_threads)
gcc_assert ((path[i]->e->flags & EDGE_DFS_BACK) == 0);
 }
@@ -2829,6 +2838,21 @@ jt_path_registry::cancel_invalid_paths 
(vec )
   cancel_thread (, "Path crosses loops");
   return true;
 }
+  // The path should either start and end in the same loop or exit the
+  // loop it starts in but never enter a loop.  This also catches
+  // creating irreducible loops, not only rotation.
+  if (entry->src->loop_father != exit->dest->loop_father
+  && !flow_loop_nested_p (exit->src->loop_father,
+ entry->dest->loop_father))
+{
+  cancel_thread (, "Path rotates loop");
+  return true;
+}
+  if (crossed_loop_header)
+{
+  cancel_thread (, "Path crosses loop header but does not exit it");
+  return true;
+}
   return false;
 }
 
If I read it correctly, for a path that enters the loop and later leaves
it (where threading is desirable since we skip the whole loop) the logic
above will still return true (after finishing the whole walk which seems
like a waste).

This may trigger more often at -Os since we limit loop header copying.

And indeed, fixing profile updating would be nice.  Why the updating
code is not reused across different threaders?  (I wrote several thread
updating functions for varioius threaders introduced & remoed in the
past and I wonder why we need to keep reinventing it)


Re: [Bug tree-optimization/102943] [12 Regression] Jump threader compile-time hog with 521.wrf_r

2021-11-07 Thread Jan Hubicka via Gcc-bugs
> 
> This PR is still open, at least for slowdown in the threader with LTO.  The
> issue is ranger wide, so it may also cause slowdowns  on non-LTO builds for
> WRF, though I haven't checked.
I just wanted to record the fact somewhere since I was looking up the
revision range mostly to figure out if there was modref change that may
cause this.

Non-lto builds seems fine.  I suppose LTo is needed ot make bug enough
CFGs.  Thanks for looking into it.

Honza


Re: [Bug tree-optimization/102943] [12 Regression] Jump threader compile-time hog with 521.wrf_r

2021-11-04 Thread Jan Hubicka via Gcc-bugs
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102943
> 
> Aldy Hernandez  changed:
> 
>What|Removed |Added
> 
>  Depends on||103058
> 
> --- Comment #18 from Aldy Hernandez  ---
> 251.wrf_r is no longer building.  Seems to be the same issue in PR103058.
> 
> during GIMPLE pass: alias
> module_fr_fire_phys.fppized.f90: In function 'init_fuel_cats':
> module_fr_fire_phys.fppized.f90:136:25: internal compiler error: in
> gimple_call_static_chain_flags, at gimple.c:1669
>   136 | subroutine init_fuel_cats
>   | ^
> 0x6957b5 gimple_call_static_chain_flags(gcall const*)
> /home/aldyh/src/clean/gcc/gimple.c:1669

I have commited workaround for this.
However here it looks like a frontend issue - I do not think Fortran
should produce nested functions with external linkage. At least there
seems to be no good reason for doing so since they can not be called
cross-module.

Honza


Re: [Bug d/103040] [12 Regression] gdc.dg/torture/pr101273.d FAILs

2021-11-02 Thread Jan Hubicka via Gcc-bugs
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103040
> 
> --- Comment #15 from Iain Buclaw  ---
> Got it. The difference between D and C++ is a matter of early inlining.
> 
> The C++ example Jakub posted fails in the same way that D does if you compile
> with: -O1 -fno-inline
Great, I will take a look now (I was travelling that is why i did not
started earlier)

Honza


Re: [Bug d/103040] [12 Regression] gdc.dg/torture/pr101273.d FAILs

2021-11-02 Thread Jan Hubicka via Gcc-bugs
> See above comments from Iain, even if that pre-initialization is removed it is
> still miscompiled.  And, the testcase fails not because of the padding bits 
> not
> being zero, but because the address of self stored into one of the fields 
> isn't
> there or modref thinks it can't be changed or set to that.  But for
> corresponding C++ it handles it ok.
Perhaps TREE_ADDRESSABLE on the type which is being used to test whether
return slot pointer may escape.


Re: [Bug middle-end/102997] [12 Regression] 45% 454.calculix regression with LTO+PGO -march=native -Ofast between ce4d1f632ff3f680550d3b186b60176022f41190 and 6fca1761a16c68740f875fc487b98b6bde8e9be7

2021-10-29 Thread Jan Hubicka via Gcc-bugs
> Not seen on Haswell (but w/o PGO).  Is this PGO specific?  There's another
> large jump visible end of 2019.
It is between 2019-11-15 and 18 but the revisions does not exist at git
- perhaps they reffer to the old git mirror. Martin will know better.

In that range there are many of Richard's vectorizer changes and my
patch fixing calculation of ref time in inliner which may be culprints.

Honza


Re: [Bug ipa/102982] [12 Regression] Dead Code Elimination Regression at -O3 (trunk vs 11.2.0)

2021-10-28 Thread Jan Hubicka via Gcc-bugs
> 
> fixup_cfg already removes write-only stores so that seems fit for that
> purpose.
> 
> Btw,
> 
> static int x = 1;
> 
> int main()
> {
>   x = 1;
> }
> 
> should ideally be handled as well as maybe the more common(?)
> 
> static int x[128];
> 
> int main()
> {
>   memset (x, 0, 128*4);
> }
> 
> so we'd like to store a (constant) RHS for the stores in the summaries?
> (I suppose we cannot selectively stream in a single stmt ;))

Yep, what we want is to have way to attach summaries to particular
IPA_REF_load/store/addr just like we annotate call sites...
It would be nice to extend symbol-summary.h for that.  Martin, would you
be interested to look into it?

Honza


Re: [Bug tree-optimization/102446] [9/10/11/12 Regression] wrong code at -O3 on x86_64-linux-gnu

2021-09-22 Thread Jan Hubicka
> Started with r5-6477-g3620b606822f80863488ca4883542d848d41f9f9
This only affects early inlining decisions, so it may be useful to
bisect this with --param early-inlining-insns=14

Honza


Re: [Bug lto/99898] Possible LTO object incompatibility on gcc-10 branch

2021-04-06 Thread Jan Hubicka
> Any *.opt changes can break the streaming of optimization or target option
> nodes.
> And from experience with gcc plugins we have such changes ~ each month even on
> release branches.
It may make sense to add a simple test to our regular testers that
either the new revision can consume old object files or the version was
updated :)

Honza


Re: [Bug ipa/99835] missed optimization for dead code elimination at -O3 (vs. -O1)

2021-03-31 Thread Jan Hubicka
> At -O3 the unused 'c' remains.  Likely different (recursive?) inlining makes 
> us
> process a cgraph cycle in different order and thus fail to elide the output
> of 'c' (it's output first at -O3).
> 
> Fixing that would need processing cgraph SCCs with an extra IPA phase in main
> optimization so we get a chance to do extra node removal (maybe order
> the cycles so that functions we can elide - aka static ones - are processed
> last).
That would tamper with optimizations that propagate from callee to
caller during late optimization, like IPA register allocation, stack
alignment propagation or late pure/const discovery.

Honza


Re: [Bug bootstrap/98338] [10/11 Regression] profiledbootstrap failure on x86_64-linux

2021-02-26 Thread Jan Hubicka
> FYI, I have today bootstrapped it as well in rpm build on
> {x86_64,i686,powerpc64le}-linux, both your patch and just trunk without the
> workaround I've been using before.  The latter failed to bootstrap on i686
> and passed it on x86_64 and powerpc64le, the former passed bootstrap on all
> arches.  make check is still ongoing...
Good, so it fixes i686 bootstrap, right?

Honza


Re: [Bug gcov-profile/99105] profile streaming scales poorly to projects with many source files

2021-02-15 Thread Jan Hubicka
> Ah, yeah, that will make a big difference.
> So clang is using 'make check', running a test-suite for a PGO build, right?
It uses 
make check-llvm
make check-clang
and then it rebuilds whole llvm with the instrumented compiler.

Honza


Re: [Bug gcov-profile/99105] profile streaming scales poorly to projects with many source files

2021-02-15 Thread Jan Hubicka
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99105
> 
> --- Comment #8 from Martin Liška  ---
> This is what I see for GCC PGO in train stage. It's from perf top:
> 
>4.33%  cc1plus  [.] 
> __gcov_indirect_call_profiler_v4
>  ◆
>2.28%  cc1plus  [.] __gcov_topn_values_profiler
>  
>  ▒
>0.85%  cc1plus  [.] ggc_internal_alloc 
>  

Yep, this is usual profile I see.  Perhaps you want to try profile "make check"
> 
> In the case of GCC, we emit 500 .gcda files.
> 
> @Honza: Can you please test my patch that uses glibc buffered I/O if it helps?

I can give it a try later this week (I would like to collect some data on 
performance first)

Honza


Re: [Bug gcov-profile/99105] profile streaming scales poorly to projects with many source files

2021-02-15 Thread Jan Hubicka
> A small improvement can be achieved by the removal of libgcov I/O buffering:
> https://gcc.gnu.org/git/?p=gcc.git;a=patch;h=5a17015c096012b9e43a8dd45768a8d5fb3a3aee

So it effectively replaces gcov's own buffered I/O by stdio.  First I am
not sure how safe it is (as we had a lot of fun about using malloc) and
also it adds dependency on stdio that is not necessarily good idea for
embedded targets. Not sure how often it is used there.

But why glibc stdio is more effective? Is it because our buffer size of
1k is way too small (as it seems juding from the profile that is
dominated by fread calls rather than open/lock/close)?
> 
> But the key thing is likely the ability to omit profile modifications
> (read/modify/write) for parts of a binary that are not trained.
Problem there are the per-program summaries that needs to be updated
even for files never visited.

It seems that producing one file with tar-like format that can be
expanded to gcda files by gcov-tool would be good idea. Even if we need
to lock whole file it is probably faster than a lot of small I/Os.
To avoid waiting for lock one can simply allow multiple profile files to
be created and teach libgcov to acquire unlocked file in pseudorandom
order.

Honza


Re: [Bug middle-end/99097] profiledbootstrap fails with LTO and disabled plugin

2021-02-15 Thread Jan Hubicka
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99097
> 
> --- Comment #5 from Martin Liška  ---
> (In reply to Jan Hubicka from comment #3)
> > > I've just tried to reproduce it:
> > > ../configure --with-build-config=bootstrap-lto --enable-checking=release
> > > --disable-plugin
> > > 
> > > But the build is fine for me.
> 
> Ah, my bad, I used BFD.
> 
> > On our dhcp230 (zen III machine) it works if you make system linker ld,
> > if system linker is gold (from tumbleweed) it fails
> > 
> > GNU gold (GNU Binutils 2.36.50.20210211) 1.16
> 
> Note that this isn't ld.gold from openSUSE Tumbleweed. We still have 2.35.1
> release in the distribution. This must be a built gold from source files?

Ah, you are right. It is binutils trunk (unpatched).   Will figure out
what is going on here.

Honza


Re: [Bug middle-end/99097] profiledbootstrap fails with LTO and disabled plugin

2021-02-15 Thread Jan Hubicka
> I've just tried to reproduce it:
> ../configure --with-build-config=bootstrap-lto --enable-checking=release
> --disable-plugin
> 
> But the build is fine for me.
On our dhcp230 (zen III machine) it works if you make system linker ld,
if system linker is gold (from tumbleweed) it fails

GNU gold (GNU Binutils 2.36.50.20210211) 1.16
Copyright (C) 2021 Free Software Foundation, Inc.
This program is free software; you may redistribute it under the terms of
the GNU General Public License version 3 or (at your option) a later version.
This program has absolutely no warranty.

I will rebuild and produce resolution files, looks like linker bug to
me.

Honza


Re: [Bug c++/98330] [9/10/11 Regression] ICE in compute_parm_map, at ipa-modref.c:2900 since r9-2640-g3d78e00879b42574

2021-01-19 Thread Jan Hubicka
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98330
> 
> --- Comment #4 from Richard Biener  ---
> So modref allocates a fnspec_summary for an unknown indirect call (NULL 
> callee)
> but then in compute_parm_map calls function_or_virtual_thunk_symbol on
> that NULL callee unconditionally.  We have a meaningful fnspec for the
> call because the call type has a 'fn spec' attribute attached.
> 
> So I'm proposing the following which avoids the ICE (and undefined behavior
> calling a member fn on a NULL object)

That looks OK.  We do not expect fnspecs on types of indirect calls (we
probably should, I just did not expected them to exist) but then we can
always recover them from type.  I suppose we do not need per-call-stmt
sensitive fnspec attributes on indirect calls, right?

Honza


Re: [Bug c++/91241] [8/9/10/11 Regression] internal compiler error: symtab_node::verify failed

2020-12-07 Thread Jan Hubicka
> @Marek: The callgraph checking error is correct.
> If you disable it, you will likely see duplicate assembler names in GAS. And
> that's the error that 2 symbol names clash.
Indeed, there are two lambdas, but I think C++ FE should assign them
different symbol names.

Honza


Re: [Bug c/97172] [11 Regression] ICE: tree code ‘ssa_name’ is not supported in LTO streams since r11-3303-g6450f07388f9fe57

2020-12-01 Thread Jan Hubicka
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97172
> 
> --- Comment #18 from Martin Sebor  ---
> Let me explain how this works.  The VLA bounds in function parameters are used
> in two ways:
> 1) in the front end, to check function redeclarations involving arrays and 
> VLAs
> for equivalence,
> 2) in the middle end, to check function calls for out of bounds accesses.
> 
> As an example of (1) consider the following declarations of function f():
> 
>   void f (int X, int, int A[X], int B[foo()]);
> and
>   void f (int, int J, int A[J], int B[foo() + 1]);
> 
> The bounds in the parameters A and B are different and we'd like them
> diagnosed.  The bound X is the first parameter in the first declaration of f
> but J is the second parameter in the second f().  Because the bounds are also
> parameters, we use their positions in the argument list to determine that they
> don't match.
> 
> Likewise, the bound foo() in B is different from foo() + 1, but because 
> neither
> is a parameter the only way to determine whether they match is by comparing
> them for equivalence.  The code uses operand_equal_p(..., OEP_LEXICOGRAPHIC).
> 
> (2) is done only for bounds that are parameters.  Other bounds are not used 
> for
> anything, but they're still stored in the attributes so they can be compared 
> in
> the redeclarations.
> 
> Since the "complex" bounds aren't used after the front end is done with them,
> unless there's a way to remove them at some point after the front end is done
> (or set them to NULL or something), the LTO streaming code could ignore them
> instead of asserting on them.  Alternatively, instead of storing them in their

free_lang_data should a good place to free them. In general we should
avoid storing things to IL that are not useful to middle end and keep
them there till LTO streaming.  Even if it does not make LTO streaming
to ICE it stll consumes memory, disk space and compile time. 

Honza
> tree form they could be stored as strings instead.  I list these in the order
> of my preference for GCC 11.


Re: [Bug tree-optimization/97915] New: ICE in get_odr_type, at ipa-devirt.c:1930 in pre

2020-11-19 Thread Jan Hubicka
Hi,
this ought to be fixed by g:0862d007b564eca8c9a48fca0e689dd3f90db828
sorry for the breakage.  OBJ_TYPE_REF in obj-C frontend is odd.


Re: [Bug bootstrap/97857] [11 Regression] profiledbootstrap broken freeing speculative call summary since r11-4987-g602c6cfc79ce4ae61e277107e0a60079c1a93a97

2020-11-16 Thread Jan Hubicka
This patch fixes the issue by making the conflict with C type sticky via
clearing the CXX bit.  I checked that it recovers profiledbootstrap,
hwoever I want to look into the code tomorrow bit more to be sure that
it does not disable more than it should.

Honza

diff --git a/gcc/ipa-utils.h b/gcc/ipa-utils.h
index 880e527c590..91571d8e82a 100644
--- a/gcc/ipa-utils.h
+++ b/gcc/ipa-utils.h
@@ -211,8 +211,6 @@ type_with_linkage_p (const_tree t)
   if (!TYPE_CONTEXT (t))
 return false;
 
-  gcc_checking_assert (TREE_CODE (t) == ENUMERAL_TYPE || TYPE_CXX_ODR_P (t));
-
   return true;
 }
 
diff --git a/gcc/lto/lto-common.c b/gcc/lto/lto-common.c
index 6944c469f89..0a3033c3695 100644
--- a/gcc/lto/lto-common.c
+++ b/gcc/lto/lto-common.c
@@ -415,8 +415,8 @@ gimple_register_canonical_type_1 (tree t, hashval_t hash)
  that we can use to lookup structurally equivalent non-ODR type.
  In case we decide to treat type as unique ODR type we recompute hash based
  on name and let TBAA machinery know about our decision.  */
-  if (RECORD_OR_UNION_TYPE_P (t)
-  && odr_type_p (t) && !odr_type_violation_reported_p (t))
+  if (RECORD_OR_UNION_TYPE_P (t) && odr_type_p (t)
+  && TYPE_CXX_ODR_P (t) && !odr_type_violation_reported_p (t))
 {
   /* Anonymous namespace types never conflict with non-C++ types.  */
   if (type_with_linkage_p (t) && type_in_anonymous_namespace_p (t))
@@ -434,6 +434,7 @@ gimple_register_canonical_type_1 (tree t, hashval_t hash)
   if (slot && !TYPE_CXX_ODR_P (*(tree *)slot))
{
  tree nonodr = *(tree *)slot;
+ gcc_checking_assert (!flag_ltrans);
  if (symtab->dump_file)
{
  fprintf (symtab->dump_file,
diff --git a/gcc/tree-streamer-out.c b/gcc/tree-streamer-out.c
index d7a451cfef4..237c87fbf62 100644
--- a/gcc/tree-streamer-out.c
+++ b/gcc/tree-streamer-out.c
@@ -343,7 +343,15 @@ pack_ts_type_common_value_fields (struct bitpack_d *bp, 
tree expr)
 {
   bp_pack_value (bp, TYPE_TRANSPARENT_AGGR (expr), 1);
   bp_pack_value (bp, TYPE_FINAL_P (expr), 1);
-  bp_pack_value (bp, TYPE_CXX_ODR_P (expr), 1);
+  /* alias_ptr_types_compatible_p relies on fact that during LTO
+ types do not get refined from WPA time to ltrans.  */
+  gcc_checking_assert (!in_lto_p
+  || !TYPE_CANONICAL (expr)
+  || TYPE_CXX_ODR_P (TYPE_CANONICAL (expr))
+ == TYPE_CXX_ODR_P (expr));
+  bp_pack_value (bp, flag_wpa && TYPE_CANONICAL (expr)
+? TYPE_CXX_ODR_P (TYPE_CANONICAL (expr))
+: TYPE_CXX_ODR_P (expr), 1);
 }
   else if (TREE_CODE (expr) == ARRAY_TYPE)
 bp_pack_value (bp, TYPE_NONALIASED_COMPONENT (expr), 1);


Re: [Bug middle-end/97840] [11 regression] Bogus -Wmaybe-uninitialized

2020-11-16 Thread Jan Hubicka
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97840
> 
> --- Comment #14 from Martin Sebor  ---
> Created attachment 49572
>   --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=49572=edit
> Patch under test.
> 
> The attached patch avoids the warning on aarch64.  Let me finish testing it 
> and
> submit it later today.
Great, I was testing identical patch (and seems to work for me), So I
will leave it up to you. Thanks!



Re: [Bug bootstrap/97857] [11 Regression] profiledbootstrap broken freeing speculative call summary since r11-4987-g602c6cfc79ce4ae61e277107e0a60079c1a93a97

2020-11-16 Thread Jan Hubicka
The checking enabled build ICEs for me at same spot as for you

   0x01475505 <+165>:   punpcklqdq %xmm2,%xmm3
   0x01475509 <+169>:   movaps %xmm3,0x30(%rsp)
   0x0147550e <+174>:   callq  0x10949d0 
::iterator::slide()>
   0x01475513 <+179>:   mov%r12,0x20(%rsp)
   0x01475518 <+184>:   mov%rbx,0x28(%rsp)
   0x0147551d <+189>:   jmp0x1475548 

=> 0x0147551f <+191>:   mov(%r14),%rdi
   0x01475522 <+194>:   cmp$0x1,%rdi
   0x01475526 <+198>:   jbe0x1475535 

   0x01475528 <+200>:   callq  0x1094a30 
   0x0147552d <+205>:   test   %eax,%eax
   0x0147552f <+207>:   je 0x14757c2 

   0x01475535 <+213>:   add$0x8,%r14
   0x01475539 <+217>:   lea0x20(%rsp),%rdi
   0x0147553e <+222>:   mov%r14,0x20(%rsp)
   0x01475543 <+227>:   callq  0x10949d0 
::iterator::slide()>
   0x01475548 <+232>:   mov0x20(%rsp),%r14

So same loop merging slide, also vectorized code around.


Re: [Bug middle-end/97840] [11 regression] Bogus -Wmaybe-uninitialized

2020-11-16 Thread Jan Hubicka
> I agree we should just rename default_is_empty_type to is_empty_type, export
> it, declare in tree.h and use it instead that complicated test.  TYPE_EMPTY_P
> isn't something tree-ssa-uninit.c should care about, that is just whether the
> backend decided it will not be passed at all.
OK, perhaps I can realize this plan and commit tonight so we do not keep
boostrap blocked since it is a free day tomorrow (I will be hacking, but
perhaps Martin is not?).

Honza


Re: [Bug middle-end/97840] [11 regression] Bogus -Wmaybe-uninitialized

2020-11-16 Thread Jan Hubicka
> Note i686-linux bootstrap is still broken in r11-5062 - the PR97853 error.
Yes, as discussed earlier (but perhaps lost in other coments) we need
fix for the targetm.calls.empty_record_p (type) divergence. It is not
clear to me if simply calling the default implementation instead of the
rather complicated conditional

  if (TYPE_EMPTY_P (rhstype)
  || (RECORD_OR_UNION_TYPE_P (rhstype)
  && (!first_field (rhstype)
  || default_is_empty_record (rhstype
return NULL_TREE;

is desired here, so hope Martin Sebor will know.  Perhaps simply
exporting default_is_empty_type and doing 

  if (default_is_empty_type (rhstype))
return NULL_TREE;



Re: [Bug bootstrap/97857] [11 Regression] profiledbootstrap broken freeing speculative call summary since r11-4987-g602c6cfc79ce4ae61e277107e0a60079c1a93a97

2020-11-16 Thread Jan Hubicka
It seems to crash on quite few locaitons but always related to indirect
calls.  So perhaps there is some sort of weird relation to indirect call
profiling or devirutalization...

I am going to move my build to faster machine.
Honza


Re: [Bug bootstrap/97857] [11 Regression] profiledbootstrap broken freeing speculative call summary since r11-4987-g602c6cfc79ce4ae61e277107e0a60079c1a93a97

2020-11-16 Thread Jan Hubicka
> > Yep, I already worked out it is ipa-icf...
> > Do you have easy way to bisect what merge is causing the failure?
> 
> Working on that will send details soon.
Great, thanks.  In meantime I will check if I can isolate one of the paths
(constant access merging, variable access merging on the two other
changes).

Honza


Re: [Bug bootstrap/97857] [11 Regression] profiledbootstrap broken freeing speculative call summary since r11-4987-g602c6cfc79ce4ae61e277107e0a60079c1a93a97

2020-11-16 Thread Jan Hubicka
> I see a similar bootstrap failure that's with:
> 
> ../configure --enable-languages=c,c++,lto --prefix=/home/marxin/bin/gcc
> --disable-multilib --without-isl --disable-libsanitizer
> --with-build-config=bootstrap-lto-lean && make profiledbootstrap
> 'STAGE1_CFLAGS=-g -O2'
> 
> started with r11-4987-g602c6cfc79ce4ae61e277107e0a60079c1a93a97.

Yep, I already worked out it is ipa-icf...
Do you have easy way to bisect what merge is causing the failure?
It is odd it reproduces only with profile feedback and now without since
ICF does not care about it very much.

Honza


Re: [Bug ipa/97695] [11 Regression] wrong code at -O3 on x86_64-pc-linux-gnu since r11-4587-gae7a23a3fab74.

2020-11-03 Thread Jan Hubicka
I see you have patch, too :)
However we do not want to copy clone info to every inline clone (since
the body is materialized just once).  The problem is that in case the
offline copy is removed we move clone info to first inline clone and
reshape the tree.  This is quite old code to save little memory for the
extra symbol that I may get rid of becuase it also trigger quadratic
time issues with tree-inline, but for now we should copy the info
correctly.  Code is not very well structured for this since it first
calls removal hooks and only then looks for replacement so patch is bit
ugly.  I will clean it up incrementally.

gcc/ChangeLog:

2020-11-03  Jan Hubicka  

* cgraph.c (cgraph_edge::redirect_call_stmt_to_callee): Fix ICE with
in dumping code.
(cgraph_node::remove): Save clone info before releasing it and pass it
to unregister.
* cgraph.h (symtab_node::unregister): Add clone_info parameter.
(cgraph_clone::unregister): Likewise.
* cgraphclones.c (cgraph_node::find_replacement): Copy clone info
* symtab-clones.cc (clone_infos_t::duplicate): Remove.
(clone_info::get_create): Simplify.
* symtab.c (symtab_node::unregister): Pass around clone info.
* varpool.c (varpool_node::remove): Update.

diff --git a/gcc/cgraph.c b/gcc/cgraph.c
index 36bdb009bf8..19dfe2be23b 100644
--- a/gcc/cgraph.c
+++ b/gcc/cgraph.c
@@ -1503,14 +1503,13 @@ cgraph_edge::redirect_call_stmt_to_callee (cgraph_edge 
*e)
 
   if (symtab->dump_file)
 {
-
   fprintf (symtab->dump_file, "updating call of %s -> %s: ",
   e->caller->dump_name (), e->callee->dump_name ());
   print_gimple_stmt (symtab->dump_file, e->call_stmt, 0, dump_flags);
   if (callee_info && callee_info->param_adjustments)
callee_info->param_adjustments->dump (symtab->dump_file);
   unsigned performed_len
-   = vec_safe_length (caller_info->performed_splits);
+   = caller_info ? vec_safe_length (caller_info->performed_splits) : 0;
   if (performed_len > 0)
fprintf (symtab->dump_file, "Performed splits records:\n");
   for (unsigned i = 0; i < performed_len; i++)
@@ -1861,12 +1860,19 @@ cgraph_node::release_body (bool keep_arguments)
 void
 cgraph_node::remove (void)
 {
+  bool clone_info_set = false;
+  clone_info *info, saved_info;
   if (symtab->ipa_clones_dump_file && symtab->cloned_nodes.contains (this))
 fprintf (symtab->ipa_clones_dump_file,
 "Callgraph removal;%s;%d;%s;%d;%d\n", asm_name (), order,
 DECL_SOURCE_FILE (decl), DECL_SOURCE_LINE (decl),
 DECL_SOURCE_COLUMN (decl));
 
+  if ((info = clone_info::get (this)) != NULL)
+{
+  saved_info = *info;
+  clone_info_set = true;
+}
   symtab->call_cgraph_removal_hooks (this);
   remove_callers ();
   remove_callees ();
@@ -1878,7 +1884,7 @@ cgraph_node::remove (void)
   force_output = false;
   forced_by_abi = false;
 
-  unregister ();
+  unregister (clone_info_set ? _info : NULL);
   if (prev_sibling_clone)
 prev_sibling_clone->next_sibling_clone = next_sibling_clone;
   else if (clone_of)
diff --git a/gcc/cgraph.h b/gcc/cgraph.h
index cd22676ff9e..c87180f1e96 100644
--- a/gcc/cgraph.h
+++ b/gcc/cgraph.h
@@ -631,7 +631,7 @@ protected:
 
   /* Remove node from symbol table.  This function is not used directly, but 
via
  cgraph/varpool node removal routines.  */
-  void unregister (void);
+  void unregister (struct clone_info *);
 
   /* Return the initialization and finalization priority information for
  DECL.  If there is no previous priority information, a freshly
@@ -949,7 +949,7 @@ struct GTY((tag ("SYMTAB_FUNCTION"))) cgraph_node : public 
symtab_node
 
   /* cgraph node being removed from symbol table; see if its entry can be
replaced by other inline clone.  */
-  cgraph_node *find_replacement (void);
+  cgraph_node *find_replacement (struct clone_info *);
 
   /* Create a new cgraph node which is the new version of
  callgraph node.  REDIRECT_CALLERS holds the callers
diff --git a/gcc/cgraphclones.c b/gcc/cgraphclones.c
index 36ca6477139..a49e58ce279 100644
--- a/gcc/cgraphclones.c
+++ b/gcc/cgraphclones.c
@@ -650,7 +650,7 @@ cgraph_node::create_virtual_clone (vec 
redirect_callers,
 /* callgraph node being removed from symbol table; see if its entry can be
replaced by other inline clone.  */
 cgraph_node *
-cgraph_node::find_replacement (void)
+cgraph_node::find_replacement (clone_info *info)
 {
   cgraph_node *next_inline_clone, *replacement;
 
@@ -690,7 +690,6 @@ cgraph_node::find_replacement (void)
   clones = NULL;
 
   /* Copy clone info.  */
-  clone_info *info = clone_info::get (this);
   if (info)
*clone_info::get_create (next_inline_clone) = *info;
 
diff --git a/gcc/symtab-clones.cc b/gcc/symtab-clones.cc
index 76b86c6496f

Re: [Bug c/97578] ice during IPA pass: inline

2020-11-03 Thread Jan Hubicka
> It needs to refer to the DW_TAG_formal_parameter DIEs, and only the PARM_DECLs
> map to those.
It has problem with the partitioning (if we call a callee from different
parititon) and also if the callee is compiled before caller (as it
should) we will call cgraph_node::release_body and that will likely
remove pointer to them.


Re: [Bug c/97578] ice during IPA pass: inline

2020-11-01 Thread Jan Hubicka
Hi,
this patch fixes the ICE, though I think we do have a design issue here
while producing debug info across ltrans boundary.

Martin, Jakub: as discussed on IRC it would be nice to add predicate
when the body is really needed and avoid materializing if it is not.
Can you add one?

Something like param_adjustemnts->need_callee_parm_decls_p ()

Honza

diff --git a/gcc/ipa-inline-transform.c b/gcc/ipa-inline-transform.c
index 4df1b7fb9ee..907a95cac5a 100644
--- a/gcc/ipa-inline-transform.c
+++ b/gcc/ipa-inline-transform.c
@@ -51,6 +51,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "ipa-modref-tree.h"
 #include "ipa-modref.h"
 #include "symtab-thunks.h"
+#include "symtab-clones.h"
 
 int ncalls_inlined;
 int nfunctions_inlined;
@@ -695,6 +696,31 @@ preserve_function_body_p (struct cgraph_node *node)
   return false;
 }
 
+/* tree-inline can not recurse; materialize all function bodie we will need
+   during inlining.  This includes inlined functions, but also called functions
+   with param manipulation because IPA param manipulation attaches debug
+   statements to PARM_DECLs of called clone.  Materialize them if needed.
+
+   FIXME: This is somehwat broken by design because it does not play well
+   with partitioning.  */
+
+static void
+maybe_materialize_called_clones (cgraph_node *node)
+{
+  for (cgraph_edge *e = node->callees; e; e = e->next_callee)
+{
+  clone_info *info;
+
+  if (!e->inline_failed)
+   maybe_materialize_called_clones (e->callee);
+
+  cgraph_node *callee = cgraph_node::get (e->callee->decl);
+  if (callee->clone_of
+ && (info = clone_info::get (callee)) && info->param_adjustments)
+   callee->get_untransformed_body ();
+}
+}
+
 /* Apply inline plan to function.  */
 
 unsigned int
@@ -748,6 +774,7 @@ inline_transform (struct cgraph_node *node)
   ENTRY_BLOCK_PTR_FOR_FN (cfun)->count = node->count;
 }
 
+  maybe_materialize_called_clones (node);
   for (e = node->callees; e; e = next)
 {
   if (!e->inline_failed)


Re: [Bug ipa/97586] [11 Regression] "make check" failures in binutils with -flto since r11-3641-gc34db4b6f8a5d803

2020-10-27 Thread Jan Hubicka
> Hi,
> this is patch that moves updates to WPA time.  Does it work for you?
Actually it won't help, since it updates only non-lto summary.  I am
testing better patch, sorry for that.

Honza


Re: [Bug ipa/97586] [11 Regression] "make check" failures in binutils with -flto since r11-3641-gc34db4b6f8a5d803

2020-10-27 Thread Jan Hubicka
Hi,
this is patch that moves updates to WPA time.  Does it work for you?
Honza


2020-10-27  Jan Hubicka  

* ipa-modref.c (modref_summaries_lto::duplicate): Check that no clones
happens after modref.
(modref_transform): Rename to ...
(update_signature): ... this one.
(pass_ipa_modref::execute): Update all summaries once done.

diff --git a/gcc/ipa-modref.c b/gcc/ipa-modref.c
index 3a70965d156..235d712a986 100644
--- a/gcc/ipa-modref.c
+++ b/gcc/ipa-modref.c
@@ -1080,6 +1080,9 @@ modref_summaries_lto::duplicate (cgraph_node *, 
cgraph_node *,
 modref_summary_lto *src_data,
 modref_summary_lto *dst_data)
 {
+  /* Be sure that no furhter cloning happens after ipa-modref.  If it does
+ we will need to update signatures for possible param changes.  */
+  gcc_checking_assert (!((modref_summaries_lto *)summaries_lto)->propagated);
   dst_data->stores = modref_records_lto::create_ggc
(src_data->stores->max_bases,
 src_data->stores->max_refs,
@@ -1503,14 +1506,14 @@ remap_arguments (vec  *map, modref_records *tt)
 
 /* If signature changed, update the summary.  */
 
-static unsigned int
-modref_transform (struct cgraph_node *node)
+static void
+update_signature (struct cgraph_node *node)
 {
   if (!node->clone.param_adjustments || !optimization_summaries)
-return 0;
+return;
   modref_summary *r = optimization_summaries->get (node);
   if (!r)
-return 0;
+return;
   if (dump_file)
 {
   fprintf (dump_file, "Updating summary for %s from:\n",
@@ -1546,7 +1549,7 @@ modref_transform (struct cgraph_node *node)
   fprintf (dump_file, "to:\n");
   r->dump (dump_file);
 }
-  return 0;
+  return;
 }
 
 /* Definition of the modref IPA pass.  */
@@ -1575,7 +1578,7 @@ public:
  modref_read, /* read_optimization_summary */
  NULL,/* stmt_fixup */
  0,   /* function_transform_todo_flags_start */
- modref_transform,/* function_transform */
+ NULL,/* function_transform */
  NULL)/* variable_transform */
   {}
 
@@ -2137,6 +2140,9 @@ pass_ipa_modref::execute (function *)
 
   modref_propagate_in_scc (component_node);
 }
+  cgraph_node *node;
+  FOR_EACH_FUNCTION (node)
+update_signature (node);
   if (summaries_lto)
 ((modref_summaries_lto *)summaries_lto)->propagated = true;
   ipa_free_postorder_info ();
diff --git a/gcc/testsuite/g++.dg/ipa/devirt-24.C 
b/gcc/testsuite/g++.dg/ipa/devirt-24.C
index eaef1f5b3f8..7b5b806dd05 100644
--- a/gcc/testsuite/g++.dg/ipa/devirt-24.C
+++ b/gcc/testsuite/g++.dg/ipa/devirt-24.C
@@ -37,4 +37,4 @@ C *b = new (C);
   }
 }
 /* { dg-final { scan-ipa-dump-times "Discovered a virtual call to a known 
target" 1 "inline" { xfail *-*-* } } } */
-/* { dg-final { scan-ipa-dump-times "Aggregate passed by reference" 1 "cp"  } 
} */
+/* { dg-final { scan-ipa-dump-times "Aggregate passed by reference" 2 "cp"  } 
} */


Re: [Bug lto/97586] [11 Regression] "make check" failures in binutils with -flto since r11-3641-gc34db4b6f8a5d803

2020-10-27 Thread Jan Hubicka
> So the _bfd_safe_read_leb128.constprop removes the first unused argument:
> 
...
> 
> But the analysis is bogus:
> 
> ipa-modref: call to _bfd_safe_read_leb128.constprop/17919 does not clobber 
> ref:
> bytes_read alias sets: 7->7
> 
> The _read is always modified in the function (if it's non-null).

There is code in modref-transform that is supposed to update the
summary.  It produces debug output about it, but to be honest I am not
sure where it will land since we now materialize lazily. Can you do
-fdump-tree-all-details and then grep for "Updating summary for"
and see if the resulting summary makes sense and we do not miss the
update?

Maybe problem is that if we call across partitioining boundary, the
summary update happens in one ltrans while the call happens in different
ltrans?  When partitioning, we may need to update summaries at
stream-out time

Honza


Re: [Bug c/97445] Some fonctions marked static inline in Linux kernel are not inlined

2020-10-20 Thread Jan Hubicka
Hi,
this implements the heuristics increasing bounds for functions having
__builtin_constant_p on parameters.  Note that for get_order this is
still not enough, since we increase the bound twice if hint applies, so
it goes from 70 to 140 and not to 190 needed, however it will handle
ohter similar cases.

If hint weight is increased to 300%, so 210 I get:
hubicka@lomikamen-jh:/aux/hubicka/trunk-git/build10/gcc$ ./xgcc -B ./ -O2 
-Winline pipe.i --param inline-heuristics-hint-percent=300
In file included from fs/pipe.c:11:
./include/linux/slab.h: In function ‘alloc_pipe_info’:
./include/linux/slab.h:586:121: warning: inlining failed in call to 
‘kmalloc_array.constprop’: --param max-inline-insns-single limit reached
[-Winline]
./include/linux/slab.h:605:9: note: called from here
./include/linux/slab.h: In function ‘pipe_resize_ring’:
./include/linux/slab.h:586:121: warning: inlining failed in call to 
‘kmalloc_array.constprop’: --param max-inline-insns-single limit reached 
[-Winline]
./include/linux/slab.h:605:9: note: called from here

So the problem only shifts to not inlininig kmalloc_array.
(that is why it would be nice to update kernel with the easier
get_order)

However it shows different problem: ipa-cp produces cone of
kmalloc_array since it is always used by constant size, but the clone
does not update the predicates, so we lose track about the parameter
being constant and that is why we optimize out only late.

Martin, I think this is caused by long lasting TODO in
ipa_fn_summary_t::duplicate and probably we should implement it: based
on the known partial assignment of params to constant we should fold the
conditions in predicates.

Indeed with ./xgcc -B ./ -O2 -Winline pipe.i  -fno-ipa-cp --param 
inline-heuristics-hint-percent=300
the warning goes away.  We still need the stronger hint though.
gcc/ChangeLog:

2020-10-20  Jan Hubicka  

PR c/97445
* ipa-fnsummary.c (ipa_dump_hints): Handle
INLINE_HINT_builtin_constant_p.
(ipa_fn_summary::~ipa_fn_summary): Free builtin_constant_p_parms.
(ipa_fn_summary_t::duplicate): Copy builtin_constant_p_parms.
(ipa_dump_fn_summary): Dump builtin_constant_p_parms.
(set_cond_stmt_execution_predicate): Compute builtin_constant_p_parms.
(ipa_call_context::estimate_size_and_time): Set
INLINE_HINT_builtin_constant_p.
(ipa_merge_fn_summary_after_inlining): Merge builtin_constant_p_parms.
(inline_read_section): Stream builtin_constant_p_parms.
(ipa_fn_summary_write): Stream builtin_constant_p_parms.
* ipa-fnsummary.h (enum ipa_hints_vals): Add
INLINE_HINT_builtin_constant_p.
(ipa_fn_summary): Add builtin_constant_p_parms.
* ipa-inline.c (want_inline_small_function_p): Handle
INLINE_HINT_builtin_constant_p.
(edge_badness): Handle INLINE_HINT_builtin_constant_p.

gcc/testsuite/ChangeLog:

2020-10-20  Jan Hubicka  

* gcc.dg/ipa/inlinehint-5.c: New test.


diff --git a/gcc/ipa-fnsummary.c b/gcc/ipa-fnsummary.c
index 9e3eda4d3cb..4292f1f5fe7 100644
--- a/gcc/ipa-fnsummary.c
+++ b/gcc/ipa-fnsummary.c
@@ -141,6 +141,11 @@ ipa_dump_hints (FILE *f, ipa_hints hints)
   hints &= ~INLINE_HINT_known_hot;
   fprintf (f, " known_hot");
 }
+  if (hints & INLINE_HINT_builtin_constant_p)
+{
+  hints &= ~INLINE_HINT_builtin_constant_p;
+  fprintf (f, " builtin_constant_p");
+}
   gcc_assert (!hints);
 }
 
@@ -751,6 +756,7 @@ ipa_fn_summary::~ipa_fn_summary ()
   vec_free (call_size_time_table);
   vec_free (loop_iterations);
   vec_free (loop_strides);
+  vec_free (builtin_constant_p_parms);
 }
 
 void
@@ -805,6 +811,10 @@ ipa_fn_summary_t::duplicate (cgraph_node *src,
  that are known to be false or true.  */
   info->conds = vec_safe_copy (info->conds);
 
+  if (info->builtin_constant_p_parms)
+info->builtin_constant_p_parms
+= vec_safe_copy (info->builtin_constant_p_parms);
+
   /* When there are any replacements in the function body, see if we can figure
  out that something was optimized out.  */
   if (ipa_node_params_sum && dst->clone.tree_map)
@@ -1066,6 +1076,13 @@ ipa_dump_fn_summary (FILE *f, struct cgraph_node *node)
fprintf (f, " inlinable");
  if (s->fp_expressions)
fprintf (f, " fp_expression");
+ if (s->builtin_constant_p_parms)
+   {
+ fprintf (f, " builtin_constant_p_parms");
+ for (unsigned int i = 0;
+  i < s->builtin_constant_p_parms->length (); i++)
+   fprintf (f, " %i", (*s->builtin_constant_p_parms)[i]);
+   }
  fprintf (f, "\n  global time: %f\n", s->time.to_double ());
  fprintf (f, "  self size:   %i\n", ss->self_size);
  fprintf (f, "  global size:   

Re: [Bug c/97445] Some fonctions marked static inline in Linux kernel are not inlined

2020-10-20 Thread Jan Hubicka
> 
> Original asm is:
> 
> __attribute__ ((noinline))
> int fls64(__u64 x)
> {
>  int bitpos = -1;
>  asm("bsrq %1,%q0"
>  : "+r" (bitpos)
>  : "rm" (x));
>  return bitpos + 1;
> }
> 
> There seems to be bug in bsr{q} pattern.  I can make GCC produce same
> code with:
> 
> __attribute__ ((noinline))
> int
> my_fls64 (__u64 x)
> {
>   asm volatile ("movl $-1, %eax");
>   return (__builtin_clzll (x) ^ 63) + 1;
> }

Aha, bsr is not doing anything if parameter is 0, so pattern is correct
(just the instruction is undefined for 0 which makes sense).
But with that pattern GCC can't synthetize the code sequence above :)

Honza


Re: [Bug c/97445] Some fonctions marked static inline in Linux kernel are not inlined

2020-10-20 Thread Jan Hubicka
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97445
> 
> --- Comment #33 from Jakub Jelinek  ---
> (In reply to Jan Hubicka from comment #32)
> > get_order is a wrapper around ffs64.  This can be implemented w/o asm
> > statement as follows:
> > int
> > my_fls64 (__u64 x)
> > {
> >   if (!x)
> >   return 0;
> >   return 64 - __builtin_clzl (x);
> > }
> > 
> > This results in longer assembly than the kernel asm implementation. If
> > that matters I would replace builtin_constnat_p part of get_order by this
> > implementation that is more transparent to the code size estimation and
> > things will get inlined.
> 
> Better __builtin_clzll so that it works also on 32-bit arches.
> Anyway, if kernel's fls64 results in better code than the my_fls64, we should
> look at GCC's code generation for that case.

Original asm is:

__attribute__ ((noinline))
int fls64(__u64 x)
{
 int bitpos = -1;
 asm("bsrq %1,%q0"
 : "+r" (bitpos)
 : "rm" (x));
 return bitpos + 1;
}

There seems to be bug in bsr{q} pattern.  I can make GCC produce same
code with:

__attribute__ ((noinline))
int
my_fls64 (__u64 x)
{
  asm volatile ("movl $-1, %eax");
  return (__builtin_clzll (x) ^ 63) + 1;
}

But obviously the volatile asm should not be needed.  I think bsrq is
incorrectly modelled as returning full register

(define_insn "bsr_rex64"
  [(set (match_operand:DI 0 "register_operand" "=r")
(minus:DI (const_int 63)
  (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm"
   (clobber (reg:CC FLAGS_REG))]
  "TARGET_64BIT"
  "bsr{q}\t{%1, %0|%0, %1}"
  [(set_attr "type" "alu1")
   (set_attr "prefix_0f" "1")
   (set_attr "znver1_decode" "vector")
   (set_attr "mode" "DI")])



Re: [Bug c/97445] Some fonctions marked static inline in Linux kernel are not inlined

2020-10-20 Thread Jan Hubicka
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97445
> 
> --- Comment #31 from Segher Boessenkool  ---
> (In reply to Jan Hubicka from comment #27)
> > It is because --param inline-insns-single was reduced for -O2 from 200
> > to 70.  GCC 10 has newly different set of parameters for -O2 and -O3 and
> > enables auto-inlining at -O2.
> > 
> > Problem with inlininig funtions declared inline is that C++ codebases
> > tends to abuse this keyword for things that are really too large (and
> > get_order would be such example if it did not have builtin_constant_p
> > check which inliner does not understand well). So having same limit at
> > -O2 and -O3 turned out to be problematic with respect to code size and
> > especially with respect to LTO, where a lot more inlining oppurtunities
> > appear.
> 
> Do the heuristics account for that not inlining a "static inline" results
> in multiple copies?

It prevents inlining only when there are multiple calls in the unit
being compiled (there is no way to know that the same inline function is
duplicated in other units).
This is what happens here: there are multiple calls so inliner concludes
inlining would cost too much of code size and later they are optimized
away.

get_order is a wrapper around ffs64.  This can be implemented w/o asm
statement as follows:
int
my_fls64 (__u64 x)
{
  if (!x)
  return 0;
  return 64 - __builtin_clzl (x);
}

This results in longer assembly than the kernel asm implementation. If
that matters I would replace builtin_constnat_p part of get_order by this
implementation that is more transparent to the code size estimation and
things will get inlined.

Honza


Re: [Bug c/97445] Some fonctions marked static inline in Linux kernel are not inlined

2020-10-20 Thread Jan Hubicka
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97445
> 
> --- Comment #23 from Christophe Leroy  ---
> (In reply to Jan Hubicka from comment #19)
> > 
> > It is always possible to always_inline functions that are intended to be
> > always inlined.
> > Honza
> 
> Yes and I sent a patch for that to the Linux kernel, but what I would like to
> understand is why does GCC 10 completely fails to inline that while GCC 9 was
> doing things properly ?

It is because --param inline-insns-single was reduced for -O2 from 200
to 70.  GCC 10 has newly different set of parameters for -O2 and -O3 and
enables auto-inlining at -O2.

Problem with inlininig funtions declared inline is that C++ codebases
tends to abuse this keyword for things that are really too large (and
get_order would be such example if it did not have builtin_constant_p
check which inliner does not understand well). So having same limit at
-O2 and -O3 turned out to be problematic with respect to code size and
especially with respect to LTO, where a lot more inlining oppurtunities
appear.

I will implement the heuristics to push up inline limits of functions
having builtin_constant_p of parameter which should help a bit in this
case (but not very systematically: as dicussed in the PR log it is quite
hard problem to get builtin_constant_p right in the code size metrics
used by inliner before it knows exactly what is going to be constant and
what is not).

Honza


Re: [Bug gcov-profile/97461] [11 Regression] allocate_gcov_kvp() deadlocks in firefox LTO+PGO build (overridden malloc() recursion)

2020-10-19 Thread Jan Hubicka
> 
> They have the very same problem when I disable a statically pre-allocated
> buffers with -mllvm -vp-static-alloc=0:
> 
> Program received signal SIGILL, Illegal instruction.
> 0x004014e6 in calloc (nmemb=1, size=8) at pr97461.c:103
> 103 if (malloc_depth != 0) __builtin_trap();
> (gdb) bt
> #0  0x004014e6 in calloc (nmemb=1, size=8) at pr97461.c:103
> #1  0x00401ae1 in allocateValueProfileCounters (Data=0x40a2c8) at
> /home/marxin/Programming/llvm-project/compiler-rt/lib/profile/InstrProfilingValue.c:101
> #2  0x00401c45 in instrumentTargetValueImpl (CountValue=1,
> CounterIndex=0, Data=0x40a2c8, TargetValue=4199264) at
> /home/marxin/Programming/llvm-project/compiler-rt/lib/profile/InstrProfilingValue.c:146
> #3  __llvm_profile_instrument_target (TargetValue=4199264, Data=0x40a2c8,
> CounterIndex=0) at
> /home/marxin/Programming/llvm-project/compiler-rt/lib/profile/InstrProfilingValue.c:232
> #4  0x0040148f in malloc_impl (size=56) at pr97461.c:85
> #5  0x004013fe in malloc (size=56) at pr97461.c:95
> #6  0x77e048a3 in __add_to_environ (name=0x406138
> "__LLVM_PROFILE_RT_INIT_ONCE", value=, combined= out>,
> replace=) at setenv.c:215
> #7  0x00402ce4 in truncateCurrentFile ()
> #8  0x004039bc in parseAndSetFilename ()
> #9  0x00404134 in __llvm_profile_initialize ()
> #10 0x00405e95 in __libc_csu_init (argc=argc@entry=1,
> argv=argv@entry=0x7fffdfa8, envp=0x7fffdfb8) at elf-init.c:89
> #11 0x77decd9a in __libc_start_main (main=0x401580 , argc=1,
> argv=0x7fffdfa8, init=0x405e50 <__libc_csu_init>, fini=,
> rtld_fini=, stack_end=0x7fffdf98) at 
> ../csu/libc-start.c:270
> #12 0x004012aa in _start () at ../sysdeps/x86_64/start.S:120

Hmm, it seems to me that having some entries prealocated by default
would be way to avoid this problem in majority cases w/o having to
modify the upstream packages. 

Honza


Re: [Bug gcov-profile/97461] [11 Regression] allocate_gcov_kvp() deadlocks in firefox LTO+PGO build (overridden malloc() recursion)

2020-10-19 Thread Jan Hubicka
> No. The only thing we support is a recursive malloc as seen in:
> ./gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-malloc.c
> 
> It was added in g:bc2b1a232b1825b421a1aaa21a0865b2d1e4e08c as we use a
> statically allocated buffer when we recursively entry allocate_gcov_kvp.
> 
> However this is different as we can't call malloc/calloc from the function as
> we're in code that initializes a memory allocator.
> 
> We can mitigate the issue with a pair of new functions __gcov_supress_malloc
> and __gcov_alloc_malloc that will be called by a custom memory allocator.
> 
> What do you think about it?

How this works with the llvm implementation (that is very similar here,
right?)

Honza


Re: [Bug ipa/97292] [11 Regression] dealII from SPECCPU 2016 no longer terminates after g:c34db4b6f8a5d80367c709309f9b00cb32630054

2020-10-08 Thread Jan Hubicka
Hi,
the following patch should let us to pinpoint the wrong disambiguation.
With -fdump-tree-all-details we should also see the difference in dump
file.

Honza

diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def
index cf8775b2b66..07946a85ecc 100644
--- a/gcc/dbgcnt.def
+++ b/gcc/dbgcnt.def
@@ -171,6 +171,7 @@ DEBUG_COUNTER (if_after_reload)
 DEBUG_COUNTER (if_conversion)
 DEBUG_COUNTER (if_conversion_tree)
 DEBUG_COUNTER (ipa_cp_bits)
+DEBUG_COUNTER (ipa_mod_ref)
 DEBUG_COUNTER (ipa_sra_params)
 DEBUG_COUNTER (ipa_sra_retvalues)
 DEBUG_COUNTER (ira_move)
diff --git a/gcc/tree-ssa-alias.c b/gcc/tree-ssa-alias.c
index 97dc4ac8814..ba208604c30 100644
--- a/gcc/tree-ssa-alias.c
+++ b/gcc/tree-ssa-alias.c
@@ -2470,6 +2470,9 @@ modref_may_conflict (const gimple *stmt,
   if (tt->every_base)
 return true;
 
+  if (!dbg_cnt (ipa_mod_ref))
+return true;
+
   base_set = ao_ref_base_alias_set (ref);
 
   ref_set = ao_ref_alias_set (ref);


Re: [Bug tree-optimization/97159] [11 Regression] segfault in modref_may_conflict

2020-09-22 Thread Jan Hubicka
Recursion is handled in normal compilation (we analyze the function and
while hitting the recursive call we skip the summary). I suppose here
the problem is missing LTO and offloading. 

With LTO lto summaries (that include types) are streamed out while they
are turned into non-lto summaries at ltrans time.  We need to do similar
thing with offload.

Honza


Re: [Bug bootstrap/96794] --with-build-config=bootstrap-lto-lean with --enable-link-mutex leads to poor LTRANS utilization

2020-08-26 Thread Jan Hubicka
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96794
> 
> --- Comment #4 from Martin Liška  ---
> > > For jobserver they are still running even though they sleep.
> > Aha, so it is extra locking mechanizm we add without jobserver
> > knowledge.
> 
> It's unrelated to jobserver, one can enable it with configure option mentioned
> in the title.

Yep, but that is where I see the problem - if we simply wait for lock
and jobserver does not know that, he counts it as a job..
> > This is of course still not very pretty, but it is impossible to tell in
> > advance what job is big and what job is small.
> 
> Sure, it's all quite compilicated. One needs to negotiate with jobserver :)

Yep...
Honza
> 
> I'm going to collect graph w/o --enable-link-mutex on my machine.
> 
> > 
> > Honza


Re: [Bug bootstrap/96794] --with-build-config=bootstrap-lto-lean with --enable-link-mutex leads to poor LTRANS utilization

2020-08-26 Thread Jan Hubicka
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96794
> 
> --- Comment #2 from Martin Liška  ---
> (In reply to Jan Hubicka from comment #1)
> > > As seen
> > > here:https://gist.githubusercontent.com/marxin/223890df4d8d8e490b6b2918b77dacad/raw/7e0363da60dcddbfde4ab68fa3be755515166297/gcc-10-with-zstd.svg
> > > 
> > > each blocking linking of a GCC front-end leads to a wasted jobserver 
> > > worker.
> > Hmm, I am not sure how to interpret the graph. I can see that there is a
> > funny staircase of ltranses but how that relates to jobserver workers?
> 
> Yes, I mean the staircase of LTRANS because at the beginning N-1 links are
> waiting for lock:
> 
> [  299s] lock-and-run.sh: (PID 7351) waiting 0 sec to acquire linkfe.lck from
> PID 7347
> ...
> 
> For jobserver they are still running even though they sleep.
Aha, so it is extra locking mechanizm we add without jobserver
knowledge.
> 
> 
> > We limit makefile to link a binary at a time to avoid Richi's box getting
> > out of memory, right?
> 
> No. It's because we want to have a reasonable contrains which is right now 
> 8GB.
> Without --enable-link-mutex, we would consume ~ 10 x 1.3 GB (plus WPA parallel
> streaming peak), which is probably not desired.

10x1.3GB will get consumed only if the building machine has 10 threads.
I wonder if the jobserver WPA streaming integration will happen this
year, with that snd some patches to WPA memory use we could fit in 8GB
unless very large parallelism is configured.

I suppose only really effective solution would to teach the jobserver
that some jobs are "big" and consume multiple tokens, that is WPA, while
other jobs like ltranses and streaming are small.

This is of course still not very pretty, but it is impossible to tell in
advance what job is big and what job is small.

Honza


Re: [Bug bootstrap/96794] New: --with-build-config=bootstrap-lto-lean with --enable-link-mutex leads to poor LTRANS utilization

2020-08-26 Thread Jan Hubicka
> As seen
> here:https://gist.githubusercontent.com/marxin/223890df4d8d8e490b6b2918b77dacad/raw/7e0363da60dcddbfde4ab68fa3be755515166297/gcc-10-with-zstd.svg
> 
> each blocking linking of a GCC front-end leads to a wasted jobserver worker.
Hmm, I am not sure how to interpret the graph. I can see that there is a
funny staircase of ltranses but how that relates to jobserver workers?
We limit makefile to link a binary at a time to avoid Richi's box getting
out of memory, right?

NUmber of partitions is currently 128 what is 100% of CPU usage for you?

Honza


Re: [Bug ipa/96337] [10/11 Regression] GCC 10.2: twice as slow for -O2 -march=x86-64 vs. GCC 9.3/8.4

2020-08-01 Thread Jan Hubicka
> I think, this inliner change needs to be reverted. People expect -O2 to 
> produce
> decently optimized binaries, and starting with gcc 10.x it doesn't deliver. 
> -O3
> traditionally enabled optimizations that may or may not improve performance
> (and historically, sometimes even break code), so most projects don't use it.
I wrote a short description of inliner changes to the phoronix
discussion
https://www.phoronix.com/forums/forum/software/programming-compilers/1196789-gcc-benchmarks-at-varying-optimization-levels-with-core-i9-10900k-show-an-unexpected-surprise/page5
comment 44.

Inliner changes was not targetting to make compile time faster and
compiled code slower. It was intended to reflect more closely modern C++
codebases and get faster binaries (at -O2 and -O2 -flto) without
regressing in code sizes.  In fact more inlining happens and thus we
needed to optimize inliner code carefully to avoid regressions with LTO.

It was benchmarked on wide range of bechmarks including some where
phoronix measured a degradation before GCC10 release.

The benchmarks presented does not reproduce and seems odd. 50% on very
simple benchmarks is bit too much for a change in one optimization.  It
seems more like thermal throttling. Michael promised to re-run the tests
and he is still spekaing about htat in the last reply from 31st.

Testcases are greatly welcome.

Honza


Re: [Bug ipa/96337] [10/11 Regression] GCC 10.2: twice as slow for -O2 -march=x86-64 vs. GCC 9.3/8.4

2020-07-28 Thread Jan Hubicka
> 
> Maybe you want to use same GCC version as phoronix used (GCC 10.2)?
OK, I will give it a try, but there are no inliner changes in gcc 10.2
compared to 10.1.

Honza


Re: [Bug lto/95548] ice in tree_to_shwi, at tree.c:7321

2020-06-05 Thread Jan Hubicka
> I think Honza ran into this himself.
Yep, i converted code to use wide-ints. But it is nice to have short
testcase.

Honza


Re: [Bug tree-optimization/91322] [10 regression] alias-4 test failure

2020-04-04 Thread Jan Hubicka
> Which ARM target has 16-bit int?
> I don't see INT_TYPE_SIZE nor SHORT_TYPE_SIZE defined in config/arm/*, neither
> BITS_PER_WORD, so all depends on UNITS_PER_WORD, which is 4 and thus short is
> 16-bit and int is 32-bit.

Hmm, you are right - I messed up target triplets. With arm-linux-gnueabi
I see 4 byte int and the testcase calls abort.
However it is still missed optimization.  I will check why we end up
with different code than x86 LTO.

Honza


Re: [Bug ipa/93318] [10 regression] Firefox LTO+FDO ICEs in speculative_call_info

2020-01-19 Thread Jan Hubicka
Ok,
I managed to reproduce the crash locally (it was not that easy)
At the point of failure the node passes verification and I suppose
problem is that the call stmt hash contains indirect call while it is
supposed to contain direct call.

Edge removal code probably replaces direct edge by indreict one since it
does not know about additional speculations.  I will continue tomorrow.

Honza


Re: [Bug tree-optimization/93084] [10 regression] Infinite loop in ipa-cp when building clang with LTO+PGO

2020-01-02 Thread Jan Hubicka
> xxx.localalias is gcc-generated as a noninterposable alias to xxx. But I guess
> target node returned by xxx.localalias->function_symbol() is not xxx. A simple
that ought to return xxx unless the target of localalias is thunk that
is not recursive.
> thing we can do is to write a simple case to force generation of .localalias
> and test that.
To do that you can probably write simple C++ self-recursive inline
function and compile with -fpic.

Honza


Re: [Bug tree-optimization/93084] [10 regression] Infinite loop in ipa-cp when building clang with LTO+PGO

2019-12-30 Thread Jan Hubicka
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93084
> 
> --- Comment #6 from fxue at gcc dot gnu.org ---
> Could you share how you build clang with PGO, and train workload?
It needs a lot of patience.   If you have patch I can try it since I
still have the train data and corresponding gcc tree.

I do the following. Pretty much imitating what
utils/collect_and_build_with_pgo.py does.

First I build instrumented compiler and train it on testsuite (it takes
a long time and probably could be omitted)

cmake -G Ninja /aux/hubicka/llvm \
  -DCLANG_TABLEGEN=/aux/hubicka/llvm/out/stage1/bin/clang-tblgen \
  -DCMAKE_BUILD_TYPE=Release \
  -DCMAKE_CXX_COMPILER=/aux/hubicka/trunk-install/bin/g++ \
  -DCMAKE_C_COMPILER=/aux/hubicka/trunk-install/bin/gcc \
  -DCMAKE_INSTALL_PREFIX=/aux/hubicka/llvm10-install-gccfdolto \
  -DLLVM_BINUTILS_INCDIR=/aux/hubicka/binutils-install/include/ \
  -DLLVM_BUILD_RUNTIME=No \
  -DLLVM_TABLEGEN=/aux/hubicka/llvm/out/stage1/bin/llvm-tblgen \
  -DLLVM_TARGETS_TO_BUILD=X86 \
  -DCMAKE_RANLIB=/aux/hubicka/trunk-install/bin/gcc-ranlib \
  -DCMAKE_AR=/aux/hubicka/trunk-install/bin/gcc-ar \
  -DCMAKE_C_FLAGS="-O2 -flto=auto -flifetime-dse=1 -fno-semantic-interposition 
-fprofile-generate" \ 
  -DCMAKE_CXX_FLAGS="-O2 -flto=auto -flifetime-dse=1 
-fno-semantic-interposition -fprofile-generate" \
  -DLLVM_PARALLEL_LINK_JOBS=1
ninja clang lld
ninja check-llvm check-clang

next I do train build in new directory.

cmake -G Ninja /aux/hubicka/llvm \
  -DCMAKE_BUILD_TYPE=Release \
  -DCMAKE_CXX_COMPILER=/aux/hubicka/llvm/out/instrumented-gcc/bin/clang++ \
  -DCMAKE_C_COMPILER=/aux/hubicka/llvm/out/instrumented-gcc/bin/clang
ninja all

then I tar gcda files collected in the first build directory

cd /aux/hubicka/llvm/instrumented-gcc/
tar czvf ../clang-profile.tgz `find . -name "*.gcda"`

and do final build supplying the gcda files.

tar xzvf ../clang-profile.tgz
cmake -G Ninja /aux/hubicka/llvm \
  -DCLANG_TABLEGEN=/aux/hubicka/llvm/out/stage1/bin/clang-tblgen \
  -DCMAKE_BUILD_TYPE=Release  -DCMAKE_C_FLAGS=-Wno-backend-plugin  \
  -DLLVM_TABLEGEN=/aux/hubicka/llvm/out/stage1/bin/llvm-tblgen \
  -DCMAKE_RANLIB=/aux/hubicka/trunk-install/bin/gcc-ranlib \
  -DCMAKE_AR=/aux/hubicka/trunk-install/bin/gcc-ar \
  -DCMAKE_C_FLAGS="-O2 -fno-semantic-interposition -fprofile-use 
-Wno-missing-profile -flto=auto -flifetime-dse=1  -fprofile-correction 
-Wno-error" \
  -DCMAKE_CXX_FLAGS="-O2 -fno-semantic-interposition -fprofile-use 
-Wno-missing-profile -flifetime-dse=1  -flto=auto -fprofile-correction 
-Wno-error" \
  -DLLVM_TARGETS_TO_BUILD=X86 \
  -DCMAKE_CXX_COMPILER=/aux/hubicka/trunk-install/bin/g++ \
  -DCMAKE_C_COMPILER=/aux/hubicka/trunk-install/bin/gcc  \
  -DLLVM_BINUTILS_INCDIR=/aux/hubicka/binutils-install/include/ \
  -DLLVM_PARALLEL_LINK_JOBS=1
ninja all


Re: [Bug rtl-optimization/68664] [6/7 Regression] Speculative sqrt in c-ray main loop causes large slow down

2017-02-06 Thread Jan Hubicka
> 
> I don't think so.  But I don't know much about that bug, it is something
> with AVX I think?  If you are talking about PR79224.

I see, we have separate PR for that, good ;)
> 
> > Also with profile feedback perhaps you have enough info to tell that the
> > speculative path is almost as likely as the original placement.
> 
> Maybe.  The gain will be tiny though, afaics.

Usually profile is useful while driving speculation.  I would add an
param that prevents moving insn when bb->count or bb->frequency increases
by large amount (like 100 times).  Speculatively executing insn that will
be ignored 99% of times is probably not paying back...
Without profile that will probably only trigger for paths leading to abort()
or similar cases, but with profiles that may hit quite common scenarios.

Honza
> 
> > I wonder if we can provie resonable default by RTX cost...
> 
> Not for rs6000 at least.


Re: [Bug rtl-optimization/68664] [6/7 Regression] Speculative sqrt in c-ray main loop causes large slow down

2017-02-06 Thread Jan Hubicka
> Scheduling should never move very expensive instructions to places they
> are executed more frequently.  This patch fixes that, reducing the
> execution time of c-ray by over 40% (I tested on a BE Power7 system).
> 
> This introduces a new target hook sched.can_speculate_insn which returns
> whether the scheduler is allowed to speculate a given instruction.  The
> rs6000 implementation disallows all divide and square root instructions.
> 
> 
> PR rtl-optimization/68664
> * target.def (can_speculate_insn): New hook.
> * doc/tm.texi.in (TARGET_SCHED_CAN_SPECULATE_INSN): New hook.
> * doc/tm.texi: Regenerate.
> * sched-rgn.c (can_schedule_ready_p): Use the new hook.
> * config/rs6000/rs6000.c (TARGET_SCHED_CAN_SPECULATE_INSN): New macro.
> (rs6000_sched_can_speculate_insn): New function.

There was also regression on cray for x86-64
https://gcc.opensuse.org/c++bench-czerny/c-ray/
Is it the same issue?

Also with profile feedback perhaps you have enough info to tell that the
speculative path is almost as likely as the original placement.

I wonder if we can provie resonable default by RTX cost...

Honza


Re: [Bug lto/65559] [5 Regression] lto1.exe: internal compiler error: in read_cgraph_and_symbols, at lto/lto.c:2947

2015-04-06 Thread Jan Hubicka
Can you please compile with --verbose --save-temps and attach the output + 
temporary files produced?
(in particular I wonder about resolution file that should be named *.res)


Re: [Bug target/65660] [5 Regression] 252.eon regression on bdver2 with -Ofast

2015-04-04 Thread Jan Hubicka
Thanks,
32-bit eon runs improved today, though I am not 100% sure it is ude to 
vectorization or the unit growth change
http://gcc.opensuse.org/SPEC/CINT/sb-frescobaldi.suse.de-head-64-32o-32bit/252_eon_recent_big.png
Overall we had better scores on 32bit eon in the past however
http://gcc.opensuse.org/SPEC/CINT/sb-frescobaldi.suse.de-head-64-32o-32bit/252_eon_big.png

Honza


Re: [Bug ipa/65516] lto1: internal compiler error: in get_odr_type, at ipa-devirt.c:1809

2015-03-22 Thread Jan Hubicka
I commited the change to mainline, so you only need to update the tree.

Honza


Re: [Bug c++/65328] New: GCC perf issue when compiling templates - 120x slower than Clang

2015-03-05 Thread Jan Hubicka
Can youm please attach the preprocessed source files (generated with -E) to
reproduce the problem and also compile with -ftime-report and post the output?


Re: [Bug target/63890] [4.9/5 regression] Compiling trivial program with -O -p leads to misaligned stack

2015-02-26 Thread Jan Hubicka
Hi,
the problem is that darwin output profiler after prologue.
With ACCUMULATE_OUTGOING_ARGS we leave stack pointer aligned to boundary - 4
(for return pointer) without we leave it aligned.
I guess it is possible to compensate this in FUNCTION_PROFILER, but I am not
convinced this will work reliably. Probably disabling combine-stack-adjustments
would be needed, too.
(I do not think it is reliable with ACCUMULATE_OUTGOING_ARGS either)

Honza

Index: config/i386/darwin.h
===
--- config/i386/darwin.h(revision 221034)
+++ config/i386/darwin.h(working copy)
@@ -210,6 +210,8 @@ extern int darwin_emit_branch_islands;
 #undef FUNCTION_PROFILER
 #define FUNCTION_PROFILER(FILE, LABELNO)   \
 do {   \
+  if (!ACCUMULATE_OUTGOING_ARGS)   \
+fprintf (FILE, \tpushl %%ebx\n);  /*  Align stack  */
\
   if (TARGET_MACHO_BRANCH_ISLANDS  \
MACHOPIC_INDIRECT  !TARGET_64BIT)   \
{   \
@@ -218,6 +220,8 @@ extern int darwin_emit_branch_islands;
  machopic_validate_stub_or_non_lazy_ptr (name);\
}   \
   else fprintf (FILE, \tcall mcount\n);  \
+  if (!ACCUMULATE_OUTGOING_ARGS)   \
+   fprintf (FILE, \tpopl %%ebx\n);  /*  Align stack  */  \
 } while (0)
 
 #define C_COMMON_OVERRIDE_OPTIONS  \
Index: config/i386/i386.c
===
--- config/i386/i386.c  (revision 221034)
+++ config/i386/i386.c  (working copy)
@@ -10061,6 +10061,13 @@ ix86_compute_frame_layout (struct ix86_f
   if (crtl-stack_alignment_needed  PREFERRED_STACK_BOUNDARY)
crtl-stack_alignment_needed = PREFERRED_STACK_BOUNDARY;
 }
+  /* Be sure we get stack aligned for mcount call.  */
+  else if (crtl-profile  flag_fentry)
+{
+  crtl-preferred_stack_boundary = PREFERRED_STACK_BOUNDARY;
+  if (crtl-stack_alignment_needed  PREFERRED_STACK_BOUNDARY)
+   crtl-stack_alignment_needed = PREFERRED_STACK_BOUNDARY;
+}
 
   stack_alignment_needed = crtl-stack_alignment_needed / BITS_PER_UNIT;
   preferred_alignment = crtl-preferred_stack_boundary / BITS_PER_UNIT;


  1   2   >