Re: [PATCH][RFC] Add new ipa-reorder pass

2019-09-25 Thread Evgeny Kudryashov

On 2019-09-19 11:33, Martin Liška wrote:

Hi.

Function reordering has been around for quite some time and a naive
implementation was also part of my diploma thesis some time ago.
Currently, the GCC can reorder function based on first execution, which
happens with PGO and LTO of course. Known limitation is that the order
is preserved only partially as various symbols go into different
LTRANS partitions.

There has been some research in the area and I would point out the
Facebook paper
([1]) and Sony presentation ([2]). Based on that, I decided to make a
new implementation
in the GCC that does the same (in a proper way). First part of the
enablement are patches
to ld.bfd and ld.gold that come up with a new section .text.sorted,
that is always sorted.

Thoughts? I would definitely welcome any interesting measurement on a
bigger load.

Martin



Hi, Martin!

Some time ago I tried to do the same but didn't go that far.

I also used the C3 algorithm, except for the fact that ipa_fn_summary 
contains information about size and time (somehow missed it). The linker 
option --sort-section=name was used for prototyping. It, obviously, 
sorts sections and allows to place functions in the desired order (by 
placing them into named sections .text.sorted., without patching 
linkers or adjusting linker scripts). For testing my implementation I 
used several benchmarks from SPEC2006: perlbench, sjeng and gobmk. 
Unfortunately, no significant positive changes were obtained.


I've tested the proposed pass on perlbench with train and reference 
input (PGO+LTO as a base) but couldn't obtain stable results (still 
unclear environment or perlbench specific reasons).


Evgeny.



[RFC][PATCH] ipa: fix dumping with deleted multiversioning nodes

2017-09-18 Thread Evgeny Kudryashov

Hello,

The code below causes an internal compiler error in cc1plus (trunk on 
x86-64) if it is compiled with -fdump-ipa-cgraph.


int foo () __attribute__ ((target ("default")));
int foo () __attribute__ ((target ("sse4.2")));

__attribute__ ((target ("sse4.2")))
int foo ()
{
  return 1;
}

The error occurs in cgraph_node::dump (gcc/cgraph.c:2065), particularly, 
in the following fragment:


cgraph_function_version_info *vi = function_version ();
if (vi != NULL)
  {
fprintf (f, "  Version info: ");
if (vi->prev != NULL)
  {
fprintf (f, "prev: ");
fprintf (f, "%s ", vi->prev->this_node->dump_asm_name ());
  }
if (vi->next != NULL)
  {
fprintf (f, "next: ");
fprintf (f, "%s ", vi->next->this_node->dump_asm_name ());
  }
if (vi->dispatcher_resolver != NULL_TREE)
  fprintf (f, "dispatcher: %s",
   lang_hooks.decl_printable_name (vi->dispatcher_resolver, 
2));


fprintf (f, "\n");
  }


The expression "vi->{prev,next}->this_node" can be null if it is a 
version of an unused symbol that was removed.


Is it intentional that removing a cgraph node does not also remove 
versions?


As a solution I suggest to delete the version of the node too during 
node removal.


Regards,
Evgeny.

* cgraph.c (delete_function_version): New, broken out from...
(cgraph_node::delete_function_version): ...here.  Rename to
cgraph_node::delete_function_version_by_decl.  Update all uses.
(cgraph_node::remove): Call delete_function_version.diff --git a/gcc/cgraph.c b/gcc/cgraph.c
index 8bffdec..3d0cefb 100644
--- a/gcc/cgraph.c
+++ b/gcc/cgraph.c
@@ -190,30 +190,34 @@ cgraph_node::insert_new_function_version (void)
   return version_info_node;
 }
 
-/* Remove the cgraph_function_version_info and cgraph_node for DECL.  This
-   DECL is a duplicate declaration.  */
-void
-cgraph_node::delete_function_version (tree decl)
+/* Remove the cgraph_function_version_info node given by DECL_V.  */
+static void
+delete_function_version (cgraph_function_version_info *decl_v)
 {
-  cgraph_node *decl_node = cgraph_node::get (decl);
-  cgraph_function_version_info *decl_v = NULL;
-
-  if (decl_node == NULL)
-return;
-
-  decl_v = decl_node->function_version ();
-
   if (decl_v == NULL)
 return;
 
   if (decl_v->prev != NULL)
-   decl_v->prev->next = decl_v->next;
+decl_v->prev->next = decl_v->next;
 
   if (decl_v->next != NULL)
 decl_v->next->prev = decl_v->prev;
 
   if (cgraph_fnver_htab != NULL)
 cgraph_fnver_htab->remove_elt (decl_v);
+}
+
+/* Remove the cgraph_function_version_info and cgraph_node for DECL.  This
+   DECL is a duplicate declaration.  */
+void
+cgraph_node::delete_function_version_by_decl (tree decl)
+{
+  cgraph_node *decl_node = cgraph_node::get (decl);
+
+  if (decl_node == NULL)
+return;
+
+  delete_function_version (decl_node->function_version ());
 
   decl_node->remove ();
 }
@@ -1844,6 +1848,7 @@ cgraph_node::remove (void)
   remove_callers ();
   remove_callees ();
   ipa_transforms_to_apply.release ();
+  delete_function_version (function_version ());
 
   /* Incremental inlining access removed nodes stored in the postorder list.
  */
diff --git a/gcc/cgraph.h b/gcc/cgraph.h
index c668b37..1303db0 100644
--- a/gcc/cgraph.h
+++ b/gcc/cgraph.h
@@ -1272,7 +1272,7 @@ public:
 
   /* Remove the cgraph_function_version_info and cgraph_node for DECL.  This
  DECL is a duplicate declaration.  */
-  static void delete_function_version (tree decl);
+  static void delete_function_version_by_decl (tree decl);
 
   /* Add the function FNDECL to the call graph.
  Unlike finalize_function, this function is intended to be used
diff --git a/gcc/cp/decl.c b/gcc/cp/decl.c
index 858747e..50fa1ba 100644
--- a/gcc/cp/decl.c
+++ b/gcc/cp/decl.c
@@ -2566,7 +2566,7 @@ next_arg:;
   DECL_FUNCTION_VERSIONED (newdecl) = 1;
   /* newdecl will be purged after copying to olddecl and is no longer
  a version.  */
-  cgraph_node::delete_function_version (newdecl);
+  cgraph_node::delete_function_version_by_decl (newdecl);
 }
 
   if (TREE_CODE (newdecl) == FUNCTION_DECL)


Re: [PATCH, RFC] Improve ivopts group costs

2016-11-12 Thread Evgeny Kudryashov

On 2016-11-10 13:30, Bin.Cheng wrote:

Hi,
I see the cost problem with your test now.  When computing an address
type iv_use with a candidate, the computation consists of two parts,
for computation can be represented by addressing mode, it is done in
memory reference; for computation cannot be represented by addressing
mode, it is done outside of memory reference.  The final cost is added
up from the two computation parts.
For address iv_use:
MEM[base + biv << scale + offset]
when it is computed with below candidate on target only supports [base
+ biv << scale] addressing mode:
biv
The computations would be like:
base' = base + offset
MEM[base' + biv << scale]
Both computations has its own cost, the first one is normal RTX cost,
the second one is addressing mode cost.  Final cost is added up from
both parts.

Normally, all these cost should be added up in cost model, but there
should be one exception found in your test: If iv_uses of a group has
exactly the same iv ({base, step}), the first part computation (RTX)
can be shared among all iv_uses, thus the cost should only counted one
time.  That is, we should be able to model such CSE opportunities.
Apparently, we can't CSE the second part computation, of course there
won't be CSE opportunities in address expression anyway.


Hi Bin,
Yes, that is exactly what happens. And this computation might be cheaper 
than initialization and increment of new iv and it would be more 
preferable.



That said, this patch should make difference between cost of RTX
computation and address expression, and only add up RTX cost once if
it can be CSEed.  Well, it might be not trivial to check CSE
opportunities of RTX computation, for example, some iv_uses of the
group are the same, others are not.

Thanks,
bin


Since uses in a given group have the same base and step, they can only 
differ by offsets. Among those, equivalent offsets can be CSE'd. Then, 
perhaps it's possible to use a hash set of unique offsets in this group 
cost estimation loop, and count RTX computation cost only when adding a 
new entry to the set. What do you think about this approach?


While working on this issue, I've found another problem: that costs may 
become negative. That looks unintended, I have filed a new bug: 
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78332


Thanks,
Evgeny.


[PATCH, RFC] Improve ivopts group costs

2016-11-03 Thread Evgeny Kudryashov

Hello,

I'm facing the following problem related to ivopts. The problem is that 
GCC generates a lot of induction variables and as a result there is an 
unnecessary increase of stack usage and register pressure.


For instance, for the attached testcase (tc_ivopts.c) GCC generates 26 
induction variables (25 for each of lhsX[{0-4}][{0-4}][k] and one for 
rhs[k][j][{0-2}]). You should be able to reproduce this issue on arm 
target using GCC with "-O2 -mcpu=cortex-a9 -mtune=cortex-a9". For 
reference, I'm attaching assembly I get on current trunk.


The reason might be in use groups costs, in particular, in the way of 
estimation. Currently, the cost of a tuple (group, candidate) is a sum 
of per-use costs in the group. So, the cost of a group grows 
proportional to the number of uses in the group. This approach has a 
negative effect on the algorithm for finding the best set of induction 
variables: the part of a total cost related to adding a new candidate is 
almost washed out by the cost of the group. In addition, when there is a 
lot of groups with many uses inside and a target is out of free 
registers, the cost of spill is washing out too. As a result, GCC 
prefers to use native candidates (candidate created for a particular 
group) for each group of uses instead of considering the real cost of 
introducing a new variable into a set.


The summing approach was added as a part of this patch 
(https://gcc.gnu.org/ml/gcc-patches/2015-05/msg00641.html) and the 
motivation for taking the sum does not seem to be

specifically discussed.

I propose the following patch that changes a group cost from cost 
summing to selecting the largest one inside a group. For the given test 
case I have: necessary size of stack is decreased by almost 3 times and 
ldr\str amount are decreased by less than 2 times. Also I'm attaching 
assembly after applying the patch.


The essential change in the patch is just:

diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index f9211ad..a149418 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -5151,7 +5151,8 @@ determine_group_iv_cost_address (struct 
ivopts_data *data,

 offset and where the iv_use is.  */
cost = get_computation_cost (data, next, cand, true,
 NULL, &can_autoinc, NULL);
-  sum_cost += cost;
+  if (sum_cost < cost)
+sum_cost = cost;
 }
   set_group_iv_cost (data, group, cand, sum_cost, depends_on,
 NULL_TREE, ERROR_MARK, inv_expr);

Any suggestions?


Thanks,
Evgeny.#define SIZE 20
int gp22 = SIZE;
double rhs [SIZE][SIZE][5];

void x_solve()
{
  int j, k;
  double lhsX[5][5][gp22][gp22];
  for (j = 1; j < gp22; j++) {
for (k = 1; k < gp22; k++) {
  lhsX[0][0][j][k] -= lhsX[1][0][j][k];
  lhsX[0][1][j][k] -= lhsX[1][1][j][k];
  lhsX[0][2][j][k] -= lhsX[1][2][j][k];
  lhsX[0][3][j][k] -= lhsX[1][3][j][k];
  lhsX[0][4][j][k] -= lhsX[1][4][j][k];

  lhsX[2][0][j][k] -= lhsX[1][0][j][k];
  lhsX[2][1][j][k] -= lhsX[1][1][j][k];
  lhsX[2][2][j][k] -= lhsX[1][2][j][k];
  lhsX[2][3][j][k] -= lhsX[1][3][j][k];
  lhsX[2][4][j][k] -= lhsX[1][4][j][k];

  lhsX[3][0][j][k] -= lhsX[1][0][j][k];
  lhsX[3][1][j][k] -= lhsX[1][1][j][k];
  lhsX[3][2][j][k] -= lhsX[1][2][j][k];
  lhsX[3][3][j][k] -= lhsX[1][3][j][k];
  lhsX[3][4][j][k] -= lhsX[1][4][j][k];

  lhsX[4][0][j][k] -= lhsX[1][0][j][k];
  lhsX[4][1][j][k] -= lhsX[1][1][j][k];
  lhsX[4][2][j][k] -= lhsX[1][2][j][k];
  lhsX[4][3][j][k] -= lhsX[1][3][j][k];
  lhsX[4][4][j][k] -= lhsX[1][4][j][k];

  lhsX[0][0][j][k] -= lhsX[3][0][j][k];
  lhsX[0][1][j][k] -= lhsX[3][1][j][k];
  lhsX[0][2][j][k] -= lhsX[3][2][j][k];
  lhsX[0][3][j][k] -= lhsX[3][3][j][k];
  lhsX[0][4][j][k] -= lhsX[3][4][j][k];

  lhsX[2][0][j][k] -= lhsX[3][0][j][k];
  lhsX[2][1][j][k] -= lhsX[3][1][j][k];
  lhsX[2][2][j][k] -= lhsX[3][2][j][k];
  lhsX[2][3][j][k] -= lhsX[3][3][j][k];
  lhsX[2][4][j][k] -= lhsX[3][4][j][k];

  lhsX[4][0][j][k] -= lhsX[3][0][j][k];
  lhsX[4][1][j][k] -= lhsX[3][1][j][k];
  lhsX[4][2][j][k] -= lhsX[3][2][j][k];
  lhsX[4][3][j][k] -= lhsX[3][3][j][k];
  lhsX[4][4][j][k] -= lhsX[3][4][j][k];

  rhs[k][j][0] -= rhs[k][j][1];
  rhs[k][j][1] -= rhs[k][j][2];
}/*end j*/
  }/*end k*/
}
	.cpu cortex-a9
	.eabi_attribute 20, 1
	.eabi_attribute 21, 1
	.eabi_attribute 23, 3
	.eabi_attribute 24, 1
	.eabi_attribute 25, 1
	.eabi_attribute 26, 2
	.eabi_attribute 30, 2
	.eabi_attribute 34, 1
	.eabi_attribute 18, 4
	.file	"bt.c"
	.global	__aeabi_dsub
	.text
	.align	2
	.global	x_solve
	.syntax unified
	.arm
	.fpu softvfp
	.type	x_solve, %function
x_solve:
	@ args = 0, pretend = 0, frame = 216
	@ frame_needed = 1, uses_anonymous_args = 0
	movw	r3, #:lower16:.LANCHOR0
	push	{r4, r5, r6, r7, r8, r9, r10, fp, lr}
	movt	r3, #:upper16:.LANCHOR0
	add	fp, sp, #32