[PATCH 1/8] perf callchain: Convert children list to rbtree

2013-10-10 Thread Namhyung Kim
From: Namhyung Kim 

Current collapse stage has a scalability problem which can be
reproduced easily with parallel kernel build.  This is because it
needs to traverse every children of callchain linearly during the
collapse/merge stage.  Convert it to rbtree reduced the overhead
significantly.

On my 400MB perf.data file which recorded with make -j32 kernel build:

  $ time perf --no-pager report --stdio > /dev/null

before:
  real  6m22.073s
  user  6m18.683s
  sys   0m0.706s

after:
  real  0m20.780s
  user  0m19.962s
  sys   0m0.689s

During the perf report the overhead on append_chain_children went down
from 96.69% to 18.16%:

  -  18.16%  perf  perf[.] append_chain_children
 - append_chain_children
- 77.48% append_chain_children
   + 69.79% merge_chain_branch
   - 22.96% append_chain_children
  + 67.44% merge_chain_branch
  + 30.15% append_chain_children
  + 2.41% callchain_append
   + 7.25% callchain_append
+ 12.26% callchain_append
+ 10.22% merge_chain_branch
  +  11.58%  perf  perf[.] dso__find_symbol
  +   8.02%  perf  perf[.] sort__comm_cmp
  +   5.48%  perf  libc-2.17.so[.] malloc_consolidate

Reported-by: Linus Torvalds 
Cc: Jiri Olsa 
Cc: Frederic Weisbecker 
Link: http://lkml.kernel.org/n/tip-d9tcfow6stbrp4btvgs51...@git.kernel.org
Signed-off-by: Namhyung Kim 
---
 tools/perf/util/callchain.c | 147 +---
 tools/perf/util/callchain.h |  11 ++--
 2 files changed, 116 insertions(+), 42 deletions(-)

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 482f68081cd8..e3970e3eaacf 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -21,12 +21,6 @@
 
 __thread struct callchain_cursor callchain_cursor;
 
-#define chain_for_each_child(child, parent)\
-   list_for_each_entry(child, >children, siblings)
-
-#define chain_for_each_child_safe(child, next, parent) \
-   list_for_each_entry_safe(child, next, >children, siblings)
-
 static void
 rb_insert_callchain(struct rb_root *root, struct callchain_node *chain,
enum chain_mode mode)
@@ -71,10 +65,16 @@ static void
 __sort_chain_flat(struct rb_root *rb_root, struct callchain_node *node,
  u64 min_hit)
 {
+   struct rb_node *n;
struct callchain_node *child;
 
-   chain_for_each_child(child, node)
+   n = rb_first(>rb_root_in);
+   while (n) {
+   child = rb_entry(n, struct callchain_node, rb_node_in);
+   n = rb_next(n);
+
__sort_chain_flat(rb_root, child, min_hit);
+   }
 
if (node->hit && node->hit >= min_hit)
rb_insert_callchain(rb_root, node, CHAIN_FLAT);
@@ -94,11 +94,16 @@ sort_chain_flat(struct rb_root *rb_root, struct 
callchain_root *root,
 static void __sort_chain_graph_abs(struct callchain_node *node,
   u64 min_hit)
 {
+   struct rb_node *n;
struct callchain_node *child;
 
node->rb_root = RB_ROOT;
+   n = rb_first(>rb_root_in);
+
+   while (n) {
+   child = rb_entry(n, struct callchain_node, rb_node_in);
+   n = rb_next(n);
 
-   chain_for_each_child(child, node) {
__sort_chain_graph_abs(child, min_hit);
if (callchain_cumul_hits(child) >= min_hit)
rb_insert_callchain(>rb_root, child,
@@ -117,13 +122,18 @@ sort_chain_graph_abs(struct rb_root *rb_root, struct 
callchain_root *chain_root,
 static void __sort_chain_graph_rel(struct callchain_node *node,
   double min_percent)
 {
+   struct rb_node *n;
struct callchain_node *child;
u64 min_hit;
 
node->rb_root = RB_ROOT;
min_hit = ceil(node->children_hit * min_percent);
 
-   chain_for_each_child(child, node) {
+   n = rb_first(>rb_root_in);
+   while (n) {
+   child = rb_entry(n, struct callchain_node, rb_node_in);
+   n = rb_next(n);
+
__sort_chain_graph_rel(child, min_percent);
if (callchain_cumul_hits(child) >= min_hit)
rb_insert_callchain(>rb_root, child,
@@ -173,19 +183,26 @@ create_child(struct callchain_node *parent, bool 
inherit_children)
return NULL;
}
new->parent = parent;
-   INIT_LIST_HEAD(>children);
INIT_LIST_HEAD(>val);
 
if (inherit_children) {
-   struct callchain_node *next;
+   struct rb_node *n;
+   struct callchain_node *child;
+
+   new->rb_root_in = parent->rb_root_in;
+   parent->rb_root_in = RB_ROOT;
 
-   list_splice(>children, >children);
-   INIT_LIST_HEAD(>children);
+   n = rb_first(>rb_root_in);
+   while (n) {
+   child = 

[PATCH 1/8] perf callchain: Convert children list to rbtree

2013-10-10 Thread Namhyung Kim
From: Namhyung Kim namhyung@lge.com

Current collapse stage has a scalability problem which can be
reproduced easily with parallel kernel build.  This is because it
needs to traverse every children of callchain linearly during the
collapse/merge stage.  Convert it to rbtree reduced the overhead
significantly.

On my 400MB perf.data file which recorded with make -j32 kernel build:

  $ time perf --no-pager report --stdio  /dev/null

before:
  real  6m22.073s
  user  6m18.683s
  sys   0m0.706s

after:
  real  0m20.780s
  user  0m19.962s
  sys   0m0.689s

During the perf report the overhead on append_chain_children went down
from 96.69% to 18.16%:

  -  18.16%  perf  perf[.] append_chain_children
 - append_chain_children
- 77.48% append_chain_children
   + 69.79% merge_chain_branch
   - 22.96% append_chain_children
  + 67.44% merge_chain_branch
  + 30.15% append_chain_children
  + 2.41% callchain_append
   + 7.25% callchain_append
+ 12.26% callchain_append
+ 10.22% merge_chain_branch
  +  11.58%  perf  perf[.] dso__find_symbol
  +   8.02%  perf  perf[.] sort__comm_cmp
  +   5.48%  perf  libc-2.17.so[.] malloc_consolidate

Reported-by: Linus Torvalds torva...@linux-foundation.org
Cc: Jiri Olsa jo...@redhat.com
Cc: Frederic Weisbecker fweis...@gmail.com
Link: http://lkml.kernel.org/n/tip-d9tcfow6stbrp4btvgs51...@git.kernel.org
Signed-off-by: Namhyung Kim namhy...@kernel.org
---
 tools/perf/util/callchain.c | 147 +---
 tools/perf/util/callchain.h |  11 ++--
 2 files changed, 116 insertions(+), 42 deletions(-)

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 482f68081cd8..e3970e3eaacf 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -21,12 +21,6 @@
 
 __thread struct callchain_cursor callchain_cursor;
 
-#define chain_for_each_child(child, parent)\
-   list_for_each_entry(child, parent-children, siblings)
-
-#define chain_for_each_child_safe(child, next, parent) \
-   list_for_each_entry_safe(child, next, parent-children, siblings)
-
 static void
 rb_insert_callchain(struct rb_root *root, struct callchain_node *chain,
enum chain_mode mode)
@@ -71,10 +65,16 @@ static void
 __sort_chain_flat(struct rb_root *rb_root, struct callchain_node *node,
  u64 min_hit)
 {
+   struct rb_node *n;
struct callchain_node *child;
 
-   chain_for_each_child(child, node)
+   n = rb_first(node-rb_root_in);
+   while (n) {
+   child = rb_entry(n, struct callchain_node, rb_node_in);
+   n = rb_next(n);
+
__sort_chain_flat(rb_root, child, min_hit);
+   }
 
if (node-hit  node-hit = min_hit)
rb_insert_callchain(rb_root, node, CHAIN_FLAT);
@@ -94,11 +94,16 @@ sort_chain_flat(struct rb_root *rb_root, struct 
callchain_root *root,
 static void __sort_chain_graph_abs(struct callchain_node *node,
   u64 min_hit)
 {
+   struct rb_node *n;
struct callchain_node *child;
 
node-rb_root = RB_ROOT;
+   n = rb_first(node-rb_root_in);
+
+   while (n) {
+   child = rb_entry(n, struct callchain_node, rb_node_in);
+   n = rb_next(n);
 
-   chain_for_each_child(child, node) {
__sort_chain_graph_abs(child, min_hit);
if (callchain_cumul_hits(child) = min_hit)
rb_insert_callchain(node-rb_root, child,
@@ -117,13 +122,18 @@ sort_chain_graph_abs(struct rb_root *rb_root, struct 
callchain_root *chain_root,
 static void __sort_chain_graph_rel(struct callchain_node *node,
   double min_percent)
 {
+   struct rb_node *n;
struct callchain_node *child;
u64 min_hit;
 
node-rb_root = RB_ROOT;
min_hit = ceil(node-children_hit * min_percent);
 
-   chain_for_each_child(child, node) {
+   n = rb_first(node-rb_root_in);
+   while (n) {
+   child = rb_entry(n, struct callchain_node, rb_node_in);
+   n = rb_next(n);
+
__sort_chain_graph_rel(child, min_percent);
if (callchain_cumul_hits(child) = min_hit)
rb_insert_callchain(node-rb_root, child,
@@ -173,19 +183,26 @@ create_child(struct callchain_node *parent, bool 
inherit_children)
return NULL;
}
new-parent = parent;
-   INIT_LIST_HEAD(new-children);
INIT_LIST_HEAD(new-val);
 
if (inherit_children) {
-   struct callchain_node *next;
+   struct rb_node *n;
+   struct callchain_node *child;
+
+   new-rb_root_in = parent-rb_root_in;
+   parent-rb_root_in = RB_ROOT;
 
-   list_splice(parent-children, new-children);
-  

Re: [PATCH 1/8] perf callchain: Convert children list to rbtree

2013-10-09 Thread Namhyung Kim
Hi Frederic,

On Tue, 8 Oct 2013 21:22:45 +0200, Frederic Weisbecker wrote:
> On Tue, Oct 08, 2013 at 11:03:16AM +0900, Namhyung Kim wrote:
>> On Wed, 2 Oct 2013 12:18:28 +0200, Frederic Weisbecker wrote:
>> > Have you tested this patchset when collapsing is not used?
>> > There are fair chances that this patchset does not only improve collapsing
>> > but also callchain insertion in general. So it's probably a win in any 
>> > case. But
>> > still it would be nice to make sure that it's the case because we are 
>> > getting
>> > rid of collapsing anyway.
>> >
>> > The test that could tell us about that is to run "perf report -s sym" and 
>> > compare the
>> > time it takes to complete before and after this patch, because "-s sym" 
>> > shouldn't
>> > involve collapses.
>> >
>> > Sorting by anything that is not comm should do the trick in fact.
>> 
>> Yes, I have similar result when collapsing is not used.  Actually when I
>> ran "perf report -s sym", the performance improves higher since it'd
>> insert more callchains in a hist entry.
>
> Great! I'll have a closer look and review on the callchain patches then. 
> Please
> resend these along the comm batch.

I'll do it later today or tomorrow.

Thanks,
Namhyung
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/8] perf callchain: Convert children list to rbtree

2013-10-09 Thread Namhyung Kim
Hi Frederic,

On Tue, 8 Oct 2013 21:22:45 +0200, Frederic Weisbecker wrote:
 On Tue, Oct 08, 2013 at 11:03:16AM +0900, Namhyung Kim wrote:
 On Wed, 2 Oct 2013 12:18:28 +0200, Frederic Weisbecker wrote:
  Have you tested this patchset when collapsing is not used?
  There are fair chances that this patchset does not only improve collapsing
  but also callchain insertion in general. So it's probably a win in any 
  case. But
  still it would be nice to make sure that it's the case because we are 
  getting
  rid of collapsing anyway.
 
  The test that could tell us about that is to run perf report -s sym and 
  compare the
  time it takes to complete before and after this patch, because -s sym 
  shouldn't
  involve collapses.
 
  Sorting by anything that is not comm should do the trick in fact.
 
 Yes, I have similar result when collapsing is not used.  Actually when I
 ran perf report -s sym, the performance improves higher since it'd
 insert more callchains in a hist entry.

 Great! I'll have a closer look and review on the callchain patches then. 
 Please
 resend these along the comm batch.

I'll do it later today or tomorrow.

Thanks,
Namhyung
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/8] perf callchain: Convert children list to rbtree

2013-10-08 Thread Frederic Weisbecker
On Tue, Oct 08, 2013 at 11:03:16AM +0900, Namhyung Kim wrote:
> On Wed, 2 Oct 2013 12:18:28 +0200, Frederic Weisbecker wrote:
> > On Thu, Sep 26, 2013 at 05:58:03PM +0900, Namhyung Kim wrote:
> >> From: Namhyung Kim 
> >> 
> >> Current collapse stage has a scalability problem which can be
> >> reproduced easily with parallel kernel build.  This is because it
> >> needs to traverse every children of callchain linearly during the
> >> collapse/merge stage.  Convert it to rbtree reduced the overhead
> >> significantly.
> >> 
> >> On my 400MB perf.data file which recorded with make -j32 kernel build:
> >> 
> >>   $ time perf --no-pager report --stdio > /dev/null
> >> 
> >> before:
> >>   real 6m22.073s
> >>   user 6m18.683s
> >>   sys  0m0.706s
> >> 
> >> after:
> >>   real 0m20.780s
> >>   user 0m19.962s
> >>   sys  0m0.689s
> >> 
> >> During the perf report the overhead on append_chain_children went down
> >> from 96.69% to 18.16%:
> >> 
> >>   -  18.16%  perf  perf[.] append_chain_children
> >>  - append_chain_children
> >> - 77.48% append_chain_children
> >>+ 69.79% merge_chain_branch
> >>- 22.96% append_chain_children
> >>   + 67.44% merge_chain_branch
> >>   + 30.15% append_chain_children
> >>   + 2.41% callchain_append
> >>+ 7.25% callchain_append
> >> + 12.26% callchain_append
> >> + 10.22% merge_chain_branch
> >>   +  11.58%  perf  perf[.] dso__find_symbol
> >>   +   8.02%  perf  perf[.] sort__comm_cmp
> >>   +   5.48%  perf  libc-2.17.so[.] malloc_consolidate
> >> 
> >> Reported-by: Linus Torvalds 
> >> Cc: Jiri Olsa 
> >> Cc: Frederic Weisbecker 
> >> Link: http://lkml.kernel.org/n/tip-d9tcfow6stbrp4btvgs51...@git.kernel.org
> >> Signed-off-by: Namhyung Kim 
> >
> > Have you tested this patchset when collapsing is not used?
> > There are fair chances that this patchset does not only improve collapsing
> > but also callchain insertion in general. So it's probably a win in any 
> > case. But
> > still it would be nice to make sure that it's the case because we are 
> > getting
> > rid of collapsing anyway.
> >
> > The test that could tell us about that is to run "perf report -s sym" and 
> > compare the
> > time it takes to complete before and after this patch, because "-s sym" 
> > shouldn't
> > involve collapses.
> >
> > Sorting by anything that is not comm should do the trick in fact.
> 
> Yes, I have similar result when collapsing is not used.  Actually when I
> ran "perf report -s sym", the performance improves higher since it'd
> insert more callchains in a hist entry.

Great! I'll have a closer look and review on the callchain patches then. Please
resend these along the comm batch.

Thanks again!
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/8] perf callchain: Convert children list to rbtree

2013-10-08 Thread Frederic Weisbecker
On Tue, Oct 08, 2013 at 11:03:16AM +0900, Namhyung Kim wrote:
 On Wed, 2 Oct 2013 12:18:28 +0200, Frederic Weisbecker wrote:
  On Thu, Sep 26, 2013 at 05:58:03PM +0900, Namhyung Kim wrote:
  From: Namhyung Kim namhyung@lge.com
  
  Current collapse stage has a scalability problem which can be
  reproduced easily with parallel kernel build.  This is because it
  needs to traverse every children of callchain linearly during the
  collapse/merge stage.  Convert it to rbtree reduced the overhead
  significantly.
  
  On my 400MB perf.data file which recorded with make -j32 kernel build:
  
$ time perf --no-pager report --stdio  /dev/null
  
  before:
real 6m22.073s
user 6m18.683s
sys  0m0.706s
  
  after:
real 0m20.780s
user 0m19.962s
sys  0m0.689s
  
  During the perf report the overhead on append_chain_children went down
  from 96.69% to 18.16%:
  
-  18.16%  perf  perf[.] append_chain_children
   - append_chain_children
  - 77.48% append_chain_children
 + 69.79% merge_chain_branch
 - 22.96% append_chain_children
+ 67.44% merge_chain_branch
+ 30.15% append_chain_children
+ 2.41% callchain_append
 + 7.25% callchain_append
  + 12.26% callchain_append
  + 10.22% merge_chain_branch
+  11.58%  perf  perf[.] dso__find_symbol
+   8.02%  perf  perf[.] sort__comm_cmp
+   5.48%  perf  libc-2.17.so[.] malloc_consolidate
  
  Reported-by: Linus Torvalds torva...@linux-foundation.org
  Cc: Jiri Olsa jo...@redhat.com
  Cc: Frederic Weisbecker fweis...@gmail.com
  Link: http://lkml.kernel.org/n/tip-d9tcfow6stbrp4btvgs51...@git.kernel.org
  Signed-off-by: Namhyung Kim namhy...@kernel.org
 
  Have you tested this patchset when collapsing is not used?
  There are fair chances that this patchset does not only improve collapsing
  but also callchain insertion in general. So it's probably a win in any 
  case. But
  still it would be nice to make sure that it's the case because we are 
  getting
  rid of collapsing anyway.
 
  The test that could tell us about that is to run perf report -s sym and 
  compare the
  time it takes to complete before and after this patch, because -s sym 
  shouldn't
  involve collapses.
 
  Sorting by anything that is not comm should do the trick in fact.
 
 Yes, I have similar result when collapsing is not used.  Actually when I
 ran perf report -s sym, the performance improves higher since it'd
 insert more callchains in a hist entry.

Great! I'll have a closer look and review on the callchain patches then. Please
resend these along the comm batch.

Thanks again!
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/8] perf callchain: Convert children list to rbtree

2013-10-07 Thread Namhyung Kim
On Wed, 2 Oct 2013 12:18:28 +0200, Frederic Weisbecker wrote:
> On Thu, Sep 26, 2013 at 05:58:03PM +0900, Namhyung Kim wrote:
>> From: Namhyung Kim 
>> 
>> Current collapse stage has a scalability problem which can be
>> reproduced easily with parallel kernel build.  This is because it
>> needs to traverse every children of callchain linearly during the
>> collapse/merge stage.  Convert it to rbtree reduced the overhead
>> significantly.
>> 
>> On my 400MB perf.data file which recorded with make -j32 kernel build:
>> 
>>   $ time perf --no-pager report --stdio > /dev/null
>> 
>> before:
>>   real   6m22.073s
>>   user   6m18.683s
>>   sys0m0.706s
>> 
>> after:
>>   real   0m20.780s
>>   user   0m19.962s
>>   sys0m0.689s
>> 
>> During the perf report the overhead on append_chain_children went down
>> from 96.69% to 18.16%:
>> 
>>   -  18.16%  perf  perf[.] append_chain_children
>>  - append_chain_children
>> - 77.48% append_chain_children
>>+ 69.79% merge_chain_branch
>>- 22.96% append_chain_children
>>   + 67.44% merge_chain_branch
>>   + 30.15% append_chain_children
>>   + 2.41% callchain_append
>>+ 7.25% callchain_append
>> + 12.26% callchain_append
>> + 10.22% merge_chain_branch
>>   +  11.58%  perf  perf[.] dso__find_symbol
>>   +   8.02%  perf  perf[.] sort__comm_cmp
>>   +   5.48%  perf  libc-2.17.so[.] malloc_consolidate
>> 
>> Reported-by: Linus Torvalds 
>> Cc: Jiri Olsa 
>> Cc: Frederic Weisbecker 
>> Link: http://lkml.kernel.org/n/tip-d9tcfow6stbrp4btvgs51...@git.kernel.org
>> Signed-off-by: Namhyung Kim 
>
> Have you tested this patchset when collapsing is not used?
> There are fair chances that this patchset does not only improve collapsing
> but also callchain insertion in general. So it's probably a win in any case. 
> But
> still it would be nice to make sure that it's the case because we are getting
> rid of collapsing anyway.
>
> The test that could tell us about that is to run "perf report -s sym" and 
> compare the
> time it takes to complete before and after this patch, because "-s sym" 
> shouldn't
> involve collapses.
>
> Sorting by anything that is not comm should do the trick in fact.

Yes, I have similar result when collapsing is not used.  Actually when I
ran "perf report -s sym", the performance improves higher since it'd
insert more callchains in a hist entry.

Thanks,
Namhyung
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/8] perf callchain: Convert children list to rbtree

2013-10-07 Thread Namhyung Kim
On Wed, 2 Oct 2013 12:18:28 +0200, Frederic Weisbecker wrote:
 On Thu, Sep 26, 2013 at 05:58:03PM +0900, Namhyung Kim wrote:
 From: Namhyung Kim namhyung@lge.com
 
 Current collapse stage has a scalability problem which can be
 reproduced easily with parallel kernel build.  This is because it
 needs to traverse every children of callchain linearly during the
 collapse/merge stage.  Convert it to rbtree reduced the overhead
 significantly.
 
 On my 400MB perf.data file which recorded with make -j32 kernel build:
 
   $ time perf --no-pager report --stdio  /dev/null
 
 before:
   real   6m22.073s
   user   6m18.683s
   sys0m0.706s
 
 after:
   real   0m20.780s
   user   0m19.962s
   sys0m0.689s
 
 During the perf report the overhead on append_chain_children went down
 from 96.69% to 18.16%:
 
   -  18.16%  perf  perf[.] append_chain_children
  - append_chain_children
 - 77.48% append_chain_children
+ 69.79% merge_chain_branch
- 22.96% append_chain_children
   + 67.44% merge_chain_branch
   + 30.15% append_chain_children
   + 2.41% callchain_append
+ 7.25% callchain_append
 + 12.26% callchain_append
 + 10.22% merge_chain_branch
   +  11.58%  perf  perf[.] dso__find_symbol
   +   8.02%  perf  perf[.] sort__comm_cmp
   +   5.48%  perf  libc-2.17.so[.] malloc_consolidate
 
 Reported-by: Linus Torvalds torva...@linux-foundation.org
 Cc: Jiri Olsa jo...@redhat.com
 Cc: Frederic Weisbecker fweis...@gmail.com
 Link: http://lkml.kernel.org/n/tip-d9tcfow6stbrp4btvgs51...@git.kernel.org
 Signed-off-by: Namhyung Kim namhy...@kernel.org

 Have you tested this patchset when collapsing is not used?
 There are fair chances that this patchset does not only improve collapsing
 but also callchain insertion in general. So it's probably a win in any case. 
 But
 still it would be nice to make sure that it's the case because we are getting
 rid of collapsing anyway.

 The test that could tell us about that is to run perf report -s sym and 
 compare the
 time it takes to complete before and after this patch, because -s sym 
 shouldn't
 involve collapses.

 Sorting by anything that is not comm should do the trick in fact.

Yes, I have similar result when collapsing is not used.  Actually when I
ran perf report -s sym, the performance improves higher since it'd
insert more callchains in a hist entry.

Thanks,
Namhyung
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/8] perf callchain: Convert children list to rbtree

2013-10-02 Thread Frederic Weisbecker
On Thu, Sep 26, 2013 at 05:58:03PM +0900, Namhyung Kim wrote:
> From: Namhyung Kim 
> 
> Current collapse stage has a scalability problem which can be
> reproduced easily with parallel kernel build.  This is because it
> needs to traverse every children of callchain linearly during the
> collapse/merge stage.  Convert it to rbtree reduced the overhead
> significantly.
> 
> On my 400MB perf.data file which recorded with make -j32 kernel build:
> 
>   $ time perf --no-pager report --stdio > /dev/null
> 
> before:
>   real6m22.073s
>   user6m18.683s
>   sys 0m0.706s
> 
> after:
>   real0m20.780s
>   user0m19.962s
>   sys 0m0.689s
> 
> During the perf report the overhead on append_chain_children went down
> from 96.69% to 18.16%:
> 
>   -  18.16%  perf  perf[.] append_chain_children
>  - append_chain_children
> - 77.48% append_chain_children
>+ 69.79% merge_chain_branch
>- 22.96% append_chain_children
>   + 67.44% merge_chain_branch
>   + 30.15% append_chain_children
>   + 2.41% callchain_append
>+ 7.25% callchain_append
> + 12.26% callchain_append
> + 10.22% merge_chain_branch
>   +  11.58%  perf  perf[.] dso__find_symbol
>   +   8.02%  perf  perf[.] sort__comm_cmp
>   +   5.48%  perf  libc-2.17.so[.] malloc_consolidate
> 
> Reported-by: Linus Torvalds 
> Cc: Jiri Olsa 
> Cc: Frederic Weisbecker 
> Link: http://lkml.kernel.org/n/tip-d9tcfow6stbrp4btvgs51...@git.kernel.org
> Signed-off-by: Namhyung Kim 

Have you tested this patchset when collapsing is not used?
There are fair chances that this patchset does not only improve collapsing
but also callchain insertion in general. So it's probably a win in any case. But
still it would be nice to make sure that it's the case because we are getting
rid of collapsing anyway.

The test that could tell us about that is to run "perf report -s sym" and 
compare the
time it takes to complete before and after this patch, because "-s sym" 
shouldn't
involve collapses.

Sorting by anything that is not comm should do the trick in fact.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/8] perf callchain: Convert children list to rbtree

2013-10-02 Thread Frederic Weisbecker
On Thu, Sep 26, 2013 at 05:58:03PM +0900, Namhyung Kim wrote:
 From: Namhyung Kim namhyung@lge.com
 
 Current collapse stage has a scalability problem which can be
 reproduced easily with parallel kernel build.  This is because it
 needs to traverse every children of callchain linearly during the
 collapse/merge stage.  Convert it to rbtree reduced the overhead
 significantly.
 
 On my 400MB perf.data file which recorded with make -j32 kernel build:
 
   $ time perf --no-pager report --stdio  /dev/null
 
 before:
   real6m22.073s
   user6m18.683s
   sys 0m0.706s
 
 after:
   real0m20.780s
   user0m19.962s
   sys 0m0.689s
 
 During the perf report the overhead on append_chain_children went down
 from 96.69% to 18.16%:
 
   -  18.16%  perf  perf[.] append_chain_children
  - append_chain_children
 - 77.48% append_chain_children
+ 69.79% merge_chain_branch
- 22.96% append_chain_children
   + 67.44% merge_chain_branch
   + 30.15% append_chain_children
   + 2.41% callchain_append
+ 7.25% callchain_append
 + 12.26% callchain_append
 + 10.22% merge_chain_branch
   +  11.58%  perf  perf[.] dso__find_symbol
   +   8.02%  perf  perf[.] sort__comm_cmp
   +   5.48%  perf  libc-2.17.so[.] malloc_consolidate
 
 Reported-by: Linus Torvalds torva...@linux-foundation.org
 Cc: Jiri Olsa jo...@redhat.com
 Cc: Frederic Weisbecker fweis...@gmail.com
 Link: http://lkml.kernel.org/n/tip-d9tcfow6stbrp4btvgs51...@git.kernel.org
 Signed-off-by: Namhyung Kim namhy...@kernel.org

Have you tested this patchset when collapsing is not used?
There are fair chances that this patchset does not only improve collapsing
but also callchain insertion in general. So it's probably a win in any case. But
still it would be nice to make sure that it's the case because we are getting
rid of collapsing anyway.

The test that could tell us about that is to run perf report -s sym and 
compare the
time it takes to complete before and after this patch, because -s sym 
shouldn't
involve collapses.

Sorting by anything that is not comm should do the trick in fact.

Thanks.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/8] perf callchain: Convert children list to rbtree

2013-09-26 Thread Namhyung Kim
From: Namhyung Kim 

Current collapse stage has a scalability problem which can be
reproduced easily with parallel kernel build.  This is because it
needs to traverse every children of callchain linearly during the
collapse/merge stage.  Convert it to rbtree reduced the overhead
significantly.

On my 400MB perf.data file which recorded with make -j32 kernel build:

  $ time perf --no-pager report --stdio > /dev/null

before:
  real  6m22.073s
  user  6m18.683s
  sys   0m0.706s

after:
  real  0m20.780s
  user  0m19.962s
  sys   0m0.689s

During the perf report the overhead on append_chain_children went down
from 96.69% to 18.16%:

  -  18.16%  perf  perf[.] append_chain_children
 - append_chain_children
- 77.48% append_chain_children
   + 69.79% merge_chain_branch
   - 22.96% append_chain_children
  + 67.44% merge_chain_branch
  + 30.15% append_chain_children
  + 2.41% callchain_append
   + 7.25% callchain_append
+ 12.26% callchain_append
+ 10.22% merge_chain_branch
  +  11.58%  perf  perf[.] dso__find_symbol
  +   8.02%  perf  perf[.] sort__comm_cmp
  +   5.48%  perf  libc-2.17.so[.] malloc_consolidate

Reported-by: Linus Torvalds 
Cc: Jiri Olsa 
Cc: Frederic Weisbecker 
Link: http://lkml.kernel.org/n/tip-d9tcfow6stbrp4btvgs51...@git.kernel.org
Signed-off-by: Namhyung Kim 
---
 tools/perf/util/callchain.c | 147 +---
 tools/perf/util/callchain.h |  11 ++--
 2 files changed, 116 insertions(+), 42 deletions(-)

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 482f68081cd8..e3970e3eaacf 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -21,12 +21,6 @@
 
 __thread struct callchain_cursor callchain_cursor;
 
-#define chain_for_each_child(child, parent)\
-   list_for_each_entry(child, >children, siblings)
-
-#define chain_for_each_child_safe(child, next, parent) \
-   list_for_each_entry_safe(child, next, >children, siblings)
-
 static void
 rb_insert_callchain(struct rb_root *root, struct callchain_node *chain,
enum chain_mode mode)
@@ -71,10 +65,16 @@ static void
 __sort_chain_flat(struct rb_root *rb_root, struct callchain_node *node,
  u64 min_hit)
 {
+   struct rb_node *n;
struct callchain_node *child;
 
-   chain_for_each_child(child, node)
+   n = rb_first(>rb_root_in);
+   while (n) {
+   child = rb_entry(n, struct callchain_node, rb_node_in);
+   n = rb_next(n);
+
__sort_chain_flat(rb_root, child, min_hit);
+   }
 
if (node->hit && node->hit >= min_hit)
rb_insert_callchain(rb_root, node, CHAIN_FLAT);
@@ -94,11 +94,16 @@ sort_chain_flat(struct rb_root *rb_root, struct 
callchain_root *root,
 static void __sort_chain_graph_abs(struct callchain_node *node,
   u64 min_hit)
 {
+   struct rb_node *n;
struct callchain_node *child;
 
node->rb_root = RB_ROOT;
+   n = rb_first(>rb_root_in);
+
+   while (n) {
+   child = rb_entry(n, struct callchain_node, rb_node_in);
+   n = rb_next(n);
 
-   chain_for_each_child(child, node) {
__sort_chain_graph_abs(child, min_hit);
if (callchain_cumul_hits(child) >= min_hit)
rb_insert_callchain(>rb_root, child,
@@ -117,13 +122,18 @@ sort_chain_graph_abs(struct rb_root *rb_root, struct 
callchain_root *chain_root,
 static void __sort_chain_graph_rel(struct callchain_node *node,
   double min_percent)
 {
+   struct rb_node *n;
struct callchain_node *child;
u64 min_hit;
 
node->rb_root = RB_ROOT;
min_hit = ceil(node->children_hit * min_percent);
 
-   chain_for_each_child(child, node) {
+   n = rb_first(>rb_root_in);
+   while (n) {
+   child = rb_entry(n, struct callchain_node, rb_node_in);
+   n = rb_next(n);
+
__sort_chain_graph_rel(child, min_percent);
if (callchain_cumul_hits(child) >= min_hit)
rb_insert_callchain(>rb_root, child,
@@ -173,19 +183,26 @@ create_child(struct callchain_node *parent, bool 
inherit_children)
return NULL;
}
new->parent = parent;
-   INIT_LIST_HEAD(>children);
INIT_LIST_HEAD(>val);
 
if (inherit_children) {
-   struct callchain_node *next;
+   struct rb_node *n;
+   struct callchain_node *child;
+
+   new->rb_root_in = parent->rb_root_in;
+   parent->rb_root_in = RB_ROOT;
 
-   list_splice(>children, >children);
-   INIT_LIST_HEAD(>children);
+   n = rb_first(>rb_root_in);
+   while (n) {
+   child = 

[PATCH 1/8] perf callchain: Convert children list to rbtree

2013-09-26 Thread Namhyung Kim
From: Namhyung Kim namhyung@lge.com

Current collapse stage has a scalability problem which can be
reproduced easily with parallel kernel build.  This is because it
needs to traverse every children of callchain linearly during the
collapse/merge stage.  Convert it to rbtree reduced the overhead
significantly.

On my 400MB perf.data file which recorded with make -j32 kernel build:

  $ time perf --no-pager report --stdio  /dev/null

before:
  real  6m22.073s
  user  6m18.683s
  sys   0m0.706s

after:
  real  0m20.780s
  user  0m19.962s
  sys   0m0.689s

During the perf report the overhead on append_chain_children went down
from 96.69% to 18.16%:

  -  18.16%  perf  perf[.] append_chain_children
 - append_chain_children
- 77.48% append_chain_children
   + 69.79% merge_chain_branch
   - 22.96% append_chain_children
  + 67.44% merge_chain_branch
  + 30.15% append_chain_children
  + 2.41% callchain_append
   + 7.25% callchain_append
+ 12.26% callchain_append
+ 10.22% merge_chain_branch
  +  11.58%  perf  perf[.] dso__find_symbol
  +   8.02%  perf  perf[.] sort__comm_cmp
  +   5.48%  perf  libc-2.17.so[.] malloc_consolidate

Reported-by: Linus Torvalds torva...@linux-foundation.org
Cc: Jiri Olsa jo...@redhat.com
Cc: Frederic Weisbecker fweis...@gmail.com
Link: http://lkml.kernel.org/n/tip-d9tcfow6stbrp4btvgs51...@git.kernel.org
Signed-off-by: Namhyung Kim namhy...@kernel.org
---
 tools/perf/util/callchain.c | 147 +---
 tools/perf/util/callchain.h |  11 ++--
 2 files changed, 116 insertions(+), 42 deletions(-)

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 482f68081cd8..e3970e3eaacf 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -21,12 +21,6 @@
 
 __thread struct callchain_cursor callchain_cursor;
 
-#define chain_for_each_child(child, parent)\
-   list_for_each_entry(child, parent-children, siblings)
-
-#define chain_for_each_child_safe(child, next, parent) \
-   list_for_each_entry_safe(child, next, parent-children, siblings)
-
 static void
 rb_insert_callchain(struct rb_root *root, struct callchain_node *chain,
enum chain_mode mode)
@@ -71,10 +65,16 @@ static void
 __sort_chain_flat(struct rb_root *rb_root, struct callchain_node *node,
  u64 min_hit)
 {
+   struct rb_node *n;
struct callchain_node *child;
 
-   chain_for_each_child(child, node)
+   n = rb_first(node-rb_root_in);
+   while (n) {
+   child = rb_entry(n, struct callchain_node, rb_node_in);
+   n = rb_next(n);
+
__sort_chain_flat(rb_root, child, min_hit);
+   }
 
if (node-hit  node-hit = min_hit)
rb_insert_callchain(rb_root, node, CHAIN_FLAT);
@@ -94,11 +94,16 @@ sort_chain_flat(struct rb_root *rb_root, struct 
callchain_root *root,
 static void __sort_chain_graph_abs(struct callchain_node *node,
   u64 min_hit)
 {
+   struct rb_node *n;
struct callchain_node *child;
 
node-rb_root = RB_ROOT;
+   n = rb_first(node-rb_root_in);
+
+   while (n) {
+   child = rb_entry(n, struct callchain_node, rb_node_in);
+   n = rb_next(n);
 
-   chain_for_each_child(child, node) {
__sort_chain_graph_abs(child, min_hit);
if (callchain_cumul_hits(child) = min_hit)
rb_insert_callchain(node-rb_root, child,
@@ -117,13 +122,18 @@ sort_chain_graph_abs(struct rb_root *rb_root, struct 
callchain_root *chain_root,
 static void __sort_chain_graph_rel(struct callchain_node *node,
   double min_percent)
 {
+   struct rb_node *n;
struct callchain_node *child;
u64 min_hit;
 
node-rb_root = RB_ROOT;
min_hit = ceil(node-children_hit * min_percent);
 
-   chain_for_each_child(child, node) {
+   n = rb_first(node-rb_root_in);
+   while (n) {
+   child = rb_entry(n, struct callchain_node, rb_node_in);
+   n = rb_next(n);
+
__sort_chain_graph_rel(child, min_percent);
if (callchain_cumul_hits(child) = min_hit)
rb_insert_callchain(node-rb_root, child,
@@ -173,19 +183,26 @@ create_child(struct callchain_node *parent, bool 
inherit_children)
return NULL;
}
new-parent = parent;
-   INIT_LIST_HEAD(new-children);
INIT_LIST_HEAD(new-val);
 
if (inherit_children) {
-   struct callchain_node *next;
+   struct rb_node *n;
+   struct callchain_node *child;
+
+   new-rb_root_in = parent-rb_root_in;
+   parent-rb_root_in = RB_ROOT;
 
-   list_splice(parent-children, new-children);
-