[PATCH 1/8] perf callchain: Convert children list to rbtree
From: Namhyung Kim Current collapse stage has a scalability problem which can be reproduced easily with parallel kernel build. This is because it needs to traverse every children of callchain linearly during the collapse/merge stage. Convert it to rbtree reduced the overhead significantly. On my 400MB perf.data file which recorded with make -j32 kernel build: $ time perf --no-pager report --stdio > /dev/null before: real 6m22.073s user 6m18.683s sys 0m0.706s after: real 0m20.780s user 0m19.962s sys 0m0.689s During the perf report the overhead on append_chain_children went down from 96.69% to 18.16%: - 18.16% perf perf[.] append_chain_children - append_chain_children - 77.48% append_chain_children + 69.79% merge_chain_branch - 22.96% append_chain_children + 67.44% merge_chain_branch + 30.15% append_chain_children + 2.41% callchain_append + 7.25% callchain_append + 12.26% callchain_append + 10.22% merge_chain_branch + 11.58% perf perf[.] dso__find_symbol + 8.02% perf perf[.] sort__comm_cmp + 5.48% perf libc-2.17.so[.] malloc_consolidate Reported-by: Linus Torvalds Cc: Jiri Olsa Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-d9tcfow6stbrp4btvgs51...@git.kernel.org Signed-off-by: Namhyung Kim --- tools/perf/util/callchain.c | 147 +--- tools/perf/util/callchain.h | 11 ++-- 2 files changed, 116 insertions(+), 42 deletions(-) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 482f68081cd8..e3970e3eaacf 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -21,12 +21,6 @@ __thread struct callchain_cursor callchain_cursor; -#define chain_for_each_child(child, parent)\ - list_for_each_entry(child, >children, siblings) - -#define chain_for_each_child_safe(child, next, parent) \ - list_for_each_entry_safe(child, next, >children, siblings) - static void rb_insert_callchain(struct rb_root *root, struct callchain_node *chain, enum chain_mode mode) @@ -71,10 +65,16 @@ static void __sort_chain_flat(struct rb_root *rb_root, struct callchain_node *node, u64 min_hit) { + struct rb_node *n; struct callchain_node *child; - chain_for_each_child(child, node) + n = rb_first(>rb_root_in); + while (n) { + child = rb_entry(n, struct callchain_node, rb_node_in); + n = rb_next(n); + __sort_chain_flat(rb_root, child, min_hit); + } if (node->hit && node->hit >= min_hit) rb_insert_callchain(rb_root, node, CHAIN_FLAT); @@ -94,11 +94,16 @@ sort_chain_flat(struct rb_root *rb_root, struct callchain_root *root, static void __sort_chain_graph_abs(struct callchain_node *node, u64 min_hit) { + struct rb_node *n; struct callchain_node *child; node->rb_root = RB_ROOT; + n = rb_first(>rb_root_in); + + while (n) { + child = rb_entry(n, struct callchain_node, rb_node_in); + n = rb_next(n); - chain_for_each_child(child, node) { __sort_chain_graph_abs(child, min_hit); if (callchain_cumul_hits(child) >= min_hit) rb_insert_callchain(>rb_root, child, @@ -117,13 +122,18 @@ sort_chain_graph_abs(struct rb_root *rb_root, struct callchain_root *chain_root, static void __sort_chain_graph_rel(struct callchain_node *node, double min_percent) { + struct rb_node *n; struct callchain_node *child; u64 min_hit; node->rb_root = RB_ROOT; min_hit = ceil(node->children_hit * min_percent); - chain_for_each_child(child, node) { + n = rb_first(>rb_root_in); + while (n) { + child = rb_entry(n, struct callchain_node, rb_node_in); + n = rb_next(n); + __sort_chain_graph_rel(child, min_percent); if (callchain_cumul_hits(child) >= min_hit) rb_insert_callchain(>rb_root, child, @@ -173,19 +183,26 @@ create_child(struct callchain_node *parent, bool inherit_children) return NULL; } new->parent = parent; - INIT_LIST_HEAD(>children); INIT_LIST_HEAD(>val); if (inherit_children) { - struct callchain_node *next; + struct rb_node *n; + struct callchain_node *child; + + new->rb_root_in = parent->rb_root_in; + parent->rb_root_in = RB_ROOT; - list_splice(>children, >children); - INIT_LIST_HEAD(>children); + n = rb_first(>rb_root_in); + while (n) { + child =
[PATCH 1/8] perf callchain: Convert children list to rbtree
From: Namhyung Kim namhyung@lge.com Current collapse stage has a scalability problem which can be reproduced easily with parallel kernel build. This is because it needs to traverse every children of callchain linearly during the collapse/merge stage. Convert it to rbtree reduced the overhead significantly. On my 400MB perf.data file which recorded with make -j32 kernel build: $ time perf --no-pager report --stdio /dev/null before: real 6m22.073s user 6m18.683s sys 0m0.706s after: real 0m20.780s user 0m19.962s sys 0m0.689s During the perf report the overhead on append_chain_children went down from 96.69% to 18.16%: - 18.16% perf perf[.] append_chain_children - append_chain_children - 77.48% append_chain_children + 69.79% merge_chain_branch - 22.96% append_chain_children + 67.44% merge_chain_branch + 30.15% append_chain_children + 2.41% callchain_append + 7.25% callchain_append + 12.26% callchain_append + 10.22% merge_chain_branch + 11.58% perf perf[.] dso__find_symbol + 8.02% perf perf[.] sort__comm_cmp + 5.48% perf libc-2.17.so[.] malloc_consolidate Reported-by: Linus Torvalds torva...@linux-foundation.org Cc: Jiri Olsa jo...@redhat.com Cc: Frederic Weisbecker fweis...@gmail.com Link: http://lkml.kernel.org/n/tip-d9tcfow6stbrp4btvgs51...@git.kernel.org Signed-off-by: Namhyung Kim namhy...@kernel.org --- tools/perf/util/callchain.c | 147 +--- tools/perf/util/callchain.h | 11 ++-- 2 files changed, 116 insertions(+), 42 deletions(-) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 482f68081cd8..e3970e3eaacf 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -21,12 +21,6 @@ __thread struct callchain_cursor callchain_cursor; -#define chain_for_each_child(child, parent)\ - list_for_each_entry(child, parent-children, siblings) - -#define chain_for_each_child_safe(child, next, parent) \ - list_for_each_entry_safe(child, next, parent-children, siblings) - static void rb_insert_callchain(struct rb_root *root, struct callchain_node *chain, enum chain_mode mode) @@ -71,10 +65,16 @@ static void __sort_chain_flat(struct rb_root *rb_root, struct callchain_node *node, u64 min_hit) { + struct rb_node *n; struct callchain_node *child; - chain_for_each_child(child, node) + n = rb_first(node-rb_root_in); + while (n) { + child = rb_entry(n, struct callchain_node, rb_node_in); + n = rb_next(n); + __sort_chain_flat(rb_root, child, min_hit); + } if (node-hit node-hit = min_hit) rb_insert_callchain(rb_root, node, CHAIN_FLAT); @@ -94,11 +94,16 @@ sort_chain_flat(struct rb_root *rb_root, struct callchain_root *root, static void __sort_chain_graph_abs(struct callchain_node *node, u64 min_hit) { + struct rb_node *n; struct callchain_node *child; node-rb_root = RB_ROOT; + n = rb_first(node-rb_root_in); + + while (n) { + child = rb_entry(n, struct callchain_node, rb_node_in); + n = rb_next(n); - chain_for_each_child(child, node) { __sort_chain_graph_abs(child, min_hit); if (callchain_cumul_hits(child) = min_hit) rb_insert_callchain(node-rb_root, child, @@ -117,13 +122,18 @@ sort_chain_graph_abs(struct rb_root *rb_root, struct callchain_root *chain_root, static void __sort_chain_graph_rel(struct callchain_node *node, double min_percent) { + struct rb_node *n; struct callchain_node *child; u64 min_hit; node-rb_root = RB_ROOT; min_hit = ceil(node-children_hit * min_percent); - chain_for_each_child(child, node) { + n = rb_first(node-rb_root_in); + while (n) { + child = rb_entry(n, struct callchain_node, rb_node_in); + n = rb_next(n); + __sort_chain_graph_rel(child, min_percent); if (callchain_cumul_hits(child) = min_hit) rb_insert_callchain(node-rb_root, child, @@ -173,19 +183,26 @@ create_child(struct callchain_node *parent, bool inherit_children) return NULL; } new-parent = parent; - INIT_LIST_HEAD(new-children); INIT_LIST_HEAD(new-val); if (inherit_children) { - struct callchain_node *next; + struct rb_node *n; + struct callchain_node *child; + + new-rb_root_in = parent-rb_root_in; + parent-rb_root_in = RB_ROOT; - list_splice(parent-children, new-children); -
Re: [PATCH 1/8] perf callchain: Convert children list to rbtree
Hi Frederic, On Tue, 8 Oct 2013 21:22:45 +0200, Frederic Weisbecker wrote: > On Tue, Oct 08, 2013 at 11:03:16AM +0900, Namhyung Kim wrote: >> On Wed, 2 Oct 2013 12:18:28 +0200, Frederic Weisbecker wrote: >> > Have you tested this patchset when collapsing is not used? >> > There are fair chances that this patchset does not only improve collapsing >> > but also callchain insertion in general. So it's probably a win in any >> > case. But >> > still it would be nice to make sure that it's the case because we are >> > getting >> > rid of collapsing anyway. >> > >> > The test that could tell us about that is to run "perf report -s sym" and >> > compare the >> > time it takes to complete before and after this patch, because "-s sym" >> > shouldn't >> > involve collapses. >> > >> > Sorting by anything that is not comm should do the trick in fact. >> >> Yes, I have similar result when collapsing is not used. Actually when I >> ran "perf report -s sym", the performance improves higher since it'd >> insert more callchains in a hist entry. > > Great! I'll have a closer look and review on the callchain patches then. > Please > resend these along the comm batch. I'll do it later today or tomorrow. Thanks, Namhyung -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/8] perf callchain: Convert children list to rbtree
Hi Frederic, On Tue, 8 Oct 2013 21:22:45 +0200, Frederic Weisbecker wrote: On Tue, Oct 08, 2013 at 11:03:16AM +0900, Namhyung Kim wrote: On Wed, 2 Oct 2013 12:18:28 +0200, Frederic Weisbecker wrote: Have you tested this patchset when collapsing is not used? There are fair chances that this patchset does not only improve collapsing but also callchain insertion in general. So it's probably a win in any case. But still it would be nice to make sure that it's the case because we are getting rid of collapsing anyway. The test that could tell us about that is to run perf report -s sym and compare the time it takes to complete before and after this patch, because -s sym shouldn't involve collapses. Sorting by anything that is not comm should do the trick in fact. Yes, I have similar result when collapsing is not used. Actually when I ran perf report -s sym, the performance improves higher since it'd insert more callchains in a hist entry. Great! I'll have a closer look and review on the callchain patches then. Please resend these along the comm batch. I'll do it later today or tomorrow. Thanks, Namhyung -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/8] perf callchain: Convert children list to rbtree
On Tue, Oct 08, 2013 at 11:03:16AM +0900, Namhyung Kim wrote: > On Wed, 2 Oct 2013 12:18:28 +0200, Frederic Weisbecker wrote: > > On Thu, Sep 26, 2013 at 05:58:03PM +0900, Namhyung Kim wrote: > >> From: Namhyung Kim > >> > >> Current collapse stage has a scalability problem which can be > >> reproduced easily with parallel kernel build. This is because it > >> needs to traverse every children of callchain linearly during the > >> collapse/merge stage. Convert it to rbtree reduced the overhead > >> significantly. > >> > >> On my 400MB perf.data file which recorded with make -j32 kernel build: > >> > >> $ time perf --no-pager report --stdio > /dev/null > >> > >> before: > >> real 6m22.073s > >> user 6m18.683s > >> sys 0m0.706s > >> > >> after: > >> real 0m20.780s > >> user 0m19.962s > >> sys 0m0.689s > >> > >> During the perf report the overhead on append_chain_children went down > >> from 96.69% to 18.16%: > >> > >> - 18.16% perf perf[.] append_chain_children > >> - append_chain_children > >> - 77.48% append_chain_children > >>+ 69.79% merge_chain_branch > >>- 22.96% append_chain_children > >> + 67.44% merge_chain_branch > >> + 30.15% append_chain_children > >> + 2.41% callchain_append > >>+ 7.25% callchain_append > >> + 12.26% callchain_append > >> + 10.22% merge_chain_branch > >> + 11.58% perf perf[.] dso__find_symbol > >> + 8.02% perf perf[.] sort__comm_cmp > >> + 5.48% perf libc-2.17.so[.] malloc_consolidate > >> > >> Reported-by: Linus Torvalds > >> Cc: Jiri Olsa > >> Cc: Frederic Weisbecker > >> Link: http://lkml.kernel.org/n/tip-d9tcfow6stbrp4btvgs51...@git.kernel.org > >> Signed-off-by: Namhyung Kim > > > > Have you tested this patchset when collapsing is not used? > > There are fair chances that this patchset does not only improve collapsing > > but also callchain insertion in general. So it's probably a win in any > > case. But > > still it would be nice to make sure that it's the case because we are > > getting > > rid of collapsing anyway. > > > > The test that could tell us about that is to run "perf report -s sym" and > > compare the > > time it takes to complete before and after this patch, because "-s sym" > > shouldn't > > involve collapses. > > > > Sorting by anything that is not comm should do the trick in fact. > > Yes, I have similar result when collapsing is not used. Actually when I > ran "perf report -s sym", the performance improves higher since it'd > insert more callchains in a hist entry. Great! I'll have a closer look and review on the callchain patches then. Please resend these along the comm batch. Thanks again! -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/8] perf callchain: Convert children list to rbtree
On Tue, Oct 08, 2013 at 11:03:16AM +0900, Namhyung Kim wrote: On Wed, 2 Oct 2013 12:18:28 +0200, Frederic Weisbecker wrote: On Thu, Sep 26, 2013 at 05:58:03PM +0900, Namhyung Kim wrote: From: Namhyung Kim namhyung@lge.com Current collapse stage has a scalability problem which can be reproduced easily with parallel kernel build. This is because it needs to traverse every children of callchain linearly during the collapse/merge stage. Convert it to rbtree reduced the overhead significantly. On my 400MB perf.data file which recorded with make -j32 kernel build: $ time perf --no-pager report --stdio /dev/null before: real 6m22.073s user 6m18.683s sys 0m0.706s after: real 0m20.780s user 0m19.962s sys 0m0.689s During the perf report the overhead on append_chain_children went down from 96.69% to 18.16%: - 18.16% perf perf[.] append_chain_children - append_chain_children - 77.48% append_chain_children + 69.79% merge_chain_branch - 22.96% append_chain_children + 67.44% merge_chain_branch + 30.15% append_chain_children + 2.41% callchain_append + 7.25% callchain_append + 12.26% callchain_append + 10.22% merge_chain_branch + 11.58% perf perf[.] dso__find_symbol + 8.02% perf perf[.] sort__comm_cmp + 5.48% perf libc-2.17.so[.] malloc_consolidate Reported-by: Linus Torvalds torva...@linux-foundation.org Cc: Jiri Olsa jo...@redhat.com Cc: Frederic Weisbecker fweis...@gmail.com Link: http://lkml.kernel.org/n/tip-d9tcfow6stbrp4btvgs51...@git.kernel.org Signed-off-by: Namhyung Kim namhy...@kernel.org Have you tested this patchset when collapsing is not used? There are fair chances that this patchset does not only improve collapsing but also callchain insertion in general. So it's probably a win in any case. But still it would be nice to make sure that it's the case because we are getting rid of collapsing anyway. The test that could tell us about that is to run perf report -s sym and compare the time it takes to complete before and after this patch, because -s sym shouldn't involve collapses. Sorting by anything that is not comm should do the trick in fact. Yes, I have similar result when collapsing is not used. Actually when I ran perf report -s sym, the performance improves higher since it'd insert more callchains in a hist entry. Great! I'll have a closer look and review on the callchain patches then. Please resend these along the comm batch. Thanks again! -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/8] perf callchain: Convert children list to rbtree
On Wed, 2 Oct 2013 12:18:28 +0200, Frederic Weisbecker wrote: > On Thu, Sep 26, 2013 at 05:58:03PM +0900, Namhyung Kim wrote: >> From: Namhyung Kim >> >> Current collapse stage has a scalability problem which can be >> reproduced easily with parallel kernel build. This is because it >> needs to traverse every children of callchain linearly during the >> collapse/merge stage. Convert it to rbtree reduced the overhead >> significantly. >> >> On my 400MB perf.data file which recorded with make -j32 kernel build: >> >> $ time perf --no-pager report --stdio > /dev/null >> >> before: >> real 6m22.073s >> user 6m18.683s >> sys0m0.706s >> >> after: >> real 0m20.780s >> user 0m19.962s >> sys0m0.689s >> >> During the perf report the overhead on append_chain_children went down >> from 96.69% to 18.16%: >> >> - 18.16% perf perf[.] append_chain_children >> - append_chain_children >> - 77.48% append_chain_children >>+ 69.79% merge_chain_branch >>- 22.96% append_chain_children >> + 67.44% merge_chain_branch >> + 30.15% append_chain_children >> + 2.41% callchain_append >>+ 7.25% callchain_append >> + 12.26% callchain_append >> + 10.22% merge_chain_branch >> + 11.58% perf perf[.] dso__find_symbol >> + 8.02% perf perf[.] sort__comm_cmp >> + 5.48% perf libc-2.17.so[.] malloc_consolidate >> >> Reported-by: Linus Torvalds >> Cc: Jiri Olsa >> Cc: Frederic Weisbecker >> Link: http://lkml.kernel.org/n/tip-d9tcfow6stbrp4btvgs51...@git.kernel.org >> Signed-off-by: Namhyung Kim > > Have you tested this patchset when collapsing is not used? > There are fair chances that this patchset does not only improve collapsing > but also callchain insertion in general. So it's probably a win in any case. > But > still it would be nice to make sure that it's the case because we are getting > rid of collapsing anyway. > > The test that could tell us about that is to run "perf report -s sym" and > compare the > time it takes to complete before and after this patch, because "-s sym" > shouldn't > involve collapses. > > Sorting by anything that is not comm should do the trick in fact. Yes, I have similar result when collapsing is not used. Actually when I ran "perf report -s sym", the performance improves higher since it'd insert more callchains in a hist entry. Thanks, Namhyung -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/8] perf callchain: Convert children list to rbtree
On Wed, 2 Oct 2013 12:18:28 +0200, Frederic Weisbecker wrote: On Thu, Sep 26, 2013 at 05:58:03PM +0900, Namhyung Kim wrote: From: Namhyung Kim namhyung@lge.com Current collapse stage has a scalability problem which can be reproduced easily with parallel kernel build. This is because it needs to traverse every children of callchain linearly during the collapse/merge stage. Convert it to rbtree reduced the overhead significantly. On my 400MB perf.data file which recorded with make -j32 kernel build: $ time perf --no-pager report --stdio /dev/null before: real 6m22.073s user 6m18.683s sys0m0.706s after: real 0m20.780s user 0m19.962s sys0m0.689s During the perf report the overhead on append_chain_children went down from 96.69% to 18.16%: - 18.16% perf perf[.] append_chain_children - append_chain_children - 77.48% append_chain_children + 69.79% merge_chain_branch - 22.96% append_chain_children + 67.44% merge_chain_branch + 30.15% append_chain_children + 2.41% callchain_append + 7.25% callchain_append + 12.26% callchain_append + 10.22% merge_chain_branch + 11.58% perf perf[.] dso__find_symbol + 8.02% perf perf[.] sort__comm_cmp + 5.48% perf libc-2.17.so[.] malloc_consolidate Reported-by: Linus Torvalds torva...@linux-foundation.org Cc: Jiri Olsa jo...@redhat.com Cc: Frederic Weisbecker fweis...@gmail.com Link: http://lkml.kernel.org/n/tip-d9tcfow6stbrp4btvgs51...@git.kernel.org Signed-off-by: Namhyung Kim namhy...@kernel.org Have you tested this patchset when collapsing is not used? There are fair chances that this patchset does not only improve collapsing but also callchain insertion in general. So it's probably a win in any case. But still it would be nice to make sure that it's the case because we are getting rid of collapsing anyway. The test that could tell us about that is to run perf report -s sym and compare the time it takes to complete before and after this patch, because -s sym shouldn't involve collapses. Sorting by anything that is not comm should do the trick in fact. Yes, I have similar result when collapsing is not used. Actually when I ran perf report -s sym, the performance improves higher since it'd insert more callchains in a hist entry. Thanks, Namhyung -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/8] perf callchain: Convert children list to rbtree
On Thu, Sep 26, 2013 at 05:58:03PM +0900, Namhyung Kim wrote: > From: Namhyung Kim > > Current collapse stage has a scalability problem which can be > reproduced easily with parallel kernel build. This is because it > needs to traverse every children of callchain linearly during the > collapse/merge stage. Convert it to rbtree reduced the overhead > significantly. > > On my 400MB perf.data file which recorded with make -j32 kernel build: > > $ time perf --no-pager report --stdio > /dev/null > > before: > real6m22.073s > user6m18.683s > sys 0m0.706s > > after: > real0m20.780s > user0m19.962s > sys 0m0.689s > > During the perf report the overhead on append_chain_children went down > from 96.69% to 18.16%: > > - 18.16% perf perf[.] append_chain_children > - append_chain_children > - 77.48% append_chain_children >+ 69.79% merge_chain_branch >- 22.96% append_chain_children > + 67.44% merge_chain_branch > + 30.15% append_chain_children > + 2.41% callchain_append >+ 7.25% callchain_append > + 12.26% callchain_append > + 10.22% merge_chain_branch > + 11.58% perf perf[.] dso__find_symbol > + 8.02% perf perf[.] sort__comm_cmp > + 5.48% perf libc-2.17.so[.] malloc_consolidate > > Reported-by: Linus Torvalds > Cc: Jiri Olsa > Cc: Frederic Weisbecker > Link: http://lkml.kernel.org/n/tip-d9tcfow6stbrp4btvgs51...@git.kernel.org > Signed-off-by: Namhyung Kim Have you tested this patchset when collapsing is not used? There are fair chances that this patchset does not only improve collapsing but also callchain insertion in general. So it's probably a win in any case. But still it would be nice to make sure that it's the case because we are getting rid of collapsing anyway. The test that could tell us about that is to run "perf report -s sym" and compare the time it takes to complete before and after this patch, because "-s sym" shouldn't involve collapses. Sorting by anything that is not comm should do the trick in fact. Thanks. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/8] perf callchain: Convert children list to rbtree
On Thu, Sep 26, 2013 at 05:58:03PM +0900, Namhyung Kim wrote: From: Namhyung Kim namhyung@lge.com Current collapse stage has a scalability problem which can be reproduced easily with parallel kernel build. This is because it needs to traverse every children of callchain linearly during the collapse/merge stage. Convert it to rbtree reduced the overhead significantly. On my 400MB perf.data file which recorded with make -j32 kernel build: $ time perf --no-pager report --stdio /dev/null before: real6m22.073s user6m18.683s sys 0m0.706s after: real0m20.780s user0m19.962s sys 0m0.689s During the perf report the overhead on append_chain_children went down from 96.69% to 18.16%: - 18.16% perf perf[.] append_chain_children - append_chain_children - 77.48% append_chain_children + 69.79% merge_chain_branch - 22.96% append_chain_children + 67.44% merge_chain_branch + 30.15% append_chain_children + 2.41% callchain_append + 7.25% callchain_append + 12.26% callchain_append + 10.22% merge_chain_branch + 11.58% perf perf[.] dso__find_symbol + 8.02% perf perf[.] sort__comm_cmp + 5.48% perf libc-2.17.so[.] malloc_consolidate Reported-by: Linus Torvalds torva...@linux-foundation.org Cc: Jiri Olsa jo...@redhat.com Cc: Frederic Weisbecker fweis...@gmail.com Link: http://lkml.kernel.org/n/tip-d9tcfow6stbrp4btvgs51...@git.kernel.org Signed-off-by: Namhyung Kim namhy...@kernel.org Have you tested this patchset when collapsing is not used? There are fair chances that this patchset does not only improve collapsing but also callchain insertion in general. So it's probably a win in any case. But still it would be nice to make sure that it's the case because we are getting rid of collapsing anyway. The test that could tell us about that is to run perf report -s sym and compare the time it takes to complete before and after this patch, because -s sym shouldn't involve collapses. Sorting by anything that is not comm should do the trick in fact. Thanks. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/8] perf callchain: Convert children list to rbtree
From: Namhyung Kim Current collapse stage has a scalability problem which can be reproduced easily with parallel kernel build. This is because it needs to traverse every children of callchain linearly during the collapse/merge stage. Convert it to rbtree reduced the overhead significantly. On my 400MB perf.data file which recorded with make -j32 kernel build: $ time perf --no-pager report --stdio > /dev/null before: real 6m22.073s user 6m18.683s sys 0m0.706s after: real 0m20.780s user 0m19.962s sys 0m0.689s During the perf report the overhead on append_chain_children went down from 96.69% to 18.16%: - 18.16% perf perf[.] append_chain_children - append_chain_children - 77.48% append_chain_children + 69.79% merge_chain_branch - 22.96% append_chain_children + 67.44% merge_chain_branch + 30.15% append_chain_children + 2.41% callchain_append + 7.25% callchain_append + 12.26% callchain_append + 10.22% merge_chain_branch + 11.58% perf perf[.] dso__find_symbol + 8.02% perf perf[.] sort__comm_cmp + 5.48% perf libc-2.17.so[.] malloc_consolidate Reported-by: Linus Torvalds Cc: Jiri Olsa Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-d9tcfow6stbrp4btvgs51...@git.kernel.org Signed-off-by: Namhyung Kim --- tools/perf/util/callchain.c | 147 +--- tools/perf/util/callchain.h | 11 ++-- 2 files changed, 116 insertions(+), 42 deletions(-) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 482f68081cd8..e3970e3eaacf 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -21,12 +21,6 @@ __thread struct callchain_cursor callchain_cursor; -#define chain_for_each_child(child, parent)\ - list_for_each_entry(child, >children, siblings) - -#define chain_for_each_child_safe(child, next, parent) \ - list_for_each_entry_safe(child, next, >children, siblings) - static void rb_insert_callchain(struct rb_root *root, struct callchain_node *chain, enum chain_mode mode) @@ -71,10 +65,16 @@ static void __sort_chain_flat(struct rb_root *rb_root, struct callchain_node *node, u64 min_hit) { + struct rb_node *n; struct callchain_node *child; - chain_for_each_child(child, node) + n = rb_first(>rb_root_in); + while (n) { + child = rb_entry(n, struct callchain_node, rb_node_in); + n = rb_next(n); + __sort_chain_flat(rb_root, child, min_hit); + } if (node->hit && node->hit >= min_hit) rb_insert_callchain(rb_root, node, CHAIN_FLAT); @@ -94,11 +94,16 @@ sort_chain_flat(struct rb_root *rb_root, struct callchain_root *root, static void __sort_chain_graph_abs(struct callchain_node *node, u64 min_hit) { + struct rb_node *n; struct callchain_node *child; node->rb_root = RB_ROOT; + n = rb_first(>rb_root_in); + + while (n) { + child = rb_entry(n, struct callchain_node, rb_node_in); + n = rb_next(n); - chain_for_each_child(child, node) { __sort_chain_graph_abs(child, min_hit); if (callchain_cumul_hits(child) >= min_hit) rb_insert_callchain(>rb_root, child, @@ -117,13 +122,18 @@ sort_chain_graph_abs(struct rb_root *rb_root, struct callchain_root *chain_root, static void __sort_chain_graph_rel(struct callchain_node *node, double min_percent) { + struct rb_node *n; struct callchain_node *child; u64 min_hit; node->rb_root = RB_ROOT; min_hit = ceil(node->children_hit * min_percent); - chain_for_each_child(child, node) { + n = rb_first(>rb_root_in); + while (n) { + child = rb_entry(n, struct callchain_node, rb_node_in); + n = rb_next(n); + __sort_chain_graph_rel(child, min_percent); if (callchain_cumul_hits(child) >= min_hit) rb_insert_callchain(>rb_root, child, @@ -173,19 +183,26 @@ create_child(struct callchain_node *parent, bool inherit_children) return NULL; } new->parent = parent; - INIT_LIST_HEAD(>children); INIT_LIST_HEAD(>val); if (inherit_children) { - struct callchain_node *next; + struct rb_node *n; + struct callchain_node *child; + + new->rb_root_in = parent->rb_root_in; + parent->rb_root_in = RB_ROOT; - list_splice(>children, >children); - INIT_LIST_HEAD(>children); + n = rb_first(>rb_root_in); + while (n) { + child =
[PATCH 1/8] perf callchain: Convert children list to rbtree
From: Namhyung Kim namhyung@lge.com Current collapse stage has a scalability problem which can be reproduced easily with parallel kernel build. This is because it needs to traverse every children of callchain linearly during the collapse/merge stage. Convert it to rbtree reduced the overhead significantly. On my 400MB perf.data file which recorded with make -j32 kernel build: $ time perf --no-pager report --stdio /dev/null before: real 6m22.073s user 6m18.683s sys 0m0.706s after: real 0m20.780s user 0m19.962s sys 0m0.689s During the perf report the overhead on append_chain_children went down from 96.69% to 18.16%: - 18.16% perf perf[.] append_chain_children - append_chain_children - 77.48% append_chain_children + 69.79% merge_chain_branch - 22.96% append_chain_children + 67.44% merge_chain_branch + 30.15% append_chain_children + 2.41% callchain_append + 7.25% callchain_append + 12.26% callchain_append + 10.22% merge_chain_branch + 11.58% perf perf[.] dso__find_symbol + 8.02% perf perf[.] sort__comm_cmp + 5.48% perf libc-2.17.so[.] malloc_consolidate Reported-by: Linus Torvalds torva...@linux-foundation.org Cc: Jiri Olsa jo...@redhat.com Cc: Frederic Weisbecker fweis...@gmail.com Link: http://lkml.kernel.org/n/tip-d9tcfow6stbrp4btvgs51...@git.kernel.org Signed-off-by: Namhyung Kim namhy...@kernel.org --- tools/perf/util/callchain.c | 147 +--- tools/perf/util/callchain.h | 11 ++-- 2 files changed, 116 insertions(+), 42 deletions(-) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 482f68081cd8..e3970e3eaacf 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -21,12 +21,6 @@ __thread struct callchain_cursor callchain_cursor; -#define chain_for_each_child(child, parent)\ - list_for_each_entry(child, parent-children, siblings) - -#define chain_for_each_child_safe(child, next, parent) \ - list_for_each_entry_safe(child, next, parent-children, siblings) - static void rb_insert_callchain(struct rb_root *root, struct callchain_node *chain, enum chain_mode mode) @@ -71,10 +65,16 @@ static void __sort_chain_flat(struct rb_root *rb_root, struct callchain_node *node, u64 min_hit) { + struct rb_node *n; struct callchain_node *child; - chain_for_each_child(child, node) + n = rb_first(node-rb_root_in); + while (n) { + child = rb_entry(n, struct callchain_node, rb_node_in); + n = rb_next(n); + __sort_chain_flat(rb_root, child, min_hit); + } if (node-hit node-hit = min_hit) rb_insert_callchain(rb_root, node, CHAIN_FLAT); @@ -94,11 +94,16 @@ sort_chain_flat(struct rb_root *rb_root, struct callchain_root *root, static void __sort_chain_graph_abs(struct callchain_node *node, u64 min_hit) { + struct rb_node *n; struct callchain_node *child; node-rb_root = RB_ROOT; + n = rb_first(node-rb_root_in); + + while (n) { + child = rb_entry(n, struct callchain_node, rb_node_in); + n = rb_next(n); - chain_for_each_child(child, node) { __sort_chain_graph_abs(child, min_hit); if (callchain_cumul_hits(child) = min_hit) rb_insert_callchain(node-rb_root, child, @@ -117,13 +122,18 @@ sort_chain_graph_abs(struct rb_root *rb_root, struct callchain_root *chain_root, static void __sort_chain_graph_rel(struct callchain_node *node, double min_percent) { + struct rb_node *n; struct callchain_node *child; u64 min_hit; node-rb_root = RB_ROOT; min_hit = ceil(node-children_hit * min_percent); - chain_for_each_child(child, node) { + n = rb_first(node-rb_root_in); + while (n) { + child = rb_entry(n, struct callchain_node, rb_node_in); + n = rb_next(n); + __sort_chain_graph_rel(child, min_percent); if (callchain_cumul_hits(child) = min_hit) rb_insert_callchain(node-rb_root, child, @@ -173,19 +183,26 @@ create_child(struct callchain_node *parent, bool inherit_children) return NULL; } new-parent = parent; - INIT_LIST_HEAD(new-children); INIT_LIST_HEAD(new-val); if (inherit_children) { - struct callchain_node *next; + struct rb_node *n; + struct callchain_node *child; + + new-rb_root_in = parent-rb_root_in; + parent-rb_root_in = RB_ROOT; - list_splice(parent-children, new-children); -