core: Optimize side-band event delivery

tip-bot for Kan Liang Fri, 03 Jun 2016 03:51:07 -0700

Commit-ID:  f2fb6bef92514432398a653df1c2f1041d79ac46
Gitweb:     http://git.kernel.org/tip/f2fb6bef92514432398a653df1c2f1041d79ac46
Author:     Kan Liang <[email protected]>
AuthorDate: Wed, 23 Mar 2016 11:24:37 -0700
Committer:  Ingo Molnar <[email protected]>
CommitDate: Fri, 3 Jun 2016 09:40:15 +0200


perf/core: Optimize side-band event delivery

The perf_event_aux() function iterates all PMUs and all events in
their respective per-CPU contexts to find the events to deliver
side-band records to.

For example, the brk test case in lkp triggers many mmap() operations,
which, if we're also running perf, results in many perf_event_aux()
invocations.

If we enable uncore PMU support (even when uncore events are not used),
dozens of uncore PMUs will be iterated, which can significantly
decrease brk_test's throughput.

For example, the brk throughput:

  without uncore PMUs: 2647573 ops_per_sec
  with    uncore PMUs: 1768444 ops_per_sec

... a 33% reduction.

To get at the per-CPU events that need side-band records, this patch
puts these events on a per-CPU list, this avoids iterating the PMUs
and any events that do not need side-band records.

Per task events are unchanged to avoid extra overhead on the context
switch paths.

Suggested-by: Peter Zijlstra (Intel) <[email protected]>
Reported-by: Huang, Ying <[email protected]>
Signed-off-by: Kan Liang <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Cc: Alexander Shishkin <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Stephane Eranian <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Vince Weaver <[email protected]>
Link: 
http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
 include/linux/perf_event.h |  6 ++++
 kernel/events/core.c       | 85 +++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 79 insertions(+), 12 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0e43355..92e9ce7 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -517,6 +517,11 @@ struct swevent_hlist {
 struct perf_cgroup;
 struct ring_buffer;
 
+struct pmu_event_list {
+       raw_spinlock_t          lock;
+       struct list_head        list;
+};
+
 /**
  * struct perf_event - performance event kernel representation:
  */
@@ -675,6 +680,7 @@ struct perf_event {
        int                             cgrp_defer_enabled;
 #endif
 
+       struct list_head                sb_list;
 #endif /* CONFIG_PERF_EVENTS */
 };
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 79363f2..6615c89 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -335,6 +335,7 @@ static atomic_t perf_sched_count;
 
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
+static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -3665,6 +3666,26 @@ static void free_event_rcu(struct rcu_head *head)
 static void ring_buffer_attach(struct perf_event *event,
                               struct ring_buffer *rb);
 
+static void detach_sb_event(struct perf_event *event)
+{
+       struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+
+       raw_spin_lock(&pel->lock);
+       list_del_rcu(&event->sb_list);
+       raw_spin_unlock(&pel->lock);
+}
+
+static void unaccount_pmu_sb_event(struct perf_event *event)
+{
+       if (event->parent)
+               return;
+
+       if (event->attach_state & PERF_ATTACH_TASK)
+               return;
+
+       detach_sb_event(event);
+}
+
 static void unaccount_event_cpu(struct perf_event *event, int cpu)
 {
        if (event->parent)
@@ -3728,6 +3749,8 @@ static void unaccount_event(struct perf_event *event)
        }
 
        unaccount_event_cpu(event, event->cpu);
+
+       unaccount_pmu_sb_event(event);
 }
 
 static void perf_sched_delayed(struct work_struct *work)
@@ -5888,13 +5911,25 @@ perf_event_aux_task_ctx(perf_event_aux_output_cb 
output, void *data,
        rcu_read_unlock();
 }
 
+static void perf_event_sb_iterate(perf_event_aux_output_cb output, void *data)
+{
+       struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
+       struct perf_event *event;
+
+       list_for_each_entry_rcu(event, &pel->list, sb_list) {
+               if (event->state < PERF_EVENT_STATE_INACTIVE)
+                       continue;
+               if (!event_filter_match(event))
+                       continue;
+               output(event, data);
+       }
+}
+
 static void
 perf_event_aux(perf_event_aux_output_cb output, void *data,
               struct perf_event_context *task_ctx)
 {
-       struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;
-       struct pmu *pmu;
        int ctxn;
 
        /*
@@ -5909,20 +5944,15 @@ perf_event_aux(perf_event_aux_output_cb output, void 
*data,
        }
 
        rcu_read_lock();
-       list_for_each_entry_rcu(pmu, &pmus, entry) {
-               cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-               if (cpuctx->unique_pmu != pmu)
-                       goto next;
-               perf_event_aux_ctx(&cpuctx->ctx, output, data, false);
-               ctxn = pmu->task_ctx_nr;
-               if (ctxn < 0)
-                       goto next;
+       preempt_disable();
+       perf_event_sb_iterate(output, data);
+
+       for_each_task_context_nr(ctxn) {
                ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
                if (ctx)
                        perf_event_aux_ctx(ctx, output, data, false);
-next:
-               put_cpu_ptr(pmu->pmu_cpu_context);
        }
+       preempt_enable();
        rcu_read_unlock();
 }
 
@@ -8615,6 +8645,32 @@ unlock:
        return pmu;
 }
 
+static void attach_sb_event(struct perf_event *event)
+{
+       struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+
+       raw_spin_lock(&pel->lock);
+       list_add_rcu(&event->sb_list, &pel->list);
+       raw_spin_unlock(&pel->lock);
+}
+
+static void account_pmu_sb_event(struct perf_event *event)
+{
+       struct perf_event_attr *attr = &event->attr;
+
+       if (event->parent)
+               return;
+
+       if (event->attach_state & PERF_ATTACH_TASK)
+               return;
+
+       if (attr->mmap || attr->mmap_data || attr->mmap2 ||
+           attr->comm || attr->comm_exec ||
+           attr->task ||
+           attr->context_switch)
+               attach_sb_event(event);
+}
+
 static void account_event_cpu(struct perf_event *event, int cpu)
 {
        if (event->parent)
@@ -8695,6 +8751,8 @@ static void account_event(struct perf_event *event)
 enabled:
 
        account_event_cpu(event, event->cpu);
+
+       account_pmu_sb_event(event);
 }
 
 /*
@@ -10203,6 +10261,9 @@ static void __init perf_event_init_all_cpus(void)
                swhash = &per_cpu(swevent_htable, cpu);
                mutex_init(&swhash->hlist_mutex);
                INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
+
+               INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
+               raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
        }
 }

[tip:perf/core] perf/core: Optimize side-band event delivery

Reply via email to