As trinity figured out, there is a recursive get_online_cpus() in
perf_event_open()'s error path:
| Call Trace:
|  dump_stack+0x86/0xce
|  __lock_acquire+0x2520/0x2cd0
|  lock_acquire+0x27c/0x2f0
|  get_online_cpus+0x3d/0x80
|  static_key_slow_dec+0x5a/0x70
|  sw_perf_event_destroy+0x8e/0x100
|  _free_event+0x61b/0x800
|  free_event+0x68/0x70
|  SyS_perf_event_open+0x19db/0x1d80

In order to cure, I am moving free_event() after the put_online_cpus()
block.
Besides that one, there also the error path in perf_event_alloc() which
also invokes event->destory. Here I delayed the destory work to
schedule_work().

Signed-off-by: Sebastian Andrzej Siewior <bige...@linutronix.de>
---

I am not quite happy with the schedule_work() part.

 include/linux/perf_event.h |  1 +
 kernel/events/core.c       | 52 +++++++++++++++++++++++++++++++++-------------
 2 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 24a635887f28..d6a874dbbd21 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -718,6 +718,7 @@ struct perf_event {
 #endif
 
        struct list_head                sb_list;
+       struct work_struct              destroy_work;
 #endif /* CONFIG_PERF_EVENTS */
 };
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7aed78b516fc..3358889609f8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9320,6 +9320,16 @@ static void account_event(struct perf_event *event)
        account_pmu_sb_event(event);
 }
 
+static void perf_alloc_destroy_ev(struct work_struct *work)
+{
+       struct perf_event *event;
+
+       event = container_of(work, struct perf_event, destroy_work);
+       event->destroy(event);
+       module_put(event->pmu->module);
+       kfree(event);
+}
+
 /*
  * Allocate and initialize a event structure
  */
@@ -9334,6 +9344,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        struct pmu *pmu;
        struct perf_event *event;
        struct hw_perf_event *hwc;
+       bool delay_destroy = false;
        long err = -EINVAL;
 
        if ((unsigned)cpu >= nr_cpu_ids) {
@@ -9497,15 +9508,22 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        exclusive_event_destroy(event);
 
 err_pmu:
-       if (event->destroy)
-               event->destroy(event);
-       module_put(pmu->module);
+       if (event->destroy) {
+               /* delay ->destroy due to nested get_online_cpus() */
+               INIT_WORK(&event->destroy_work, perf_alloc_destroy_ev);
+               delay_destroy = true;
+       } else {
+               module_put(pmu->module);
+       }
 err_ns:
        if (is_cgroup_event(event))
                perf_detach_cgroup(event);
        if (event->ns)
                put_pid_ns(event->ns);
-       kfree(event);
+       if (delay_destroy)
+               schedule_work(&event->destroy_work);
+       else
+               kfree(event);
 
        return ERR_PTR(err);
 }
@@ -9798,7 +9816,7 @@ SYSCALL_DEFINE5(perf_event_open,
                pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
 {
        struct perf_event *group_leader = NULL, *output_event = NULL;
-       struct perf_event *event, *sibling;
+       struct perf_event *event = NULL, *sibling;
        struct perf_event_attr attr;
        struct perf_event_context *ctx, *uninitialized_var(gctx);
        struct file *event_file = NULL;
@@ -9908,13 +9926,14 @@ SYSCALL_DEFINE5(perf_event_open,
                                 NULL, NULL, cgroup_fd);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
+               event = NULL;
                goto err_cred;
        }
 
        if (is_sampling_event(event)) {
                if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
                        err = -EOPNOTSUPP;
-                       goto err_alloc;
+                       goto err_cred;
                }
        }
 
@@ -9927,7 +9946,7 @@ SYSCALL_DEFINE5(perf_event_open,
        if (attr.use_clockid) {
                err = perf_event_set_clock(event, attr.clockid);
                if (err)
-                       goto err_alloc;
+                       goto err_cred;
        }
 
        if (pmu->task_ctx_nr == perf_sw_context)
@@ -9962,7 +9981,7 @@ SYSCALL_DEFINE5(perf_event_open,
        ctx = find_get_context(pmu, task, event);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
-               goto err_alloc;
+               goto err_cred;
        }
 
        if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
@@ -10186,18 +10205,21 @@ SYSCALL_DEFINE5(perf_event_open,
 err_context:
        perf_unpin_context(ctx);
        put_ctx(ctx);
-err_alloc:
-       /*
-        * If event_file is set, the fput() above will have called ->release()
-        * and that will take care of freeing the event.
-        */
-       if (!event_file)
-               free_event(event);
 err_cred:
        if (task)
                mutex_unlock(&task->signal->cred_guard_mutex);
 err_cpus:
        put_online_cpus();
+       /*
+        * The event cleanup should happen earlier (as per cleanup in reverse
+        * allocation order). It is delayed after the put_online_cpus() section
+        * so we don't invoke event->destroy in it and risk recursive invocation
+        * of it via static_key_slow_dec().
+        * If event_file is set, the fput() above will have called ->release()
+        * and that will take care of freeing the event.
+        */
+       if (event && !event_file)
+               free_event(event);
 err_task:
        if (task)
                put_task_struct(task);
-- 
2.11.0

Reply via email to