From: Josh Poimboeuf <jpoim...@kernel.org>

Make unwind_deferred_request() NMI-safe so tracers in NMI context can
call it and safely request a user space stacktrace when the task exits.

A "nmi_timestamp" is added to the unwind_task_info that gets updated by
NMIs to not race with setting the info->timestamp.

Signed-off-by: Josh Poimboeuf <jpoim...@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rost...@goodmis.org>
---
 include/linux/unwind_deferred_types.h |  1 +
 kernel/unwind/deferred.c              | 91 ++++++++++++++++++++++++---
 2 files changed, 84 insertions(+), 8 deletions(-)

diff --git a/include/linux/unwind_deferred_types.h 
b/include/linux/unwind_deferred_types.h
index 5df264cf81ad..ae27a02234b8 100644
--- a/include/linux/unwind_deferred_types.h
+++ b/include/linux/unwind_deferred_types.h
@@ -11,6 +11,7 @@ struct unwind_task_info {
        struct unwind_cache     *cache;
        struct callback_head    work;
        u64                     timestamp;
+       u64                     nmi_timestamp;
        int                     pending;
 };
 
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
index b76c704ddc6d..238cd97079ec 100644
--- a/kernel/unwind/deferred.c
+++ b/kernel/unwind/deferred.c
@@ -25,8 +25,27 @@ static u64 get_timestamp(struct unwind_task_info *info)
 {
        lockdep_assert_irqs_disabled();
 
-       if (!info->timestamp)
-               info->timestamp = local_clock();
+       /*
+        * Note, the timestamp is generated on the first request.
+        * If it exists here, then the timestamp is earlier than
+        * this request and it means that this request will be
+        * valid for the stracktrace.
+        */
+       if (!info->timestamp) {
+               WRITE_ONCE(info->timestamp, local_clock());
+               barrier();
+               /*
+                * If an NMI came in and set a timestamp, it means that
+                * it happened before this timestamp was set (otherwise
+                * the NMI would have used this one). Use the NMI timestamp
+                * instead.
+                */
+               if (unlikely(info->nmi_timestamp)) {
+                       WRITE_ONCE(info->timestamp, info->nmi_timestamp);
+                       barrier();
+                       WRITE_ONCE(info->nmi_timestamp, 0);
+               }
+       }
 
        return info->timestamp;
 }
@@ -103,6 +122,13 @@ static void unwind_deferred_task_work(struct callback_head 
*head)
 
        unwind_deferred_trace(&trace);
 
+       /* Check if the timestamp was only set by NMI */
+       if (info->nmi_timestamp) {
+               WRITE_ONCE(info->timestamp, info->nmi_timestamp);
+               barrier();
+               WRITE_ONCE(info->nmi_timestamp, 0);
+       }
+
        timestamp = info->timestamp;
 
        guard(mutex)(&callback_mutex);
@@ -111,6 +137,48 @@ static void unwind_deferred_task_work(struct callback_head 
*head)
        }
 }
 
+static int unwind_deferred_request_nmi(struct unwind_work *work, u64 
*timestamp)
+{
+       struct unwind_task_info *info = &current->unwind_info;
+       bool inited_timestamp = false;
+       int ret;
+
+       /* Always use the nmi_timestamp first */
+       *timestamp = info->nmi_timestamp ? : info->timestamp;
+
+       if (!*timestamp) {
+               /*
+                * This is the first unwind request since the most recent entry
+                * from user space. Initialize the task timestamp.
+                *
+                * Don't write to info->timestamp directly, otherwise it may 
race
+                * with an interruption of get_timestamp().
+                */
+               info->nmi_timestamp = local_clock();
+               *timestamp = info->nmi_timestamp;
+               inited_timestamp = true;
+       }
+
+       if (info->pending)
+               return 1;
+
+       ret = task_work_add(current, &info->work, TWA_NMI_CURRENT);
+       if (ret) {
+               /*
+                * If this set nmi_timestamp and is not using it,
+                * there's no guarantee that it will be used.
+                * Set it back to zero.
+                */
+               if (inited_timestamp)
+                       info->nmi_timestamp = 0;
+               return ret;
+       }
+
+       info->pending = 1;
+
+       return 0;
+}
+
 /**
  * unwind_deferred_request - Request a user stacktrace on task exit
  * @work: Unwind descriptor requesting the trace
@@ -139,31 +207,38 @@ static void unwind_deferred_task_work(struct 
callback_head *head)
 int unwind_deferred_request(struct unwind_work *work, u64 *timestamp)
 {
        struct unwind_task_info *info = &current->unwind_info;
+       int pending;
        int ret;
 
        *timestamp = 0;
 
-       if (WARN_ON_ONCE(in_nmi()))
-               return -EINVAL;
-
        if ((current->flags & (PF_KTHREAD | PF_EXITING)) ||
            !user_mode(task_pt_regs(current)))
                return -EINVAL;
 
+       if (in_nmi())
+               return unwind_deferred_request_nmi(work, timestamp);
+
        guard(irqsave)();
 
        *timestamp = get_timestamp(info);
 
        /* callback already pending? */
-       if (info->pending)
+       pending = READ_ONCE(info->pending);
+       if (pending)
+               return 1;
+
+       /* Claim the work unless an NMI just now swooped in to do so. */
+       if (!try_cmpxchg(&info->pending, &pending, 1))
                return 1;
 
        /* The work has been claimed, now schedule it. */
        ret = task_work_add(current, &info->work, TWA_RESUME);
-       if (WARN_ON_ONCE(ret))
+       if (WARN_ON_ONCE(ret)) {
+               WRITE_ONCE(info->pending, 0);
                return ret;
+       }
 
-       info->pending = 1;
        return 0;
 }
 
-- 
2.47.2



Reply via email to