If TWA_NMI_CURRENT task work is queued from an NMI triggered while
running in __schedule() with IRQs disabled, task_work_set_notify_irq()
ends up inadvertently running on the next scheduled task.  So the
original task doesn't get its TIF_NOTIFY_RESUME flag set and the task
work may get delayed indefinitely, or may not get to run at all.

    __schedule()
        // disable irqs
            <NMI>
                task_work_add(current, work, TWA_NMI_CURRENT);
            </NMI>
        // current = next;
        // enable irqs
            <IRQ>
                task_work_set_notify_irq()
                test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); // 
wrong task!
            </IRQ>
        // original task skips task work on its next return to user (or exit!)

Fix it by storing the task pointer along with the irq_work struct and
passing that task to set_notify_resume().

Fixes: 466e4d801cd4 ("task_work: Add TWA_NMI_CURRENT as an additional notify 
mode.")
Signed-off-by: Josh Poimboeuf <jpoim...@kernel.org>
---
 kernel/task_work.c | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/kernel/task_work.c b/kernel/task_work.c
index 92024a8bfe12..f17447f69843 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -7,12 +7,23 @@
 static struct callback_head work_exited; /* all we need is ->next == NULL */
 
 #ifdef CONFIG_IRQ_WORK
+
+struct nmi_irq_work {
+       struct irq_work work;
+       struct task_struct *task;
+};
+
 static void task_work_set_notify_irq(struct irq_work *entry)
 {
-       test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+       struct nmi_irq_work *work = container_of(entry, struct nmi_irq_work, 
work);
+
+       set_notify_resume(work->task);
 }
-static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) =
-       IRQ_WORK_INIT_HARD(task_work_set_notify_irq);
+
+static DEFINE_PER_CPU(struct nmi_irq_work, nmi_irq_work) = {
+       .work = IRQ_WORK_INIT_HARD(task_work_set_notify_irq),
+};
+
 #endif
 
 /**
@@ -65,15 +76,21 @@ int task_work_add(struct task_struct *task, struct 
callback_head *work,
                if (!IS_ENABLED(CONFIG_IRQ_WORK))
                        return -EINVAL;
 #ifdef CONFIG_IRQ_WORK
+{
+               struct nmi_irq_work *irq_work = this_cpu_ptr(&nmi_irq_work);
+
                head = task->task_works;
                if (unlikely(head == &work_exited))
                        return -ESRCH;
 
-               if (!irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume)))
+               if (!irq_work_queue(&irq_work->work))
                        return -EBUSY;
 
+               irq_work->task = current;
+
                work->next = head;
                task->task_works = work;
+}
 #endif
                return 0;
        }
@@ -109,11 +126,6 @@ int task_work_add(struct task_struct *task, struct 
callback_head *work,
        case TWA_SIGNAL_NO_IPI:
                __set_notify_signal(task);
                break;
-#ifdef CONFIG_IRQ_WORK
-       case TWA_NMI_CURRENT:
-               irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume));
-               break;
-#endif
        default:
                WARN_ON_ONCE(1);
                break;
-- 
2.48.1


Reply via email to