If TWA_NMI_CURRENT task work is queued from an NMI triggered while running in __schedule() with IRQs disabled, task_work_set_notify_irq() ends up inadvertently running on the next scheduled task. So the original task doesn't get its TIF_NOTIFY_RESUME flag set and the task work may get delayed indefinitely, or may not get to run at all.
__schedule() // disable irqs <NMI> task_work_add(current, work, TWA_NMI_CURRENT); </NMI> // current = next; // enable irqs <IRQ> task_work_set_notify_irq() test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); // wrong task! </IRQ> // original task skips task work on its next return to user (or exit!) Fix it by storing the task pointer along with the irq_work struct and passing that task to set_notify_resume(). Fixes: 466e4d801cd4 ("task_work: Add TWA_NMI_CURRENT as an additional notify mode.") Signed-off-by: Josh Poimboeuf <jpoim...@kernel.org> --- kernel/task_work.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/kernel/task_work.c b/kernel/task_work.c index 92024a8bfe12..f17447f69843 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -7,12 +7,23 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ #ifdef CONFIG_IRQ_WORK + +struct nmi_irq_work { + struct irq_work work; + struct task_struct *task; +}; + static void task_work_set_notify_irq(struct irq_work *entry) { - test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); + struct nmi_irq_work *work = container_of(entry, struct nmi_irq_work, work); + + set_notify_resume(work->task); } -static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) = - IRQ_WORK_INIT_HARD(task_work_set_notify_irq); + +static DEFINE_PER_CPU(struct nmi_irq_work, nmi_irq_work) = { + .work = IRQ_WORK_INIT_HARD(task_work_set_notify_irq), +}; + #endif /** @@ -65,15 +76,21 @@ int task_work_add(struct task_struct *task, struct callback_head *work, if (!IS_ENABLED(CONFIG_IRQ_WORK)) return -EINVAL; #ifdef CONFIG_IRQ_WORK +{ + struct nmi_irq_work *irq_work = this_cpu_ptr(&nmi_irq_work); + head = task->task_works; if (unlikely(head == &work_exited)) return -ESRCH; - if (!irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume))) + if (!irq_work_queue(&irq_work->work)) return -EBUSY; + irq_work->task = current; + work->next = head; task->task_works = work; +} #endif return 0; } @@ -109,11 +126,6 @@ int task_work_add(struct task_struct *task, struct callback_head *work, case TWA_SIGNAL_NO_IPI: __set_notify_signal(task); break; -#ifdef CONFIG_IRQ_WORK - case TWA_NMI_CURRENT: - irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume)); - break; -#endif default: WARN_ON_ONCE(1); break; -- 2.48.1