On Mon, Jul 07, 2025 at 09:22:49PM -0400, Steven Rostedt wrote: > diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h > index 12bffdb0648e..587e120c0fd6 100644 > --- a/include/linux/unwind_deferred.h > +++ b/include/linux/unwind_deferred.h > @@ -18,6 +18,14 @@ struct unwind_work { > > #ifdef CONFIG_UNWIND_USER > > +#define UNWIND_PENDING_BIT (BITS_PER_LONG - 1) > +#define UNWIND_PENDING BIT(UNWIND_PENDING_BIT)
Since it really didn't matter what bit you took, why not take bit 0? > + > +enum { > + UNWIND_ALREADY_PENDING = 1, > + UNWIND_ALREADY_EXECUTED = 2, > +}; > + > void unwind_task_init(struct task_struct *task); > void unwind_task_free(struct task_struct *task); > > @@ -29,15 +37,26 @@ void unwind_deferred_cancel(struct unwind_work *work); > > static __always_inline void unwind_reset_info(void) > { > - if (unlikely(current->unwind_info.id.id)) > + struct unwind_task_info *info = ¤t->unwind_info; > + unsigned long bits; > + > + /* Was there any unwinding? */ > + if (unlikely(info->unwind_mask)) { > + bits = info->unwind_mask; > + do { > + /* Is a task_work going to run again before going back > */ > + if (bits & UNWIND_PENDING) > + return; > + } while (!try_cmpxchg(&info->unwind_mask, &bits, 0UL)); > current->unwind_info.id.id = 0; > + } > /* > * As unwind_user_faultable() can be called directly and > * depends on nr_entries being cleared on exit to user, > * this needs to be a separate conditional. > */ > - if (unlikely(current->unwind_info.cache)) > - current->unwind_info.cache->nr_entries = 0; > + if (unlikely(info->cache)) > + info->cache->nr_entries = 0; > } > > #else /* !CONFIG_UNWIND_USER */ > diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c > index 9aed9866f460..256458f3eafe 100644 > --- a/kernel/unwind/deferred.c > +++ b/kernel/unwind/deferred.c > @@ -47,6 +47,11 @@ static LIST_HEAD(callbacks); > static unsigned long unwind_mask; > DEFINE_STATIC_SRCU(unwind_srcu); > > +static inline bool unwind_pending(struct unwind_task_info *info) > +{ > + return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask); > +} > + > /* > * This is a unique percpu identifier for a given task entry context. > * Conceptually, it's incremented every time the CPU enters the kernel from > @@ -143,14 +148,17 @@ static void unwind_deferred_task_work(struct > callback_head *head) > struct unwind_task_info *info = container_of(head, struct > unwind_task_info, work); > struct unwind_stacktrace trace; > struct unwind_work *work; > + unsigned long bits; > u64 cookie; > int idx; > > - if (WARN_ON_ONCE(!local_read(&info->pending))) > + if (WARN_ON_ONCE(!unwind_pending(info))) > return; > > - /* Allow work to come in again */ > - local_set(&info->pending, 0); > + /* Clear pending bit but make sure to have the current bits */ > + bits = READ_ONCE(info->unwind_mask); > + while (!try_cmpxchg(&info->unwind_mask, &bits, bits & ~UNWIND_PENDING)) > + ; We have: bits = atomic_long_fetch_andnot(UNWIND_PENDING, &info->unwind_mask); for that. A fair number of architecture can actually do this better than a cmpxchg loop. > > /* > * From here on out, the callback must always be called, even if it's > @@ -166,10 +174,8 @@ static void unwind_deferred_task_work(struct > callback_head *head) > idx = srcu_read_lock(&unwind_srcu); > list_for_each_entry_srcu(work, &callbacks, list, > srcu_read_lock_held(&unwind_srcu)) { > - if (test_bit(work->bit, &info->unwind_mask)) { > + if (test_bit(work->bit, &bits)) > work->func(work, &trace, cookie); > - clear_bit(work->bit, &info->unwind_mask); > - } > } > srcu_read_unlock(&unwind_srcu, idx); > } > @@ -194,15 +200,17 @@ static void unwind_deferred_task_work(struct > callback_head *head) > * because it has already been previously called for the same entry context, > * it will be called again with the same stack trace and cookie. > * > - * Return: 1 if the the callback was already queued. > - * 0 if the callback successfully was queued. > + * Return: 0 if the callback successfully was queued. > + * UNWIND_ALREADY_PENDING if the the callback was already queued. > + * UNWIND_ALREADY_EXECUTED if the callback was already called > + * (and will not be called again) > * Negative if there's an error. > * @cookie holds the cookie of the first request by any user > */ Lots of babbling in the Changelog, but no real elucidation as to why you need this second return value. AFAICT it serves no real purpose; the users of this function should not care. The only difference is that the unwind reference (your cookie) becomes a backward reference instead of a forward reference. But why would anybody care? Whatever tool is ultimately in charge of gluing humpty^Wstacktraces back together again should have no problem with this. > int unwind_deferred_request(struct unwind_work *work, u64 *cookie) > { > struct unwind_task_info *info = ¤t->unwind_info; > - long pending; > + unsigned long old, bits; > int bit; > int ret; > > @@ -225,32 +233,52 @@ int unwind_deferred_request(struct unwind_work *work, > u64 *cookie) > > *cookie = get_cookie(info); > > - /* This is already queued */ > - if (test_bit(bit, &info->unwind_mask)) > - return 1; > + old = READ_ONCE(info->unwind_mask); > + > + /* Is this already queued */ > + if (test_bit(bit, &old)) { > + /* > + * If pending is not set, it means this work's callback > + * was already called. > + */ > + return old & UNWIND_PENDING ? UNWIND_ALREADY_PENDING : > + UNWIND_ALREADY_EXECUTED; > + } > > - /* callback already pending? */ > - pending = local_read(&info->pending); > - if (pending) > + if (unwind_pending(info)) > goto out; > > + /* > + * This is the first to enable another task_work for this task since > + * the task entered the kernel, or had already called the callbacks. > + * Set only the bit for this work and clear all others as they have > + * already had their callbacks called, and do not need to call them > + * again because of this work. > + */ > + bits = UNWIND_PENDING | BIT(bit); > + > + /* > + * If the cmpxchg() fails, it means that an NMI came in and set > + * the pending bit as well as cleared the other bits. Just > + * jump to setting the bit for this work. > + */ > if (CAN_USE_IN_NMI) { > - /* Claim the work unless an NMI just now swooped in to do so. */ > - if (!local_try_cmpxchg(&info->pending, &pending, 1)) > + if (!try_cmpxchg(&info->unwind_mask, &old, bits)) > goto out; > } else { > - local_set(&info->pending, 1); > + info->unwind_mask = bits; > } > > /* The work has been claimed, now schedule it. */ > ret = task_work_add(current, &info->work, TWA_RESUME); > - if (WARN_ON_ONCE(ret)) { > - local_set(&info->pending, 0); > - return ret; > - } > > + if (WARN_ON_ONCE(ret)) > + WRITE_ONCE(info->unwind_mask, 0); > + > + return ret; > out: > - return test_and_set_bit(bit, &info->unwind_mask); > + return test_and_set_bit(bit, &info->unwind_mask) ? > + UNWIND_ALREADY_PENDING : 0; > } This is some of the most horrifyingly confused code I've seen in a while. Please just slow down and think for a minute. The below is the last four patches rolled into one. Not been near a compiler. --- --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -524,4 +524,8 @@ DEFINE_LOCK_GUARD_1(srcu, struct srcu_st srcu_read_unlock(_T->lock, _T->idx), int idx) +DEFINE_LOCK_GUARD_1(srcu_lite, struct srcu_struct, + _T->idx = srcu_read_lock_lite(_T->lock), + srcu_read_unlock_lite(_T->lock, _T->idx), + int idx) #endif --- a/include/linux/unwind_deferred.h +++ b/include/linux/unwind_deferred.h @@ -13,10 +13,14 @@ typedef void (*unwind_callback_t)(struct struct unwind_work { struct list_head list; unwind_callback_t func; + int bit; }; #ifdef CONFIG_UNWIND_USER +#define UNWIND_PENDING_BIT 0 +#define UNWIND_PENDING BIT(UNWIND_PENDING_BIT) + void unwind_task_init(struct task_struct *task); void unwind_task_free(struct task_struct *task); @@ -28,15 +32,26 @@ void unwind_deferred_cancel(struct unwin static __always_inline void unwind_reset_info(void) { - if (unlikely(current->unwind_info.id.id)) + struct unwind_task_info *info = ¤t->unwind_info; + unsigned long bits; + + /* Was there any unwinding? */ + if (unlikely(info->unwind_mask)) { + bits = raw_atomic_long_read(&info->unwind_mask); + do { + /* Is a task_work going to run again before going back */ + if (bits & UNWIND_PENDING) + return; + } while (!raw_atomic_long_try_cmpxchg(&info->unwind_mask, &bits, 0UL)); current->unwind_info.id.id = 0; + } /* * As unwind_user_faultable() can be called directly and * depends on nr_entries being cleared on exit to user, * this needs to be a separate conditional. */ - if (unlikely(current->unwind_info.cache)) - current->unwind_info.cache->nr_entries = 0; + if (unlikely(info->cache)) + info->cache->nr_entries = 0; } #else /* !CONFIG_UNWIND_USER */ --- a/include/linux/unwind_deferred_types.h +++ b/include/linux/unwind_deferred_types.h @@ -2,6 +2,8 @@ #ifndef _LINUX_UNWIND_USER_DEFERRED_TYPES_H #define _LINUX_UNWIND_USER_DEFERRED_TYPES_H +#include <linux/atomic.h> + struct unwind_cache { unsigned int nr_entries; unsigned long entries[]; @@ -19,8 +21,8 @@ union unwind_task_id { struct unwind_task_info { struct unwind_cache *cache; struct callback_head work; + atomic_long_t unwind_mask; union unwind_task_id id; - int pending; }; #endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */ --- a/kernel/unwind/deferred.c +++ b/kernel/unwind/deferred.c @@ -12,13 +12,39 @@ #include <linux/slab.h> #include <linux/mm.h> +/* + * For requesting a deferred user space stack trace from NMI context + * the architecture must support a safe cmpxchg in NMI context. + * For those architectures that do not have that, then it cannot ask + * for a deferred user space stack trace from an NMI context. If it + * does, then it will get -EINVAL. + */ +#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG +#define UNWIND_NMI_SAFE 1 +static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt) +{ + u32 zero = 0; + return try_cmpxchg(&info->id.cnt, &zero, cnt); +} +#else +#define UNWIND_NMI_SAFE 0 +/* When NMIs are not allowed, this always succeeds */ +static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt) +{ + info->id.cnt = cnt; + return true; +} +#endif /* CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG */ + /* Make the cache fit in a 4K page */ #define UNWIND_MAX_ENTRIES \ ((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long)) -/* Guards adding to and reading the list of callbacks */ +/* Guards adding to or removing from the list of callbacks */ static DEFINE_MUTEX(callback_mutex); static LIST_HEAD(callbacks); +static unsigned long unwind_mask; +DEFINE_STATIC_SRCU(unwind_srcu); /* * This is a unique percpu identifier for a given task entry context. @@ -41,21 +67,16 @@ static DEFINE_PER_CPU(u32, unwind_ctx_ct */ static u64 get_cookie(struct unwind_task_info *info) { - u32 cpu_cnt; - u32 cnt; - u32 old = 0; + u32 cnt = 1; if (info->id.cpu) return info->id.id; - cpu_cnt = __this_cpu_read(unwind_ctx_ctr); - cpu_cnt += 2; - cnt = cpu_cnt | 1; /* Always make non zero */ - - if (try_cmpxchg(&info->id.cnt, &old, cnt)) { - /* Update the per cpu counter */ - __this_cpu_write(unwind_ctx_ctr, cpu_cnt); - } + /* LSB it always set to ensure 0 is an invalid value. */ + cnt |= __this_cpu_read(unwind_ctx_ctr) + 2; + if (try_assign_cnt(info, cnt)) + __this_cpu_write(unwind_ctx_ctr, cnt); + /* Interrupts are disabled, the CPU will always be same */ info->id.cpu = smp_processor_id() + 1; /* Must be non zero */ @@ -117,13 +138,13 @@ static void unwind_deferred_task_work(st struct unwind_task_info *info = container_of(head, struct unwind_task_info, work); struct unwind_stacktrace trace; struct unwind_work *work; + unsigned long bits; u64 cookie; - if (WARN_ON_ONCE(!info->pending)) + if (WARN_ON_ONCE(!unwind_pending(info))) return; - /* Allow work to come in again */ - WRITE_ONCE(info->pending, 0); + bits = atomic_long_fetch_andnot(UNWIND_PENDING, &info->unwind_mask); /* * From here on out, the callback must always be called, even if it's @@ -136,9 +157,11 @@ static void unwind_deferred_task_work(st cookie = info->id.id; - guard(mutex)(&callback_mutex); - list_for_each_entry(work, &callbacks, list) { - work->func(work, &trace, cookie); + guard(srcu_lite)(&unwind_srcu); + list_for_each_entry_srcu(work, &callbacks, list, + srcu_read_lock_held(&unwind_srcu)) { + if (test_bit(work->bit, &bits)) + work->func(work, &trace, cookie); } } @@ -162,7 +185,7 @@ static void unwind_deferred_task_work(st * because it has already been previously called for the same entry context, * it will be called again with the same stack trace and cookie. * - * Return: 1 if the the callback was already queued. + * Return: 1 if the callback was already queued. * 0 if the callback successfully was queued. * Negative if there's an error. * @cookie holds the cookie of the first request by any user @@ -170,41 +193,62 @@ static void unwind_deferred_task_work(st int unwind_deferred_request(struct unwind_work *work, u64 *cookie) { struct unwind_task_info *info = ¤t->unwind_info; - int ret; + unsigned long bits, mask; + int bit, ret; *cookie = 0; - if (WARN_ON_ONCE(in_nmi())) - return -EINVAL; - if ((current->flags & (PF_KTHREAD | PF_EXITING)) || !user_mode(task_pt_regs(current))) return -EINVAL; + /* NMI requires having safe cmpxchg operations */ + if (WARN_ON_ONCE(!UNWIND_NMI_SAFE && in_nmi())) + return -EINVAL; + + /* Do not allow cancelled works to request again */ + bit = READ_ONCE(work->bit); + if (WARN_ON_ONCE(bit < 0)) + return -EINVAL; + guard(irqsave)(); *cookie = get_cookie(info); - /* callback already pending? */ - if (info->pending) + bits = UNWIND_PENDING | BIT(bit); + mask = atomic_long_fetch_or(bits, &info->unwind_mask); + if (mask & bits) return 1; /* The work has been claimed, now schedule it. */ ret = task_work_add(current, &info->work, TWA_RESUME); if (WARN_ON_ONCE(ret)) - return ret; - - info->pending = 1; - return 0; + atomic_long_set(0, &info->unwind_mask); } void unwind_deferred_cancel(struct unwind_work *work) { + struct task_struct *g, *t; + int bit; + if (!work) return; guard(mutex)(&callback_mutex); - list_del(&work->list); + list_del_rcu(&work->list); + bit = work->bit; + + /* Do not allow any more requests and prevent callbacks */ + work->bit = -1; + + __clear_bit(bit, &unwind_mask); + + synchronize_srcu(&unwind_srcu); + + guard(rcu)(); + /* Clear this bit from all threads */ + for_each_process_thread(g, t) + atomic_long_andnot(BIT(bit), &t->unwind_info.unwind_mask); } int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) @@ -212,7 +256,15 @@ int unwind_deferred_init(struct unwind_w memset(work, 0, sizeof(*work)); guard(mutex)(&callback_mutex); - list_add(&work->list, &callbacks); + + /* See if there's a bit in the mask available */ + if (unwind_mask == ~UNWIND_PENDING) + return -EBUSY; + + work->bit = ffz(unwind_mask); + __set_bit(work->bit, &unwind_mask); + + list_add_rcu(&work->list, &callbacks); work->func = func; return 0; }