On Tue, Mar 10 2026 at 11:03, Thomas Gleixner wrote:
> On Tue, Mar 10 2026 at 10:00, Jiri Slaby wrote:
> Yes. There is an issue. Peter and me just discovered that as well, but
> in the case at hand it can't be the problem. The missing task is on the
> CPU which means it must be visible in the thread list and task list.
The updated debug patch fixes this fork problem and adds more tracing to
it.
Mathieu, can you give it another spin please?
Thanks,
tglx
---
include/linux/sched.h | 2
include/trace/events/mmcid.h | 171 +++++++++++++++++++++++++++++++++++++++++++
kernel/fork.c | 2
kernel/sched/core.c | 34 ++++++--
kernel/sched/sched.h | 20 ++++-
5 files changed, 215 insertions(+), 14 deletions(-)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2354,7 +2354,6 @@ static __always_inline void alloc_tag_re
#ifdef CONFIG_SCHED_MM_CID
void sched_mm_cid_before_execve(struct task_struct *t);
void sched_mm_cid_after_execve(struct task_struct *t);
-void sched_mm_cid_fork(struct task_struct *t);
void sched_mm_cid_exit(struct task_struct *t);
static __always_inline int task_mm_cid(struct task_struct *t)
{
@@ -2363,7 +2362,6 @@ static __always_inline int task_mm_cid(s
#else
static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
-static inline void sched_mm_cid_fork(struct task_struct *t) { }
static inline void sched_mm_cid_exit(struct task_struct *t) { }
static __always_inline int task_mm_cid(struct task_struct *t)
{
--- /dev/null
+++ b/include/trace/events/mmcid.h
@@ -0,0 +1,171 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mmcid
+
+#if !defined(_TRACE_MMCID_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MMCID_H
+
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(mmcid_class,
+
+ TP_PROTO(struct mm_struct *mm, unsigned int cid),
+
+ TP_ARGS(mm, cid),
+
+ TP_STRUCT__entry(
+ __field( void *, mm )
+ __field( unsigned int, cid )
+ ),
+
+ TP_fast_assign(
+ __entry->mm = mm;
+ __entry->cid = cid;
+ ),
+
+ TP_printk("mm=%p cid=%08x", __entry->mm, __entry->cid)
+);
+
+DEFINE_EVENT(mmcid_class, mmcid_getcid,
+
+ TP_PROTO(struct mm_struct *mm, unsigned int cid),
+
+ TP_ARGS(mm, cid)
+);
+
+DEFINE_EVENT(mmcid_class, mmcid_putcid,
+
+ TP_PROTO(struct mm_struct *mm, unsigned int cid),
+
+ TP_ARGS(mm, cid)
+);
+
+DECLARE_EVENT_CLASS(mmcid_task_class,
+
+ TP_PROTO(struct task_struct *t, struct mm_struct *mm, unsigned int cid),
+
+ TP_ARGS(t, mm, cid),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, pid )
+ __field( unsigned int, cid )
+ __field( void *, mm )
+ ),
+
+ TP_fast_assign(
+ __entry->pid = t->pid;
+ __entry->cid = cid;
+ __entry->mm = mm;
+ ),
+
+ TP_printk("pid=%u cid=%08x mm=%p", __entry->pid, __entry->cid,
__entry->mm)
+);
+
+DEFINE_EVENT(mmcid_task_class, mmcid_task_update,
+
+ TP_PROTO(struct task_struct *t, struct mm_struct *mm, unsigned int cid),
+
+ TP_ARGS(t, mm, cid)
+);
+
+DECLARE_EVENT_CLASS(mmcid_fixup_class,
+
+ TP_PROTO(struct task_struct *t, unsigned int users),
+
+ TP_ARGS(t, users),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, pid )
+ __field( unsigned int, cid )
+ __field( unsigned int, active )
+ __field( unsigned int, users )
+ __field( void *, mm )
+ ),
+
+ TP_fast_assign(
+ __entry->pid = t->pid;
+ __entry->cid = t->mm_cid.cid;
+ __entry->active = t->mm_cid.active;
+ __entry->users = users;
+ __entry->mm = t->mm;
+ ),
+
+ TP_printk("pid=%u cid=%08x active=%u users=%u mm=%p", __entry->pid,
__entry->cid,
+ __entry->active, __entry->users, __entry->mm)
+);
+
+DEFINE_EVENT(mmcid_fixup_class, mmcid_fixup_task,
+
+ TP_PROTO(struct task_struct *t, unsigned int users),
+
+ TP_ARGS(t, users)
+);
+
+DECLARE_EVENT_CLASS(mmcid_cpu_class,
+
+ TP_PROTO(unsigned int cpu, struct mm_struct *mm, unsigned int cid),
+
+ TP_ARGS(cpu, mm, cid),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, cpu )
+ __field( unsigned int, cid )
+ __field( void *, mm )
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->cid = cid;
+ __entry->mm = mm;
+ ),
+
+ TP_printk("cpu=%u cid=%08x mm=%p", __entry->cpu, __entry->cid,
__entry->mm)
+);
+
+DEFINE_EVENT(mmcid_cpu_class, mmcid_cpu_update,
+
+ TP_PROTO(unsigned int cpu, struct mm_struct *mm, unsigned int cid),
+
+ TP_ARGS(cpu, mm, cid)
+);
+
+DECLARE_EVENT_CLASS(mmcid_user_class,
+
+ TP_PROTO(struct task_struct *t, struct mm_struct *mm),
+
+ TP_ARGS(t, mm),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, pid )
+ __field( unsigned int, users )
+ __field( void *, mm )
+ ),
+
+ TP_fast_assign(
+ __entry->pid = t->pid;
+ __entry->users = mm->mm_cid.users;
+ __entry->mm = mm;
+ ),
+
+ TP_printk("pid=%u users=%u mm=%p", __entry->pid, __entry->users,
__entry->mm)
+);
+
+DEFINE_EVENT(mmcid_user_class, mmcid_user_add,
+
+ TP_PROTO(struct task_struct *t, struct mm_struct *mm),
+
+ TP_ARGS(t, mm)
+);
+
+DEFINE_EVENT(mmcid_user_class, mmcid_user_del,
+
+ TP_PROTO(struct task_struct *t, struct mm_struct *mm),
+
+ TP_ARGS(t, mm)
+);
+
+#endif
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1586,7 +1586,6 @@ static int copy_mm(u64 clone_flags, stru
tsk->mm = mm;
tsk->active_mm = mm;
- sched_mm_cid_fork(tsk);
return 0;
}
@@ -2498,7 +2497,6 @@ static bool need_futex_hash_allocate_def
exit_nsproxy_namespaces(p);
bad_fork_cleanup_mm:
if (p->mm) {
- sched_mm_cid_exit(p);
mm_clear_owner(p->mm, p);
mmput(p->mm);
}
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -86,6 +86,7 @@
#include <linux/sched/rseq_api.h>
#include <trace/events/sched.h>
#include <trace/events/ipi.h>
+#include <trace/events/mmcid.h>
#undef CREATE_TRACE_POINTS
#include "sched.h"
@@ -4729,8 +4730,12 @@ void sched_cancel_fork(struct task_struc
scx_cancel_fork(p);
}
+static void sched_mm_cid_fork(struct task_struct *t);
+
void sched_post_fork(struct task_struct *p)
{
+ if (IS_ENABLED(CONFIG_SCHED_MM_CID))
+ sched_mm_cid_fork(p);
uclamp_post_fork(p);
scx_post_fork(p);
}
@@ -10569,7 +10574,9 @@ static inline void mm_cid_transit_to_tas
unsigned int cid = cpu_cid_to_cid(t->mm_cid.cid);
t->mm_cid.cid = cid_to_transit_cid(cid);
+ trace_mmcid_task_update(t, t->mm, t->mm_cid.cid);
pcp->cid = t->mm_cid.cid;
+ trace_mmcid_cpu_update(task_cpu(t), t->mm, pcp->cid);
}
}
@@ -10602,7 +10609,9 @@ static void mm_cid_fixup_cpus_to_tasks(s
if (!cid_in_transit(cid)) {
cid = cid_to_transit_cid(cid);
rq->curr->mm_cid.cid = cid;
+ trace_mmcid_task_update(rq->curr, rq->curr->mm,
cid);
pcp->cid = cid;
+ trace_mmcid_cpu_update(cpu, mm, cid);
}
}
}
@@ -10613,7 +10622,9 @@ static inline void mm_cid_transit_to_cpu
{
if (cid_on_task(t->mm_cid.cid)) {
t->mm_cid.cid = cid_to_transit_cid(t->mm_cid.cid);
+ trace_mmcid_task_update(t, t->mm, t->mm_cid.cid);
pcp->cid = t->mm_cid.cid;
+ trace_mmcid_cpu_update(task_cpu(t), t->mm, pcp->cid);
}
}
@@ -10646,15 +10657,17 @@ static void mm_cid_do_fixup_tasks_to_cpu
* possible switch back to per task mode happens either in the
* deferred handler function or in the next fork()/exit().
*
- * The caller has already transferred. The newly incoming task is
- * already accounted for, but not yet visible.
+ * The caller has already transferred so remove it from the users
+ * count. The incoming task is already visible and has mm_cid.active,
+ * but has task::mm_cid::cid == UNSET. Still it needs to be accounted
+ * for. Concurrent fork()s might add more threads, but all of them have
+ * task::mm_cid::active = 0, so they don't affect the accounting here.
*/
- users = mm->mm_cid.users - 2;
- if (!users)
- return;
+ users = mm->mm_cid.users - 1;
guard(rcu)();
for_other_threads(current, t) {
+ trace_mmcid_fixup_task(t, users);
if (mm_cid_fixup_task_to_cpu(t, mm))
users--;
}
@@ -10666,6 +10679,7 @@ static void mm_cid_do_fixup_tasks_to_cpu
for_each_process_thread(p, t) {
if (t == current || t->mm != mm)
continue;
+ trace_mmcid_fixup_task(t, users);
if (mm_cid_fixup_task_to_cpu(t, mm)) {
if (--users == 0)
return;
@@ -10685,15 +10699,19 @@ static bool sched_mm_cid_add_user(struct
{
t->mm_cid.active = 1;
mm->mm_cid.users++;
+ trace_mmcid_user_add(t, mm);
return mm_update_max_cids(mm);
}
-void sched_mm_cid_fork(struct task_struct *t)
+static void sched_mm_cid_fork(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
bool percpu;
- WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
+ if (!mm)
+ return;
+
+ WARN_ON_ONCE(t->mm_cid.cid != MM_CID_UNSET);
guard(mutex)(&mm->mm_cid.mutex);
scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
@@ -10727,6 +10745,7 @@ void sched_mm_cid_fork(struct task_struc
} else {
mm_cid_fixup_cpus_to_tasks(mm);
t->mm_cid.cid = mm_get_cid(mm);
+ trace_mmcid_task_update(t, t->mm, t->mm_cid.cid);
}
}
@@ -10739,6 +10758,7 @@ static bool sched_mm_cid_remove_user(str
mm_unset_cid_on_task(t);
}
t->mm->mm_cid.users--;
+ trace_mmcid_user_del(t, t->mm);
return mm_update_max_cids(t->mm);
}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -75,6 +75,7 @@
#include <linux/delayacct.h>
#include <linux/mmu_context.h>
+#include <trace/events/mmcid.h>
#include <trace/events/power.h>
#include <trace/events/sched.h>
@@ -3809,6 +3810,7 @@ static __always_inline bool cid_on_task(
static __always_inline void mm_drop_cid(struct mm_struct *mm, unsigned int cid)
{
+ trace_mmcid_putcid(mm, cid);
clear_bit(cid, mm_cidmask(mm));
}
@@ -3817,6 +3819,7 @@ static __always_inline void mm_unset_cid
unsigned int cid = t->mm_cid.cid;
t->mm_cid.cid = MM_CID_UNSET;
+ trace_mmcid_task_update(t, t->mm, t->mm_cid.cid);
if (cid_on_task(cid))
mm_drop_cid(t->mm, cid);
}
@@ -3838,6 +3841,7 @@ static inline unsigned int __mm_get_cid(
return MM_CID_UNSET;
if (test_and_set_bit(cid, mm_cidmask(mm)))
return MM_CID_UNSET;
+ trace_mmcid_getcid(mm, cid);
return cid;
}
@@ -3845,9 +3849,17 @@ static inline unsigned int mm_get_cid(st
{
unsigned int cid = __mm_get_cid(mm, READ_ONCE(mm->mm_cid.max_cids));
- while (cid == MM_CID_UNSET) {
- cpu_relax();
- cid = __mm_get_cid(mm, num_possible_cpus());
+ if (cid == MM_CID_UNSET) {
+ ktime_t t0 = ktime_get();
+
+ while (cid == MM_CID_UNSET) {
+ cpu_relax();
+ cid = __mm_get_cid(mm, num_possible_cpus());
+ if (ktime_get() - t0 > 50 * NSEC_PER_MSEC) {
+ tracing_off();
+ WARN_ON_ONCE(1);
+ }
+ }
}
return cid;
}
@@ -3874,6 +3886,7 @@ static inline unsigned int mm_cid_conver
static __always_inline void mm_cid_update_task_cid(struct task_struct *t,
unsigned int cid)
{
if (t->mm_cid.cid != cid) {
+ trace_mmcid_task_update(t, t->mm, cid);
t->mm_cid.cid = cid;
rseq_sched_set_ids_changed(t);
}
@@ -3881,6 +3894,7 @@ static __always_inline void mm_cid_updat
static __always_inline void mm_cid_update_pcpu_cid(struct mm_struct *mm,
unsigned int cid)
{
+ trace_mmcid_cpu_update(smp_processor_id(), mm, cid);
__this_cpu_write(mm->mm_cid.pcpu->cid, cid);
}