Hi Thomas, Jiri, Peter,
On 06/03/2026 12:06, Matthieu Baerts wrote:
> On 06/03/2026 10:57, Thomas Gleixner wrote:
>> On Fri, Mar 06 2026 at 06:48, Jiri Slaby wrote:
>>> On 05. 03. 26, 20:25, Thomas Gleixner wrote:
>>>> Is there simple way to reproduce?
>>>
>>> Unfortunately not at all. To date, I even cannot reproduce locally, it
>>> reproduces exclusively in opensuse build service (and github CI as per
>>> Matthieu's report). I have a project in there with packages which fail
>>> more often than others:
>>> https://build.opensuse.org/project/monitor/home:jirislaby:softlockup
>>> But it's all green ATM.
>>>
>>> Builds of Go 1.24 and tests of rust 1.90 fail the most. The former even
>>> takes only ~ 8 minutes, so it's not that intensive build at all. So the
>>> reasons are unknown to me. At least, Go apparently uses threads for
>>> building (unlike gcc/clang with forks/processes). Dunno about rust.
>>
>> I tried with tons of test cases which stress test mmcid with threads and
>> failed.
>
> On my side, I didn't manage to reproduce it locally either.
Apparently I can now... sorry, I don't know why I was not able to do
that before!
(...)
> It is possible to locally launch the same command using the same QEMU
> version (but not the same host kernel) with the help of Docker:
>
> $ cd <kernel source code>
> # docker run -v "${PWD}:${PWD}:rw" -w "${PWD}" --rm \
> -it --privileged mptcp/mptcp-upstream-virtme-docker:latest \
> manual normal
>
> This will build a new kernel in O=.virtme/build, launch it and give you
> access to a prompt.
>
>
> After that, you can do also use the "auto" mode with the last built
> image to boot the VM, only print "OK", stop and retry if there were no
> errors:
>
> $ cd <kernel source code>
> $ echo 'echo OK' > .virtme-exec-run
> # i=1; \
> while docker run -v "${PWD}:${PWD}:rw" -w "${PWD}" --rm \
> -it --privileged mptcp/mptcp-upstream-virtme-docker:latest \
> vm auto normal; do \
> echo "== Attempt: $i: OK =="; \
> i=$((i+1)); \
> done; \
> echo "== Failure after $i attempts =="
After having sent this email, I re-checked on my side, and I was able to
reproduce this issue with the technique described above: using the
docker image with "build" argument, then max 50 boot iterations with "vm
auto normal" argument. I then used 'git bisect' between v6.18 and
v6.19-rc1 to find the guilty commit, and got:
653fda7ae73d ("sched/mmcid: Switch over to the new mechanism")
Reverting it on top of v6.19-rc1 fixes the issue.
Unfortunatelly, reverting it on top of Linus' tree causes some
conflicts. I did my best to resolve them, and with this patch attached
below -- also available in [1] -- I no longer have the issue. I don't
know if it is correct -- some quick tests don't show any issues -- nor
if Jiri should test it. I guess the final fix will be different from
this simple revert.
Note: I also tried Peter's patch (thank you for sharing it!), but I can
still reproduce the issue with it on top of Linus' tree.
[1] https://git.kernel.org/matttbe/net-next/c/5e4b47fd150c
Cheers,
Matt
---
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index b9d62fc2140d..ef4ff117d037 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -84,6 +84,24 @@ static __always_inline void
rseq_sched_set_ids_changed(struct task_struct *t)
t->rseq.event.ids_changed = true;
}
+/*
+ * Invoked from switch_mm_cid() in context switch when the task gets a MM
+ * CID assigned.
+ *
+ * This does not raise TIF_NOTIFY_RESUME as that happens in
+ * rseq_sched_switch_event().
+ */
+static __always_inline void rseq_sched_set_task_mm_cid(struct task_struct *t,
unsigned int cid)
+{
+ /*
+ * Requires a comparison as the switch_mm_cid() code does not
+ * provide a conditional for it readily. So avoid excessive updates
+ * when nothing changes.
+ */
+ if (t->rseq.ids.mm_cid != cid)
+ t->rseq.event.ids_changed = true;
+}
+
/* Enforce a full update after RSEQ registration and when execve() failed */
static inline void rseq_force_update(void)
{
@@ -163,6 +181,7 @@ static inline void rseq_handle_slowpath(struct pt_regs
*regs) { }
static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs
*regs) { }
static inline void rseq_sched_switch_event(struct task_struct *t) { }
static inline void rseq_sched_set_ids_changed(struct task_struct *t) { }
+static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned
int cid) { }
static inline void rseq_force_update(void) { }
static inline void rseq_virt_userspace_exit(void) { }
static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index da5fa6f40294..61d294d3bbd7 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -131,18 +131,18 @@ struct rseq_data { };
/**
* struct sched_mm_cid - Storage for per task MM CID data
* @active: MM CID is active for the task
- * @cid: The CID associated to the task either permanently or
- * borrowed from the CPU
+ * @cid: The CID associated to the task
+ * @last_cid: The last CID associated to the task
*/
struct sched_mm_cid {
unsigned int active;
unsigned int cid;
+ unsigned int last_cid;
};
/**
* struct mm_cid_pcpu - Storage for per CPU MM_CID data
- * @cid: The CID associated to the CPU either permanently or
- * while a task with a CID is running
+ * @cid: The CID associated to the CPU
*/
struct mm_cid_pcpu {
unsigned int cid;
diff --git a/kernel/fork.c b/kernel/fork.c
index 65113a304518..af3f65f963e2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -999,6 +999,7 @@ static struct task_struct *dup_task_struct(struct
task_struct *orig, int node)
#ifdef CONFIG_SCHED_MM_CID
tsk->mm_cid.cid = MM_CID_UNSET;
+ tsk->mm_cid.last_cid = MM_CID_UNSET;
tsk->mm_cid.active = 0;
#endif
return tsk;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b7f77c165a6e..cc969711cb08 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5281,7 +5281,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
}
}
- mm_cid_switch_to(prev, next);
+ switch_mm_cid(prev, next);
/*
* Tell rseq that the task was scheduled in. Must be after
@@ -10634,7 +10634,7 @@ static bool mm_cid_fixup_task_to_cpu(struct task_struct
*t, struct mm_struct *mm
return true;
}
-static void mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm)
+static void __maybe_unused mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm)
{
struct task_struct *p, *t;
unsigned int users;
@@ -10673,7 +10673,7 @@ static void mm_cid_do_fixup_tasks_to_cpus(struct
mm_struct *mm)
}
}
-static void mm_cid_fixup_tasks_to_cpus(void)
+static void __maybe_unused mm_cid_fixup_tasks_to_cpus(void)
{
struct mm_struct *mm = current->mm;
@@ -10691,81 +10691,25 @@ static bool sched_mm_cid_add_user(struct task_struct
*t, struct mm_struct *mm)
void sched_mm_cid_fork(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- bool percpu;
WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
guard(mutex)(&mm->mm_cid.mutex);
- scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
- struct mm_cid_pcpu *pcp = this_cpu_ptr(mm->mm_cid.pcpu);
-
- /* First user ? */
- if (!mm->mm_cid.users) {
- sched_mm_cid_add_user(t, mm);
- t->mm_cid.cid = mm_get_cid(mm);
- /* Required for execve() */
- pcp->cid = t->mm_cid.cid;
- return;
- }
-
- if (!sched_mm_cid_add_user(t, mm)) {
- if (!cid_on_cpu(mm->mm_cid.mode))
- t->mm_cid.cid = mm_get_cid(mm);
- return;
- }
-
- /* Handle the mode change and transfer current's CID */
- percpu = cid_on_cpu(mm->mm_cid.mode);
- if (!percpu)
- mm_cid_transit_to_task(current, pcp);
- else
- mm_cid_transit_to_cpu(current, pcp);
- }
-
- if (percpu) {
- mm_cid_fixup_tasks_to_cpus();
- } else {
- mm_cid_fixup_cpus_to_tasks(mm);
- t->mm_cid.cid = mm_get_cid(mm);
+ scoped_guard(raw_spinlock, &mm->mm_cid.lock) {
+ sched_mm_cid_add_user(t, mm);
+ /* Preset last_cid for mm_cid_select() */
+ t->mm_cid.last_cid = mm->mm_cid.max_cids - 1;
}
}
static bool sched_mm_cid_remove_user(struct task_struct *t)
{
t->mm_cid.active = 0;
- scoped_guard(preempt) {
- /* Clear the transition bit */
- t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid);
- mm_unset_cid_on_task(t);
- }
+ mm_unset_cid_on_task(t);
t->mm->mm_cid.users--;
return mm_update_max_cids(t->mm);
}
-static bool __sched_mm_cid_exit(struct task_struct *t)
-{
- struct mm_struct *mm = t->mm;
-
- if (!sched_mm_cid_remove_user(t))
- return false;
- /*
- * Contrary to fork() this only deals with a switch back to per
- * task mode either because the above decreased users or an
- * affinity change increased the number of allowed CPUs and the
- * deferred fixup did not run yet.
- */
- if (WARN_ON_ONCE(cid_on_cpu(mm->mm_cid.mode)))
- return false;
- /*
- * A failed fork(2) cleanup never gets here, so @current must have
- * the same MM as @t. That's true for exit() and the failed
- * pthread_create() cleanup case.
- */
- if (WARN_ON_ONCE(current->mm != mm))
- return false;
- return true;
-}
-
/*
* When a task exits, the MM CID held by the task is not longer required as
* the task cannot return to user space.
@@ -10776,48 +10720,10 @@ void sched_mm_cid_exit(struct task_struct *t)
if (!mm || !t->mm_cid.active)
return;
- /*
- * Ensure that only one instance is doing MM CID operations within
- * a MM. The common case is uncontended. The rare fixup case adds
- * some overhead.
- */
- scoped_guard(mutex, &mm->mm_cid.mutex) {
- /* mm_cid::mutex is sufficient to protect mm_cid::users */
- if (likely(mm->mm_cid.users > 1)) {
- scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
- if (!__sched_mm_cid_exit(t))
- return;
- /*
- * Mode change. The task has the CID unset
- * already and dealt with an eventually set
- * TRANSIT bit. If the CID is owned by the CPU
- * then drop it.
- */
- mm_drop_cid_on_cpu(mm,
this_cpu_ptr(mm->mm_cid.pcpu));
- }
- mm_cid_fixup_cpus_to_tasks(mm);
- return;
- }
- /* Last user */
- scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
- /* Required across execve() */
- if (t == current)
- mm_cid_transit_to_task(t,
this_cpu_ptr(mm->mm_cid.pcpu));
- /* Ignore mode change. There is nothing to do. */
- sched_mm_cid_remove_user(t);
- }
- }
- /*
- * As this is the last user (execve(), process exit or failed
- * fork(2)) there is no concurrency anymore.
- *
- * Synchronize eventually pending work to ensure that there are no
- * dangling references left. @t->mm_cid.users is zero so nothing
- * can queue this work anymore.
- */
- irq_work_sync(&mm->mm_cid.irq_work);
- cancel_work_sync(&mm->mm_cid.work);
+ guard(mutex)(&mm->mm_cid.mutex);
+ scoped_guard(raw_spinlock, &mm->mm_cid.lock)
+ sched_mm_cid_remove_user(t);
}
/* Deactivate MM CID allocation across execve() */
@@ -10831,12 +10737,18 @@ void sched_mm_cid_after_execve(struct task_struct *t)
{
if (t->mm)
sched_mm_cid_fork(t);
+ guard(preempt)();
+ mm_cid_select(t);
}
static void mm_cid_work_fn(struct work_struct *work)
{
struct mm_struct *mm = container_of(work, struct mm_struct,
mm_cid.work);
+ /* Make it compile, but not functional yet */
+ if (!IS_ENABLED(CONFIG_NEW_MM_CID))
+ return;
+
guard(mutex)(&mm->mm_cid.mutex);
/* Did the last user task exit already? */
if (!mm->mm_cid.users)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 43bbf0693cca..b60d49fc9c11 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -4003,7 +4003,83 @@ static inline void mm_cid_switch_to(struct task_struct
*prev, struct task_struct
mm_cid_schedin(next);
}
+/* Active implementation */
+static inline void init_sched_mm_cid(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ unsigned int max_cid;
+
+ if (!mm)
+ return;
+
+ /* Preset last_mm_cid */
+ max_cid = min_t(int, READ_ONCE(mm->mm_cid.nr_cpus_allowed),
atomic_read(&mm->mm_users));
+ t->mm_cid.last_cid = max_cid - 1;
+}
+
+static inline bool __mm_cid_get(struct task_struct *t, unsigned int cid,
unsigned int max_cids)
+{
+ struct mm_struct *mm = t->mm;
+
+ if (cid >= max_cids)
+ return false;
+ if (test_and_set_bit(cid, mm_cidmask(mm)))
+ return false;
+ t->mm_cid.cid = t->mm_cid.last_cid = cid;
+ __this_cpu_write(mm->mm_cid.pcpu->cid, cid);
+ return true;
+}
+
+static inline bool mm_cid_get(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ unsigned int max_cids;
+
+ max_cids = READ_ONCE(mm->mm_cid.max_cids);
+
+ /* Try to reuse the last CID of this task */
+ if (__mm_cid_get(t, t->mm_cid.last_cid, max_cids))
+ return true;
+
+ /* Try to reuse the last CID of this mm on this CPU */
+ if (__mm_cid_get(t, __this_cpu_read(mm->mm_cid.pcpu->cid), max_cids))
+ return true;
+
+ /* Try the first zero bit in the cidmask. */
+ return __mm_cid_get(t, find_first_zero_bit(mm_cidmask(mm),
num_possible_cpus()), max_cids);
+}
+
+static inline void mm_cid_select(struct task_struct *t)
+{
+ /*
+ * mm_cid_get() can fail when the maximum CID, which is determined
+ * by min(mm->nr_cpus_allowed, mm->mm_users) changes concurrently.
+ * That's a transient failure as there cannot be more tasks
+ * concurrently on a CPU (or about to be scheduled in) than that.
+ */
+ for (;;) {
+ if (mm_cid_get(t))
+ break;
+ }
+}
+
+static inline void switch_mm_cid(struct task_struct *prev, struct task_struct
*next)
+{
+ if (prev->mm_cid.active) {
+ if (prev->mm_cid.cid != MM_CID_UNSET)
+ clear_bit(prev->mm_cid.cid, mm_cidmask(prev->mm));
+ prev->mm_cid.cid = MM_CID_UNSET;
+ }
+
+ if (next->mm_cid.active) {
+ mm_cid_select(next);
+ rseq_sched_set_task_mm_cid(next, next->mm_cid.cid);
+ }
+}
+
#else /* !CONFIG_SCHED_MM_CID: */
+static inline void mm_cid_select(struct task_struct *t) { }
+static inline void switch_mm_cid(struct task_struct *prev, struct task_struct
*next) { }
static inline void mm_cid_switch_to(struct task_struct *prev, struct
task_struct *next) { }
#endif /* !CONFIG_SCHED_MM_CID */
--
Sponsored by the NGI0 Core fund.