Hi Krzysztof,
On Thu, 2026-05-07 at 14:24 +0000, Krzysztof Karas wrote:
> Migration testing in i915 assumes current task's address space
> to allocate new userspace mapping and uses it without
> registering real user for that address space in mm_struct.
> On single NUMA node setups PCI probe executes in the same
> context as userspace process calling the test (i915_selftest
> from IGT), but when multiple nodes are available, the PCI code
> puts probe into a kernel workqueue. This switches execution to
> a kworker, which does not have its own address space in
> userspace and must borrow such memory from another process, so
> "current->active_mm" is unknown at the start of the test.
>
> It was observed that mm->mm_users would occasionally be 0
> or drop to 0 during the test due to short delay between
> scheduling and executing work in forked process, which reaped
> userspace mappings, further leading to failures upon reading
> from userland memory.
>
> Prevent this by adding a PID parameter to a trusted task, so its
> mm struct may be used if needed.
>
> Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/14204
> Signed-off-by: Krzysztof Karas <[email protected]>
> ---
> v7 (Andi):
> * Add missing mm reference release on error path.
>
> v8:
> * Keep reference to mm open for the duration of test for
> readability. (Sebastian)
> * Be paranoic and explicit about keeping the mm reference,
> so we are **really** sure about userspace mappings not
> diappearing.
>
> v9:
> * Drop "Fixes" tag. (Andi)
> * Revert to using a separate function for mm acquisition. (Andi)
> * Keep kthread_use/unuse and mmget/mmput calls symmetric. (Janusz)
>
> drivers/gpu/drm/i915/i915_selftest.h | 1 +
> .../gpu/drm/i915/selftests/i915_selftest.c | 68 ++++++++++++++++++-
> 2 files changed, 68 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_selftest.h
> b/drivers/gpu/drm/i915/i915_selftest.h
> index 72922028f4ba..e29ca298e7eb 100644
> --- a/drivers/gpu/drm/i915/i915_selftest.h
> +++ b/drivers/gpu/drm/i915/i915_selftest.h
> @@ -35,6 +35,7 @@ struct i915_selftest {
> unsigned long timeout_jiffies;
> unsigned int timeout_ms;
> unsigned int random_seed;
> + unsigned int userspace_pid;
> char *filter;
> int mock;
> int live;
> diff --git a/drivers/gpu/drm/i915/selftests/i915_selftest.c
> b/drivers/gpu/drm/i915/selftests/i915_selftest.c
> index 8460f0a70d04..1e8494bab14b 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_selftest.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_selftest.c
> @@ -181,11 +181,48 @@ __wait_gsc_huc_load_completed(struct drm_i915_private
> *i915)
> pr_warn(DRIVER_NAME "Timed out waiting for huc load via
> GSC!\n");
> }
>
> +static struct mm_struct *
> +get_mm(int u_pid_nr)
> +{
> + struct pid *u_pid = find_get_pid(u_pid_nr);
What happens here if the st_userspace_pid module parameter is not provided?
> + struct task_struct *task = NULL;
> + struct mm_struct *mm = NULL;
> +
> + if (!u_pid) {
> + pr_warn("Could not find PID: %d\n", u_pid_nr);
> + return NULL;
> + }
> +
> + task = get_pid_task(u_pid, PIDTYPE_PID);
> + put_pid(u_pid);
> + if (!task) {
> + pr_warn("Could not find task for PID: %d\n", u_pid_nr);
> + return NULL;
> + }
> +
> + if (task->flags & PF_KTHREAD) {
> + pr_warn("Task not in userspace: %d\n", u_pid_nr);
> + put_task_struct(task);
> + return NULL;
> + }
> +
> + mm = get_task_mm(task);
> + put_task_struct(task);
> + if (!mm) {
> + pr_warn("Could not find address space of task with PID: %d\n",
> u_pid_nr);
> + return NULL;
> + }
> +
> + return mm;
> +}
> +
> static int __run_selftests(const char *name,
> struct selftest *st,
> unsigned int count,
> void *data)
> {
> + int u_pid_nr = i915_selftest.userspace_pid;
> + struct mm_struct *mm = NULL;
> int err = 0;
>
> while (!i915_selftest.random_seed)
> @@ -201,14 +238,36 @@ static int __run_selftests(const char *name,
> pr_info(DRIVER_NAME ": Performing %s selftests with st_random_seed=0x%x
> st_timeout=%u\n",
> name, i915_selftest.random_seed, i915_selftest.timeout_ms);
>
> + /*
> + * If we are running in a kthread on a multi NUMA system and the user
> passed
> + * a valid PID of a userspace task, then we may borrow its address space
> + * to prepare a safe environment for the mmap selftests.
> + */
> + if (!current->mm) {
I think this condition should also check for a valid u_pid_nr. To avoid
ambiguity, maybe the i915_selftest.userspace_pid attribute should be
initialized to a negative value by default (when not overwritten with the
corresponding module parameter). There is no point in submitting any
warnings from here if the module parameter is not provided, I believe.
Other than that, LGTM.
Thanks,
Janusz
> + mm = get_mm(u_pid_nr);
> + if (mm) {
> + kthread_use_mm(mm);
> + if (unlikely(!current->mm)) {
> + mmput(mm);
> + mm = NULL;
> + pr_warn("Could not set mm as current->mm\n");
> + }
> + }
> + }
> +
> /* Tests are listed in order in i915_*_selftests.h */
> for (; count--; st++) {
> if (!st->enabled)
> continue;
>
> cond_resched();
> - if (signal_pending(current))
> + if (signal_pending(current)) {
> + if (mm) {
> + kthread_unuse_mm(mm);
> + mmput_async(mm);
> + }
> return -EINTR;
> + }
>
> pr_info(DRIVER_NAME ": Running %s\n", st->name);
> if (data)
> @@ -226,6 +285,11 @@ static int __run_selftests(const char *name,
> st->name, err))
> err = -1;
>
> + if (mm) {
> + kthread_unuse_mm(mm);
> + mmput_async(mm);
> + }
> +
> return err;
> }
>
> @@ -507,6 +571,8 @@ void igt_hexdump(const void *buf, size_t len)
> module_param_named(st_random_seed, i915_selftest.random_seed, uint, 0400);
> module_param_named(st_timeout, i915_selftest.timeout_ms, uint, 0400);
> module_param_named(st_filter, i915_selftest.filter, charp, 0400);
> +module_param_named(st_userspace_pid, i915_selftest.userspace_pid, uint,
> 0400);
> +MODULE_PARM_DESC(st_userspace_pid, "For usage in tests that map userspace
> memory and require address space with controllable lifetime.");
>
> module_param_named_unsafe(mock_selftests, i915_selftest.mock, int, 0400);
> MODULE_PARM_DESC(mock_selftests, "Run selftests before loading, using mock
> hardware (0:disabled [default], 1:run tests then load driver, -1:run tests
> then leave dummy module)");