[AMD Official Use Only - AMD Internal Distribution Only] Add [email protected].
Emily Deng Best Wishes >-----Original Message----- >From: Guo, Bingxi <[email protected]> >Sent: Monday, December 22, 2025 7:02 PM >To: [email protected] >Cc: Koenig, Christian <[email protected]>; Deng, Emily ><[email protected]>; Guo, Bingxi <[email protected]> >Subject: [PATCH] drm/amdgpu: Support fork process > >When a process forks, the child inherits the open DRM file descriptor. >If the parent is killed (e.g., by SIGKILL), only the parent's jobs are >canceled. The child >process can continue submitting jobs to the same entity through its own user >entry in >the entity's user list. > >Signed-off-by: Emily Deng <[email protected]> >Signed-off-by: Bingxi Guo <[email protected]> >--- > drivers/gpu/drm/scheduler/sched_entity.c | 133 +++++++++++++++++++---- > include/drm/gpu_scheduler.h | 22 ++++ > 2 files changed, 135 insertions(+), 20 deletions(-) > >diff --git a/drivers/gpu/drm/scheduler/sched_entity.c >b/drivers/gpu/drm/scheduler/sched_entity.c >index 8867b95ab089..508a0629b839 100644 >--- a/drivers/gpu/drm/scheduler/sched_entity.c >+++ b/drivers/gpu/drm/scheduler/sched_entity.c >@@ -110,6 +110,9 @@ int drm_sched_entity_init(struct drm_sched_entity *entity, > atomic_set(&entity->fence_seq, 0); > entity->fence_context = dma_fence_context_alloc(2); > >+ INIT_LIST_HEAD(&entity->users); >+ spin_lock_init(&entity->users_lock); >+ > return 0; > } > EXPORT_SYMBOL(drm_sched_entity_init); >@@ -228,10 +231,24 @@ static void drm_sched_entity_kill(struct drm_sched_entity >*entity) { > struct drm_sched_job *job; > struct dma_fence *prev; >+ struct drm_sched_entity_user *user; >+ struct spsc_queue temp_queue; >+ pid_t my_tgid = task_tgid_nr(current); > > if (!entity->rq) > return; > >+ /* Mark current process as exited */ >+ spin_lock(&entity->users_lock); >+ list_for_each_entry(user, &entity->users, list) { >+ if (user->tgid == my_tgid) { >+ atomic_set(&user->exited, 1); >+ break; >+ } >+ } >+ spin_unlock(&entity->users_lock); >+ >+ /* Temporarily stop entity to prevent new jobs */ > spin_lock(&entity->lock); > entity->stopped = true; > drm_sched_rq_remove_entity(entity->rq, entity); @@ -240,27 +257,59 @@ >static void drm_sched_entity_kill(struct drm_sched_entity *entity) > /* Make sure this entity is not used by the scheduler at the moment */ > wait_for_completion(&entity->entity_idle); > >- /* The entity is guaranteed to not be used by the scheduler */ >+ /* >+ * The entity is stopped and idle. No new jobs can be pushed. >+ * Scan the queue and separate jobs: >+ * - Jobs from this process: kill immediately >+ * - Jobs from other processes: keep in temp_queue >+ */ >+ spsc_queue_init(&temp_queue); > prev = rcu_dereference_check(entity->last_scheduled, true); > dma_fence_get(prev); >+ > while ((job = drm_sched_entity_queue_pop(entity))) { >- struct drm_sched_fence *s_fence = job->s_fence; >- >- dma_fence_get(&s_fence->finished); >- if (!prev || >- dma_fence_add_callback(prev, &job->finish_cb, >- drm_sched_entity_kill_jobs_cb)) { >- /* >- * Adding callback above failed. >- * dma_fence_put() checks for NULL. >- */ >- dma_fence_put(prev); >- drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb); >+ if (job->owner_tgid == my_tgid) { >+ /* Kill this job */ >+ struct drm_sched_fence *s_fence = job->s_fence; >+ >+ dma_fence_get(&s_fence->finished); >+ if (!prev || >+ dma_fence_add_callback(prev, &job->finish_cb, >+ >drm_sched_entity_kill_jobs_cb)) { >+ dma_fence_put(prev); >+ drm_sched_entity_kill_jobs_cb(NULL, >&job->finish_cb); >+ } >+ prev = &s_fence->finished; >+ } else { >+ /* Keep jobs from other processes */ >+ spsc_queue_push(&temp_queue, &job->queue_node); > } >+ } > >- prev = &s_fence->finished; >+ /* Put back jobs from other processes */ >+ while (true) { >+ struct spsc_node *node = spsc_queue_pop(&temp_queue); >+ if (!node) >+ break; >+ spsc_queue_push(&entity->job_queue, node); > } >+ > dma_fence_put(prev); >+ >+ /* Check if there are other active users and restore entity if needed */ >+ spin_lock(&entity->users_lock); >+ list_for_each_entry(user, &entity->users, list) { >+ if (!atomic_read(&user->exited)) { >+ /* Found active user, restore entity */ >+ spin_unlock(&entity->users_lock); >+ spin_lock(&entity->lock); >+ entity->stopped = false; >+ drm_sched_rq_add_entity(entity->rq, entity); >+ spin_unlock(&entity->lock); >+ return; >+ } >+ } >+ spin_unlock(&entity->users_lock); > } > > /** >@@ -323,6 +372,8 @@ EXPORT_SYMBOL(drm_sched_entity_flush); > */ > void drm_sched_entity_fini(struct drm_sched_entity *entity) { >+ struct drm_sched_entity_user *user, *tmp; >+ > /* > * If consumption of existing IBs wasn't completed. Forcefully remove > * them here. Also makes sure that the scheduler won't touch this > entity @@ - >338,6 +389,14 @@ void drm_sched_entity_fini(struct drm_sched_entity *entity) > > dma_fence_put(rcu_dereference_check(entity->last_scheduled, true)); > RCU_INIT_POINTER(entity->last_scheduled, NULL); >+ >+ /* Clean up user list */ >+ spin_lock(&entity->users_lock); >+ list_for_each_entry_safe(user, tmp, &entity->users, list) { >+ list_del_rcu(&user->list); >+ kfree_rcu(user, rcu); >+ } >+ spin_unlock(&entity->users_lock); > } > EXPORT_SYMBOL(drm_sched_entity_fini); > >@@ -567,9 +626,40 @@ void drm_sched_entity_select_rq(struct drm_sched_entity >*entity) void drm_sched_entity_push_job(struct drm_sched_job *sched_job) { > struct drm_sched_entity *entity = sched_job->entity; >+ struct drm_sched_entity_user *user, *found = NULL; >+ pid_t my_tgid = task_tgid_nr(current); > bool first; > ktime_t submit_ts; > >+ /* Check if entity is stopped and reject directly */ >+ if (entity->stopped) >+ goto error; >+ >+ /* Entity is running, check user list */ >+ spin_lock(&entity->users_lock); >+ list_for_each_entry(user, &entity->users, list) { >+ if (user->tgid == my_tgid) { >+ found = user; >+ /* Reject if this user has exited */ >+ if (atomic_read(&user->exited)) { >+ spin_unlock(&entity->users_lock); >+ goto error; >+ } >+ break; >+ } >+ } >+ >+ /* If not found, create new user (fork case) */ >+ if (!found) { >+ found = kzalloc(sizeof(*found), GFP_ATOMIC); >+ if (found) { >+ found->tgid = my_tgid; >+ atomic_set(&found->exited, 0); >+ list_add_tail(&found->list, &entity->users); >+ } >+ } >+ spin_unlock(&entity->users_lock); >+ > trace_drm_sched_job_queue(sched_job, entity); > > if (trace_drm_sched_job_add_dep_enabled()) { @@ -582,6 +672,9 @@ void >drm_sched_entity_push_job(struct drm_sched_job *sched_job) > atomic_inc(entity->rq->sched->score); > WRITE_ONCE(entity->last_user, current->group_leader); > >+ /* Record owner TGID */ >+ sched_job->owner_tgid = my_tgid; >+ > /* > * After the sched_job is pushed into the entity queue, it may be > * completed and freed up at any time. We can no longer access it. >@@ -597,12 +690,6 @@ void drm_sched_entity_push_job(struct drm_sched_job >*sched_job) > > /* Add the entity to the run queue */ > spin_lock(&entity->lock); >- if (entity->stopped) { >- spin_unlock(&entity->lock); >- >- DRM_ERROR("Trying to push to a killed entity\n"); >- return; >- } > > rq = entity->rq; > sched = rq->sched; >@@ -618,5 +705,11 @@ void drm_sched_entity_push_job(struct drm_sched_job >*sched_job) > > drm_sched_wakeup(sched); > } >+ return; >+ >+error: >+ dma_fence_set_error(&sched_job->s_fence->finished, -EPERM); >+ drm_sched_fence_scheduled(sched_job->s_fence, NULL); >+ drm_sched_fence_finished(sched_job->s_fence, -EPERM); > } > EXPORT_SYMBOL(drm_sched_entity_push_job); >diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index >e62a7214e052..45e066596405 100644 >--- a/include/drm/gpu_scheduler.h >+++ b/include/drm/gpu_scheduler.h >@@ -59,6 +59,16 @@ struct drm_sched_rq; > > struct drm_file; > >+/** >+ * struct drm_sched_entity_user - Per-process entity user tracking */ >+struct drm_sched_entity_user { >+ struct list_head list; >+ struct rcu_head rcu; >+ pid_t tgid; >+ atomic_t exited; >+}; >+ > /* These are often used as an (initial) index > * to an array, and as such should start at 0. > */ >@@ -233,6 +243,13 @@ struct drm_sched_entity { > */ > struct rb_node rb_tree_node; > >+ /** >+ * @users: >+ * >+ * List of processes using this entity (for fork support) >+ */ >+ struct list_head users; >+ spinlock_t users_lock; > }; > > /** >@@ -385,6 +402,11 @@ struct drm_sched_job { > * drm_sched_job_add_implicit_dependencies(). > */ > struct xarray dependencies; >+ >+ /** >+ * @owner_tgid: TGID of the process that submitted this job >+ */ >+ pid_t owner_tgid; > }; > > /** >-- >2.43.0
