On 12/22/25 12:01, Bingxi Guo wrote: > When a process forks, the child inherits the open DRM file descriptor. > If the parent is killed (e.g., by SIGKILL), only the parent's jobs > are canceled. The child process can continue submitting jobs to the > same entity through its own user entry in the entity's user list.
Clear NAK to that. Forking a process and re-using the fd is illegal! See the OpenGL, OpenCL and Vulkan specification. We intentionally block that here. Regards, Christian. > > Signed-off-by: Emily Deng <[email protected]> > Signed-off-by: Bingxi Guo <[email protected]> > --- > drivers/gpu/drm/scheduler/sched_entity.c | 133 +++++++++++++++++++---- > include/drm/gpu_scheduler.h | 22 ++++ > 2 files changed, 135 insertions(+), 20 deletions(-) > > diff --git a/drivers/gpu/drm/scheduler/sched_entity.c > b/drivers/gpu/drm/scheduler/sched_entity.c > index 8867b95ab089..508a0629b839 100644 > --- a/drivers/gpu/drm/scheduler/sched_entity.c > +++ b/drivers/gpu/drm/scheduler/sched_entity.c > @@ -110,6 +110,9 @@ int drm_sched_entity_init(struct drm_sched_entity *entity, > atomic_set(&entity->fence_seq, 0); > entity->fence_context = dma_fence_context_alloc(2); > > + INIT_LIST_HEAD(&entity->users); > + spin_lock_init(&entity->users_lock); > + > return 0; > } > EXPORT_SYMBOL(drm_sched_entity_init); > @@ -228,10 +231,24 @@ static void drm_sched_entity_kill(struct > drm_sched_entity *entity) > { > struct drm_sched_job *job; > struct dma_fence *prev; > + struct drm_sched_entity_user *user; > + struct spsc_queue temp_queue; > + pid_t my_tgid = task_tgid_nr(current); > > if (!entity->rq) > return; > > + /* Mark current process as exited */ > + spin_lock(&entity->users_lock); > + list_for_each_entry(user, &entity->users, list) { > + if (user->tgid == my_tgid) { > + atomic_set(&user->exited, 1); > + break; > + } > + } > + spin_unlock(&entity->users_lock); > + > + /* Temporarily stop entity to prevent new jobs */ > spin_lock(&entity->lock); > entity->stopped = true; > drm_sched_rq_remove_entity(entity->rq, entity); > @@ -240,27 +257,59 @@ static void drm_sched_entity_kill(struct > drm_sched_entity *entity) > /* Make sure this entity is not used by the scheduler at the moment */ > wait_for_completion(&entity->entity_idle); > > - /* The entity is guaranteed to not be used by the scheduler */ > + /* > + * The entity is stopped and idle. No new jobs can be pushed. > + * Scan the queue and separate jobs: > + * - Jobs from this process: kill immediately > + * - Jobs from other processes: keep in temp_queue > + */ > + spsc_queue_init(&temp_queue); > prev = rcu_dereference_check(entity->last_scheduled, true); > dma_fence_get(prev); > + > while ((job = drm_sched_entity_queue_pop(entity))) { > - struct drm_sched_fence *s_fence = job->s_fence; > - > - dma_fence_get(&s_fence->finished); > - if (!prev || > - dma_fence_add_callback(prev, &job->finish_cb, > - drm_sched_entity_kill_jobs_cb)) { > - /* > - * Adding callback above failed. > - * dma_fence_put() checks for NULL. > - */ > - dma_fence_put(prev); > - drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb); > + if (job->owner_tgid == my_tgid) { > + /* Kill this job */ > + struct drm_sched_fence *s_fence = job->s_fence; > + > + dma_fence_get(&s_fence->finished); > + if (!prev || > + dma_fence_add_callback(prev, &job->finish_cb, > + > drm_sched_entity_kill_jobs_cb)) { > + dma_fence_put(prev); > + drm_sched_entity_kill_jobs_cb(NULL, > &job->finish_cb); > + } > + prev = &s_fence->finished; > + } else { > + /* Keep jobs from other processes */ > + spsc_queue_push(&temp_queue, &job->queue_node); > } > + } > > - prev = &s_fence->finished; > + /* Put back jobs from other processes */ > + while (true) { > + struct spsc_node *node = spsc_queue_pop(&temp_queue); > + if (!node) > + break; > + spsc_queue_push(&entity->job_queue, node); > } > + > dma_fence_put(prev); > + > + /* Check if there are other active users and restore entity if needed */ > + spin_lock(&entity->users_lock); > + list_for_each_entry(user, &entity->users, list) { > + if (!atomic_read(&user->exited)) { > + /* Found active user, restore entity */ > + spin_unlock(&entity->users_lock); > + spin_lock(&entity->lock); > + entity->stopped = false; > + drm_sched_rq_add_entity(entity->rq, entity); > + spin_unlock(&entity->lock); > + return; > + } > + } > + spin_unlock(&entity->users_lock); > } > > /** > @@ -323,6 +372,8 @@ EXPORT_SYMBOL(drm_sched_entity_flush); > */ > void drm_sched_entity_fini(struct drm_sched_entity *entity) > { > + struct drm_sched_entity_user *user, *tmp; > + > /* > * If consumption of existing IBs wasn't completed. Forcefully remove > * them here. Also makes sure that the scheduler won't touch this entity > @@ -338,6 +389,14 @@ void drm_sched_entity_fini(struct drm_sched_entity > *entity) > > dma_fence_put(rcu_dereference_check(entity->last_scheduled, true)); > RCU_INIT_POINTER(entity->last_scheduled, NULL); > + > + /* Clean up user list */ > + spin_lock(&entity->users_lock); > + list_for_each_entry_safe(user, tmp, &entity->users, list) { > + list_del_rcu(&user->list); > + kfree_rcu(user, rcu); > + } > + spin_unlock(&entity->users_lock); > } > EXPORT_SYMBOL(drm_sched_entity_fini); > > @@ -567,9 +626,40 @@ void drm_sched_entity_select_rq(struct drm_sched_entity > *entity) > void drm_sched_entity_push_job(struct drm_sched_job *sched_job) > { > struct drm_sched_entity *entity = sched_job->entity; > + struct drm_sched_entity_user *user, *found = NULL; > + pid_t my_tgid = task_tgid_nr(current); > bool first; > ktime_t submit_ts; > > + /* Check if entity is stopped and reject directly */ > + if (entity->stopped) > + goto error; > + > + /* Entity is running, check user list */ > + spin_lock(&entity->users_lock); > + list_for_each_entry(user, &entity->users, list) { > + if (user->tgid == my_tgid) { > + found = user; > + /* Reject if this user has exited */ > + if (atomic_read(&user->exited)) { > + spin_unlock(&entity->users_lock); > + goto error; > + } > + break; > + } > + } > + > + /* If not found, create new user (fork case) */ > + if (!found) { > + found = kzalloc(sizeof(*found), GFP_ATOMIC); > + if (found) { > + found->tgid = my_tgid; > + atomic_set(&found->exited, 0); > + list_add_tail(&found->list, &entity->users); > + } > + } > + spin_unlock(&entity->users_lock); > + > trace_drm_sched_job_queue(sched_job, entity); > > if (trace_drm_sched_job_add_dep_enabled()) { > @@ -582,6 +672,9 @@ void drm_sched_entity_push_job(struct drm_sched_job > *sched_job) > atomic_inc(entity->rq->sched->score); > WRITE_ONCE(entity->last_user, current->group_leader); > > + /* Record owner TGID */ > + sched_job->owner_tgid = my_tgid; > + > /* > * After the sched_job is pushed into the entity queue, it may be > * completed and freed up at any time. We can no longer access it. > @@ -597,12 +690,6 @@ void drm_sched_entity_push_job(struct drm_sched_job > *sched_job) > > /* Add the entity to the run queue */ > spin_lock(&entity->lock); > - if (entity->stopped) { > - spin_unlock(&entity->lock); > - > - DRM_ERROR("Trying to push to a killed entity\n"); > - return; > - } > > rq = entity->rq; > sched = rq->sched; > @@ -618,5 +705,11 @@ void drm_sched_entity_push_job(struct drm_sched_job > *sched_job) > > drm_sched_wakeup(sched); > } > + return; > + > +error: > + dma_fence_set_error(&sched_job->s_fence->finished, -EPERM); > + drm_sched_fence_scheduled(sched_job->s_fence, NULL); > + drm_sched_fence_finished(sched_job->s_fence, -EPERM); > } > EXPORT_SYMBOL(drm_sched_entity_push_job); > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h > index e62a7214e052..45e066596405 100644 > --- a/include/drm/gpu_scheduler.h > +++ b/include/drm/gpu_scheduler.h > @@ -59,6 +59,16 @@ struct drm_sched_rq; > > struct drm_file; > > +/** > + * struct drm_sched_entity_user - Per-process entity user tracking > + */ > +struct drm_sched_entity_user { > + struct list_head list; > + struct rcu_head rcu; > + pid_t tgid; > + atomic_t exited; > +}; > + > /* These are often used as an (initial) index > * to an array, and as such should start at 0. > */ > @@ -233,6 +243,13 @@ struct drm_sched_entity { > */ > struct rb_node rb_tree_node; > > + /** > + * @users: > + * > + * List of processes using this entity (for fork support) > + */ > + struct list_head users; > + spinlock_t users_lock; > }; > > /** > @@ -385,6 +402,11 @@ struct drm_sched_job { > * drm_sched_job_add_implicit_dependencies(). > */ > struct xarray dependencies; > + > + /** > + * @owner_tgid: TGID of the process that submitted this job > + */ > + pid_t owner_tgid; > }; > > /**
