Re: [PATCH] drm/amdgpu: Support fork process

Christian König Tue, 23 Dec 2025 07:04:18 -0800

On 12/22/25 12:01, Bingxi Guo wrote:
> When a process forks, the child inherits the open DRM file descriptor.
> If the parent is killed (e.g., by SIGKILL), only the parent's jobs
> are canceled. The child process can continue submitting jobs to the
> same entity through its own user entry in the entity's user list.


Clear NAK to that. Forking a process and re-using the fd is illegal!

See the OpenGL, OpenCL and Vulkan specification.

We intentionally block that here.

Regards,
Christian.

> 
> Signed-off-by: Emily Deng <[email protected]>
> Signed-off-by: Bingxi Guo <[email protected]>
> ---
>  drivers/gpu/drm/scheduler/sched_entity.c | 133 +++++++++++++++++++----
>  include/drm/gpu_scheduler.h              |  22 ++++
>  2 files changed, 135 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c 
> b/drivers/gpu/drm/scheduler/sched_entity.c
> index 8867b95ab089..508a0629b839 100644
> --- a/drivers/gpu/drm/scheduler/sched_entity.c
> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
> @@ -110,6 +110,9 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
>       atomic_set(&entity->fence_seq, 0);
>       entity->fence_context = dma_fence_context_alloc(2);
>  
> +     INIT_LIST_HEAD(&entity->users);
> +     spin_lock_init(&entity->users_lock);
> +
>       return 0;
>  }
>  EXPORT_SYMBOL(drm_sched_entity_init);
> @@ -228,10 +231,24 @@ static void drm_sched_entity_kill(struct 
> drm_sched_entity *entity)
>  {
>       struct drm_sched_job *job;
>       struct dma_fence *prev;
> +     struct drm_sched_entity_user *user;
> +     struct spsc_queue temp_queue;
> +     pid_t my_tgid = task_tgid_nr(current);
>  
>       if (!entity->rq)
>               return;
>  
> +     /* Mark current process as exited */
> +     spin_lock(&entity->users_lock);
> +     list_for_each_entry(user, &entity->users, list) {
> +             if (user->tgid == my_tgid) {
> +                     atomic_set(&user->exited, 1);
> +                     break;
> +             }
> +     }
> +     spin_unlock(&entity->users_lock);
> +
> +     /* Temporarily stop entity to prevent new jobs */
>       spin_lock(&entity->lock);
>       entity->stopped = true;
>       drm_sched_rq_remove_entity(entity->rq, entity);
> @@ -240,27 +257,59 @@ static void drm_sched_entity_kill(struct 
> drm_sched_entity *entity)
>       /* Make sure this entity is not used by the scheduler at the moment */
>       wait_for_completion(&entity->entity_idle);
>  
> -     /* The entity is guaranteed to not be used by the scheduler */
> +     /*
> +      * The entity is stopped and idle. No new jobs can be pushed.
> +      * Scan the queue and separate jobs:
> +      * - Jobs from this process: kill immediately
> +      * - Jobs from other processes: keep in temp_queue
> +      */
> +     spsc_queue_init(&temp_queue);
>       prev = rcu_dereference_check(entity->last_scheduled, true);
>       dma_fence_get(prev);
> +
>       while ((job = drm_sched_entity_queue_pop(entity))) {
> -             struct drm_sched_fence *s_fence = job->s_fence;
> -
> -             dma_fence_get(&s_fence->finished);
> -             if (!prev ||
> -                 dma_fence_add_callback(prev, &job->finish_cb,
> -                                        drm_sched_entity_kill_jobs_cb)) {
> -                     /*
> -                      * Adding callback above failed.
> -                      * dma_fence_put() checks for NULL.
> -                      */
> -                     dma_fence_put(prev);
> -                     drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
> +             if (job->owner_tgid == my_tgid) {
> +                     /* Kill this job */
> +                     struct drm_sched_fence *s_fence = job->s_fence;
> +
> +                     dma_fence_get(&s_fence->finished);
> +                     if (!prev ||
> +                         dma_fence_add_callback(prev, &job->finish_cb,
> +                                                
> drm_sched_entity_kill_jobs_cb)) {
> +                             dma_fence_put(prev);
> +                             drm_sched_entity_kill_jobs_cb(NULL, 
> &job->finish_cb);
> +                     }
> +                     prev = &s_fence->finished;
> +             } else {
> +                     /* Keep jobs from other processes */
> +                     spsc_queue_push(&temp_queue, &job->queue_node);
>               }
> +     }
>  
> -             prev = &s_fence->finished;
> +     /* Put back jobs from other processes */
> +     while (true) {
> +             struct spsc_node *node = spsc_queue_pop(&temp_queue);
> +             if (!node)
> +                     break;
> +             spsc_queue_push(&entity->job_queue, node);
>       }
> +
>       dma_fence_put(prev);
> +
> +     /* Check if there are other active users and restore entity if needed */
> +     spin_lock(&entity->users_lock);
> +     list_for_each_entry(user, &entity->users, list) {
> +             if (!atomic_read(&user->exited)) {
> +                     /* Found active user, restore entity */
> +                     spin_unlock(&entity->users_lock);
> +                     spin_lock(&entity->lock);
> +                     entity->stopped = false;
> +                     drm_sched_rq_add_entity(entity->rq, entity);
> +                     spin_unlock(&entity->lock);
> +                     return;
> +             }
> +     }
> +     spin_unlock(&entity->users_lock);
>  }
>  
>  /**
> @@ -323,6 +372,8 @@ EXPORT_SYMBOL(drm_sched_entity_flush);
>   */
>  void drm_sched_entity_fini(struct drm_sched_entity *entity)
>  {
> +     struct drm_sched_entity_user *user, *tmp;
> +
>       /*
>        * If consumption of existing IBs wasn't completed. Forcefully remove
>        * them here. Also makes sure that the scheduler won't touch this entity
> @@ -338,6 +389,14 @@ void drm_sched_entity_fini(struct drm_sched_entity 
> *entity)
>  
>       dma_fence_put(rcu_dereference_check(entity->last_scheduled, true));
>       RCU_INIT_POINTER(entity->last_scheduled, NULL);
> +
> +     /* Clean up user list */
> +     spin_lock(&entity->users_lock);
> +     list_for_each_entry_safe(user, tmp, &entity->users, list) {
> +             list_del_rcu(&user->list);
> +             kfree_rcu(user, rcu);
> +     }
> +     spin_unlock(&entity->users_lock);
>  }
>  EXPORT_SYMBOL(drm_sched_entity_fini);
>  
> @@ -567,9 +626,40 @@ void drm_sched_entity_select_rq(struct drm_sched_entity 
> *entity)
>  void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
>  {
>       struct drm_sched_entity *entity = sched_job->entity;
> +     struct drm_sched_entity_user *user, *found = NULL;
> +     pid_t my_tgid = task_tgid_nr(current);
>       bool first;
>       ktime_t submit_ts;
>  
> +     /* Check if entity is stopped and reject directly */
> +     if (entity->stopped)
> +             goto error;
> +
> +     /* Entity is running, check user list */
> +     spin_lock(&entity->users_lock);
> +     list_for_each_entry(user, &entity->users, list) {
> +             if (user->tgid == my_tgid) {
> +                     found = user;
> +                     /* Reject if this user has exited */
> +                     if (atomic_read(&user->exited)) {
> +                             spin_unlock(&entity->users_lock);
> +                             goto error;
> +                     }
> +                     break;
> +             }
> +     }
> +
> +     /* If not found, create new user (fork case) */
> +     if (!found) {
> +             found = kzalloc(sizeof(*found), GFP_ATOMIC);
> +             if (found) {
> +                     found->tgid = my_tgid;
> +                     atomic_set(&found->exited, 0);
> +                     list_add_tail(&found->list, &entity->users);
> +             }
> +     }
> +     spin_unlock(&entity->users_lock);
> +
>       trace_drm_sched_job_queue(sched_job, entity);
>  
>       if (trace_drm_sched_job_add_dep_enabled()) {
> @@ -582,6 +672,9 @@ void drm_sched_entity_push_job(struct drm_sched_job 
> *sched_job)
>       atomic_inc(entity->rq->sched->score);
>       WRITE_ONCE(entity->last_user, current->group_leader);
>  
> +     /* Record owner TGID */
> +     sched_job->owner_tgid = my_tgid;
> +
>       /*
>        * After the sched_job is pushed into the entity queue, it may be
>        * completed and freed up at any time. We can no longer access it.
> @@ -597,12 +690,6 @@ void drm_sched_entity_push_job(struct drm_sched_job 
> *sched_job)
>  
>               /* Add the entity to the run queue */
>               spin_lock(&entity->lock);
> -             if (entity->stopped) {
> -                     spin_unlock(&entity->lock);
> -
> -                     DRM_ERROR("Trying to push to a killed entity\n");
> -                     return;
> -             }
>  
>               rq = entity->rq;
>               sched = rq->sched;
> @@ -618,5 +705,11 @@ void drm_sched_entity_push_job(struct drm_sched_job 
> *sched_job)
>  
>               drm_sched_wakeup(sched);
>       }
> +     return;
> +
> +error:
> +     dma_fence_set_error(&sched_job->s_fence->finished, -EPERM);
> +     drm_sched_fence_scheduled(sched_job->s_fence, NULL);
> +     drm_sched_fence_finished(sched_job->s_fence, -EPERM);
>  }
>  EXPORT_SYMBOL(drm_sched_entity_push_job);
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index e62a7214e052..45e066596405 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -59,6 +59,16 @@ struct drm_sched_rq;
>  
>  struct drm_file;
>  
> +/**
> + * struct drm_sched_entity_user - Per-process entity user tracking
> + */
> +struct drm_sched_entity_user {
> +     struct list_head                list;
> +     struct rcu_head                 rcu;
> +     pid_t                           tgid;
> +     atomic_t                        exited;
> +};
> +
>  /* These are often used as an (initial) index
>   * to an array, and as such should start at 0.
>   */
> @@ -233,6 +243,13 @@ struct drm_sched_entity {
>        */
>       struct rb_node                  rb_tree_node;
>  
> +     /**
> +      * @users:
> +      *
> +      * List of processes using this entity (for fork support)
> +      */
> +     struct list_head                users;
> +     spinlock_t                      users_lock;
>  };
>  
>  /**
> @@ -385,6 +402,11 @@ struct drm_sched_job {
>        * drm_sched_job_add_implicit_dependencies().
>        */
>       struct xarray                   dependencies;
> +
> +     /**
> +      * @owner_tgid: TGID of the process that submitted this job
> +      */
> +     pid_t                           owner_tgid;
>  };
>  
>  /**

Re: [PATCH] drm/amdgpu: Support fork process

Reply via email to