On 08/02/2018 02:43 AM, Nayan Deshmukh wrote:
Hi Andrey,

Good catch. I guess since both Christian and I were working on it parallelly we didn't catch this problem.

If it is only causing a problem in push_job then we can handle it something like this:

----------------------------------------------------------------------------------------------------------------------------
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c
index 05dc6ecd4003..f344ce32f128 100644
--- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
@@ -563,6 +563,11 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job,
        first = spsc_queue_count(&entity->job_queue) == 0;
        reschedule = idle && first && (entity->num_rq_list > 1);

+       if (first && list_empty(&entity->list)) {

first might be false if the other process interrupted by SIGKILL  in middle of  wait_event_killable in drm_sched_entity_flush when there were still item in queue.
I don't see a good way besides adding a 'stopped' flag to drm_sched_enitity.

Andrey

+               DRM_ERROR("Trying to push to a killed entity\n");
+               return;
+       }
+
        if (reschedule) {
                rq = drm_sched_entity_get_free_sched(entity);
                spin_lock(&entity->rq_lock);
@@ -580,11 +585,6 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job,
        if (first) {
                /* Add the entity to the run queue */
                spin_lock(&entity->rq_lock);
-               if (!entity->rq) {
-                       DRM_ERROR("Trying to push to a killed entity\n");
-                       spin_unlock(&entity->rq_lock);
-                       return;
-               }
                drm_sched_rq_add_entity(entity->rq, entity);
                spin_unlock(&entity->rq_lock);
                drm_sched_wakeup(entity->rq->sched);
-----------------------------------------------------------------------------------------------------------------------------


On Thu, Aug 2, 2018 at 3:55 AM Andrey Grodzovsky <andrey.grodzov...@amd.com <mailto:andrey.grodzov...@amd.com>> wrote:

    Thinking again about this change and 53d5f1e drm/scheduler: move idle
    entities to scheduler with less load v2

    Looks to me like use case for which fc9a539 drm/scheduler: add NULL
    pointer check for run queue (v2) was done

    will not work anymore.

    First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will
    never
    be true any more since we stopped entity->rq to NULL in

    drm_sched_entity_flush.

    Second of all, even after we removed the entity from rq in
    drm_sched_entity_flush to terminate any subsequent submissions

    to the queue the other thread doing push job can just acquire again a
    run queue

    from drm_sched_entity_get_free_sched and continue submission so you
    can't substitute ' if (!entity->rq)' to 'if
    (list_empty(&entity->list))'.

    What about adding a 'stopped' flag to drm_sched_entity to be set in
    drm_sched_entity_flush and in

But it might be worth adding a stopped flag field if it is required at other places as well.

Thanks,
Nayan

    drm_sched_entity_push_job check for  'if (!entity->stopped)'
    instead of
    ' if (!entity->rq)' ?

    Andrey


    On 07/30/2018 07:03 AM, Christian König wrote:
    > We removed the redundancy of having an extra scheduler field, so we
    > can't set the rq to NULL any more or otherwise won't know which
    > scheduler to use for the cleanup.
    >
    > Just remove the entity from the scheduling list instead.
    >
    > Signed-off-by: Christian König <christian.koe...@amd.com
    <mailto:christian.koe...@amd.com>>
    > ---
    >   drivers/gpu/drm/scheduler/gpu_scheduler.c | 35
    +++++++------------------------
    >   1 file changed, 8 insertions(+), 27 deletions(-)
    >
    > diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c
    b/drivers/gpu/drm/scheduler/gpu_scheduler.c
    > index f563e4fbb4b6..1b733229201e 100644
    > --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
    > +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
    > @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct
    drm_sched_entity *entity,
    >   }
    >   EXPORT_SYMBOL(drm_sched_entity_init);
    >
    > -/**
    > - * drm_sched_entity_is_initialized - Query if entity is initialized
    > - *
    > - * @sched: Pointer to scheduler instance
    > - * @entity: The pointer to a valid scheduler entity
    > - *
    > - * return true if entity is initialized, false otherwise
    > -*/
    > -static bool drm_sched_entity_is_initialized(struct
    drm_gpu_scheduler *sched,
    > -                                         struct
    drm_sched_entity *entity)
    > -{
    > -     return entity->rq != NULL &&
    > -             entity->rq->sched == sched;
    > -}
    > -
    >   /**
    >    * drm_sched_entity_is_idle - Check if entity is idle
    >    *
    > @@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct
    drm_sched_entity *entity)
    >   {
    >       rmb();
    >
    > -     if (!entity->rq || spsc_queue_peek(&entity->job_queue) ==
    NULL)
    > +     if (list_empty(&entity->list) ||
    > +         spsc_queue_peek(&entity->job_queue) == NULL)
    >               return true;
    >
    >       return false;
    > @@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct
    drm_sched_entity *entity, long timeout)
    >       long ret = timeout;
    >
    >       sched = entity->rq->sched;
    > -     if (!drm_sched_entity_is_initialized(sched, entity))
    > -             return ret;
    >       /**
    >        * The client will not queue more IBs during this fini,
    consume existing
    >        * queued IBs or discard them on SIGKILL
    > @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct
    drm_sched_entity *entity, long timeout)
    >       last_user = cmpxchg(&entity->last_user,
    current->group_leader, NULL);
    >       if ((!last_user || last_user == current->group_leader) &&
    >           (current->flags & PF_EXITING) && (current->exit_code
    == SIGKILL))
    > -             drm_sched_entity_set_rq(entity, NULL);
    > +             drm_sched_rq_remove_entity(entity->rq, entity);
    >
    >       return ret;
    >   }
    > @@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct
    drm_sched_entity *entity)
    >       struct drm_gpu_scheduler *sched;
    >
    >       sched = entity->rq->sched;
    > -     drm_sched_entity_set_rq(entity, NULL);
    > +     drm_sched_rq_remove_entity(entity->rq, entity);
    >
    >       /* Consumption of existing IBs wasn't completed. Forcefully
    >        * remove them here.
    > @@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct
    drm_sched_entity *entity,
    >       if (entity->rq == rq)
    >               return;
    >
    > -     spin_lock(&entity->rq_lock);
    > -
    > -     if (entity->rq)
    > -             drm_sched_rq_remove_entity(entity->rq, entity);
    > +     BUG_ON(!rq);
    >
    > +     spin_lock(&entity->rq_lock);
    > +     drm_sched_rq_remove_entity(entity->rq, entity);
    >       entity->rq = rq;
    > -     if (rq)
    > -             drm_sched_rq_add_entity(rq, entity);
    > -
    > +     drm_sched_rq_add_entity(rq, entity);
    >       spin_unlock(&entity->rq_lock);
    >   }
    >   EXPORT_SYMBOL(drm_sched_entity_set_rq);


_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Reply via email to