Re: [PATCH 12/13] drm/scheduler: rework entity flush, kill and fini

2023-01-02 Thread Dmitry Osipenko
On 1/2/23 17:17, youling 257 wrote:
> which patch?

https://patchwork.freedesktop.org/patch/512652/

I applied it to next-fixes

-- 
Best regards,
Dmitry



Re: [PATCH 12/13] drm/scheduler: rework entity flush, kill and fini

2023-01-02 Thread youling 257
which patch?

2023-01-02 17:24 GMT+08:00, Dmitry Osipenko :
> On 1/1/23 21:29, youling257 wrote:
>> Linux 6.2-rc1 has memory leak on amdgpu, git bisect bad commit is
>> "drm/scheduler: rework entity flush, kill and fini".
>> git bisect start
>> # status: waiting for both good and bad commits
>> # good: [eb7081409f94a9a8608593d0fb63a1aa3d6f95d8] Linux 6.1-rc6
>> git bisect good eb7081409f94a9a8608593d0fb63a1aa3d6f95d8
>> # status: waiting for bad commit, 1 good commit known
>> # bad: [66efff515a6500d4b4976fbab3bee8b92a1137fb] Merge tag
>> 'amd-drm-next-6.2-2022-12-07' of
>> https://gitlab.freedesktop.org/agd5f/linux into drm-next
>> git bisect bad 66efff515a6500d4b4976fbab3bee8b92a1137fb
>> # good: [49e8e6343df688d68b12c2af50791ca37520f0b7] Merge tag
>> 'amd-drm-next-6.2-2022-11-04' of
>> https://gitlab.freedesktop.org/agd5f/linux into drm-next
>> git bisect good 49e8e6343df688d68b12c2af50791ca37520f0b7
>> # bad: [fc58764bbf602b65a6f63c53e5fd6feae76c510c] Merge tag
>> 'amd-drm-next-6.2-2022-11-18' of
>> https://gitlab.freedesktop.org/agd5f/linux into drm-next
>> git bisect bad fc58764bbf602b65a6f63c53e5fd6feae76c510c
>> # bad: [4e291f2f585313efa5200cce655e17c94906e50a] Merge tag
>> 'drm-misc-next-2022-11-10-1' of git://anongit.freedesktop.org/drm/drm-misc
>> into drm-next
>> git bisect bad 4e291f2f585313efa5200cce655e17c94906e50a
>> # good: [78a43c7e3b2ff5aed1809f93b4f87a418355789e] drm/nouveau/gr/gf100-:
>> make global attrib_cb actually global
>> git bisect good 78a43c7e3b2ff5aed1809f93b4f87a418355789e
>> # bad: [611fc22c9e5e13276c819a7f7a7d19b794bbed1a] drm/arm/hdlcd: remove
>> calls to drm_mode_config_cleanup()
>> git bisect bad 611fc22c9e5e13276c819a7f7a7d19b794bbed1a
>> # bad: [a8d9621b9fc67957b3de334cc1b5f47570fb90a0] drm/ingenic: Don't set
>> struct drm_driver.output_poll_changed
>> git bisect bad a8d9621b9fc67957b3de334cc1b5f47570fb90a0
>> # good: [2cf9886e281678ae9ee57e24a656749071d543bb] drm/scheduler: remove
>> drm_sched_dependency_optimized
>> git bisect good 2cf9886e281678ae9ee57e24a656749071d543bb
>> # bad: [8e4e4c2f53ffcb0ef746dc3b87ce1a57c5c94c7d] Merge drm/drm-next into
>> drm-misc-next
>> git bisect bad 8e4e4c2f53ffcb0ef746dc3b87ce1a57c5c94c7d
>> # bad: [47078311b8efebdefd5b3b2f87e2b02b14f49c66] drm/ingenic: Fix missing
>> platform_driver_unregister() call in ingenic_drm_init()
>> git bisect bad 47078311b8efebdefd5b3b2f87e2b02b14f49c66
>> # bad: [a82f30b04c6aaefe62cbbfd297e1bb23435b6b3a] drm/scheduler: rename
>> dependency callback into prepare_job
>> git bisect bad a82f30b04c6aaefe62cbbfd297e1bb23435b6b3a
>> # bad: [2fdb8a8f07c2f1353770a324fd19b8114e4329ac] drm/scheduler: rework
>> entity flush, kill and fini
>> git bisect bad 2fdb8a8f07c2f1353770a324fd19b8114e4329ac
>> # first bad commit: [2fdb8a8f07c2f1353770a324fd19b8114e4329ac]
>> drm/scheduler: rework entity flush, kill and fini
>>
>> @Rob Clark, i test your patch fixed my problem.
>
> The linux-next already carried the fix for a couple weeks. It will land
> to 6.2-rc once drm-fixes branch will be synced with the 6.2.
>
> --
> Best regards,
> Dmitry
>
>


Re: [PATCH 12/13] drm/scheduler: rework entity flush, kill and fini

2023-01-02 Thread Dmitry Osipenko
On 1/1/23 21:29, youling257 wrote:
> Linux 6.2-rc1 has memory leak on amdgpu, git bisect bad commit is 
> "drm/scheduler: rework entity flush, kill and fini".
> git bisect start
> # status: waiting for both good and bad commits
> # good: [eb7081409f94a9a8608593d0fb63a1aa3d6f95d8] Linux 6.1-rc6
> git bisect good eb7081409f94a9a8608593d0fb63a1aa3d6f95d8
> # status: waiting for bad commit, 1 good commit known
> # bad: [66efff515a6500d4b4976fbab3bee8b92a1137fb] Merge tag 
> 'amd-drm-next-6.2-2022-12-07' of https://gitlab.freedesktop.org/agd5f/linux 
> into drm-next
> git bisect bad 66efff515a6500d4b4976fbab3bee8b92a1137fb
> # good: [49e8e6343df688d68b12c2af50791ca37520f0b7] Merge tag 
> 'amd-drm-next-6.2-2022-11-04' of https://gitlab.freedesktop.org/agd5f/linux 
> into drm-next
> git bisect good 49e8e6343df688d68b12c2af50791ca37520f0b7
> # bad: [fc58764bbf602b65a6f63c53e5fd6feae76c510c] Merge tag 
> 'amd-drm-next-6.2-2022-11-18' of https://gitlab.freedesktop.org/agd5f/linux 
> into drm-next
> git bisect bad fc58764bbf602b65a6f63c53e5fd6feae76c510c
> # bad: [4e291f2f585313efa5200cce655e17c94906e50a] Merge tag 
> 'drm-misc-next-2022-11-10-1' of git://anongit.freedesktop.org/drm/drm-misc 
> into drm-next
> git bisect bad 4e291f2f585313efa5200cce655e17c94906e50a
> # good: [78a43c7e3b2ff5aed1809f93b4f87a418355789e] drm/nouveau/gr/gf100-: 
> make global attrib_cb actually global
> git bisect good 78a43c7e3b2ff5aed1809f93b4f87a418355789e
> # bad: [611fc22c9e5e13276c819a7f7a7d19b794bbed1a] drm/arm/hdlcd: remove calls 
> to drm_mode_config_cleanup()
> git bisect bad 611fc22c9e5e13276c819a7f7a7d19b794bbed1a
> # bad: [a8d9621b9fc67957b3de334cc1b5f47570fb90a0] drm/ingenic: Don't set 
> struct drm_driver.output_poll_changed
> git bisect bad a8d9621b9fc67957b3de334cc1b5f47570fb90a0
> # good: [2cf9886e281678ae9ee57e24a656749071d543bb] drm/scheduler: remove 
> drm_sched_dependency_optimized
> git bisect good 2cf9886e281678ae9ee57e24a656749071d543bb
> # bad: [8e4e4c2f53ffcb0ef746dc3b87ce1a57c5c94c7d] Merge drm/drm-next into 
> drm-misc-next
> git bisect bad 8e4e4c2f53ffcb0ef746dc3b87ce1a57c5c94c7d
> # bad: [47078311b8efebdefd5b3b2f87e2b02b14f49c66] drm/ingenic: Fix missing 
> platform_driver_unregister() call in ingenic_drm_init()
> git bisect bad 47078311b8efebdefd5b3b2f87e2b02b14f49c66
> # bad: [a82f30b04c6aaefe62cbbfd297e1bb23435b6b3a] drm/scheduler: rename 
> dependency callback into prepare_job
> git bisect bad a82f30b04c6aaefe62cbbfd297e1bb23435b6b3a
> # bad: [2fdb8a8f07c2f1353770a324fd19b8114e4329ac] drm/scheduler: rework 
> entity flush, kill and fini
> git bisect bad 2fdb8a8f07c2f1353770a324fd19b8114e4329ac
> # first bad commit: [2fdb8a8f07c2f1353770a324fd19b8114e4329ac] drm/scheduler: 
> rework entity flush, kill and fini
> 
> @Rob Clark, i test your patch fixed my problem.

The linux-next already carried the fix for a couple weeks. It will land
to 6.2-rc once drm-fixes branch will be synced with the 6.2.

-- 
Best regards,
Dmitry



Re: [PATCH 12/13] drm/scheduler: rework entity flush, kill and fini

2023-01-01 Thread youling257
Linux 6.2-rc1 has memory leak on amdgpu, git bisect bad commit is 
"drm/scheduler: rework entity flush, kill and fini".
git bisect start
# status: waiting for both good and bad commits
# good: [eb7081409f94a9a8608593d0fb63a1aa3d6f95d8] Linux 6.1-rc6
git bisect good eb7081409f94a9a8608593d0fb63a1aa3d6f95d8
# status: waiting for bad commit, 1 good commit known
# bad: [66efff515a6500d4b4976fbab3bee8b92a1137fb] Merge tag 
'amd-drm-next-6.2-2022-12-07' of https://gitlab.freedesktop.org/agd5f/linux 
into drm-next
git bisect bad 66efff515a6500d4b4976fbab3bee8b92a1137fb
# good: [49e8e6343df688d68b12c2af50791ca37520f0b7] Merge tag 
'amd-drm-next-6.2-2022-11-04' of https://gitlab.freedesktop.org/agd5f/linux 
into drm-next
git bisect good 49e8e6343df688d68b12c2af50791ca37520f0b7
# bad: [fc58764bbf602b65a6f63c53e5fd6feae76c510c] Merge tag 
'amd-drm-next-6.2-2022-11-18' of https://gitlab.freedesktop.org/agd5f/linux 
into drm-next
git bisect bad fc58764bbf602b65a6f63c53e5fd6feae76c510c
# bad: [4e291f2f585313efa5200cce655e17c94906e50a] Merge tag 
'drm-misc-next-2022-11-10-1' of git://anongit.freedesktop.org/drm/drm-misc into 
drm-next
git bisect bad 4e291f2f585313efa5200cce655e17c94906e50a
# good: [78a43c7e3b2ff5aed1809f93b4f87a418355789e] drm/nouveau/gr/gf100-: make 
global attrib_cb actually global
git bisect good 78a43c7e3b2ff5aed1809f93b4f87a418355789e
# bad: [611fc22c9e5e13276c819a7f7a7d19b794bbed1a] drm/arm/hdlcd: remove calls 
to drm_mode_config_cleanup()
git bisect bad 611fc22c9e5e13276c819a7f7a7d19b794bbed1a
# bad: [a8d9621b9fc67957b3de334cc1b5f47570fb90a0] drm/ingenic: Don't set struct 
drm_driver.output_poll_changed
git bisect bad a8d9621b9fc67957b3de334cc1b5f47570fb90a0
# good: [2cf9886e281678ae9ee57e24a656749071d543bb] drm/scheduler: remove 
drm_sched_dependency_optimized
git bisect good 2cf9886e281678ae9ee57e24a656749071d543bb
# bad: [8e4e4c2f53ffcb0ef746dc3b87ce1a57c5c94c7d] Merge drm/drm-next into 
drm-misc-next
git bisect bad 8e4e4c2f53ffcb0ef746dc3b87ce1a57c5c94c7d
# bad: [47078311b8efebdefd5b3b2f87e2b02b14f49c66] drm/ingenic: Fix missing 
platform_driver_unregister() call in ingenic_drm_init()
git bisect bad 47078311b8efebdefd5b3b2f87e2b02b14f49c66
# bad: [a82f30b04c6aaefe62cbbfd297e1bb23435b6b3a] drm/scheduler: rename 
dependency callback into prepare_job
git bisect bad a82f30b04c6aaefe62cbbfd297e1bb23435b6b3a
# bad: [2fdb8a8f07c2f1353770a324fd19b8114e4329ac] drm/scheduler: rework entity 
flush, kill and fini
git bisect bad 2fdb8a8f07c2f1353770a324fd19b8114e4329ac
# first bad commit: [2fdb8a8f07c2f1353770a324fd19b8114e4329ac] drm/scheduler: 
rework entity flush, kill and fini

@Rob Clark, i test your patch fixed my problem.


Re: [PATCH 12/13] drm/scheduler: rework entity flush, kill and fini

2022-12-28 Thread Rob Clark
On Wed, Dec 28, 2022 at 8:27 AM Rob Clark  wrote:
>
> On Thu, Nov 17, 2022 at 7:12 AM Dmitry Osipenko
>  wrote:
> >
> > On 11/17/22 18:09, Christian König wrote:
> > > Am 17.11.22 um 15:41 schrieb Dmitry Osipenko:
> > >> [SNIP]
> > >>> drm_sched_entity_flush() should be called from the flush callback from
> > >>> the file_operations structure of panfrost. See amdgpu_flush() and
> > >>> amdgpu_ctx_mgr_entity_flush(). This makes sure that we wait for all
> > >>> entities of the process/file descriptor to be flushed out.
> > >>>
> > >>> drm_sched_entity_fini() must be called before you free the memory the
> > >>> entity structure or otherwise we would run into an use after free.
> > >> Right, drm_sched_entity_destroy() invokes these two functions and
> > >> Panfrost uses drm_sched_entity_destroy().
> > >
> > > Than I have no idea what's going wrong here.
> > >
> > > The scheduler should trivially finish with the entity and call
> > > complete(>entity_idle) in it's main loop. No idea why this
> > > doesn't happen. Can you investigate?
> >
> > I'll take a closer look. Hoped you may have a quick idea of what's wrong :)
> >
>
> As Jonathan mentioned, the same thing is happening on msm.  I can
> reproduce this by adding an assert in mesa (in this case, triggered
> after 100 draws) and running an app under gdb.  After the assert is
> hit, if I try to exit mesa, it hangs.
>
> The problem is that we somehow call drm_sched_entity_kill() twice.
> The first time completes, but now the entity_idle completion is no
> longer done, so the second call hangs forever.

Maybe we should:

--
diff --git a/drivers/gpu/drm/scheduler/sched_entity.c
b/drivers/gpu/drm/scheduler/sched_entity.c
index fe09e5be79bd..3d7c671d05e3 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -222,7 +226,6 @@ static void drm_sched_entity_kill(struct
drm_sched_entity *entity)
 long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout)
 {
struct drm_gpu_scheduler *sched;
-   struct task_struct *last_user;
long ret = timeout;

if (!entity->rq)
@@ -244,12 +247,6 @@ long drm_sched_entity_flush(struct
drm_sched_entity *entity, long timeout)
drm_sched_entity_is_idle(entity));
}

-   /* For killed process disable any more IBs enqueue right now */
-   last_user = cmpxchg(>last_user, current->group_leader, NULL);
-   if ((!last_user || last_user == current->group_leader) &&
-   (current->flags & PF_EXITING) && (current->exit_code == SIGKILL))
-   drm_sched_entity_kill(entity);
-
return ret;
 }
 EXPORT_SYMBOL(drm_sched_entity_flush);


Maybe there is a better fix, but special handling for SIGKILL seems
dubious to me (vs just relying on the drm device fd close path).  I
wonder if that code path was tested at all?

BR,
-R


Re: [PATCH 12/13] drm/scheduler: rework entity flush, kill and fini

2022-12-28 Thread Rob Clark
On Thu, Nov 17, 2022 at 7:12 AM Dmitry Osipenko
 wrote:
>
> On 11/17/22 18:09, Christian König wrote:
> > Am 17.11.22 um 15:41 schrieb Dmitry Osipenko:
> >> [SNIP]
> >>> drm_sched_entity_flush() should be called from the flush callback from
> >>> the file_operations structure of panfrost. See amdgpu_flush() and
> >>> amdgpu_ctx_mgr_entity_flush(). This makes sure that we wait for all
> >>> entities of the process/file descriptor to be flushed out.
> >>>
> >>> drm_sched_entity_fini() must be called before you free the memory the
> >>> entity structure or otherwise we would run into an use after free.
> >> Right, drm_sched_entity_destroy() invokes these two functions and
> >> Panfrost uses drm_sched_entity_destroy().
> >
> > Than I have no idea what's going wrong here.
> >
> > The scheduler should trivially finish with the entity and call
> > complete(>entity_idle) in it's main loop. No idea why this
> > doesn't happen. Can you investigate?
>
> I'll take a closer look. Hoped you may have a quick idea of what's wrong :)
>

As Jonathan mentioned, the same thing is happening on msm.  I can
reproduce this by adding an assert in mesa (in this case, triggered
after 100 draws) and running an app under gdb.  After the assert is
hit, if I try to exit mesa, it hangs.

The problem is that we somehow call drm_sched_entity_kill() twice.
The first time completes, but now the entity_idle completion is no
longer done, so the second call hangs forever.

BR,
-R


Re: [PATCH 12/13] drm/scheduler: rework entity flush, kill and fini

2022-11-17 Thread Dmitry Osipenko
On 11/17/22 18:09, Christian König wrote:
> Am 17.11.22 um 15:41 schrieb Dmitry Osipenko:
>> [SNIP]
>>> drm_sched_entity_flush() should be called from the flush callback from
>>> the file_operations structure of panfrost. See amdgpu_flush() and
>>> amdgpu_ctx_mgr_entity_flush(). This makes sure that we wait for all
>>> entities of the process/file descriptor to be flushed out.
>>>
>>> drm_sched_entity_fini() must be called before you free the memory the
>>> entity structure or otherwise we would run into an use after free.
>> Right, drm_sched_entity_destroy() invokes these two functions and
>> Panfrost uses drm_sched_entity_destroy().
> 
> Than I have no idea what's going wrong here.
> 
> The scheduler should trivially finish with the entity and call
> complete(>entity_idle) in it's main loop. No idea why this
> doesn't happen. Can you investigate?

I'll take a closer look. Hoped you may have a quick idea of what's wrong :)

-- 
Best regards,
Dmitry



Re: [PATCH 12/13] drm/scheduler: rework entity flush, kill and fini

2022-11-17 Thread Christian König

Am 17.11.22 um 15:41 schrieb Dmitry Osipenko:

[SNIP]

drm_sched_entity_flush() should be called from the flush callback from
the file_operations structure of panfrost. See amdgpu_flush() and
amdgpu_ctx_mgr_entity_flush(). This makes sure that we wait for all
entities of the process/file descriptor to be flushed out.

drm_sched_entity_fini() must be called before you free the memory the
entity structure or otherwise we would run into an use after free.

Right, drm_sched_entity_destroy() invokes these two functions and
Panfrost uses drm_sched_entity_destroy().


Than I have no idea what's going wrong here.

The scheduler should trivially finish with the entity and call 
complete(>entity_idle) in it's main loop. No idea why this 
doesn't happen. Can you investigate?


Regards,
Christian.


Re: [PATCH 12/13] drm/scheduler: rework entity flush, kill and fini

2022-11-17 Thread Dmitry Osipenko
On 11/17/22 16:11, Christian König wrote:
> Am 17.11.22 um 14:00 schrieb Dmitry Osipenko:
>> On 11/17/22 15:59, Dmitry Osipenko wrote:
>>> On 11/17/22 15:55, Christian König wrote:
 Am 17.11.22 um 13:47 schrieb Dmitry Osipenko:
> On 11/17/22 12:53, Christian König wrote:
>> Am 17.11.22 um 03:36 schrieb Dmitry Osipenko:
>>> Hi,
>>>
>>> On 10/14/22 11:46, Christian König wrote:
 +/* Remove the entity from the scheduler and kill all pending
 jobs */
 +static void drm_sched_entity_kill(struct drm_sched_entity *entity)
 +{
 +    struct drm_sched_job *job;
 +    struct dma_fence *prev;
 +
 +    if (!entity->rq)
 +    return;
 +
 +    spin_lock(>rq_lock);
 +    entity->stopped = true;
 +    drm_sched_rq_remove_entity(entity->rq, entity);
 +    spin_unlock(>rq_lock);
 +
 +    /* Make sure this entity is not used by the scheduler at the
 moment */
 +    wait_for_completion(>entity_idle);
>>> I'm always hitting lockup here using Panfrost driver on terminating
>>> Xorg. Revering this patch helps. Any ideas how to fix it?
>>>
>> Well is the entity idle or are there some unsubmitted jobs left?
> Do you mean unsubmitted to h/w? IIUC, there are unsubmitted jobs left.
>
> I see that there are 5-6 incomplete (in-flight) jobs when
> panfrost_job_close() is invoked.
>
> There are 1-2 jobs that are constantly scheduled and finished once
> in a
> few seconds after the lockup happens.
 Well what drm_sched_entity_kill() is supposed to do is to prevent
 pushing queued up stuff to the hw when the process which queued it is
 killed. Is the process really killed or is that just some incorrect
 handling?
>>> It's actually 5-6 incomplete jobs of Xorg that are hanging when Xorg
>>> process is closed.
>>>
>>> The two re-scheduled jobs are from sddm, so it's only the Xorg context
>>> that hangs.
>>>
 In other words I see two possibilities here, either we have a bug in
 the
 scheduler or panfrost isn't using it correctly.

 Does panfrost calls drm_sched_entity_flush() before it calls
 drm_sched_entity_fini()? (I don't have the driver source at hand at the
 moment).
>>> Panfrost doesn't use drm_sched_entity_flush(), nor
>>> drm_sched_entity_flush().
>> *nor drm_sched_entity_fini()
> 
> Well that would mean that this is *really* buggy! How do you then end up
> in drm_sched_entity_kill()? From drm_sched_entity_destroy()?

Yes, from drm_sched_entity_destroy().

> drm_sched_entity_flush() should be called from the flush callback from
> the file_operations structure of panfrost. See amdgpu_flush() and
> amdgpu_ctx_mgr_entity_flush(). This makes sure that we wait for all
> entities of the process/file descriptor to be flushed out.
> 
> drm_sched_entity_fini() must be called before you free the memory the
> entity structure or otherwise we would run into an use after free.

Right, drm_sched_entity_destroy() invokes these two functions and
Panfrost uses drm_sched_entity_destroy().

-- 
Best regards,
Dmitry



Re: [PATCH 12/13] drm/scheduler: rework entity flush, kill and fini

2022-11-17 Thread Christian König

Am 17.11.22 um 14:00 schrieb Dmitry Osipenko:

On 11/17/22 15:59, Dmitry Osipenko wrote:

On 11/17/22 15:55, Christian König wrote:

Am 17.11.22 um 13:47 schrieb Dmitry Osipenko:

On 11/17/22 12:53, Christian König wrote:

Am 17.11.22 um 03:36 schrieb Dmitry Osipenko:

Hi,

On 10/14/22 11:46, Christian König wrote:

+/* Remove the entity from the scheduler and kill all pending jobs */
+static void drm_sched_entity_kill(struct drm_sched_entity *entity)
+{
+    struct drm_sched_job *job;
+    struct dma_fence *prev;
+
+    if (!entity->rq)
+    return;
+
+    spin_lock(>rq_lock);
+    entity->stopped = true;
+    drm_sched_rq_remove_entity(entity->rq, entity);
+    spin_unlock(>rq_lock);
+
+    /* Make sure this entity is not used by the scheduler at the
moment */
+    wait_for_completion(>entity_idle);

I'm always hitting lockup here using Panfrost driver on terminating
Xorg. Revering this patch helps. Any ideas how to fix it?


Well is the entity idle or are there some unsubmitted jobs left?

Do you mean unsubmitted to h/w? IIUC, there are unsubmitted jobs left.

I see that there are 5-6 incomplete (in-flight) jobs when
panfrost_job_close() is invoked.

There are 1-2 jobs that are constantly scheduled and finished once in a
few seconds after the lockup happens.

Well what drm_sched_entity_kill() is supposed to do is to prevent
pushing queued up stuff to the hw when the process which queued it is
killed. Is the process really killed or is that just some incorrect
handling?

It's actually 5-6 incomplete jobs of Xorg that are hanging when Xorg
process is closed.

The two re-scheduled jobs are from sddm, so it's only the Xorg context
that hangs.


In other words I see two possibilities here, either we have a bug in the
scheduler or panfrost isn't using it correctly.

Does panfrost calls drm_sched_entity_flush() before it calls
drm_sched_entity_fini()? (I don't have the driver source at hand at the
moment).

Panfrost doesn't use drm_sched_entity_flush(), nor drm_sched_entity_flush().

*nor drm_sched_entity_fini()


Well that would mean that this is *really* buggy! How do you then end up 
in drm_sched_entity_kill()? From drm_sched_entity_destroy()?


drm_sched_entity_flush() should be called from the flush callback from 
the file_operations structure of panfrost. See amdgpu_flush() and 
amdgpu_ctx_mgr_entity_flush(). This makes sure that we wait for all 
entities of the process/file descriptor to be flushed out.


drm_sched_entity_fini() must be called before you free the memory the 
entity structure or otherwise we would run into an use after free.


Regards,
Christian.


Re: [PATCH 12/13] drm/scheduler: rework entity flush, kill and fini

2022-11-17 Thread Dmitry Osipenko
On 11/17/22 15:59, Dmitry Osipenko wrote:
> On 11/17/22 15:55, Christian König wrote:
>> Am 17.11.22 um 13:47 schrieb Dmitry Osipenko:
>>> On 11/17/22 12:53, Christian König wrote:
 Am 17.11.22 um 03:36 schrieb Dmitry Osipenko:
> Hi,
>
> On 10/14/22 11:46, Christian König wrote:
>> +/* Remove the entity from the scheduler and kill all pending jobs */
>> +static void drm_sched_entity_kill(struct drm_sched_entity *entity)
>> +{
>> +    struct drm_sched_job *job;
>> +    struct dma_fence *prev;
>> +
>> +    if (!entity->rq)
>> +    return;
>> +
>> +    spin_lock(>rq_lock);
>> +    entity->stopped = true;
>> +    drm_sched_rq_remove_entity(entity->rq, entity);
>> +    spin_unlock(>rq_lock);
>> +
>> +    /* Make sure this entity is not used by the scheduler at the
>> moment */
>> +    wait_for_completion(>entity_idle);
> I'm always hitting lockup here using Panfrost driver on terminating
> Xorg. Revering this patch helps. Any ideas how to fix it?
>
 Well is the entity idle or are there some unsubmitted jobs left?
>>> Do you mean unsubmitted to h/w? IIUC, there are unsubmitted jobs left.
>>>
>>> I see that there are 5-6 incomplete (in-flight) jobs when
>>> panfrost_job_close() is invoked.
>>>
>>> There are 1-2 jobs that are constantly scheduled and finished once in a
>>> few seconds after the lockup happens.
>>
>> Well what drm_sched_entity_kill() is supposed to do is to prevent
>> pushing queued up stuff to the hw when the process which queued it is
>> killed. Is the process really killed or is that just some incorrect
>> handling?
> 
> It's actually 5-6 incomplete jobs of Xorg that are hanging when Xorg
> process is closed.
> 
> The two re-scheduled jobs are from sddm, so it's only the Xorg context
> that hangs.
> 
>> In other words I see two possibilities here, either we have a bug in the
>> scheduler or panfrost isn't using it correctly.
>>
>> Does panfrost calls drm_sched_entity_flush() before it calls
>> drm_sched_entity_fini()? (I don't have the driver source at hand at the
>> moment).
> 
> Panfrost doesn't use drm_sched_entity_flush(), nor drm_sched_entity_flush().

*nor drm_sched_entity_fini()

-- 
Best regards,
Dmitry



Re: [PATCH 12/13] drm/scheduler: rework entity flush, kill and fini

2022-11-17 Thread Dmitry Osipenko
On 11/17/22 15:55, Christian König wrote:
> Am 17.11.22 um 13:47 schrieb Dmitry Osipenko:
>> On 11/17/22 12:53, Christian König wrote:
>>> Am 17.11.22 um 03:36 schrieb Dmitry Osipenko:
 Hi,

 On 10/14/22 11:46, Christian König wrote:
> +/* Remove the entity from the scheduler and kill all pending jobs */
> +static void drm_sched_entity_kill(struct drm_sched_entity *entity)
> +{
> +    struct drm_sched_job *job;
> +    struct dma_fence *prev;
> +
> +    if (!entity->rq)
> +    return;
> +
> +    spin_lock(>rq_lock);
> +    entity->stopped = true;
> +    drm_sched_rq_remove_entity(entity->rq, entity);
> +    spin_unlock(>rq_lock);
> +
> +    /* Make sure this entity is not used by the scheduler at the
> moment */
> +    wait_for_completion(>entity_idle);
 I'm always hitting lockup here using Panfrost driver on terminating
 Xorg. Revering this patch helps. Any ideas how to fix it?

>>> Well is the entity idle or are there some unsubmitted jobs left?
>> Do you mean unsubmitted to h/w? IIUC, there are unsubmitted jobs left.
>>
>> I see that there are 5-6 incomplete (in-flight) jobs when
>> panfrost_job_close() is invoked.
>>
>> There are 1-2 jobs that are constantly scheduled and finished once in a
>> few seconds after the lockup happens.
> 
> Well what drm_sched_entity_kill() is supposed to do is to prevent
> pushing queued up stuff to the hw when the process which queued it is
> killed. Is the process really killed or is that just some incorrect
> handling?

It's actually 5-6 incomplete jobs of Xorg that are hanging when Xorg
process is closed.

The two re-scheduled jobs are from sddm, so it's only the Xorg context
that hangs.

> In other words I see two possibilities here, either we have a bug in the
> scheduler or panfrost isn't using it correctly.
> 
> Does panfrost calls drm_sched_entity_flush() before it calls
> drm_sched_entity_fini()? (I don't have the driver source at hand at the
> moment).

Panfrost doesn't use drm_sched_entity_flush(), nor drm_sched_entity_flush().

-- 
Best regards,
Dmitry



Re: [PATCH 12/13] drm/scheduler: rework entity flush, kill and fini

2022-11-17 Thread Christian König

Am 17.11.22 um 13:47 schrieb Dmitry Osipenko:

On 11/17/22 12:53, Christian König wrote:

Am 17.11.22 um 03:36 schrieb Dmitry Osipenko:

Hi,

On 10/14/22 11:46, Christian König wrote:

+/* Remove the entity from the scheduler and kill all pending jobs */
+static void drm_sched_entity_kill(struct drm_sched_entity *entity)
+{
+    struct drm_sched_job *job;
+    struct dma_fence *prev;
+
+    if (!entity->rq)
+    return;
+
+    spin_lock(>rq_lock);
+    entity->stopped = true;
+    drm_sched_rq_remove_entity(entity->rq, entity);
+    spin_unlock(>rq_lock);
+
+    /* Make sure this entity is not used by the scheduler at the
moment */
+    wait_for_completion(>entity_idle);

I'm always hitting lockup here using Panfrost driver on terminating
Xorg. Revering this patch helps. Any ideas how to fix it?


Well is the entity idle or are there some unsubmitted jobs left?

Do you mean unsubmitted to h/w? IIUC, there are unsubmitted jobs left.

I see that there are 5-6 incomplete (in-flight) jobs when
panfrost_job_close() is invoked.

There are 1-2 jobs that are constantly scheduled and finished once in a
few seconds after the lockup happens.


Well what drm_sched_entity_kill() is supposed to do is to prevent 
pushing queued up stuff to the hw when the process which queued it is 
killed. Is the process really killed or is that just some incorrect 
handling?


In other words I see two possibilities here, either we have a bug in the 
scheduler or panfrost isn't using it correctly.


Does panfrost calls drm_sched_entity_flush() before it calls 
drm_sched_entity_fini()? (I don't have the driver source at hand at the 
moment).


Regards,
Christian.


Re: [PATCH 12/13] drm/scheduler: rework entity flush, kill and fini

2022-11-17 Thread Dmitry Osipenko
On 11/17/22 12:53, Christian König wrote:
> Am 17.11.22 um 03:36 schrieb Dmitry Osipenko:
>> Hi,
>>
>> On 10/14/22 11:46, Christian König wrote:
>>> +/* Remove the entity from the scheduler and kill all pending jobs */
>>> +static void drm_sched_entity_kill(struct drm_sched_entity *entity)
>>> +{
>>> +    struct drm_sched_job *job;
>>> +    struct dma_fence *prev;
>>> +
>>> +    if (!entity->rq)
>>> +    return;
>>> +
>>> +    spin_lock(>rq_lock);
>>> +    entity->stopped = true;
>>> +    drm_sched_rq_remove_entity(entity->rq, entity);
>>> +    spin_unlock(>rq_lock);
>>> +
>>> +    /* Make sure this entity is not used by the scheduler at the
>>> moment */
>>> +    wait_for_completion(>entity_idle);
>> I'm always hitting lockup here using Panfrost driver on terminating
>> Xorg. Revering this patch helps. Any ideas how to fix it?
>>
> 
> Well is the entity idle or are there some unsubmitted jobs left?

Do you mean unsubmitted to h/w? IIUC, there are unsubmitted jobs left.

I see that there are 5-6 incomplete (in-flight) jobs when
panfrost_job_close() is invoked.

There are 1-2 jobs that are constantly scheduled and finished once in a
few seconds after the lockup happens.

-- 
Best regards,
Dmitry



Re: [PATCH 12/13] drm/scheduler: rework entity flush, kill and fini

2022-11-17 Thread Christian König

Am 17.11.22 um 03:36 schrieb Dmitry Osipenko:

Hi,

On 10/14/22 11:46, Christian König wrote:

+/* Remove the entity from the scheduler and kill all pending jobs */
+static void drm_sched_entity_kill(struct drm_sched_entity *entity)
+{
+   struct drm_sched_job *job;
+   struct dma_fence *prev;
+
+   if (!entity->rq)
+   return;
+
+   spin_lock(>rq_lock);
+   entity->stopped = true;
+   drm_sched_rq_remove_entity(entity->rq, entity);
+   spin_unlock(>rq_lock);
+
+   /* Make sure this entity is not used by the scheduler at the moment */
+   wait_for_completion(>entity_idle);

I'm always hitting lockup here using Panfrost driver on terminating
Xorg. Revering this patch helps. Any ideas how to fix it?



Well is the entity idle or are there some unsubmitted jobs left?

Regards,
Christian.


Re: [PATCH 12/13] drm/scheduler: rework entity flush, kill and fini

2022-11-16 Thread Dmitry Osipenko
Hi,

On 10/14/22 11:46, Christian König wrote:
> +/* Remove the entity from the scheduler and kill all pending jobs */
> +static void drm_sched_entity_kill(struct drm_sched_entity *entity)
> +{
> + struct drm_sched_job *job;
> + struct dma_fence *prev;
> +
> + if (!entity->rq)
> + return;
> +
> + spin_lock(>rq_lock);
> + entity->stopped = true;
> + drm_sched_rq_remove_entity(entity->rq, entity);
> + spin_unlock(>rq_lock);
> +
> + /* Make sure this entity is not used by the scheduler at the moment */
> + wait_for_completion(>entity_idle);

I'm always hitting lockup here using Panfrost driver on terminating
Xorg. Revering this patch helps. Any ideas how to fix it?

-- 
Best regards,
Dmitry



[PATCH 12/13] drm/scheduler: rework entity flush, kill and fini

2022-10-14 Thread Christian König
This was buggy because when we had to wait for entities which were
killed as well we would just deadlock.

Instead move all the dependency handling into the callbacks so that
will all happen asynchronously.

Signed-off-by: Christian König 
---
 drivers/gpu/drm/scheduler/sched_entity.c | 197 +++
 1 file changed, 92 insertions(+), 105 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_entity.c 
b/drivers/gpu/drm/scheduler/sched_entity.c
index 243ff384cde8..54ac37cd5017 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -139,6 +139,74 @@ bool drm_sched_entity_is_ready(struct drm_sched_entity 
*entity)
return true;
 }
 
+static void drm_sched_entity_kill_jobs_irq_work(struct irq_work *wrk)
+{
+   struct drm_sched_job *job = container_of(wrk, typeof(*job), work);
+
+   drm_sched_fence_scheduled(job->s_fence);
+   drm_sched_fence_finished(job->s_fence);
+   WARN_ON(job->s_fence->parent);
+   job->sched->ops->free_job(job);
+}
+
+/* Signal the scheduler finished fence when the entity in question is killed. 
*/
+static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
+ struct dma_fence_cb *cb)
+{
+   struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
+finish_cb);
+   int r;
+
+   dma_fence_put(f);
+
+   /* Wait for all dependencies to avoid data corruptions */
+   while (!xa_empty(>dependencies)) {
+   f = xa_erase(>dependencies, job->last_dependency++);
+   r = dma_fence_add_callback(f, >finish_cb,
+  drm_sched_entity_kill_jobs_cb);
+   if (!r)
+   return;
+
+   dma_fence_put(f);
+   }
+
+   init_irq_work(>work, drm_sched_entity_kill_jobs_irq_work);
+   irq_work_queue(>work);
+}
+
+/* Remove the entity from the scheduler and kill all pending jobs */
+static void drm_sched_entity_kill(struct drm_sched_entity *entity)
+{
+   struct drm_sched_job *job;
+   struct dma_fence *prev;
+
+   if (!entity->rq)
+   return;
+
+   spin_lock(>rq_lock);
+   entity->stopped = true;
+   drm_sched_rq_remove_entity(entity->rq, entity);
+   spin_unlock(>rq_lock);
+
+   /* Make sure this entity is not used by the scheduler at the moment */
+   wait_for_completion(>entity_idle);
+
+   prev = dma_fence_get(entity->last_scheduled);
+   while ((job = to_drm_sched_job(spsc_queue_pop(>job_queue {
+   struct drm_sched_fence *s_fence = job->s_fence;
+
+   dma_fence_set_error(_fence->finished, -ESRCH);
+
+   dma_fence_get(_fence->finished);
+   if (!prev || dma_fence_add_callback(prev, >finish_cb,
+  drm_sched_entity_kill_jobs_cb))
+   drm_sched_entity_kill_jobs_cb(NULL, >finish_cb);
+
+   prev = _fence->finished;
+   }
+   dma_fence_put(prev);
+}
+
 /**
  * drm_sched_entity_flush - Flush a context entity
  *
@@ -179,91 +247,13 @@ long drm_sched_entity_flush(struct drm_sched_entity 
*entity, long timeout)
/* For killed process disable any more IBs enqueue right now */
last_user = cmpxchg(>last_user, current->group_leader, NULL);
if ((!last_user || last_user == current->group_leader) &&
-   (current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) {
-   spin_lock(>rq_lock);
-   entity->stopped = true;
-   drm_sched_rq_remove_entity(entity->rq, entity);
-   spin_unlock(>rq_lock);
-   }
+   (current->flags & PF_EXITING) && (current->exit_code == SIGKILL))
+   drm_sched_entity_kill(entity);
 
return ret;
 }
 EXPORT_SYMBOL(drm_sched_entity_flush);
 
-static void drm_sched_entity_kill_jobs_irq_work(struct irq_work *wrk)
-{
-   struct drm_sched_job *job = container_of(wrk, typeof(*job), work);
-
-   drm_sched_fence_finished(job->s_fence);
-   WARN_ON(job->s_fence->parent);
-   job->sched->ops->free_job(job);
-}
-
-
-/* Signal the scheduler finished fence when the entity in question is killed. 
*/
-static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
- struct dma_fence_cb *cb)
-{
-   struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
-finish_cb);
-
-   dma_fence_put(f);
-   init_irq_work(>work, drm_sched_entity_kill_jobs_irq_work);
-   irq_work_queue(>work);
-}
-
-static struct dma_fence *
-drm_sched_job_dependency(struct drm_sched_job *job,
-struct drm_sched_entity *entity)
-{
-   if (!xa_empty(>dependencies))
-   return xa_erase(>dependencies, job->last_dependency++);
-
-   if (job->sched->ops->dependency)

Re: [PATCH 12/13] drm/scheduler: rework entity flush, kill and fini

2022-09-30 Thread Andrey Grodzovsky



On 2022-09-30 07:51, Christian König wrote:

Am 29.09.22 um 21:20 schrieb Andrey Grodzovsky:


On 2022-09-29 09:21, Christian König wrote:

This was buggy because when we had to wait for entities which were
killed as well we would just deadlock.

Instead move all the dependency handling into the callbacks so that
will all happen asynchronously.

Signed-off-by: Christian König 
---
  drivers/gpu/drm/scheduler/sched_entity.c | 197 
+++

  1 file changed, 92 insertions(+), 105 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_entity.c 
b/drivers/gpu/drm/scheduler/sched_entity.c

index 1bb1437a8fed..1d448e376811 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -139,6 +139,74 @@ bool drm_sched_entity_is_ready(struct 
drm_sched_entity *entity)

  return true;
  }
  +static void drm_sched_entity_kill_jobs_irq_work(struct irq_work 
*wrk)

+{
+    struct drm_sched_job *job = container_of(wrk, typeof(*job), work);
+
+    drm_sched_fence_scheduled(job->s_fence);
+    drm_sched_fence_finished(job->s_fence);
+    WARN_ON(job->s_fence->parent);
+    job->sched->ops->free_job(job);
+}
+
+/* Signal the scheduler finished fence when the entity in question 
is killed. */

+static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
+  struct dma_fence_cb *cb)
+{
+    struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
+ finish_cb);
+    int r;
+
+    dma_fence_put(f);
+
+    /* Wait for all dependencies to avoid data corruptions */
+    while (!xa_empty(>dependencies)) {
+    f = xa_erase(>dependencies, job->last_dependency++);
+    r = dma_fence_add_callback(f, >finish_cb,
+   drm_sched_entity_kill_jobs_cb);
+    if (!r)
+    return;
+
+    dma_fence_put(f);
+    }
+
+    init_irq_work(>work, drm_sched_entity_kill_jobs_irq_work);
+    irq_work_queue(>work);
+}
+
+/* Remove the entity from the scheduler and kill all pending jobs */
+static void drm_sched_entity_kill(struct drm_sched_entity *entity)
+{
+    struct drm_sched_job *job;
+    struct dma_fence *prev;
+
+    if (!entity->rq)
+    return;
+
+    spin_lock(>rq_lock);
+    entity->stopped = true;
+    drm_sched_rq_remove_entity(entity->rq, entity);
+    spin_unlock(>rq_lock);
+
+    /* Make sure this entity is not used by the scheduler at the 
moment */

+    wait_for_completion(>entity_idle);



Does it really stop processing in case more jobs are pending in 
entity queue already ?
It probably makes sense to integrate drm_sched_entity_is_idle into 
drm_sched_entity_is_ready

to prevent rq selecting this  entity  in such case


We make sure that the entity is not used for scheduling any more by 
calling drm_sched_rq_remove_entity() right above this check here.


The wait is only to prevent us from racing with the scheduler thread 
submitting one more job from the entity.



Ok, missed the entity remove in the code







+
+    prev = dma_fence_get(entity->last_scheduled);
+    while ((job = 
to_drm_sched_job(spsc_queue_pop(>job_queue {

+    struct drm_sched_fence *s_fence = job->s_fence;
+
+    dma_fence_set_error(_fence->finished, -ESRCH);
+
+    dma_fence_get(_fence->finished);
+    if (!prev || dma_fence_add_callback(prev, >finish_cb,
+   drm_sched_entity_kill_jobs_cb))
+    drm_sched_entity_kill_jobs_cb(NULL, >finish_cb);
+
+    prev = _fence->finished;
+    }
+    dma_fence_put(prev);
+}
+
  /**
   * drm_sched_entity_flush - Flush a context entity
   *
@@ -179,91 +247,13 @@ long drm_sched_entity_flush(struct 
drm_sched_entity *entity, long timeout)

  /* For killed process disable any more IBs enqueue right now */
  last_user = cmpxchg(>last_user, current->group_leader, 
NULL);

  if ((!last_user || last_user == current->group_leader) &&
-    (current->flags & PF_EXITING) && (current->exit_code == 
SIGKILL)) {

-    spin_lock(>rq_lock);
-    entity->stopped = true;
-    drm_sched_rq_remove_entity(entity->rq, entity);
-    spin_unlock(>rq_lock);
-    }
+    (current->flags & PF_EXITING) && (current->exit_code == 
SIGKILL))

+    drm_sched_entity_kill(entity);



This reverts 'drm/scheduler: only kill entity if last user is killed 
v2' and so might break this use case - when entity

gets through FD into child process. Why this needs to be removed ?


This patch isn't reverted. I keep the "last_user == 
current->group_leader" check in the line above.


Christian.



OK, second time i didn't look carefully... My bad

Another question - you call drm_sched_entity_kill now both in 
drm_sched_entity_flush and in drm_sched_entity_kill_jobs - so
for drm_sched_entity_destroy it will be called twice, no ? because it 
will be called from drm_sched_entity_flush and from
drm_sched_entity_fini. Why we need to call drm_sched_entity_kill from 
flush ? With your new async scheme of jobs 

Re: [PATCH 12/13] drm/scheduler: rework entity flush, kill and fini

2022-09-30 Thread Christian König

Am 29.09.22 um 21:20 schrieb Andrey Grodzovsky:


On 2022-09-29 09:21, Christian König wrote:

This was buggy because when we had to wait for entities which were
killed as well we would just deadlock.

Instead move all the dependency handling into the callbacks so that
will all happen asynchronously.

Signed-off-by: Christian König 
---
  drivers/gpu/drm/scheduler/sched_entity.c | 197 +++
  1 file changed, 92 insertions(+), 105 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_entity.c 
b/drivers/gpu/drm/scheduler/sched_entity.c

index 1bb1437a8fed..1d448e376811 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -139,6 +139,74 @@ bool drm_sched_entity_is_ready(struct 
drm_sched_entity *entity)

  return true;
  }
  +static void drm_sched_entity_kill_jobs_irq_work(struct irq_work *wrk)
+{
+    struct drm_sched_job *job = container_of(wrk, typeof(*job), work);
+
+    drm_sched_fence_scheduled(job->s_fence);
+    drm_sched_fence_finished(job->s_fence);
+    WARN_ON(job->s_fence->parent);
+    job->sched->ops->free_job(job);
+}
+
+/* Signal the scheduler finished fence when the entity in question 
is killed. */

+static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
+  struct dma_fence_cb *cb)
+{
+    struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
+ finish_cb);
+    int r;
+
+    dma_fence_put(f);
+
+    /* Wait for all dependencies to avoid data corruptions */
+    while (!xa_empty(>dependencies)) {
+    f = xa_erase(>dependencies, job->last_dependency++);
+    r = dma_fence_add_callback(f, >finish_cb,
+   drm_sched_entity_kill_jobs_cb);
+    if (!r)
+    return;
+
+    dma_fence_put(f);
+    }
+
+    init_irq_work(>work, drm_sched_entity_kill_jobs_irq_work);
+    irq_work_queue(>work);
+}
+
+/* Remove the entity from the scheduler and kill all pending jobs */
+static void drm_sched_entity_kill(struct drm_sched_entity *entity)
+{
+    struct drm_sched_job *job;
+    struct dma_fence *prev;
+
+    if (!entity->rq)
+    return;
+
+    spin_lock(>rq_lock);
+    entity->stopped = true;
+    drm_sched_rq_remove_entity(entity->rq, entity);
+    spin_unlock(>rq_lock);
+
+    /* Make sure this entity is not used by the scheduler at the 
moment */

+    wait_for_completion(>entity_idle);



Does it really stop processing in case more jobs are pending in entity 
queue already ?
It probably makes sense to integrate drm_sched_entity_is_idle into 
drm_sched_entity_is_ready

to prevent rq selecting this  entity  in such case


We make sure that the entity is not used for scheduling any more by 
calling drm_sched_rq_remove_entity() right above this check here.


The wait is only to prevent us from racing with the scheduler thread 
submitting one more job from the entity.





+
+    prev = dma_fence_get(entity->last_scheduled);
+    while ((job = 
to_drm_sched_job(spsc_queue_pop(>job_queue {

+    struct drm_sched_fence *s_fence = job->s_fence;
+
+    dma_fence_set_error(_fence->finished, -ESRCH);
+
+    dma_fence_get(_fence->finished);
+    if (!prev || dma_fence_add_callback(prev, >finish_cb,
+   drm_sched_entity_kill_jobs_cb))
+    drm_sched_entity_kill_jobs_cb(NULL, >finish_cb);
+
+    prev = _fence->finished;
+    }
+    dma_fence_put(prev);
+}
+
  /**
   * drm_sched_entity_flush - Flush a context entity
   *
@@ -179,91 +247,13 @@ long drm_sched_entity_flush(struct 
drm_sched_entity *entity, long timeout)

  /* For killed process disable any more IBs enqueue right now */
  last_user = cmpxchg(>last_user, current->group_leader, 
NULL);

  if ((!last_user || last_user == current->group_leader) &&
-    (current->flags & PF_EXITING) && (current->exit_code == 
SIGKILL)) {

-    spin_lock(>rq_lock);
-    entity->stopped = true;
-    drm_sched_rq_remove_entity(entity->rq, entity);
-    spin_unlock(>rq_lock);
-    }
+    (current->flags & PF_EXITING) && (current->exit_code == 
SIGKILL))

+    drm_sched_entity_kill(entity);



This reverts 'drm/scheduler: only kill entity if last user is killed 
v2' and so might break this use case - when entity

gets through FD into child process. Why this needs to be removed ?


This patch isn't reverted. I keep the "last_user == 
current->group_leader" check in the line above.


Christian.



Andrey



    return ret;
  }
  EXPORT_SYMBOL(drm_sched_entity_flush);
  -static void drm_sched_entity_kill_jobs_irq_work(struct irq_work *wrk)
-{
-    struct drm_sched_job *job = container_of(wrk, typeof(*job), work);
-
-    drm_sched_fence_finished(job->s_fence);
-    WARN_ON(job->s_fence->parent);
-    job->sched->ops->free_job(job);
-}
-
-
-/* Signal the scheduler finished fence when the entity in question 
is killed. */

-static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
-   

Re: [PATCH 12/13] drm/scheduler: rework entity flush, kill and fini

2022-09-29 Thread Andrey Grodzovsky



On 2022-09-29 09:21, Christian König wrote:

This was buggy because when we had to wait for entities which were
killed as well we would just deadlock.

Instead move all the dependency handling into the callbacks so that
will all happen asynchronously.

Signed-off-by: Christian König 
---
  drivers/gpu/drm/scheduler/sched_entity.c | 197 +++
  1 file changed, 92 insertions(+), 105 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_entity.c 
b/drivers/gpu/drm/scheduler/sched_entity.c
index 1bb1437a8fed..1d448e376811 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -139,6 +139,74 @@ bool drm_sched_entity_is_ready(struct drm_sched_entity 
*entity)
return true;
  }
  
+static void drm_sched_entity_kill_jobs_irq_work(struct irq_work *wrk)

+{
+   struct drm_sched_job *job = container_of(wrk, typeof(*job), work);
+
+   drm_sched_fence_scheduled(job->s_fence);
+   drm_sched_fence_finished(job->s_fence);
+   WARN_ON(job->s_fence->parent);
+   job->sched->ops->free_job(job);
+}
+
+/* Signal the scheduler finished fence when the entity in question is killed. 
*/
+static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
+ struct dma_fence_cb *cb)
+{
+   struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
+finish_cb);
+   int r;
+
+   dma_fence_put(f);
+
+   /* Wait for all dependencies to avoid data corruptions */
+   while (!xa_empty(>dependencies)) {
+   f = xa_erase(>dependencies, job->last_dependency++);
+   r = dma_fence_add_callback(f, >finish_cb,
+  drm_sched_entity_kill_jobs_cb);
+   if (!r)
+   return;
+
+   dma_fence_put(f);
+   }
+
+   init_irq_work(>work, drm_sched_entity_kill_jobs_irq_work);
+   irq_work_queue(>work);
+}
+
+/* Remove the entity from the scheduler and kill all pending jobs */
+static void drm_sched_entity_kill(struct drm_sched_entity *entity)
+{
+   struct drm_sched_job *job;
+   struct dma_fence *prev;
+
+   if (!entity->rq)
+   return;
+
+   spin_lock(>rq_lock);
+   entity->stopped = true;
+   drm_sched_rq_remove_entity(entity->rq, entity);
+   spin_unlock(>rq_lock);
+
+   /* Make sure this entity is not used by the scheduler at the moment */
+   wait_for_completion(>entity_idle);



Does it really stop processing in case more jobs are pending in entity 
queue already ?
It probably makes sense to integrate drm_sched_entity_is_idle into 
drm_sched_entity_is_ready

to prevent rq selecting this  entity  in such case


+
+   prev = dma_fence_get(entity->last_scheduled);
+   while ((job = to_drm_sched_job(spsc_queue_pop(>job_queue {
+   struct drm_sched_fence *s_fence = job->s_fence;
+
+   dma_fence_set_error(_fence->finished, -ESRCH);
+
+   dma_fence_get(_fence->finished);
+   if (!prev || dma_fence_add_callback(prev, >finish_cb,
+  drm_sched_entity_kill_jobs_cb))
+   drm_sched_entity_kill_jobs_cb(NULL, >finish_cb);
+
+   prev = _fence->finished;
+   }
+   dma_fence_put(prev);
+}
+
  /**
   * drm_sched_entity_flush - Flush a context entity
   *
@@ -179,91 +247,13 @@ long drm_sched_entity_flush(struct drm_sched_entity 
*entity, long timeout)
/* For killed process disable any more IBs enqueue right now */
last_user = cmpxchg(>last_user, current->group_leader, NULL);
if ((!last_user || last_user == current->group_leader) &&
-   (current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) {
-   spin_lock(>rq_lock);
-   entity->stopped = true;
-   drm_sched_rq_remove_entity(entity->rq, entity);
-   spin_unlock(>rq_lock);
-   }
+   (current->flags & PF_EXITING) && (current->exit_code == SIGKILL))
+   drm_sched_entity_kill(entity);



This reverts 'drm/scheduler: only kill entity if last user is killed v2' 
and so might break this use case - when entity

gets through FD into child process. Why this needs to be removed ?

Andrey


  
  	return ret;

  }
  EXPORT_SYMBOL(drm_sched_entity_flush);
  
-static void drm_sched_entity_kill_jobs_irq_work(struct irq_work *wrk)

-{
-   struct drm_sched_job *job = container_of(wrk, typeof(*job), work);
-
-   drm_sched_fence_finished(job->s_fence);
-   WARN_ON(job->s_fence->parent);
-   job->sched->ops->free_job(job);
-}
-
-
-/* Signal the scheduler finished fence when the entity in question is killed. 
*/
-static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
- struct dma_fence_cb *cb)
-{
-   struct drm_sched_job *job = container_of(cb, struct 

[PATCH 12/13] drm/scheduler: rework entity flush, kill and fini

2022-09-29 Thread Christian König
This was buggy because when we had to wait for entities which were
killed as well we would just deadlock.

Instead move all the dependency handling into the callbacks so that
will all happen asynchronously.

Signed-off-by: Christian König 
---
 drivers/gpu/drm/scheduler/sched_entity.c | 197 +++
 1 file changed, 92 insertions(+), 105 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_entity.c 
b/drivers/gpu/drm/scheduler/sched_entity.c
index 1bb1437a8fed..1d448e376811 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -139,6 +139,74 @@ bool drm_sched_entity_is_ready(struct drm_sched_entity 
*entity)
return true;
 }
 
+static void drm_sched_entity_kill_jobs_irq_work(struct irq_work *wrk)
+{
+   struct drm_sched_job *job = container_of(wrk, typeof(*job), work);
+
+   drm_sched_fence_scheduled(job->s_fence);
+   drm_sched_fence_finished(job->s_fence);
+   WARN_ON(job->s_fence->parent);
+   job->sched->ops->free_job(job);
+}
+
+/* Signal the scheduler finished fence when the entity in question is killed. 
*/
+static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
+ struct dma_fence_cb *cb)
+{
+   struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
+finish_cb);
+   int r;
+
+   dma_fence_put(f);
+
+   /* Wait for all dependencies to avoid data corruptions */
+   while (!xa_empty(>dependencies)) {
+   f = xa_erase(>dependencies, job->last_dependency++);
+   r = dma_fence_add_callback(f, >finish_cb,
+  drm_sched_entity_kill_jobs_cb);
+   if (!r)
+   return;
+
+   dma_fence_put(f);
+   }
+
+   init_irq_work(>work, drm_sched_entity_kill_jobs_irq_work);
+   irq_work_queue(>work);
+}
+
+/* Remove the entity from the scheduler and kill all pending jobs */
+static void drm_sched_entity_kill(struct drm_sched_entity *entity)
+{
+   struct drm_sched_job *job;
+   struct dma_fence *prev;
+
+   if (!entity->rq)
+   return;
+
+   spin_lock(>rq_lock);
+   entity->stopped = true;
+   drm_sched_rq_remove_entity(entity->rq, entity);
+   spin_unlock(>rq_lock);
+
+   /* Make sure this entity is not used by the scheduler at the moment */
+   wait_for_completion(>entity_idle);
+
+   prev = dma_fence_get(entity->last_scheduled);
+   while ((job = to_drm_sched_job(spsc_queue_pop(>job_queue {
+   struct drm_sched_fence *s_fence = job->s_fence;
+
+   dma_fence_set_error(_fence->finished, -ESRCH);
+
+   dma_fence_get(_fence->finished);
+   if (!prev || dma_fence_add_callback(prev, >finish_cb,
+  drm_sched_entity_kill_jobs_cb))
+   drm_sched_entity_kill_jobs_cb(NULL, >finish_cb);
+
+   prev = _fence->finished;
+   }
+   dma_fence_put(prev);
+}
+
 /**
  * drm_sched_entity_flush - Flush a context entity
  *
@@ -179,91 +247,13 @@ long drm_sched_entity_flush(struct drm_sched_entity 
*entity, long timeout)
/* For killed process disable any more IBs enqueue right now */
last_user = cmpxchg(>last_user, current->group_leader, NULL);
if ((!last_user || last_user == current->group_leader) &&
-   (current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) {
-   spin_lock(>rq_lock);
-   entity->stopped = true;
-   drm_sched_rq_remove_entity(entity->rq, entity);
-   spin_unlock(>rq_lock);
-   }
+   (current->flags & PF_EXITING) && (current->exit_code == SIGKILL))
+   drm_sched_entity_kill(entity);
 
return ret;
 }
 EXPORT_SYMBOL(drm_sched_entity_flush);
 
-static void drm_sched_entity_kill_jobs_irq_work(struct irq_work *wrk)
-{
-   struct drm_sched_job *job = container_of(wrk, typeof(*job), work);
-
-   drm_sched_fence_finished(job->s_fence);
-   WARN_ON(job->s_fence->parent);
-   job->sched->ops->free_job(job);
-}
-
-
-/* Signal the scheduler finished fence when the entity in question is killed. 
*/
-static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
- struct dma_fence_cb *cb)
-{
-   struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
-finish_cb);
-
-   dma_fence_put(f);
-   init_irq_work(>work, drm_sched_entity_kill_jobs_irq_work);
-   irq_work_queue(>work);
-}
-
-static struct dma_fence *
-drm_sched_job_dependency(struct drm_sched_job *job,
-struct drm_sched_entity *entity)
-{
-   if (!xa_empty(>dependencies))
-   return xa_erase(>dependencies, job->last_dependency++);
-
-   if (job->sched->ops->dependency)