On Fri, 2025-07-04 at 11:53 +0200, Philipp Stanner wrote: > On Wed, 2025-07-02 at 12:25 +0100, Tvrtko Ursulin wrote: > > > > On 02/07/2025 11:56, Philipp Stanner wrote: > > > On Wed, 2025-07-02 at 11:36 +0100, Tvrtko Ursulin wrote: > > > > > > > > On 01/07/2025 14:21, Philipp Stanner wrote: > > > > > The GPU Scheduler now supports a new callback, cancel_job(), > > > > > which > > > > > lets > > > > > the scheduler cancel all jobs which might not yet be freed > > > > > when > > > > > drm_sched_fini() runs. Using this callback allows for > > > > > significantly > > > > > simplifying the mock scheduler teardown code. > > > > > > > > > > Implement the cancel_job() callback and adjust the code where > > > > > necessary. > > > > > > > > Cross referencing against my version I think you missed this > > > > hunk: > > > > > > > > --- a/drivers/gpu/drm/scheduler/tests/sched_tests.h > > > > +++ b/drivers/gpu/drm/scheduler/tests/sched_tests.h > > > > @@ -49,7 +49,6 @@ struct drm_mock_scheduler { > > > > > > > > spinlock_t lock; > > > > struct list_head job_list; > > > > - struct list_head done_list; > > > > > > > > struct { > > > > u64 context; > > > > > > > > > > Right, overlooked that one. > > > > > > > > > > > I also had this: > > > > > > > > @@ -97,7 +96,8 @@ struct drm_mock_sched_job { > > > > struct completion done; > > > > > > > > #define DRM_MOCK_SCHED_JOB_DONE 0x1 > > > > -#define DRM_MOCK_SCHED_JOB_TIMEDOUT 0x2 > > > > +#define DRM_MOCK_SCHED_JOB_CANCELED 0x2 > > > > +#define DRM_MOCK_SCHED_JOB_TIMEDOUT 0x4 > > > > > > > > And was setting it in the callback. And since we should add a > > > > test to > > > > explicitly cover the new callback, and just the callback, that > > > > could > > > > make it very easy to do it. > > > > > > What do you imagine that to look like? The scheduler only invokes > > > the > > > callback on tear down. > > > > > > We also don't have tests that only test free_job() and the like, > > > do > > > we? > > > > > > You cannot test a callback for the scheduler, because the > > > callback > > > is > > > implemented in the driver. > > > > > > Callbacks are tested by using the scheduler. In this case, it's > > > tested > > > the intended way by the unit tests invoking drm_sched_free(). > > > > Something like (untested): > > > > static void drm_sched_test_cleanup(struct kunit *test) > > { > > struct drm_mock_sched_entity *entity; > > struct drm_mock_scheduler *sched; > > struct drm_mock_sched_job job; > > bool done; > > > > /* > > * Check that the job cancel callback gets invoked by the > > scheduler. > > */ > > > > sched = drm_mock_sched_new(test, MAX_SCHEDULE_TIMEOUT); > > entity = drm_mock_sched_entity_new(test, > > > > DRM_SCHED_PRIORITY_NORMAL, > > sched); > > > > job = drm_mock_sched_job_new(test, entity); > > drm_mock_sched_job_submit(job); > > done = drm_mock_sched_job_wait_scheduled(job, HZ); > > KUNIT_ASSERT_TRUE(test, done); > > > > drm_mock_sched_entity_free(entity); > > drm_mock_sched_fini(sched); > > > > KUNIT_ASSERT_TRUE(test, job->flags & > > DRM_MOCK_SCHED_JOB_CANCELED); > > } > > That could work – but it's racy. > > I wonder if we want to introduce a mechanism into the mock scheduler > through which we can enforce a simulated GPU hang. Then it would > never > race, and that might be useful for more advanced tests for reset > recovery and those things. > > Opinions?
Ah wait, we can do that with job_duration = 0 Very good, I'll include something like that in v2. P. > > > P. > > > > > > Or via the hw fence status. > > > > Regards, > > > > Tvrtko > > > > > > > Signed-off-by: Philipp Stanner <pha...@kernel.org> > > > > > --- > > > > > .../gpu/drm/scheduler/tests/mock_scheduler.c | 66 > > > > > +++++++-- > > > > > ----- > > > > > ----- > > > > > 1 file changed, 23 insertions(+), 43 deletions(-) > > > > > > > > > > diff --git a/drivers/gpu/drm/scheduler/tests/mock_scheduler.c > > > > > b/drivers/gpu/drm/scheduler/tests/mock_scheduler.c > > > > > index 49d067fecd67..2d3169d95200 100644 > > > > > --- a/drivers/gpu/drm/scheduler/tests/mock_scheduler.c > > > > > +++ b/drivers/gpu/drm/scheduler/tests/mock_scheduler.c > > > > > @@ -63,7 +63,7 @@ static void > > > > > drm_mock_sched_job_complete(struct > > > > > drm_mock_sched_job *job) > > > > > lockdep_assert_held(&sched->lock); > > > > > > > > > > job->flags |= DRM_MOCK_SCHED_JOB_DONE; > > > > > - list_move_tail(&job->link, &sched->done_list); > > > > > + list_del(&job->link); > > > > > dma_fence_signal_locked(&job->hw_fence); > > > > > complete(&job->done); > > > > > } > > > > > @@ -236,26 +236,39 @@ mock_sched_timedout_job(struct > > > > > drm_sched_job > > > > > *sched_job) > > > > > > > > > > static void mock_sched_free_job(struct drm_sched_job > > > > > *sched_job) > > > > > { > > > > > - struct drm_mock_scheduler *sched = > > > > > - drm_sched_to_mock_sched(sched_job- > > > > > > sched); > > > > > struct drm_mock_sched_job *job = > > > > > drm_sched_job_to_mock_job(sched_job); > > > > > - unsigned long flags; > > > > > > > > > > - /* Remove from the scheduler done list. */ > > > > > - spin_lock_irqsave(&sched->lock, flags); > > > > > - list_del(&job->link); > > > > > - spin_unlock_irqrestore(&sched->lock, flags); > > > > > dma_fence_put(&job->hw_fence); > > > > > - > > > > > drm_sched_job_cleanup(sched_job); > > > > > > > > > > /* Mock job itself is freed by the kunit framework. > > > > > */ > > > > > } > > > > > > > > > > +static void mock_sched_cancel_job(struct drm_sched_job > > > > > *sched_job) > > > > > +{ > > > > > + struct drm_mock_scheduler *sched = > > > > > drm_sched_to_mock_sched(sched_job->sched); > > > > > + struct drm_mock_sched_job *job = > > > > > drm_sched_job_to_mock_job(sched_job); > > > > > + unsigned long flags; > > > > > + > > > > > + hrtimer_cancel(&job->timer); > > > > > + > > > > > + spin_lock_irqsave(&sched->lock, flags); > > > > > + if (!dma_fence_is_signaled_locked(&job->hw_fence)) { > > > > > + list_del(&job->link); > > > > > + dma_fence_set_error(&job->hw_fence, - > > > > > ECANCELED); > > > > > + dma_fence_signal_locked(&job->hw_fence); > > > > > + } > > > > > + spin_unlock_irqrestore(&sched->lock, flags); > > > > > + > > > > > + /* The GPU Scheduler will call > > > > > drm_sched_backend_ops.free_job(), still. > > > > > + * Mock job itself is freed by the kunit framework. > > > > > */ > > > > > > > > /* > > > > * Multiline comment style to stay consistent, at least in > > > > this > > > > file. > > > > */ > > > > > > > > The rest looks good, but I need to revisit the timeout/free > > > > handling > > > > since it has been a while and you changed it recently. > > > > > > > > Regards, > > > > > > > > Tvrtko > > > > > > > > > +} > > > > > + > > > > > static const struct drm_sched_backend_ops > > > > > drm_mock_scheduler_ops > > > > > = { > > > > > .run_job = mock_sched_run_job, > > > > > .timedout_job = mock_sched_timedout_job, > > > > > - .free_job = mock_sched_free_job > > > > > + .free_job = mock_sched_free_job, > > > > > + .cancel_job = mock_sched_cancel_job, > > > > > }; > > > > > > > > > > /** > > > > > @@ -289,7 +302,6 @@ struct drm_mock_scheduler > > > > > *drm_mock_sched_new(struct kunit *test, long timeout) > > > > > sched->hw_timeline.context = > > > > > dma_fence_context_alloc(1); > > > > > atomic_set(&sched->hw_timeline.next_seqno, 0); > > > > > INIT_LIST_HEAD(&sched->job_list); > > > > > - INIT_LIST_HEAD(&sched->done_list); > > > > > spin_lock_init(&sched->lock); > > > > > > > > > > return sched; > > > > > @@ -304,38 +316,6 @@ struct drm_mock_scheduler > > > > > *drm_mock_sched_new(struct kunit *test, long timeout) > > > > > */ > > > > > void drm_mock_sched_fini(struct drm_mock_scheduler *sched) > > > > > { > > > > > - struct drm_mock_sched_job *job, *next; > > > > > - unsigned long flags; > > > > > - LIST_HEAD(list); > > > > > - > > > > > - drm_sched_wqueue_stop(&sched->base); > > > > > - > > > > > - /* Force complete all unfinished jobs. */ > > > > > - spin_lock_irqsave(&sched->lock, flags); > > > > > - list_for_each_entry_safe(job, next, &sched- > > > > > >job_list, > > > > > link) > > > > > - list_move_tail(&job->link, &list); > > > > > - spin_unlock_irqrestore(&sched->lock, flags); > > > > > - > > > > > - list_for_each_entry(job, &list, link) > > > > > - hrtimer_cancel(&job->timer); > > > > > - > > > > > - spin_lock_irqsave(&sched->lock, flags); > > > > > - list_for_each_entry_safe(job, next, &list, link) > > > > > - drm_mock_sched_job_complete(job); > > > > > - spin_unlock_irqrestore(&sched->lock, flags); > > > > > - > > > > > - /* > > > > > - * Free completed jobs and jobs not yet processed by > > > > > the > > > > > DRM scheduler > > > > > - * free worker. > > > > > - */ > > > > > - spin_lock_irqsave(&sched->lock, flags); > > > > > - list_for_each_entry_safe(job, next, &sched- > > > > > >done_list, > > > > > link) > > > > > - list_move_tail(&job->link, &list); > > > > > - spin_unlock_irqrestore(&sched->lock, flags); > > > > > - > > > > > - list_for_each_entry_safe(job, next, &list, link) > > > > > - mock_sched_free_job(&job->base); > > > > > - > > > > > drm_sched_fini(&sched->base); > > > > > } > > > > > > > > > > > > > > >