Re: [PATCH 8/8] drm/sched: Further optimise drm_sched_entity_push_job

2024-09-16 Thread Tvrtko Ursulin



On 16/09/2024 13:20, Tvrtko Ursulin wrote:


On 16/09/2024 13:11, Christian König wrote:

Am 13.09.24 um 18:05 schrieb Tvrtko Ursulin:

From: Tvrtko Ursulin 

Having removed one re-lock cycle on the entity->lock in a patch titled
"drm/sched: Optimise drm_sched_entity_push_job", with only a tiny bit
larger refactoring we can do the same optimisation on the rq->lock.
(Currently both drm_sched_rq_add_entity() and
drm_sched_rq_update_fifo_locked() take and release the same lock.)

To achieve this we rename drm_sched_rq_add_entity() to
drm_sched_rq_add_entity_locked(), making it expect the rq->lock to be
held, and also add the same expectation to
drm_sched_rq_update_fifo_locked().

Finally, to align drm_sched_rq_update_fifo_locked(),
drm_sched_rq_add_entity_locked() and
drm_sched_rq_remove_fifo_locked() function signatures, we add rq as a
parameter to the latter.

Signed-off-by: Tvrtko Ursulin 
Cc: Christian König 
Cc: Alex Deucher 
Cc: Luben Tuikov 
Cc: Matthew Brost 
Cc: Philipp Stanner 
---
  drivers/gpu/drm/scheduler/sched_entity.c |  8 --
  drivers/gpu/drm/scheduler/sched_main.c   | 34 +++-
  include/drm/gpu_scheduler.h  |  7 ++---
  3 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_entity.c 
b/drivers/gpu/drm/scheduler/sched_entity.c

index d982cebc6bee..c48f17faef41 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -517,6 +517,7 @@ struct drm_sched_job 
*drm_sched_entity_pop_job(struct drm_sched_entity *entity)

  if (next) {
  spin_lock(&entity->lock);
  drm_sched_rq_update_fifo_locked(entity,
+    entity->rq,
  next->submit_ts);
  spin_unlock(&entity->lock);
  }
@@ -618,11 +619,14 @@ void drm_sched_entity_push_job(struct 
drm_sched_job *sched_job)

  sched = rq->sched;
  atomic_inc(sched->score);
-    drm_sched_rq_add_entity(rq, entity);
+
+    spin_lock(&rq->lock);
+    drm_sched_rq_add_entity_locked(rq, entity);
  if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
-    drm_sched_rq_update_fifo_locked(entity, submit_ts);
+    drm_sched_rq_update_fifo_locked(entity, rq, submit_ts);
+    spin_unlock(&rq->lock);
  spin_unlock(&entity->lock);
  drm_sched_wakeup(sched, entity);
diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c

index 18a952f73ecb..c0d3f6ac3ae3 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -153,17 +153,18 @@ static __always_inline bool 
drm_sched_entity_compare_before(struct rb_node *a,
  return ktime_before(ent_a->oldest_job_waiting, 
ent_b->oldest_job_waiting);

  }
-static inline void drm_sched_rq_remove_fifo_locked(struct 
drm_sched_entity *entity)
+static void drm_sched_rq_remove_fifo_locked(struct drm_sched_entity 
*entity,

+    struct drm_sched_rq *rq)
  {
-    struct drm_sched_rq *rq = entity->rq;
-
  if (!RB_EMPTY_NODE(&entity->rb_tree_node)) {
  rb_erase_cached(&entity->rb_tree_node, &rq->rb_tree_root);
  RB_CLEAR_NODE(&entity->rb_tree_node);
  }
  }
-void drm_sched_rq_update_fifo_locked(struct drm_sched_entity 
*entity, ktime_t ts)

+void drm_sched_rq_update_fifo_locked(struct drm_sched_entity *entity,
+ struct drm_sched_rq *rq,
+ ktime_t ts)
  {
  /*
   * Both locks need to be grabbed, one to protect from 
entity->rq change
@@ -171,17 +172,14 @@ void drm_sched_rq_update_fifo_locked(struct 
drm_sched_entity *entity, ktime_t ts

   * other to update the rb tree structure.
   */
  lockdep_assert_held(&entity->lock);
+    lockdep_assert_held(&rq->lock);
-    spin_lock(&entity->rq->lock);
-
-    drm_sched_rq_remove_fifo_locked(entity);
+    drm_sched_rq_remove_fifo_locked(entity, rq);
  entity->oldest_job_waiting = ts;
-    rb_add_cached(&entity->rb_tree_node, &entity->rq->rb_tree_root,
+    rb_add_cached(&entity->rb_tree_node, &rq->rb_tree_root,
    drm_sched_entity_compare_before);
-
-    spin_unlock(&entity->rq->lock);
  }
  /**
@@ -203,25 +201,23 @@ static void drm_sched_rq_init(struct 
drm_gpu_scheduler *sched,

  }
  /**
- * drm_sched_rq_add_entity - add an entity
+ * drm_sched_rq_add_entity_locked - add an entity
   *
   * @rq: scheduler run queue
   * @entity: scheduler entity
   *
   * Adds a scheduler entity to the run queue.
   */
-void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
- struct drm_sched_entity *entity)
+void drm_sched_rq_add_entity_locked(struct drm_sched_rq *rq,
+    struct drm_sched_entity *entity)
  {
+    lockdep_assert_held(&rq->lock);
+
  if (!list_empty(&entity->list))
  return;
-    spin_lock(&rq->lock);
-
  atomic_inc(rq->sched->score);
  list_add_tail(&entity->list, &rq-

Re: [PATCH 8/8] drm/sched: Further optimise drm_sched_entity_push_job

2024-09-16 Thread Tvrtko Ursulin



On 16/09/2024 13:11, Christian König wrote:

Am 13.09.24 um 18:05 schrieb Tvrtko Ursulin:

From: Tvrtko Ursulin 

Having removed one re-lock cycle on the entity->lock in a patch titled
"drm/sched: Optimise drm_sched_entity_push_job", with only a tiny bit
larger refactoring we can do the same optimisation on the rq->lock.
(Currently both drm_sched_rq_add_entity() and
drm_sched_rq_update_fifo_locked() take and release the same lock.)

To achieve this we rename drm_sched_rq_add_entity() to
drm_sched_rq_add_entity_locked(), making it expect the rq->lock to be
held, and also add the same expectation to
drm_sched_rq_update_fifo_locked().

Finally, to align drm_sched_rq_update_fifo_locked(),
drm_sched_rq_add_entity_locked() and
drm_sched_rq_remove_fifo_locked() function signatures, we add rq as a
parameter to the latter.

Signed-off-by: Tvrtko Ursulin 
Cc: Christian König 
Cc: Alex Deucher 
Cc: Luben Tuikov 
Cc: Matthew Brost 
Cc: Philipp Stanner 
---
  drivers/gpu/drm/scheduler/sched_entity.c |  8 --
  drivers/gpu/drm/scheduler/sched_main.c   | 34 +++-
  include/drm/gpu_scheduler.h  |  7 ++---
  3 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_entity.c 
b/drivers/gpu/drm/scheduler/sched_entity.c

index d982cebc6bee..c48f17faef41 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -517,6 +517,7 @@ struct drm_sched_job 
*drm_sched_entity_pop_job(struct drm_sched_entity *entity)

  if (next) {
  spin_lock(&entity->lock);
  drm_sched_rq_update_fifo_locked(entity,
+    entity->rq,
  next->submit_ts);
  spin_unlock(&entity->lock);
  }
@@ -618,11 +619,14 @@ void drm_sched_entity_push_job(struct 
drm_sched_job *sched_job)

  sched = rq->sched;
  atomic_inc(sched->score);
-    drm_sched_rq_add_entity(rq, entity);
+
+    spin_lock(&rq->lock);
+    drm_sched_rq_add_entity_locked(rq, entity);
  if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
-    drm_sched_rq_update_fifo_locked(entity, submit_ts);
+    drm_sched_rq_update_fifo_locked(entity, rq, submit_ts);
+    spin_unlock(&rq->lock);
  spin_unlock(&entity->lock);
  drm_sched_wakeup(sched, entity);
diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c

index 18a952f73ecb..c0d3f6ac3ae3 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -153,17 +153,18 @@ static __always_inline bool 
drm_sched_entity_compare_before(struct rb_node *a,
  return ktime_before(ent_a->oldest_job_waiting, 
ent_b->oldest_job_waiting);

  }
-static inline void drm_sched_rq_remove_fifo_locked(struct 
drm_sched_entity *entity)
+static void drm_sched_rq_remove_fifo_locked(struct drm_sched_entity 
*entity,

+    struct drm_sched_rq *rq)
  {
-    struct drm_sched_rq *rq = entity->rq;
-
  if (!RB_EMPTY_NODE(&entity->rb_tree_node)) {
  rb_erase_cached(&entity->rb_tree_node, &rq->rb_tree_root);
  RB_CLEAR_NODE(&entity->rb_tree_node);
  }
  }
-void drm_sched_rq_update_fifo_locked(struct drm_sched_entity *entity, 
ktime_t ts)

+void drm_sched_rq_update_fifo_locked(struct drm_sched_entity *entity,
+ struct drm_sched_rq *rq,
+ ktime_t ts)
  {
  /*
   * Both locks need to be grabbed, one to protect from entity->rq 
change
@@ -171,17 +172,14 @@ void drm_sched_rq_update_fifo_locked(struct 
drm_sched_entity *entity, ktime_t ts

   * other to update the rb tree structure.
   */
  lockdep_assert_held(&entity->lock);
+    lockdep_assert_held(&rq->lock);
-    spin_lock(&entity->rq->lock);
-
-    drm_sched_rq_remove_fifo_locked(entity);
+    drm_sched_rq_remove_fifo_locked(entity, rq);
  entity->oldest_job_waiting = ts;
-    rb_add_cached(&entity->rb_tree_node, &entity->rq->rb_tree_root,
+    rb_add_cached(&entity->rb_tree_node, &rq->rb_tree_root,
    drm_sched_entity_compare_before);
-
-    spin_unlock(&entity->rq->lock);
  }
  /**
@@ -203,25 +201,23 @@ static void drm_sched_rq_init(struct 
drm_gpu_scheduler *sched,

  }
  /**
- * drm_sched_rq_add_entity - add an entity
+ * drm_sched_rq_add_entity_locked - add an entity
   *
   * @rq: scheduler run queue
   * @entity: scheduler entity
   *
   * Adds a scheduler entity to the run queue.
   */
-void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
- struct drm_sched_entity *entity)
+void drm_sched_rq_add_entity_locked(struct drm_sched_rq *rq,
+    struct drm_sched_entity *entity)
  {
+    lockdep_assert_held(&rq->lock);
+
  if (!list_empty(&entity->list))
  return;
-    spin_lock(&rq->lock);
-
  atomic_inc(rq->sched->score);
  list_add_tail(&entity->list, &rq->entities);
-
-    spin_unlock(&rq->lock);
  

Re: [PATCH 8/8] drm/sched: Further optimise drm_sched_entity_push_job

2024-09-16 Thread Christian König

Am 13.09.24 um 18:05 schrieb Tvrtko Ursulin:

From: Tvrtko Ursulin 

Having removed one re-lock cycle on the entity->lock in a patch titled
"drm/sched: Optimise drm_sched_entity_push_job", with only a tiny bit
larger refactoring we can do the same optimisation on the rq->lock.
(Currently both drm_sched_rq_add_entity() and
drm_sched_rq_update_fifo_locked() take and release the same lock.)

To achieve this we rename drm_sched_rq_add_entity() to
drm_sched_rq_add_entity_locked(), making it expect the rq->lock to be
held, and also add the same expectation to
drm_sched_rq_update_fifo_locked().

Finally, to align drm_sched_rq_update_fifo_locked(),
drm_sched_rq_add_entity_locked() and
drm_sched_rq_remove_fifo_locked() function signatures, we add rq as a
parameter to the latter.

Signed-off-by: Tvrtko Ursulin 
Cc: Christian König 
Cc: Alex Deucher 
Cc: Luben Tuikov 
Cc: Matthew Brost 
Cc: Philipp Stanner 
---
  drivers/gpu/drm/scheduler/sched_entity.c |  8 --
  drivers/gpu/drm/scheduler/sched_main.c   | 34 +++-
  include/drm/gpu_scheduler.h  |  7 ++---
  3 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_entity.c 
b/drivers/gpu/drm/scheduler/sched_entity.c
index d982cebc6bee..c48f17faef41 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -517,6 +517,7 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct 
drm_sched_entity *entity)
if (next) {
spin_lock(&entity->lock);
drm_sched_rq_update_fifo_locked(entity,
+   entity->rq,
next->submit_ts);
spin_unlock(&entity->lock);
}
@@ -618,11 +619,14 @@ void drm_sched_entity_push_job(struct drm_sched_job 
*sched_job)
sched = rq->sched;
  
  		atomic_inc(sched->score);

-   drm_sched_rq_add_entity(rq, entity);
+
+   spin_lock(&rq->lock);
+   drm_sched_rq_add_entity_locked(rq, entity);
  
  		if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)

-   drm_sched_rq_update_fifo_locked(entity, submit_ts);
+   drm_sched_rq_update_fifo_locked(entity, rq, submit_ts);
  
+		spin_unlock(&rq->lock);

spin_unlock(&entity->lock);
  
  		drm_sched_wakeup(sched, entity);

diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c
index 18a952f73ecb..c0d3f6ac3ae3 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -153,17 +153,18 @@ static __always_inline bool 
drm_sched_entity_compare_before(struct rb_node *a,
return ktime_before(ent_a->oldest_job_waiting, 
ent_b->oldest_job_waiting);
  }
  
-static inline void drm_sched_rq_remove_fifo_locked(struct drm_sched_entity *entity)

+static void drm_sched_rq_remove_fifo_locked(struct drm_sched_entity *entity,
+   struct drm_sched_rq *rq)
  {
-   struct drm_sched_rq *rq = entity->rq;
-
if (!RB_EMPTY_NODE(&entity->rb_tree_node)) {
rb_erase_cached(&entity->rb_tree_node, &rq->rb_tree_root);
RB_CLEAR_NODE(&entity->rb_tree_node);
}
  }
  
-void drm_sched_rq_update_fifo_locked(struct drm_sched_entity *entity, ktime_t ts)

+void drm_sched_rq_update_fifo_locked(struct drm_sched_entity *entity,
+struct drm_sched_rq *rq,
+ktime_t ts)
  {
/*
 * Both locks need to be grabbed, one to protect from entity->rq change
@@ -171,17 +172,14 @@ void drm_sched_rq_update_fifo_locked(struct 
drm_sched_entity *entity, ktime_t ts
 * other to update the rb tree structure.
 */
lockdep_assert_held(&entity->lock);
+   lockdep_assert_held(&rq->lock);
  
-	spin_lock(&entity->rq->lock);

-
-   drm_sched_rq_remove_fifo_locked(entity);
+   drm_sched_rq_remove_fifo_locked(entity, rq);
  
  	entity->oldest_job_waiting = ts;
  
-	rb_add_cached(&entity->rb_tree_node, &entity->rq->rb_tree_root,

+   rb_add_cached(&entity->rb_tree_node, &rq->rb_tree_root,
  drm_sched_entity_compare_before);
-
-   spin_unlock(&entity->rq->lock);
  }
  
  /**

@@ -203,25 +201,23 @@ static void drm_sched_rq_init(struct drm_gpu_scheduler 
*sched,
  }
  
  /**

- * drm_sched_rq_add_entity - add an entity
+ * drm_sched_rq_add_entity_locked - add an entity
   *
   * @rq: scheduler run queue
   * @entity: scheduler entity
   *
   * Adds a scheduler entity to the run queue.
   */
-void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
-struct drm_sched_entity *entity)
+void drm_sched_rq_add_entity_locked(struct drm_sched_rq *rq,
+   struct drm_sched_entity *entity)
  {
+   lockdep_a

Re: [PATCH 8/8] drm/sched: Further optimise drm_sched_entity_push_job

2024-09-13 Thread Tvrtko Ursulin



On 13/09/2024 13:19, Philipp Stanner wrote:

On Wed, 2024-09-11 at 13:22 +0100, Tvrtko Ursulin wrote:


On 10/09/2024 11:25, Philipp Stanner wrote:

On Mon, 2024-09-09 at 18:19 +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Having removed one re-lock cycle on the entity->lock in a patch
titled
"drm/sched: Optimise drm_sched_entity_push_job", with only a tiny
bit
larger refactoring we can do the same optimisation on the rq-

lock

(Currently both drm_sched_rq_add_entity() and
drm_sched_rq_update_fifo_locked() take and release the same
lock.)

To achieve this we rename drm_sched_rq_add_entity() to
drm_sched_rq_add_entity_locked(), making it expect the rq->lock
to be
held, and also add the same expectation to
drm_sched_rq_update_fifo_locked().

For more stream-lining we also add the run-queue as an explicit
parameter
to drm_sched_rq_remove_fifo_locked() to avoid both callers and
callee
having to dereference entity->rq.


Why is dereferencing it a problem?


As you have noticed below the API is a bit unsightly. Consider for
example this call chain:

drm_sched_entity_kill(entity)
  drm_sched_rq_remove_entity(entity->rq, entity);
  drm_sched_rq_remove_fifo_locked(entity);
  struct drm_sched_rq *rq = entity->rq;

A bit confused, no?

I thought adding rq to remove_fifo_locked at least removes one back
and
forth between the entity->rq and rq.

And then if we cache the rq in a local variable, after having
explicitly
taken the correct lock, we have this other call chain example:

drm_sched_entity_push_job()
...
  rq = entity->rq;
  spin_lock(rq->lock);

  drm_sched_rq_add_entity_locked(rq, entity);
  drm_sched_rq_update_fifo_locked(rq, entity, submit_ts);

  spin_unlock(rq->lock);

To me at least this reads more streamlined.


Alright, doesn't sound to bad, but




Signed-off-by: Tvrtko Ursulin 
Cc: Christian König 
Cc: Alex Deucher 
Cc: Luben Tuikov 
Cc: Matthew Brost 
Cc: Philipp Stanner 
---
   drivers/gpu/drm/scheduler/sched_entity.c |  7 ++--
   drivers/gpu/drm/scheduler/sched_main.c   | 41 +
-
--
   include/drm/gpu_scheduler.h  |  7 ++--
   3 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_entity.c
b/drivers/gpu/drm/scheduler/sched_entity.c
index b4c4f9923e0b..2102c726d275 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -614,11 +614,14 @@ void drm_sched_entity_push_job(struct
drm_sched_job *sched_job)
    sched = rq->sched;
   
   		atomic_inc(sched->score);

-   drm_sched_rq_add_entity(rq, entity);
+
+   spin_lock(&rq->lock);
+   drm_sched_rq_add_entity_locked(rq, entity);
   
   		if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)

-   drm_sched_rq_update_fifo_locked(entity,
submit_ts);
+   drm_sched_rq_update_fifo_locked(entity,
rq,
submit_ts);
   
+		spin_unlock(&rq->lock);

    spin_unlock(&entity->lock);
   
   		drm_sched_wakeup(sched, entity);

diff --git a/drivers/gpu/drm/scheduler/sched_main.c
b/drivers/gpu/drm/scheduler/sched_main.c
index 937e7d1cfc49..1ccd2aed2d32 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -153,41 +153,44 @@ static __always_inline bool
drm_sched_entity_compare_before(struct rb_node *a,
    return ktime_before(ent_a->oldest_job_waiting, ent_b-

oldest_job_waiting);

   }
   
-static inline void drm_sched_rq_remove_fifo_locked(struct

drm_sched_entity *entity)
+static void drm_sched_rq_remove_fifo_locked(struct
drm_sched_entity
*entity,
+       struct drm_sched_rq
*rq)


I would then at least like to see a comment somewhere telling the
reader why rq is taken as a separate variable. One might otherwise
easily wonder why it's not obtained through the entity and what the
difference is.


I failed to find a nice place to put it. I'll send v3 of the series with 
some changes soo and then please have another look at this patch and see 
if you can think of where it would look good.


Regards,

Tvrtko





So here we'd add a new function parameter that still doesn't allow
for
getting rid of 'entity' as a parameter.


We can't get rid of the entity.

Maaaybe instead we could get rid of the rq in the whole chain, I mean
from drm_sched_rq_add_entity and drm_sched_rq_remove_entity to start
with.


Let's postpone that.



But then to remove double re-lock we still (like in this patch) need
to
make the callers take the locks and rename the helpers with _locked
suffix. Otherwise it would be incosistent that a lock is taken
outside
the helpers with no _locked suffix.

I am not sure if that is better. All it achieves is remove the rq as
explicit parameter my making the callees dereference it from the
entity.


OK, as I see it now it would actually be desirable to have suffix
_locked indicate that the caller must hold all necess

Re: [PATCH 8/8] drm/sched: Further optimise drm_sched_entity_push_job

2024-09-13 Thread Tvrtko Ursulin



On 10/09/2024 16:03, Christian König wrote:

Am 10.09.24 um 11:46 schrieb Tvrtko Ursulin:


On 10/09/2024 10:08, Christian König wrote:

Am 09.09.24 um 19:19 schrieb Tvrtko Ursulin:

From: Tvrtko Ursulin 

Having removed one re-lock cycle on the entity->lock in a patch titled
"drm/sched: Optimise drm_sched_entity_push_job", with only a tiny bit
larger refactoring we can do the same optimisation on the rq->lock.
(Currently both drm_sched_rq_add_entity() and
drm_sched_rq_update_fifo_locked() take and release the same lock.)


I think that goes into the wrong direction.

Probably better to move this here into drm_sched_rq_add_entity():

       if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
          drm_sched_rq_update_fifo_locked(entity, submit_ts);

We can then also drop adding the entity to the rr list when FIFO is 
in use.


Unfortuntely there is a few other places which appear to rely on the 
list. Like drm_sched_fini,


That should be only a warning.


Warning as in?

drm_sched_increase_karma and 


The karma handling was another bad idea from AMD how to populate back 
errors to userspace and I've just recently documented together with Sima 
that we should use dma-fence errors instead.


Just didn't had time to tackle cleaning that up yet.


even amdgpu_job_stop_all_jobs_on_sched.


Uff, seeing that for the first time just now. Another bad idea how to 
handle things which doesn't take the appropriate locks and looks racy to 
me.



Latter could perhaps be solved by adding an iterator helper to the 
scheduler, which would perhaps be a good move for component isolation. 
And first two could be handled by implementing a complete and mutually 
exclusive duality of how entities are walked depending on scheduling 
mode. Plus making the scheduling mode only be configurable at boot. It 
feels doable but significant work and in the meantime removing the 
double re-lock maybe acceptable?


I don't think we should optimize for something we want to remove in the 
long term.


I knew using the term optimise would just making things more difficult 
for myself. :) Lets view this as cleaning up the API to avoid the 
inelegance of taking the same lock twice right next to each other.


If we can achieve this while not making the API worse then there is 
nothing to lose either short, med or long term.


If possible I would rather say that we should completely drop the RR 
approach and only use FIFO or even something more sophisticated.


No complaints from me, but I don't know how that would work other than 
putting a depreciation warning if someone selected RR. And keeping that 
for a good number of kernel releases. Any other ideas?


Regards,

Tvrtko


Re: [PATCH 8/8] drm/sched: Further optimise drm_sched_entity_push_job

2024-09-13 Thread Philipp Stanner
On Wed, 2024-09-11 at 13:22 +0100, Tvrtko Ursulin wrote:
> 
> On 10/09/2024 11:25, Philipp Stanner wrote:
> > On Mon, 2024-09-09 at 18:19 +0100, Tvrtko Ursulin wrote:
> > > From: Tvrtko Ursulin 
> > > 
> > > Having removed one re-lock cycle on the entity->lock in a patch
> > > titled
> > > "drm/sched: Optimise drm_sched_entity_push_job", with only a tiny
> > > bit
> > > larger refactoring we can do the same optimisation on the rq-
> > > >lock
> > > (Currently both drm_sched_rq_add_entity() and
> > > drm_sched_rq_update_fifo_locked() take and release the same
> > > lock.)
> > > 
> > > To achieve this we rename drm_sched_rq_add_entity() to
> > > drm_sched_rq_add_entity_locked(), making it expect the rq->lock
> > > to be
> > > held, and also add the same expectation to
> > > drm_sched_rq_update_fifo_locked().
> > > 
> > > For more stream-lining we also add the run-queue as an explicit
> > > parameter
> > > to drm_sched_rq_remove_fifo_locked() to avoid both callers and
> > > callee
> > > having to dereference entity->rq.
> > 
> > Why is dereferencing it a problem?
> 
> As you have noticed below the API is a bit unsightly. Consider for 
> example this call chain:
> 
> drm_sched_entity_kill(entity)
>  drm_sched_rq_remove_entity(entity->rq, entity);
>  drm_sched_rq_remove_fifo_locked(entity);
>  struct drm_sched_rq *rq = entity->rq;
> 
> A bit confused, no?
> 
> I thought adding rq to remove_fifo_locked at least removes one back
> and 
> forth between the entity->rq and rq.
> 
> And then if we cache the rq in a local variable, after having
> explicitly 
> taken the correct lock, we have this other call chain example:
> 
> drm_sched_entity_push_job()
> ...
>  rq = entity->rq;
>  spin_lock(rq->lock);
> 
>  drm_sched_rq_add_entity_locked(rq, entity);
>  drm_sched_rq_update_fifo_locked(rq, entity, submit_ts);
> 
>  spin_unlock(rq->lock);
> 
> To me at least this reads more streamlined.

Alright, doesn't sound to bad, but

> 
> > > Signed-off-by: Tvrtko Ursulin 
> > > Cc: Christian König 
> > > Cc: Alex Deucher 
> > > Cc: Luben Tuikov 
> > > Cc: Matthew Brost 
> > > Cc: Philipp Stanner 
> > > ---
> > >   drivers/gpu/drm/scheduler/sched_entity.c |  7 ++--
> > >   drivers/gpu/drm/scheduler/sched_main.c   | 41 +
> > > -
> > > --
> > >   include/drm/gpu_scheduler.h  |  7 ++--
> > >   3 files changed, 31 insertions(+), 24 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/scheduler/sched_entity.c
> > > b/drivers/gpu/drm/scheduler/sched_entity.c
> > > index b4c4f9923e0b..2102c726d275 100644
> > > --- a/drivers/gpu/drm/scheduler/sched_entity.c
> > > +++ b/drivers/gpu/drm/scheduler/sched_entity.c
> > > @@ -614,11 +614,14 @@ void drm_sched_entity_push_job(struct
> > > drm_sched_job *sched_job)
> > >   sched = rq->sched;
> > >   
> > >   atomic_inc(sched->score);
> > > - drm_sched_rq_add_entity(rq, entity);
> > > +
> > > + spin_lock(&rq->lock);
> > > + drm_sched_rq_add_entity_locked(rq, entity);
> > >   
> > >   if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
> > > - drm_sched_rq_update_fifo_locked(entity,
> > > submit_ts);
> > > + drm_sched_rq_update_fifo_locked(entity,
> > > rq,
> > > submit_ts);
> > >   
> > > + spin_unlock(&rq->lock);
> > >   spin_unlock(&entity->lock);
> > >   
> > >   drm_sched_wakeup(sched, entity);
> > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c
> > > b/drivers/gpu/drm/scheduler/sched_main.c
> > > index 937e7d1cfc49..1ccd2aed2d32 100644
> > > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > > @@ -153,41 +153,44 @@ static __always_inline bool
> > > drm_sched_entity_compare_before(struct rb_node *a,
> > >   return ktime_before(ent_a->oldest_job_waiting, ent_b-
> > > > oldest_job_waiting);
> > >   }
> > >   
> > > -static inline void drm_sched_rq_remove_fifo_locked(struct
> > > drm_sched_entity *entity)
> > > +static void drm_sched_rq_remove_fifo_locked(struct
> > > drm_sched_entity
> > > *entity,
> > > +     struct drm_sched_rq
> > > *rq)

I would then at least like to see a comment somewhere telling the
reader why rq is taken as a separate variable. One might otherwise
easily wonder why it's not obtained through the entity and what the
difference is.

> > 
> > So here we'd add a new function parameter that still doesn't allow
> > for
> > getting rid of 'entity' as a parameter.
> 
> We can't get rid of the entity.
> 
> Maaaybe instead we could get rid of the rq in the whole chain, I mean
> from drm_sched_rq_add_entity and drm_sched_rq_remove_entity to start
> with.

Let's postpone that.

> 
> But then to remove double re-lock we still (like in this patch) need
> to 
> make the callers take the locks and rename the helpers with _locked 
> suffix. Otherwise it would be incosistent that a l

Re: [PATCH 8/8] drm/sched: Further optimise drm_sched_entity_push_job

2024-09-11 Thread Tvrtko Ursulin



On 10/09/2024 11:25, Philipp Stanner wrote:

On Mon, 2024-09-09 at 18:19 +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Having removed one re-lock cycle on the entity->lock in a patch
titled
"drm/sched: Optimise drm_sched_entity_push_job", with only a tiny bit
larger refactoring we can do the same optimisation on the rq->lock
(Currently both drm_sched_rq_add_entity() and
drm_sched_rq_update_fifo_locked() take and release the same lock.)

To achieve this we rename drm_sched_rq_add_entity() to
drm_sched_rq_add_entity_locked(), making it expect the rq->lock to be
held, and also add the same expectation to
drm_sched_rq_update_fifo_locked().

For more stream-lining we also add the run-queue as an explicit
parameter
to drm_sched_rq_remove_fifo_locked() to avoid both callers and callee
having to dereference entity->rq.


Why is dereferencing it a problem?


As you have noticed below the API is a bit unsightly. Consider for 
example this call chain:


drm_sched_entity_kill(entity)
drm_sched_rq_remove_entity(entity->rq, entity);
drm_sched_rq_remove_fifo_locked(entity);
struct drm_sched_rq *rq = entity->rq;

A bit confused, no?

I thought adding rq to remove_fifo_locked at least removes one back and 
forth between the entity->rq and rq.


And then if we cache the rq in a local variable, after having explicitly 
taken the correct lock, we have this other call chain example:


drm_sched_entity_push_job()
...
rq = entity->rq;
spin_lock(rq->lock);

drm_sched_rq_add_entity_locked(rq, entity);
drm_sched_rq_update_fifo_locked(rq, entity, submit_ts);

spin_unlock(rq->lock);

To me at least this reads more streamlined.


Signed-off-by: Tvrtko Ursulin 
Cc: Christian König 
Cc: Alex Deucher 
Cc: Luben Tuikov 
Cc: Matthew Brost 
Cc: Philipp Stanner 
---
  drivers/gpu/drm/scheduler/sched_entity.c |  7 ++--
  drivers/gpu/drm/scheduler/sched_main.c   | 41 +-
--
  include/drm/gpu_scheduler.h  |  7 ++--
  3 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_entity.c
b/drivers/gpu/drm/scheduler/sched_entity.c
index b4c4f9923e0b..2102c726d275 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -614,11 +614,14 @@ void drm_sched_entity_push_job(struct
drm_sched_job *sched_job)
    sched = rq->sched;
  
  		atomic_inc(sched->score);

-   drm_sched_rq_add_entity(rq, entity);
+
+   spin_lock(&rq->lock);
+   drm_sched_rq_add_entity_locked(rq, entity);
  
  		if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)

-   drm_sched_rq_update_fifo_locked(entity,
submit_ts);
+   drm_sched_rq_update_fifo_locked(entity, rq,
submit_ts);
  
+		spin_unlock(&rq->lock);

    spin_unlock(&entity->lock);
  
  		drm_sched_wakeup(sched, entity);

diff --git a/drivers/gpu/drm/scheduler/sched_main.c
b/drivers/gpu/drm/scheduler/sched_main.c
index 937e7d1cfc49..1ccd2aed2d32 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -153,41 +153,44 @@ static __always_inline bool
drm_sched_entity_compare_before(struct rb_node *a,
    return ktime_before(ent_a->oldest_job_waiting, ent_b-

oldest_job_waiting);

  }
  
-static inline void drm_sched_rq_remove_fifo_locked(struct

drm_sched_entity *entity)
+static void drm_sched_rq_remove_fifo_locked(struct drm_sched_entity
*entity,
+       struct drm_sched_rq *rq)


So here we'd add a new function parameter that still doesn't allow for
getting rid of 'entity' as a parameter.


We can't get rid of the entity.

Maaaybe instead we could get rid of the rq in the whole chain, I mean 
from drm_sched_rq_add_entity and drm_sched_rq_remove_entity to start with.


But then to remove double re-lock we still (like in this patch) need to 
make the callers take the locks and rename the helpers with _locked 
suffix. Otherwise it would be incosistent that a lock is taken outside 
the helpers with no _locked suffix.


I am not sure if that is better. All it achieves is remove the rq as 
explicit parameter my making the callees dereference it from the entity.


Worst part is all these helpers have drm_sched_rq_ prefix.. which to me 
reads as "we operate on rq". So not passing in rq is confusing to start 
with.


Granted, some confusion still remains with my approach since ideally, to 
those helpers, I wanted to add some asserts that rq == entity->rq...



The API gets larger that way and readers will immediately wonder why
sth is passed as a separate variable that could also be obtained
through the pointer.


  {
-   struct drm_sched_rq *rq = entity->rq;
-
    if (!RB_EMPTY_NODE(&entity->rb_tree_node)) {
    rb_erase_cached(&entity->rb_tree_node, &rq-

rb_tree_root);

    RB_CLEAR_NODE(&entity->rb_tree_node);
    }
  }
  
-void drm_sched_rq_update_fifo_l

Re: [PATCH 8/8] drm/sched: Further optimise drm_sched_entity_push_job

2024-09-10 Thread Christian König

Am 10.09.24 um 12:25 schrieb Philipp Stanner:

On Mon, 2024-09-09 at 18:19 +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Having removed one re-lock cycle on the entity->lock in a patch
titled
"drm/sched: Optimise drm_sched_entity_push_job", with only a tiny bit
larger refactoring we can do the same optimisation on the rq->lock
(Currently both drm_sched_rq_add_entity() and
drm_sched_rq_update_fifo_locked() take and release the same lock.)

To achieve this we rename drm_sched_rq_add_entity() to
drm_sched_rq_add_entity_locked(), making it expect the rq->lock to be
held, and also add the same expectation to
drm_sched_rq_update_fifo_locked().

For more stream-lining we also add the run-queue as an explicit
parameter
to drm_sched_rq_remove_fifo_locked() to avoid both callers and callee
having to dereference entity->rq.

Why is dereferencing it a problem?


Maybe add "without holding the appropriate lock".

Christian.




Signed-off-by: Tvrtko Ursulin 
Cc: Christian König 
Cc: Alex Deucher 
Cc: Luben Tuikov 
Cc: Matthew Brost 
Cc: Philipp Stanner 
---
  drivers/gpu/drm/scheduler/sched_entity.c |  7 ++--
  drivers/gpu/drm/scheduler/sched_main.c   | 41 +-
--
  include/drm/gpu_scheduler.h  |  7 ++--
  3 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_entity.c
b/drivers/gpu/drm/scheduler/sched_entity.c
index b4c4f9923e0b..2102c726d275 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -614,11 +614,14 @@ void drm_sched_entity_push_job(struct
drm_sched_job *sched_job)
    sched = rq->sched;
  
  		atomic_inc(sched->score);

-   drm_sched_rq_add_entity(rq, entity);
+
+   spin_lock(&rq->lock);
+   drm_sched_rq_add_entity_locked(rq, entity);
  
  		if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)

-   drm_sched_rq_update_fifo_locked(entity,
submit_ts);
+   drm_sched_rq_update_fifo_locked(entity, rq,
submit_ts);
  
+		spin_unlock(&rq->lock);

    spin_unlock(&entity->lock);
  
  		drm_sched_wakeup(sched, entity);

diff --git a/drivers/gpu/drm/scheduler/sched_main.c
b/drivers/gpu/drm/scheduler/sched_main.c
index 937e7d1cfc49..1ccd2aed2d32 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -153,41 +153,44 @@ static __always_inline bool
drm_sched_entity_compare_before(struct rb_node *a,
    return ktime_before(ent_a->oldest_job_waiting, ent_b-

oldest_job_waiting);

  }
  
-static inline void drm_sched_rq_remove_fifo_locked(struct

drm_sched_entity *entity)
+static void drm_sched_rq_remove_fifo_locked(struct drm_sched_entity
*entity,
+       struct drm_sched_rq *rq)

So here we'd add a new function parameter that still doesn't allow for
getting rid of 'entity' as a parameter.

The API gets larger that way and readers will immediately wonder why
sth is passed as a separate variable that could also be obtained
through the pointer.


  {
-   struct drm_sched_rq *rq = entity->rq;
-
    if (!RB_EMPTY_NODE(&entity->rb_tree_node)) {
    rb_erase_cached(&entity->rb_tree_node, &rq-

rb_tree_root);

    RB_CLEAR_NODE(&entity->rb_tree_node);
    }
  }
  
-void drm_sched_rq_update_fifo_locked(struct drm_sched_entity

*entity, ktime_t ts)
+void drm_sched_rq_update_fifo_locked(struct drm_sched_entity
*entity,
+    struct drm_sched_rq *rq,
+    ktime_t ts)

The function is still called _locked. That implies to the reader that
this function takes care of locking. But it doesn't anymore. Instead,


  {
    lockdep_assert_held(&entity->lock);
+   lockdep_assert_held(&rq->lock);
  
-	spin_lock(&entity->rq->lock);

-
-   drm_sched_rq_remove_fifo_locked(entity);
+   drm_sched_rq_remove_fifo_locked(entity, rq);
  
  	entity->oldest_job_waiting = ts;
  
-	rb_add_cached(&entity->rb_tree_node, &entity->rq-

rb_tree_root,

+   rb_add_cached(&entity->rb_tree_node, &rq->rb_tree_root,
      drm_sched_entity_compare_before);
-
-   spin_unlock(&entity->rq->lock);
  }
  
  void drm_sched_rq_update_fifo(struct drm_sched_entity *entity,

ktime_t ts)
  {
+   struct drm_sched_rq *rq;
+
    /*
     * Both locks need to be grabbed, one to protect from
entity->rq change
     * for entity from within concurrent
drm_sched_entity_select_rq and the
     * other to update the rb tree structure.
     */
    spin_lock(&entity->lock);
-   drm_sched_rq_update_fifo_locked(entity, ts);
+   rq = entity->rq;
+   spin_lock(&rq->lock);
+   drm_sched_rq_update_fifo_locked(entity, rq, ts);
+   spin_unlock(&rq->lock);

its caller, drm_sched_rq_update_fifo(), now takes care of the locking.
So if it all drm_sched_rq_update_fifo_locked() should be called
drm_sched_rq_update_fifo_

Re: [PATCH 8/8] drm/sched: Further optimise drm_sched_entity_push_job

2024-09-10 Thread Christian König

Am 10.09.24 um 11:46 schrieb Tvrtko Ursulin:


On 10/09/2024 10:08, Christian König wrote:

Am 09.09.24 um 19:19 schrieb Tvrtko Ursulin:

From: Tvrtko Ursulin 

Having removed one re-lock cycle on the entity->lock in a patch titled
"drm/sched: Optimise drm_sched_entity_push_job", with only a tiny bit
larger refactoring we can do the same optimisation on the rq->lock.
(Currently both drm_sched_rq_add_entity() and
drm_sched_rq_update_fifo_locked() take and release the same lock.)


I think that goes into the wrong direction.

Probably better to move this here into drm_sched_rq_add_entity():

       if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
          drm_sched_rq_update_fifo_locked(entity, submit_ts);

We can then also drop adding the entity to the rr list when FIFO is 
in use.


Unfortuntely there is a few other places which appear to rely on the 
list. Like drm_sched_fini,


That should be only a warning.

drm_sched_increase_karma and 


The karma handling was another bad idea from AMD how to populate back 
errors to userspace and I've just recently documented together with Sima 
that we should use dma-fence errors instead.


Just didn't had time to tackle cleaning that up yet.


even amdgpu_job_stop_all_jobs_on_sched.


Uff, seeing that for the first time just now. Another bad idea how to 
handle things which doesn't take the appropriate locks and looks racy to me.



Latter could perhaps be solved by adding an iterator helper to the 
scheduler, which would perhaps be a good move for component isolation. 
And first two could be handled by implementing a complete and mutually 
exclusive duality of how entities are walked depending on scheduling 
mode. Plus making the scheduling mode only be configurable at boot. It 
feels doable but significant work and in the meantime removing the 
double re-lock maybe acceptable?


I don't think we should optimize for something we want to remove in the 
long term.


If possible I would rather say that we should completely drop the RR 
approach and only use FIFO or even something more sophisticated.


Regards,
Christian.



Regards,

Tvrtko


To achieve this we rename drm_sched_rq_add_entity() to
drm_sched_rq_add_entity_locked(), making it expect the rq->lock to be
held, and also add the same expectation to
drm_sched_rq_update_fifo_locked().

For more stream-lining we also add the run-queue as an explicit 
parameter

to drm_sched_rq_remove_fifo_locked() to avoid both callers and callee
having to dereference entity->rq.

Signed-off-by: Tvrtko Ursulin 
Cc: Christian König 
Cc: Alex Deucher 
Cc: Luben Tuikov 
Cc: Matthew Brost 
Cc: Philipp Stanner 
---
  drivers/gpu/drm/scheduler/sched_entity.c |  7 ++--
  drivers/gpu/drm/scheduler/sched_main.c   | 41 
+---

  include/drm/gpu_scheduler.h  |  7 ++--
  3 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_entity.c 
b/drivers/gpu/drm/scheduler/sched_entity.c

index b4c4f9923e0b..2102c726d275 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -614,11 +614,14 @@ void drm_sched_entity_push_job(struct 
drm_sched_job *sched_job)

  sched = rq->sched;
  atomic_inc(sched->score);
-    drm_sched_rq_add_entity(rq, entity);
+
+    spin_lock(&rq->lock);
+    drm_sched_rq_add_entity_locked(rq, entity);
  if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
-    drm_sched_rq_update_fifo_locked(entity, submit_ts);
+    drm_sched_rq_update_fifo_locked(entity, rq, submit_ts);
+    spin_unlock(&rq->lock);
  spin_unlock(&entity->lock);
  drm_sched_wakeup(sched, entity);
diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c

index 937e7d1cfc49..1ccd2aed2d32 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -153,41 +153,44 @@ static __always_inline bool 
drm_sched_entity_compare_before(struct rb_node *a,
  return ktime_before(ent_a->oldest_job_waiting, 
ent_b->oldest_job_waiting);

  }
-static inline void drm_sched_rq_remove_fifo_locked(struct 
drm_sched_entity *entity)
+static void drm_sched_rq_remove_fifo_locked(struct drm_sched_entity 
*entity,

+    struct drm_sched_rq *rq)
  {
-    struct drm_sched_rq *rq = entity->rq;
-
  if (!RB_EMPTY_NODE(&entity->rb_tree_node)) {
  rb_erase_cached(&entity->rb_tree_node, &rq->rb_tree_root);
  RB_CLEAR_NODE(&entity->rb_tree_node);
  }
  }
-void drm_sched_rq_update_fifo_locked(struct drm_sched_entity 
*entity, ktime_t ts)

+void drm_sched_rq_update_fifo_locked(struct drm_sched_entity *entity,
+ struct drm_sched_rq *rq,
+ ktime_t ts)
  {
  lockdep_assert_held(&entity->lock);
+    lockdep_assert_held(&rq->lock);
-    spin_lock(&entity->rq->lock);
-
-    drm_sched_rq_remove_fifo_locked(entity);
+    drm_sched_rq_remove_fifo_locke

Re: [PATCH 8/8] drm/sched: Further optimise drm_sched_entity_push_job

2024-09-10 Thread Philipp Stanner
On Mon, 2024-09-09 at 18:19 +0100, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin 
> 
> Having removed one re-lock cycle on the entity->lock in a patch
> titled
> "drm/sched: Optimise drm_sched_entity_push_job", with only a tiny bit
> larger refactoring we can do the same optimisation on the rq->lock
> (Currently both drm_sched_rq_add_entity() and
> drm_sched_rq_update_fifo_locked() take and release the same lock.)
> 
> To achieve this we rename drm_sched_rq_add_entity() to
> drm_sched_rq_add_entity_locked(), making it expect the rq->lock to be
> held, and also add the same expectation to
> drm_sched_rq_update_fifo_locked().
> 
> For more stream-lining we also add the run-queue as an explicit
> parameter
> to drm_sched_rq_remove_fifo_locked() to avoid both callers and callee
> having to dereference entity->rq.

Why is dereferencing it a problem?

> 
> Signed-off-by: Tvrtko Ursulin 
> Cc: Christian König 
> Cc: Alex Deucher 
> Cc: Luben Tuikov 
> Cc: Matthew Brost 
> Cc: Philipp Stanner 
> ---
>  drivers/gpu/drm/scheduler/sched_entity.c |  7 ++--
>  drivers/gpu/drm/scheduler/sched_main.c   | 41 +-
> --
>  include/drm/gpu_scheduler.h  |  7 ++--
>  3 files changed, 31 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c
> b/drivers/gpu/drm/scheduler/sched_entity.c
> index b4c4f9923e0b..2102c726d275 100644
> --- a/drivers/gpu/drm/scheduler/sched_entity.c
> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
> @@ -614,11 +614,14 @@ void drm_sched_entity_push_job(struct
> drm_sched_job *sched_job)
>   sched = rq->sched;
>  
>   atomic_inc(sched->score);
> - drm_sched_rq_add_entity(rq, entity);
> +
> + spin_lock(&rq->lock);
> + drm_sched_rq_add_entity_locked(rq, entity);
>  
>   if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
> - drm_sched_rq_update_fifo_locked(entity,
> submit_ts);
> + drm_sched_rq_update_fifo_locked(entity, rq,
> submit_ts);
>  
> + spin_unlock(&rq->lock);
>   spin_unlock(&entity->lock);
>  
>   drm_sched_wakeup(sched, entity);
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
> b/drivers/gpu/drm/scheduler/sched_main.c
> index 937e7d1cfc49..1ccd2aed2d32 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -153,41 +153,44 @@ static __always_inline bool
> drm_sched_entity_compare_before(struct rb_node *a,
>   return ktime_before(ent_a->oldest_job_waiting, ent_b-
> >oldest_job_waiting);
>  }
>  
> -static inline void drm_sched_rq_remove_fifo_locked(struct
> drm_sched_entity *entity)
> +static void drm_sched_rq_remove_fifo_locked(struct drm_sched_entity
> *entity,
> +     struct drm_sched_rq *rq)

So here we'd add a new function parameter that still doesn't allow for
getting rid of 'entity' as a parameter.

The API gets larger that way and readers will immediately wonder why
sth is passed as a separate variable that could also be obtained
through the pointer.

>  {
> - struct drm_sched_rq *rq = entity->rq;
> -
>   if (!RB_EMPTY_NODE(&entity->rb_tree_node)) {
>   rb_erase_cached(&entity->rb_tree_node, &rq-
> >rb_tree_root);
>   RB_CLEAR_NODE(&entity->rb_tree_node);
>   }
>  }
>  
> -void drm_sched_rq_update_fifo_locked(struct drm_sched_entity
> *entity, ktime_t ts)
> +void drm_sched_rq_update_fifo_locked(struct drm_sched_entity
> *entity,
> +  struct drm_sched_rq *rq,
> +  ktime_t ts)

The function is still called _locked. That implies to the reader that
this function takes care of locking. But it doesn't anymore. Instead,

>  {
>   lockdep_assert_held(&entity->lock);
> + lockdep_assert_held(&rq->lock);
>  
> - spin_lock(&entity->rq->lock);
> -
> - drm_sched_rq_remove_fifo_locked(entity);
> + drm_sched_rq_remove_fifo_locked(entity, rq);
>  
>   entity->oldest_job_waiting = ts;
>  
> - rb_add_cached(&entity->rb_tree_node, &entity->rq-
> >rb_tree_root,
> + rb_add_cached(&entity->rb_tree_node, &rq->rb_tree_root,
>     drm_sched_entity_compare_before);
> -
> - spin_unlock(&entity->rq->lock);
>  }
>  
>  void drm_sched_rq_update_fifo(struct drm_sched_entity *entity,
> ktime_t ts)
>  {
> + struct drm_sched_rq *rq;
> +
>   /*
>    * Both locks need to be grabbed, one to protect from
> entity->rq change
>    * for entity from within concurrent
> drm_sched_entity_select_rq and the
>    * other to update the rb tree structure.
>    */
>   spin_lock(&entity->lock);
> - drm_sched_rq_update_fifo_locked(entity, ts);
> + rq = entity->rq;
> + spin_lock(&rq->lock);
> + drm_sched_rq_update_fifo_locked(entity, rq, ts);
> + spin_unlock(&rq->lock);

its caller, drm_sched_rq_update_fifo(), now takes care of the locking.
So i

Re: [PATCH 8/8] drm/sched: Further optimise drm_sched_entity_push_job

2024-09-10 Thread Tvrtko Ursulin



On 10/09/2024 10:08, Christian König wrote:

Am 09.09.24 um 19:19 schrieb Tvrtko Ursulin:

From: Tvrtko Ursulin 

Having removed one re-lock cycle on the entity->lock in a patch titled
"drm/sched: Optimise drm_sched_entity_push_job", with only a tiny bit
larger refactoring we can do the same optimisation on the rq->lock.
(Currently both drm_sched_rq_add_entity() and
drm_sched_rq_update_fifo_locked() take and release the same lock.)


I think that goes into the wrong direction.

Probably better to move this here into drm_sched_rq_add_entity():

       if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
          drm_sched_rq_update_fifo_locked(entity, submit_ts);

We can then also drop adding the entity to the rr list when FIFO is in use.


Unfortuntely there is a few other places which appear to rely on the 
list. Like drm_sched_fini, drm_sched_increase_karma and even 
amdgpu_job_stop_all_jobs_on_sched. Latter could perhaps be solved by 
adding an iterator helper to the scheduler, which would perhaps be a 
good move for component isolation. And first two could be handled by 
implementing a complete and mutually exclusive duality of how entities 
are walked depending on scheduling mode. Plus making the scheduling mode 
only be configurable at boot. It feels doable but significant work and 
in the meantime removing the double re-lock maybe acceptable?


Regards,

Tvrtko


To achieve this we rename drm_sched_rq_add_entity() to
drm_sched_rq_add_entity_locked(), making it expect the rq->lock to be
held, and also add the same expectation to
drm_sched_rq_update_fifo_locked().

For more stream-lining we also add the run-queue as an explicit parameter
to drm_sched_rq_remove_fifo_locked() to avoid both callers and callee
having to dereference entity->rq.

Signed-off-by: Tvrtko Ursulin 
Cc: Christian König 
Cc: Alex Deucher 
Cc: Luben Tuikov 
Cc: Matthew Brost 
Cc: Philipp Stanner 
---
  drivers/gpu/drm/scheduler/sched_entity.c |  7 ++--
  drivers/gpu/drm/scheduler/sched_main.c   | 41 +---
  include/drm/gpu_scheduler.h  |  7 ++--
  3 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_entity.c 
b/drivers/gpu/drm/scheduler/sched_entity.c

index b4c4f9923e0b..2102c726d275 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -614,11 +614,14 @@ void drm_sched_entity_push_job(struct 
drm_sched_job *sched_job)

  sched = rq->sched;
  atomic_inc(sched->score);
-    drm_sched_rq_add_entity(rq, entity);
+
+    spin_lock(&rq->lock);
+    drm_sched_rq_add_entity_locked(rq, entity);
  if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
-    drm_sched_rq_update_fifo_locked(entity, submit_ts);
+    drm_sched_rq_update_fifo_locked(entity, rq, submit_ts);
+    spin_unlock(&rq->lock);
  spin_unlock(&entity->lock);
  drm_sched_wakeup(sched, entity);
diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c

index 937e7d1cfc49..1ccd2aed2d32 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -153,41 +153,44 @@ static __always_inline bool 
drm_sched_entity_compare_before(struct rb_node *a,
  return ktime_before(ent_a->oldest_job_waiting, 
ent_b->oldest_job_waiting);

  }
-static inline void drm_sched_rq_remove_fifo_locked(struct 
drm_sched_entity *entity)
+static void drm_sched_rq_remove_fifo_locked(struct drm_sched_entity 
*entity,

+    struct drm_sched_rq *rq)
  {
-    struct drm_sched_rq *rq = entity->rq;
-
  if (!RB_EMPTY_NODE(&entity->rb_tree_node)) {
  rb_erase_cached(&entity->rb_tree_node, &rq->rb_tree_root);
  RB_CLEAR_NODE(&entity->rb_tree_node);
  }
  }
-void drm_sched_rq_update_fifo_locked(struct drm_sched_entity *entity, 
ktime_t ts)

+void drm_sched_rq_update_fifo_locked(struct drm_sched_entity *entity,
+ struct drm_sched_rq *rq,
+ ktime_t ts)
  {
  lockdep_assert_held(&entity->lock);
+    lockdep_assert_held(&rq->lock);
-    spin_lock(&entity->rq->lock);
-
-    drm_sched_rq_remove_fifo_locked(entity);
+    drm_sched_rq_remove_fifo_locked(entity, rq);
  entity->oldest_job_waiting = ts;
-    rb_add_cached(&entity->rb_tree_node, &entity->rq->rb_tree_root,
+    rb_add_cached(&entity->rb_tree_node, &rq->rb_tree_root,
    drm_sched_entity_compare_before);
-
-    spin_unlock(&entity->rq->lock);
  }
  void drm_sched_rq_update_fifo(struct drm_sched_entity *entity, 
ktime_t ts)

  {
+    struct drm_sched_rq *rq;
+
  /*
   * Both locks need to be grabbed, one to protect from entity->rq 
change
   * for entity from within concurrent drm_sched_entity_select_rq 
and the

   * other to update the rb tree structure.
   */
  spin_lock(&entity->lock);
-    drm_sched_rq_update_fifo_locked(entity, ts);
+    rq = entity->rq;
+    spin_

Re: [PATCH 8/8] drm/sched: Further optimise drm_sched_entity_push_job

2024-09-10 Thread Christian König

Am 09.09.24 um 19:19 schrieb Tvrtko Ursulin:

From: Tvrtko Ursulin 

Having removed one re-lock cycle on the entity->lock in a patch titled
"drm/sched: Optimise drm_sched_entity_push_job", with only a tiny bit
larger refactoring we can do the same optimisation on the rq->lock.
(Currently both drm_sched_rq_add_entity() and
drm_sched_rq_update_fifo_locked() take and release the same lock.)


I think that goes into the wrong direction.

Probably better to move this here into drm_sched_rq_add_entity():

      if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
         drm_sched_rq_update_fifo_locked(entity, submit_ts);

We can then also drop adding the entity to the rr list when FIFO is in use.

Regards,
Christian.




To achieve this we rename drm_sched_rq_add_entity() to
drm_sched_rq_add_entity_locked(), making it expect the rq->lock to be
held, and also add the same expectation to
drm_sched_rq_update_fifo_locked().

For more stream-lining we also add the run-queue as an explicit parameter
to drm_sched_rq_remove_fifo_locked() to avoid both callers and callee
having to dereference entity->rq.

Signed-off-by: Tvrtko Ursulin 
Cc: Christian König 
Cc: Alex Deucher 
Cc: Luben Tuikov 
Cc: Matthew Brost 
Cc: Philipp Stanner 
---
  drivers/gpu/drm/scheduler/sched_entity.c |  7 ++--
  drivers/gpu/drm/scheduler/sched_main.c   | 41 +---
  include/drm/gpu_scheduler.h  |  7 ++--
  3 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_entity.c 
b/drivers/gpu/drm/scheduler/sched_entity.c
index b4c4f9923e0b..2102c726d275 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -614,11 +614,14 @@ void drm_sched_entity_push_job(struct drm_sched_job 
*sched_job)
sched = rq->sched;
  
  		atomic_inc(sched->score);

-   drm_sched_rq_add_entity(rq, entity);
+
+   spin_lock(&rq->lock);
+   drm_sched_rq_add_entity_locked(rq, entity);
  
  		if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)

-   drm_sched_rq_update_fifo_locked(entity, submit_ts);
+   drm_sched_rq_update_fifo_locked(entity, rq, submit_ts);
  
+		spin_unlock(&rq->lock);

spin_unlock(&entity->lock);
  
  		drm_sched_wakeup(sched, entity);

diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c
index 937e7d1cfc49..1ccd2aed2d32 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -153,41 +153,44 @@ static __always_inline bool 
drm_sched_entity_compare_before(struct rb_node *a,
return ktime_before(ent_a->oldest_job_waiting, 
ent_b->oldest_job_waiting);
  }
  
-static inline void drm_sched_rq_remove_fifo_locked(struct drm_sched_entity *entity)

+static void drm_sched_rq_remove_fifo_locked(struct drm_sched_entity *entity,
+   struct drm_sched_rq *rq)
  {
-   struct drm_sched_rq *rq = entity->rq;
-
if (!RB_EMPTY_NODE(&entity->rb_tree_node)) {
rb_erase_cached(&entity->rb_tree_node, &rq->rb_tree_root);
RB_CLEAR_NODE(&entity->rb_tree_node);
}
  }
  
-void drm_sched_rq_update_fifo_locked(struct drm_sched_entity *entity, ktime_t ts)

+void drm_sched_rq_update_fifo_locked(struct drm_sched_entity *entity,
+struct drm_sched_rq *rq,
+ktime_t ts)
  {
lockdep_assert_held(&entity->lock);
+   lockdep_assert_held(&rq->lock);
  
-	spin_lock(&entity->rq->lock);

-
-   drm_sched_rq_remove_fifo_locked(entity);
+   drm_sched_rq_remove_fifo_locked(entity, rq);
  
  	entity->oldest_job_waiting = ts;
  
-	rb_add_cached(&entity->rb_tree_node, &entity->rq->rb_tree_root,

+   rb_add_cached(&entity->rb_tree_node, &rq->rb_tree_root,
  drm_sched_entity_compare_before);
-
-   spin_unlock(&entity->rq->lock);
  }
  
  void drm_sched_rq_update_fifo(struct drm_sched_entity *entity, ktime_t ts)

  {
+   struct drm_sched_rq *rq;
+
/*
 * Both locks need to be grabbed, one to protect from entity->rq change
 * for entity from within concurrent drm_sched_entity_select_rq and the
 * other to update the rb tree structure.
 */
spin_lock(&entity->lock);
-   drm_sched_rq_update_fifo_locked(entity, ts);
+   rq = entity->rq;
+   spin_lock(&rq->lock);
+   drm_sched_rq_update_fifo_locked(entity, rq, ts);
+   spin_unlock(&rq->lock);
spin_unlock(&entity->lock);
  }
  
@@ -210,25 +213,23 @@ static void drm_sched_rq_init(struct drm_gpu_scheduler *sched,

  }
  
  /**

- * drm_sched_rq_add_entity - add an entity
+ * drm_sched_rq_add_entity_locked - add an entity
   *
   * @rq: scheduler run queue
   * @entity: scheduler entity
   *
   * Adds a scheduler entity to the run queue.
   */
-void drm_sched_rq_add_entity(struct