Re: [PATCH v3 5/6] drm/xe: Add helper to accumulate exec queue runtime

2024-05-08 Thread Lucas De Marchi

On Wed, May 08, 2024 at 09:34:59AM GMT, Tvrtko Ursulin wrote:


On 07/05/2024 23:45, Lucas De Marchi wrote:

From: Umesh Nerlige Ramappa 

Add a helper to accumulate per-client runtime of all its
exec queues. This is called every time a sched job is finished.

v2:
  - Use guc_exec_queue_free_job() and execlist_job_free() to accumulate
runtime when job is finished since xe_sched_job_completed() is not a
notification that job finished.
  - Stop trying to update runtime from xe_exec_queue_fini() - that is
redundant and may happen after xef is closed, leading to a
use-after-free
  - Do not special case the first timestamp read: the default LRC sets
CTX_TIMESTAMP to zero, so even the first sample should be a valid
one.
  - Handle the parallel submission case by multiplying the runtime by
width.

Signed-off-by: Umesh Nerlige Ramappa 
Signed-off-by: Lucas De Marchi 
---
 drivers/gpu/drm/xe/xe_device_types.h |  9 +++
 drivers/gpu/drm/xe/xe_exec_queue.c   | 35 
 drivers/gpu/drm/xe/xe_exec_queue.h   |  1 +
 drivers/gpu/drm/xe/xe_execlist.c |  1 +
 drivers/gpu/drm/xe/xe_guc_submit.c   |  2 ++
 5 files changed, 48 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_device_types.h 
b/drivers/gpu/drm/xe/xe_device_types.h
index 906b98fb973b..de078bdf0ab9 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -560,6 +560,15 @@ struct xe_file {
struct mutex lock;
} exec_queue;
+   /**
+* @runtime: hw engine class runtime in ticks for this drm client
+*
+* Only stats from xe_exec_queue->lrc[0] are accumulated. For multi-lrc
+* case, since all jobs run in parallel on the engines, only the stats
+* from lrc[0] are sufficient.
+*/
+   u64 runtime[XE_ENGINE_CLASS_MAX];
+
/** @client: drm client */
struct xe_drm_client *client;
 };
diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c 
b/drivers/gpu/drm/xe/xe_exec_queue.c
index 395de93579fa..86eb22e22c95 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue.c
+++ b/drivers/gpu/drm/xe/xe_exec_queue.c
@@ -769,6 +769,41 @@ bool xe_exec_queue_is_idle(struct xe_exec_queue *q)
q->lrc[0].fence_ctx.next_seqno - 1;
 }
+/**
+ * xe_exec_queue_update_runtime() - Update runtime for this exec queue from hw
+ * @q: The exec queue
+ *
+ * Update the timestamp saved by HW for this exec queue and save runtime
+ * calculated by using the delta from last update. On multi-lrc case, only the
+ * first is considered.
+ */
+void xe_exec_queue_update_runtime(struct xe_exec_queue *q)
+{
+   struct xe_file *xef;
+   struct xe_lrc *lrc;
+   u32 old_ts, new_ts;
+
+   /*
+* Jobs that are run during driver load may use an exec_queue, but are
+* not associated with a user xe file, so avoid accumulating busyness
+* for kernel specific work.
+*/
+   if (!q->vm || !q->vm->xef)
+   return;
+
+   xef = q->vm->xef;
+
+   /*
+* Only sample the first LRC. For parallel submission, all of them are
+* scheduled together and we compensate that below by multiplying by
+* width
+*/
+   lrc = >lrc[0];
+
+   new_ts = xe_lrc_update_timestamp(lrc, _ts);
+   xef->runtime[q->class] += (new_ts - old_ts) * q->width;


I think in theory this could be introducing a systematic error 
depending on how firmware handles things and tick resolution. Or even 
regardless of the firmware, if the timestamps are saved on context 
exit by the GPU hw itself and parallel contexts do not exit 100% 
aligned. Undershoot would be I think fine, but systematic overshoot 
under constant 100% parallel load from mutlitple client could 
constantly show >100% class utilisation. Probably not a concern in 
practice but worthy a comment?


Ok... just extend the comment above? I could have looped through the
LRCs to read the timestamp, but I didn't like reading them in slightly
different times, which could give slight different results if they are
in a running state.  This way here it doesn't have this problem at the
expense of "we have to trust the hw/firmware". We can switch to the loop
eventually if our level of trust decreases after getting more data/test :)

thanks
Lucas De Marchi



Regards,

Tvrtko


+}
+
 void xe_exec_queue_kill(struct xe_exec_queue *q)
 {
struct xe_exec_queue *eq = q, *next;
diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h 
b/drivers/gpu/drm/xe/xe_exec_queue.h
index 48f6da53a292..e0f07d28ee1a 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue.h
+++ b/drivers/gpu/drm/xe/xe_exec_queue.h
@@ -75,5 +75,6 @@ struct dma_fence *xe_exec_queue_last_fence_get(struct 
xe_exec_queue *e,
   struct xe_vm *vm);
 void xe_exec_queue_last_fence_set(struct xe_exec_queue *e, struct xe_vm *vm,
  struct dma_fence *fence);
+void xe_exec_queue_update_runtime(struct 

Re: [PATCH v3 5/6] drm/xe: Add helper to accumulate exec queue runtime

2024-05-08 Thread Tvrtko Ursulin



On 07/05/2024 23:45, Lucas De Marchi wrote:

From: Umesh Nerlige Ramappa 

Add a helper to accumulate per-client runtime of all its
exec queues. This is called every time a sched job is finished.

v2:
   - Use guc_exec_queue_free_job() and execlist_job_free() to accumulate
 runtime when job is finished since xe_sched_job_completed() is not a
 notification that job finished.
   - Stop trying to update runtime from xe_exec_queue_fini() - that is
 redundant and may happen after xef is closed, leading to a
 use-after-free
   - Do not special case the first timestamp read: the default LRC sets
 CTX_TIMESTAMP to zero, so even the first sample should be a valid
 one.
   - Handle the parallel submission case by multiplying the runtime by
 width.

Signed-off-by: Umesh Nerlige Ramappa 
Signed-off-by: Lucas De Marchi 
---
  drivers/gpu/drm/xe/xe_device_types.h |  9 +++
  drivers/gpu/drm/xe/xe_exec_queue.c   | 35 
  drivers/gpu/drm/xe/xe_exec_queue.h   |  1 +
  drivers/gpu/drm/xe/xe_execlist.c |  1 +
  drivers/gpu/drm/xe/xe_guc_submit.c   |  2 ++
  5 files changed, 48 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_device_types.h 
b/drivers/gpu/drm/xe/xe_device_types.h
index 906b98fb973b..de078bdf0ab9 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -560,6 +560,15 @@ struct xe_file {
struct mutex lock;
} exec_queue;
  
+	/**

+* @runtime: hw engine class runtime in ticks for this drm client
+*
+* Only stats from xe_exec_queue->lrc[0] are accumulated. For multi-lrc
+* case, since all jobs run in parallel on the engines, only the stats
+* from lrc[0] are sufficient.
+*/
+   u64 runtime[XE_ENGINE_CLASS_MAX];
+
/** @client: drm client */
struct xe_drm_client *client;
  };
diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c 
b/drivers/gpu/drm/xe/xe_exec_queue.c
index 395de93579fa..86eb22e22c95 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue.c
+++ b/drivers/gpu/drm/xe/xe_exec_queue.c
@@ -769,6 +769,41 @@ bool xe_exec_queue_is_idle(struct xe_exec_queue *q)
q->lrc[0].fence_ctx.next_seqno - 1;
  }
  
+/**

+ * xe_exec_queue_update_runtime() - Update runtime for this exec queue from hw
+ * @q: The exec queue
+ *
+ * Update the timestamp saved by HW for this exec queue and save runtime
+ * calculated by using the delta from last update. On multi-lrc case, only the
+ * first is considered.
+ */
+void xe_exec_queue_update_runtime(struct xe_exec_queue *q)
+{
+   struct xe_file *xef;
+   struct xe_lrc *lrc;
+   u32 old_ts, new_ts;
+
+   /*
+* Jobs that are run during driver load may use an exec_queue, but are
+* not associated with a user xe file, so avoid accumulating busyness
+* for kernel specific work.
+*/
+   if (!q->vm || !q->vm->xef)
+   return;
+
+   xef = q->vm->xef;
+
+   /*
+* Only sample the first LRC. For parallel submission, all of them are
+* scheduled together and we compensate that below by multiplying by
+* width
+*/
+   lrc = >lrc[0];
+
+   new_ts = xe_lrc_update_timestamp(lrc, _ts);
+   xef->runtime[q->class] += (new_ts - old_ts) * q->width;


I think in theory this could be introducing a systematic error depending 
on how firmware handles things and tick resolution. Or even regardless 
of the firmware, if the timestamps are saved on context exit by the GPU 
hw itself and parallel contexts do not exit 100% aligned. Undershoot 
would be I think fine, but systematic overshoot under constant 100% 
parallel load from mutlitple client could constantly show >100% class 
utilisation. Probably not a concern in practice but worthy a comment?


Regards,

Tvrtko


+}
+
  void xe_exec_queue_kill(struct xe_exec_queue *q)
  {
struct xe_exec_queue *eq = q, *next;
diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h 
b/drivers/gpu/drm/xe/xe_exec_queue.h
index 48f6da53a292..e0f07d28ee1a 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue.h
+++ b/drivers/gpu/drm/xe/xe_exec_queue.h
@@ -75,5 +75,6 @@ struct dma_fence *xe_exec_queue_last_fence_get(struct 
xe_exec_queue *e,
   struct xe_vm *vm);
  void xe_exec_queue_last_fence_set(struct xe_exec_queue *e, struct xe_vm *vm,
  struct dma_fence *fence);
+void xe_exec_queue_update_runtime(struct xe_exec_queue *q);
  
  #endif

diff --git a/drivers/gpu/drm/xe/xe_execlist.c b/drivers/gpu/drm/xe/xe_execlist.c
index dece2785933c..a316431025c7 100644
--- a/drivers/gpu/drm/xe/xe_execlist.c
+++ b/drivers/gpu/drm/xe/xe_execlist.c
@@ -307,6 +307,7 @@ static void execlist_job_free(struct drm_sched_job *drm_job)
  {
struct xe_sched_job *job = to_xe_sched_job(drm_job);
  
+	xe_exec_queue_update_runtime(job->q);

xe_sched_job_put(job);
  }
  
diff --git 

Re: [PATCH v3 5/6] drm/xe: Add helper to accumulate exec queue runtime

2024-05-07 Thread Lucas De Marchi

On Tue, May 07, 2024 at 04:45:36PM GMT, Umesh Nerlige Ramappa wrote:

On Tue, May 07, 2024 at 03:45:09PM -0700, Lucas De Marchi wrote:

From: Umesh Nerlige Ramappa 

Add a helper to accumulate per-client runtime of all its
exec queues. This is called every time a sched job is finished.

v2:
- Use guc_exec_queue_free_job() and execlist_job_free() to accumulate
  runtime when job is finished since xe_sched_job_completed() is not a
  notification that job finished.
- Stop trying to update runtime from xe_exec_queue_fini() - that is
  redundant and may happen after xef is closed, leading to a
  use-after-free
- Do not special case the first timestamp read: the default LRC sets
  CTX_TIMESTAMP to zero, so even the first sample should be a valid
  one.
- Handle the parallel submission case by multiplying the runtime by
  width.

Signed-off-by: Umesh Nerlige Ramappa 
Signed-off-by: Lucas De Marchi 
---
drivers/gpu/drm/xe/xe_device_types.h |  9 +++
drivers/gpu/drm/xe/xe_exec_queue.c   | 35 
drivers/gpu/drm/xe/xe_exec_queue.h   |  1 +
drivers/gpu/drm/xe/xe_execlist.c |  1 +
drivers/gpu/drm/xe/xe_guc_submit.c   |  2 ++
5 files changed, 48 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_device_types.h 
b/drivers/gpu/drm/xe/xe_device_types.h
index 906b98fb973b..de078bdf0ab9 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -560,6 +560,15 @@ struct xe_file {
struct mutex lock;
} exec_queue;

+   /**
+* @runtime: hw engine class runtime in ticks for this drm client
+*
+* Only stats from xe_exec_queue->lrc[0] are accumulated. For multi-lrc
+* case, since all jobs run in parallel on the engines, only the stats
+* from lrc[0] are sufficient.


Maybe we can drop the above comment altogether after the multi-lrc 
update.


right... I removed it from the middle of the function and this was a
leftover. I will remove it on next version.

thanks
Lucas De Marchi




Umesh


Re: [PATCH v3 5/6] drm/xe: Add helper to accumulate exec queue runtime

2024-05-07 Thread Umesh Nerlige Ramappa

On Tue, May 07, 2024 at 03:45:09PM -0700, Lucas De Marchi wrote:

From: Umesh Nerlige Ramappa 

Add a helper to accumulate per-client runtime of all its
exec queues. This is called every time a sched job is finished.

v2:
 - Use guc_exec_queue_free_job() and execlist_job_free() to accumulate
   runtime when job is finished since xe_sched_job_completed() is not a
   notification that job finished.
 - Stop trying to update runtime from xe_exec_queue_fini() - that is
   redundant and may happen after xef is closed, leading to a
   use-after-free
 - Do not special case the first timestamp read: the default LRC sets
   CTX_TIMESTAMP to zero, so even the first sample should be a valid
   one.
 - Handle the parallel submission case by multiplying the runtime by
   width.

Signed-off-by: Umesh Nerlige Ramappa 
Signed-off-by: Lucas De Marchi 
---
drivers/gpu/drm/xe/xe_device_types.h |  9 +++
drivers/gpu/drm/xe/xe_exec_queue.c   | 35 
drivers/gpu/drm/xe/xe_exec_queue.h   |  1 +
drivers/gpu/drm/xe/xe_execlist.c |  1 +
drivers/gpu/drm/xe/xe_guc_submit.c   |  2 ++
5 files changed, 48 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_device_types.h 
b/drivers/gpu/drm/xe/xe_device_types.h
index 906b98fb973b..de078bdf0ab9 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -560,6 +560,15 @@ struct xe_file {
struct mutex lock;
} exec_queue;

+   /**
+* @runtime: hw engine class runtime in ticks for this drm client
+*
+* Only stats from xe_exec_queue->lrc[0] are accumulated. For multi-lrc
+* case, since all jobs run in parallel on the engines, only the stats
+* from lrc[0] are sufficient.


Maybe we can drop the above comment altogether after the multi-lrc 
update.


Umesh


[PATCH v3 5/6] drm/xe: Add helper to accumulate exec queue runtime

2024-05-07 Thread Lucas De Marchi
From: Umesh Nerlige Ramappa 

Add a helper to accumulate per-client runtime of all its
exec queues. This is called every time a sched job is finished.

v2:
  - Use guc_exec_queue_free_job() and execlist_job_free() to accumulate
runtime when job is finished since xe_sched_job_completed() is not a
notification that job finished.
  - Stop trying to update runtime from xe_exec_queue_fini() - that is
redundant and may happen after xef is closed, leading to a
use-after-free
  - Do not special case the first timestamp read: the default LRC sets
CTX_TIMESTAMP to zero, so even the first sample should be a valid
one.
  - Handle the parallel submission case by multiplying the runtime by
width.

Signed-off-by: Umesh Nerlige Ramappa 
Signed-off-by: Lucas De Marchi 
---
 drivers/gpu/drm/xe/xe_device_types.h |  9 +++
 drivers/gpu/drm/xe/xe_exec_queue.c   | 35 
 drivers/gpu/drm/xe/xe_exec_queue.h   |  1 +
 drivers/gpu/drm/xe/xe_execlist.c |  1 +
 drivers/gpu/drm/xe/xe_guc_submit.c   |  2 ++
 5 files changed, 48 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_device_types.h 
b/drivers/gpu/drm/xe/xe_device_types.h
index 906b98fb973b..de078bdf0ab9 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -560,6 +560,15 @@ struct xe_file {
struct mutex lock;
} exec_queue;
 
+   /**
+* @runtime: hw engine class runtime in ticks for this drm client
+*
+* Only stats from xe_exec_queue->lrc[0] are accumulated. For multi-lrc
+* case, since all jobs run in parallel on the engines, only the stats
+* from lrc[0] are sufficient.
+*/
+   u64 runtime[XE_ENGINE_CLASS_MAX];
+
/** @client: drm client */
struct xe_drm_client *client;
 };
diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c 
b/drivers/gpu/drm/xe/xe_exec_queue.c
index 395de93579fa..86eb22e22c95 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue.c
+++ b/drivers/gpu/drm/xe/xe_exec_queue.c
@@ -769,6 +769,41 @@ bool xe_exec_queue_is_idle(struct xe_exec_queue *q)
q->lrc[0].fence_ctx.next_seqno - 1;
 }
 
+/**
+ * xe_exec_queue_update_runtime() - Update runtime for this exec queue from hw
+ * @q: The exec queue
+ *
+ * Update the timestamp saved by HW for this exec queue and save runtime
+ * calculated by using the delta from last update. On multi-lrc case, only the
+ * first is considered.
+ */
+void xe_exec_queue_update_runtime(struct xe_exec_queue *q)
+{
+   struct xe_file *xef;
+   struct xe_lrc *lrc;
+   u32 old_ts, new_ts;
+
+   /*
+* Jobs that are run during driver load may use an exec_queue, but are
+* not associated with a user xe file, so avoid accumulating busyness
+* for kernel specific work.
+*/
+   if (!q->vm || !q->vm->xef)
+   return;
+
+   xef = q->vm->xef;
+
+   /*
+* Only sample the first LRC. For parallel submission, all of them are
+* scheduled together and we compensate that below by multiplying by
+* width
+*/
+   lrc = >lrc[0];
+
+   new_ts = xe_lrc_update_timestamp(lrc, _ts);
+   xef->runtime[q->class] += (new_ts - old_ts) * q->width;
+}
+
 void xe_exec_queue_kill(struct xe_exec_queue *q)
 {
struct xe_exec_queue *eq = q, *next;
diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h 
b/drivers/gpu/drm/xe/xe_exec_queue.h
index 48f6da53a292..e0f07d28ee1a 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue.h
+++ b/drivers/gpu/drm/xe/xe_exec_queue.h
@@ -75,5 +75,6 @@ struct dma_fence *xe_exec_queue_last_fence_get(struct 
xe_exec_queue *e,
   struct xe_vm *vm);
 void xe_exec_queue_last_fence_set(struct xe_exec_queue *e, struct xe_vm *vm,
  struct dma_fence *fence);
+void xe_exec_queue_update_runtime(struct xe_exec_queue *q);
 
 #endif
diff --git a/drivers/gpu/drm/xe/xe_execlist.c b/drivers/gpu/drm/xe/xe_execlist.c
index dece2785933c..a316431025c7 100644
--- a/drivers/gpu/drm/xe/xe_execlist.c
+++ b/drivers/gpu/drm/xe/xe_execlist.c
@@ -307,6 +307,7 @@ static void execlist_job_free(struct drm_sched_job *drm_job)
 {
struct xe_sched_job *job = to_xe_sched_job(drm_job);
 
+   xe_exec_queue_update_runtime(job->q);
xe_sched_job_put(job);
 }
 
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c 
b/drivers/gpu/drm/xe/xe_guc_submit.c
index d274a139010b..e0ebfe83dfe8 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -749,6 +749,8 @@ static void guc_exec_queue_free_job(struct drm_sched_job 
*drm_job)
 {
struct xe_sched_job *job = to_xe_sched_job(drm_job);
 
+   xe_exec_queue_update_runtime(job->q);
+
trace_xe_sched_job_free(job);
xe_sched_job_put(job);
 }
-- 
2.43.0