Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu

2022-10-21 Thread Umesh Nerlige Ramappa

On Fri, Oct 21, 2022 at 09:42:53AM +0100, Tvrtko Ursulin wrote:


On 27/10/2021 01:48, Umesh Nerlige Ramappa wrote:

[snip]


+static void guc_timestamp_ping(struct work_struct *wrk)
+{
+   struct intel_guc *guc = container_of(wrk, typeof(*guc),
+timestamp.work.work);
+   struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
+   struct intel_gt *gt = guc_to_gt(guc);
+   intel_wakeref_t wakeref;
+   unsigned long flags;
+   int srcu, ret;
+
+   /*
+* Synchronize with gt reset to make sure the worker does not
+* corrupt the engine/guc stats.
+*/
+   ret = intel_gt_reset_trylock(gt, );
+   if (ret)
+   return;
+
+   spin_lock_irqsave(>timestamp.lock, flags);
+
+   with_intel_runtime_pm(>i915->runtime_pm, wakeref)
+   __update_guc_busyness_stats(guc);


Spotted one splat today: 
https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12268/bat-adlp-4/igt@i915_pm_...@basic-pci-d3-state.html

Could be that reset lock needs to be inside the rpm get. Haven't really though 
about it much, could you please check?

<4> [300.214744]
<4> [300.214753] ==
<4> [300.214755] WARNING: possible circular locking dependency detected
<4> [300.214758] 6.1.0-rc1-CI_DRM_12268-g86e8558e3283+ #1 Not tainted
<4> [300.214761] --
<4> [300.214762] kworker/10:1H/265 is trying to acquire lock:
<4> [300.214765] 8275e560 (fs_reclaim){+.+.}-{0:0}, at: 
__kmem_cache_alloc_node+0x27/0x170
<4> [300.214780]
but task is already holding lock:
<4> [300.214782] c900013e7e78 
((work_completion)(&(>timestamp.work)->work)){+.+.}-{0:0}, at: 
process_one_work+0x1eb/0x5b0
<4> [300.214793]
which lock already depends on the new lock.
<4> [300.214794]
the existing dependency chain (in reverse order) is:
<4> [300.214796]
-> #2 ((work_completion)(&(>timestamp.work)->work)){+.+.}-{0:0}:
<4> [300.214801]lock_acquire+0xd3/0x310
<4> [300.214806]__flush_work+0x77/0x4e0
<4> [300.214811]__cancel_work_timer+0x14e/0x1f0
<4> [300.214815]intel_guc_submission_reset_prepare+0x7a/0x420 [i915]
<4> [300.215119]intel_uc_reset_prepare+0x44/0x50 [i915]
<4> [300.215360]reset_prepare+0x21/0x80 [i915]
<4> [300.215561]intel_gt_reset+0x143/0x340 [i915]
<4> [300.215757]intel_gt_reset_global+0xeb/0x160 [i915]
<4> [300.215946]intel_gt_handle_error+0x2c2/0x410 [i915]
<4> [300.216137]intel_gt_debugfs_reset_store+0x59/0xc0 [i915]
<4> [300.216333]i915_wedged_set+0xc/0x20 [i915]
<4> [300.216513]simple_attr_write+0xda/0x100
<4> [300.216520]full_proxy_write+0x4e/0x80
<4> [300.216525]vfs_write+0xe3/0x4e0
<4> [300.216531]ksys_write+0x57/0xd0
<4> [300.216535]do_syscall_64+0x37/0x90
<4> [300.216542]entry_SYSCALL_64_after_hwframe+0x63/0xcd
<4> [300.216549]
-> #1 (>reset.mutex){+.+.}-{3:3}:
<4> [300.216556]lock_acquire+0xd3/0x310
<4> [300.216559]i915_gem_shrinker_taints_mutex+0x2d/0x50 [i915]


i915_gem_shrinker_taints_mutex seems to have something to do with 
fs_reclaim and so does the stack #0. Any idea what this early init is 
doing? Can this code also result in a gt_wedged case because that might 
explain the stack #2 which is a reset.



<4> [300.216799]intel_gt_init_reset+0x61/0x80 [i915]
<4> [300.217018]intel_gt_common_init_early+0x10c/0x190 [i915]
<4> [300.217227]intel_root_gt_init_early+0x44/0x60 [i915]
<4> [300.217434]i915_driver_probe+0x9ab/0xf30 [i915]
<4> [300.217615]i915_pci_probe+0xa5/0x240 [i915]
<4> [300.217796]pci_device_probe+0x95/0x110
<4> [300.217803]really_probe+0xd6/0x350
<4> [300.217811]__driver_probe_device+0x73/0x170
<4> [300.217816]driver_probe_device+0x1a/0x90
<4> [300.217821]__driver_attach+0xbc/0x190
<4> [300.217826]bus_for_each_dev+0x72/0xc0
<4> [300.217831]bus_add_driver+0x1bb/0x210
<4> [300.217835]driver_register+0x66/0xc0
<4> [300.217841]0xa093001f
<4> [300.217844]do_one_initcall+0x53/0x2f0
<4> [300.217849]do_init_module+0x45/0x1c0
<4> [300.217855]load_module+0x1d5e/0x1e90
<4> [300.217859]__do_sys_finit_module+0xaf/0x120
<4> [300.217864]do_syscall_64+0x37/0x90
<4> [300.217869]entry_SYSCALL_64_after_hwframe+0x63/0xcd
<4> [300.217875]
-> #0 (fs_reclaim){+.+.}-{0:0}:
<4> [300.217880]validate_chain+0xb3d/0x2000
<4> [300.217884]__lock_acquire+0x5a4/0xb70
<4> [300.217888]lock_acquire+0xd3/0x310
<4> [300.217891]fs_reclaim_acquire+0xa1/0xd0


fs_reclaim ^


<4> [300.217896]__kmem_cache_alloc_node+0x27/0x170
<4> [300.217899]__kmalloc+0x43/0x1a0
<4> [300.217903]acpi_ns_internalize_name+0x44/0x9f
<4> [300.217909]acpi_ns_get_node_unlocked+0x6b/0xd7
<4> 

Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu

2022-10-21 Thread Tvrtko Ursulin



On 27/10/2021 01:48, Umesh Nerlige Ramappa wrote:

[snip]


+static void guc_timestamp_ping(struct work_struct *wrk)
+{
+   struct intel_guc *guc = container_of(wrk, typeof(*guc),
+timestamp.work.work);
+   struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
+   struct intel_gt *gt = guc_to_gt(guc);
+   intel_wakeref_t wakeref;
+   unsigned long flags;
+   int srcu, ret;
+
+   /*
+* Synchronize with gt reset to make sure the worker does not
+* corrupt the engine/guc stats.
+*/
+   ret = intel_gt_reset_trylock(gt, );
+   if (ret)
+   return;
+
+   spin_lock_irqsave(>timestamp.lock, flags);
+
+   with_intel_runtime_pm(>i915->runtime_pm, wakeref)
+   __update_guc_busyness_stats(guc);


Spotted one splat today: 
https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12268/bat-adlp-4/igt@i915_pm_...@basic-pci-d3-state.html

Could be that reset lock needs to be inside the rpm get. Haven't really though 
about it much, could you please check?

<4> [300.214744]
<4> [300.214753] ==
<4> [300.214755] WARNING: possible circular locking dependency detected
<4> [300.214758] 6.1.0-rc1-CI_DRM_12268-g86e8558e3283+ #1 Not tainted
<4> [300.214761] --
<4> [300.214762] kworker/10:1H/265 is trying to acquire lock:
<4> [300.214765] 8275e560 (fs_reclaim){+.+.}-{0:0}, at: 
__kmem_cache_alloc_node+0x27/0x170
<4> [300.214780]
but task is already holding lock:
<4> [300.214782] c900013e7e78 
((work_completion)(&(>timestamp.work)->work)){+.+.}-{0:0}, at: 
process_one_work+0x1eb/0x5b0
<4> [300.214793]
which lock already depends on the new lock.
<4> [300.214794]
the existing dependency chain (in reverse order) is:
<4> [300.214796]
-> #2 ((work_completion)(&(>timestamp.work)->work)){+.+.}-{0:0}:
<4> [300.214801]lock_acquire+0xd3/0x310
<4> [300.214806]__flush_work+0x77/0x4e0
<4> [300.214811]__cancel_work_timer+0x14e/0x1f0
<4> [300.214815]intel_guc_submission_reset_prepare+0x7a/0x420 [i915]
<4> [300.215119]intel_uc_reset_prepare+0x44/0x50 [i915]
<4> [300.215360]reset_prepare+0x21/0x80 [i915]
<4> [300.215561]intel_gt_reset+0x143/0x340 [i915]
<4> [300.215757]intel_gt_reset_global+0xeb/0x160 [i915]
<4> [300.215946]intel_gt_handle_error+0x2c2/0x410 [i915]
<4> [300.216137]intel_gt_debugfs_reset_store+0x59/0xc0 [i915]
<4> [300.216333]i915_wedged_set+0xc/0x20 [i915]
<4> [300.216513]simple_attr_write+0xda/0x100
<4> [300.216520]full_proxy_write+0x4e/0x80
<4> [300.216525]vfs_write+0xe3/0x4e0
<4> [300.216531]ksys_write+0x57/0xd0
<4> [300.216535]do_syscall_64+0x37/0x90
<4> [300.216542]entry_SYSCALL_64_after_hwframe+0x63/0xcd
<4> [300.216549]
-> #1 (>reset.mutex){+.+.}-{3:3}:
<4> [300.216556]lock_acquire+0xd3/0x310
<4> [300.216559]i915_gem_shrinker_taints_mutex+0x2d/0x50 [i915]
<4> [300.216799]intel_gt_init_reset+0x61/0x80 [i915]
<4> [300.217018]intel_gt_common_init_early+0x10c/0x190 [i915]
<4> [300.217227]intel_root_gt_init_early+0x44/0x60 [i915]
<4> [300.217434]i915_driver_probe+0x9ab/0xf30 [i915]
<4> [300.217615]i915_pci_probe+0xa5/0x240 [i915]
<4> [300.217796]pci_device_probe+0x95/0x110
<4> [300.217803]really_probe+0xd6/0x350
<4> [300.217811]__driver_probe_device+0x73/0x170
<4> [300.217816]driver_probe_device+0x1a/0x90
<4> [300.217821]__driver_attach+0xbc/0x190
<4> [300.217826]bus_for_each_dev+0x72/0xc0
<4> [300.217831]bus_add_driver+0x1bb/0x210
<4> [300.217835]driver_register+0x66/0xc0
<4> [300.217841]0xa093001f
<4> [300.217844]do_one_initcall+0x53/0x2f0
<4> [300.217849]do_init_module+0x45/0x1c0
<4> [300.217855]load_module+0x1d5e/0x1e90
<4> [300.217859]__do_sys_finit_module+0xaf/0x120
<4> [300.217864]do_syscall_64+0x37/0x90
<4> [300.217869]entry_SYSCALL_64_after_hwframe+0x63/0xcd
<4> [300.217875]
-> #0 (fs_reclaim){+.+.}-{0:0}:
<4> [300.217880]validate_chain+0xb3d/0x2000
<4> [300.217884]__lock_acquire+0x5a4/0xb70
<4> [300.217888]lock_acquire+0xd3/0x310
<4> [300.217891]fs_reclaim_acquire+0xa1/0xd0
<4> [300.217896]__kmem_cache_alloc_node+0x27/0x170
<4> [300.217899]__kmalloc+0x43/0x1a0
<4> [300.217903]acpi_ns_internalize_name+0x44/0x9f
<4> [300.217909]acpi_ns_get_node_unlocked+0x6b/0xd7
<4> [300.217914]acpi_ns_get_node+0x3b/0x54
<4> [300.217918]acpi_get_handle+0x89/0xb7
<4> [300.217922]acpi_has_method+0x1c/0x40
<4> [300.217928]acpi_pci_set_power_state+0x42/0xf0
<4> [300.217935]pci_power_up+0x20/0x1a0
<4> [300.217940]pci_pm_default_resume_early+0x9/0x30
<4> [300.217945]

Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu

2021-10-27 Thread Matthew Brost
On Tue, Oct 26, 2021 at 05:48:21PM -0700, Umesh Nerlige Ramappa wrote:
> With GuC handling scheduling, i915 is not aware of the time that a
> context is scheduled in and out of the engine. Since i915 pmu relies on
> this info to provide engine busyness to the user, GuC shares this info
> with i915 for all engines using shared memory. For each engine, this
> info contains:
> 
> - total busyness: total time that the context was running (total)
> - id: id of the running context (id)
> - start timestamp: timestamp when the context started running (start)
> 
> At the time (now) of sampling the engine busyness, if the id is valid
> (!= ~0), and start is non-zero, then the context is considered to be
> active and the engine busyness is calculated using the below equation
> 
>   engine busyness = total + (now - start)
> 
> All times are obtained from the gt clock base. For inactive contexts,
> engine busyness is just equal to the total.
> 
> The start and total values provided by GuC are 32 bits and wrap around
> in a few minutes. Since perf pmu provides busyness as 64 bit
> monotonically increasing values, there is a need for this implementation
> to account for overflows and extend the time to 64 bits before returning
> busyness to the user. In order to do that, a worker runs periodically at
> frequency = 1/8th the time it takes for the timestamp to wrap. As an
> example, that would be once in 27 seconds for a gt clock frequency of
> 19.2 MHz.
> 
> Note:
> There might be an over-accounting of busyness due to the fact that GuC
> may be updating the total and start values while kmd is reading them.
> (i.e kmd may read the updated total and the stale start). In such a
> case, user may see higher busyness value followed by smaller ones which
> would eventually catch up to the higher value.
> 
> v2: (Tvrtko)
> - Include details in commit message
> - Move intel engine busyness function into execlist code
> - Use union inside engine->stats
> - Use natural type for ping delay jiffies
> - Drop active_work condition checks
> - Use for_each_engine if iterating all engines
> - Drop seq locking, use spinlock at GuC level to update engine stats
> - Document worker specific details
> 
> v3: (Tvrtko/Umesh)
> - Demarcate GuC and execlist stat objects with comments
> - Document known over-accounting issue in commit
> - Provide a consistent view of GuC state
> - Add hooks to gt park/unpark for GuC busyness
> - Stop/start worker in gt park/unpark path
> - Drop inline
> - Move spinlock and worker inits to GuC initialization
> - Drop helpers that are called only once
> 
> v4: (Tvrtko/Matt/Umesh)
> - Drop addressed opens from commit message
> - Get runtime pm in ping, remove from the park path
> - Use cancel_delayed_work_sync in disable_submission path
> - Update stats during reset prepare
> - Skip ping if reset in progress
> - Explicitly name execlists and GuC stats objects
> - Since disable_submission is called from many places, move resetting
>   stats to intel_guc_submission_reset_prepare
> 
> v5: (Tvrtko)
> - Add a trylock helper that does not sleep and synchronize PMU event
>   callbacks and worker with gt reset
> 
> v6: (CI BAT failures)
> - DUTs using execlist submission failed to boot since __gt_unpark is
>   called during i915 load. This ends up calling the GuC busyness unpark
>   hook and results in kick-starting an uninitialized worker. Let
>   park/unpark hooks check if GuC submission has been initialized.
> - drop cant_sleep() from trylock helper since rcu_read_lock takes care
>   of that.
> 
> v7: (CI) Fix igt@i915_selftest@live@gt_engines
> - For GuC mode of submission the engine busyness is derived from gt time
>   domain. Use gt time elapsed as reference in the selftest.
> - Increase busyness calculation to 10ms duration to ensure batch runs
>   longer and falls within the busyness tolerances in selftest.
> 
> v8:
> - Use ktime_get in selftest as before
> - intel_reset_trylock_no_wait results in a lockdep splat that is not
>   trivial to fix since the PMU callback runs in irq context and the
>   reset paths are tightly knit into the driver. The test that uncovers
>   this is igt@perf_pmu@faulting-read. Drop intel_reset_trylock_no_wait,
>   instead use the reset_count to synchronize with gt reset during pmu
>   callback. For the ping, continue to use intel_reset_trylock since ping
>   is not run in irq context.
> 
> - GuC PM timestamp does not tick when GuC is idle. This can potentially
>   result in wrong busyness values when a context is active on the
>   engine, but GuC is idle. Use the RING TIMESTAMP as GPU timestamp to
>   process the GuC busyness stats. This works since both GuC timestamp and
>   RING timestamp are synced with the same clock.
> 
> - The busyness stats may get updated after the batch starts running.
>   This delay causes the busyness reported for 100us duration to fall
>   below 95% in the selftest. The only option at this time is to wait for
>   GuC busyness to change from idle to active 

Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu

2021-10-19 Thread Umesh Nerlige Ramappa

On Tue, Oct 19, 2021 at 09:32:07AM +0100, Tvrtko Ursulin wrote:


On 18/10/2021 19:35, Umesh Nerlige Ramappa wrote:

On Mon, Oct 18, 2021 at 08:58:01AM +0100, Tvrtko Ursulin wrote:



On 16/10/2021 00:47, Umesh Nerlige Ramappa wrote:

With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:

- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)

At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation

engine busyness = total + (now - start)

All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.

The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.

Note:
There might be an overaccounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.

v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at guc level to update engine stats
- Document worker specific details

v3: (Tvrtko/Umesh)
- Demarcate guc and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of guc state
- Add hooks to gt park/unpark for guc busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to guc initialization
- Drop helpers that are called only once

v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and guc stats objects
- Since disable_submission is called from many places, move resetting
  stats to intel_guc_submission_reset_prepare

v5: (Tvrtko)
- Add a trylock helper that does not sleep and synchronize PMU event
  callbacks and worker with gt reset

v6: (CI BAT failures)
- DUTs using execlist submission failed to boot since __gt_unpark is
  called during i915 load. This ends up calling the guc busyness unpark
  hook and results in kiskstarting an uninitialized worker. Let
  park/unpark hooks check if guc submission has been initialized.
- drop cant_sleep() from trylock hepler since rcu_read_lock takes care
  of that.

v7: (CI) Fix igt@i915_selftest@live@gt_engines
- For guc mode of submission the engine busyness is derived from gt time
  domain. Use gt time elapsed as reference in the selftest.
- Increase busyness calculation to 10ms duration to ensure batch runs
  longer and falls within the busyness tolerances in selftest.


[snip]

diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c 
b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c

index 75569666105d..24358bef6691 100644
--- a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
@@ -234,6 +234,7 @@ static int live_engine_busy_stats(void *arg)
 struct i915_request *rq;
 ktime_t de, dt;
 ktime_t t[2];
+    u32 gt_stamp;
 if (!intel_engine_supports_stats(engine))
 continue;
@@ -251,10 +252,16 @@ static int live_engine_busy_stats(void *arg)
 ENGINE_TRACE(engine, "measuring idle time\n");
 preempt_disable();
 de = intel_engine_get_busy_time(engine, [0]);
-    udelay(100);
+    gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
+    udelay(1);
 de = ktime_sub(intel_engine_get_busy_time(engine, [1]), de);
+    gt_stamp = intel_uncore_read(gt->uncore, 
GUCPMTIMESTAMP) - gt_stamp;

 preempt_enable();
-    dt = ktime_sub(t[1], t[0]);
+
+    dt = intel_engine_uses_guc(engine) ?
+ 

Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu

2021-10-19 Thread Tvrtko Ursulin



On 18/10/2021 19:35, Umesh Nerlige Ramappa wrote:

On Mon, Oct 18, 2021 at 08:58:01AM +0100, Tvrtko Ursulin wrote:



On 16/10/2021 00:47, Umesh Nerlige Ramappa wrote:

With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:

- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)

At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation

engine busyness = total + (now - start)

All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.

The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.

Note:
There might be an overaccounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.

v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at guc level to update engine stats
- Document worker specific details

v3: (Tvrtko/Umesh)
- Demarcate guc and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of guc state
- Add hooks to gt park/unpark for guc busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to guc initialization
- Drop helpers that are called only once

v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and guc stats objects
- Since disable_submission is called from many places, move resetting
  stats to intel_guc_submission_reset_prepare

v5: (Tvrtko)
- Add a trylock helper that does not sleep and synchronize PMU event
  callbacks and worker with gt reset

v6: (CI BAT failures)
- DUTs using execlist submission failed to boot since __gt_unpark is
  called during i915 load. This ends up calling the guc busyness unpark
  hook and results in kiskstarting an uninitialized worker. Let
  park/unpark hooks check if guc submission has been initialized.
- drop cant_sleep() from trylock hepler since rcu_read_lock takes care
  of that.

v7: (CI) Fix igt@i915_selftest@live@gt_engines
- For guc mode of submission the engine busyness is derived from gt time
  domain. Use gt time elapsed as reference in the selftest.
- Increase busyness calculation to 10ms duration to ensure batch runs
  longer and falls within the busyness tolerances in selftest.


[snip]

diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c 
b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c

index 75569666105d..24358bef6691 100644
--- a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
@@ -234,6 +234,7 @@ static int live_engine_busy_stats(void *arg)
 struct i915_request *rq;
 ktime_t de, dt;
 ktime_t t[2];
+    u32 gt_stamp;
 if (!intel_engine_supports_stats(engine))
 continue;
@@ -251,10 +252,16 @@ static int live_engine_busy_stats(void *arg)
 ENGINE_TRACE(engine, "measuring idle time\n");
 preempt_disable();
 de = intel_engine_get_busy_time(engine, [0]);
-    udelay(100);
+    gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
+    udelay(1);
 de = ktime_sub(intel_engine_get_busy_time(engine, [1]), de);
+    gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP) - 
gt_stamp;

 preempt_enable();
-    dt = ktime_sub(t[1], t[0]);
+
+    dt = intel_engine_uses_guc(engine) ?
+ intel_gt_clock_interval_to_ns(engine->gt, gt_stamp) :
+ ktime_sub(t[1], t[0]);


But 

Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu

2021-10-18 Thread Umesh Nerlige Ramappa

On Mon, Oct 18, 2021 at 11:35:44AM -0700, Umesh Nerlige Ramappa wrote:

On Mon, Oct 18, 2021 at 08:58:01AM +0100, Tvrtko Ursulin wrote:



On 16/10/2021 00:47, Umesh Nerlige Ramappa wrote:

With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:

- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)

At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation

engine busyness = total + (now - start)

All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.

The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.

Note:
There might be an overaccounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.

v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at guc level to update engine stats
- Document worker specific details

v3: (Tvrtko/Umesh)
- Demarcate guc and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of guc state
- Add hooks to gt park/unpark for guc busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to guc initialization
- Drop helpers that are called only once

v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and guc stats objects
- Since disable_submission is called from many places, move resetting
 stats to intel_guc_submission_reset_prepare

v5: (Tvrtko)
- Add a trylock helper that does not sleep and synchronize PMU event
 callbacks and worker with gt reset

v6: (CI BAT failures)
- DUTs using execlist submission failed to boot since __gt_unpark is
 called during i915 load. This ends up calling the guc busyness unpark
 hook and results in kiskstarting an uninitialized worker. Let
 park/unpark hooks check if guc submission has been initialized.
- drop cant_sleep() from trylock hepler since rcu_read_lock takes care
 of that.

v7: (CI) Fix igt@i915_selftest@live@gt_engines
- For guc mode of submission the engine busyness is derived from gt time
 domain. Use gt time elapsed as reference in the selftest.
- Increase busyness calculation to 10ms duration to ensure batch runs
 longer and falls within the busyness tolerances in selftest.


[snip]


diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c 
b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
index 75569666105d..24358bef6691 100644
--- a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
@@ -234,6 +234,7 @@ static int live_engine_busy_stats(void *arg)
struct i915_request *rq;
ktime_t de, dt;
ktime_t t[2];
+   u32 gt_stamp;
if (!intel_engine_supports_stats(engine))
continue;
@@ -251,10 +252,16 @@ static int live_engine_busy_stats(void *arg)
ENGINE_TRACE(engine, "measuring idle time\n");
preempt_disable();
de = intel_engine_get_busy_time(engine, [0]);
-   udelay(100);
+   gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
+   udelay(1);
de = ktime_sub(intel_engine_get_busy_time(engine, [1]), de);
+   gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP) - 
gt_stamp;
preempt_enable();
-   dt = ktime_sub(t[1], t[0]);
+
+   dt = 

Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu

2021-10-18 Thread Umesh Nerlige Ramappa

On Mon, Oct 18, 2021 at 08:58:01AM +0100, Tvrtko Ursulin wrote:



On 16/10/2021 00:47, Umesh Nerlige Ramappa wrote:

With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:

- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)

At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation

engine busyness = total + (now - start)

All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.

The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.

Note:
There might be an overaccounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.

v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at guc level to update engine stats
- Document worker specific details

v3: (Tvrtko/Umesh)
- Demarcate guc and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of guc state
- Add hooks to gt park/unpark for guc busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to guc initialization
- Drop helpers that are called only once

v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and guc stats objects
- Since disable_submission is called from many places, move resetting
  stats to intel_guc_submission_reset_prepare

v5: (Tvrtko)
- Add a trylock helper that does not sleep and synchronize PMU event
  callbacks and worker with gt reset

v6: (CI BAT failures)
- DUTs using execlist submission failed to boot since __gt_unpark is
  called during i915 load. This ends up calling the guc busyness unpark
  hook and results in kiskstarting an uninitialized worker. Let
  park/unpark hooks check if guc submission has been initialized.
- drop cant_sleep() from trylock hepler since rcu_read_lock takes care
  of that.

v7: (CI) Fix igt@i915_selftest@live@gt_engines
- For guc mode of submission the engine busyness is derived from gt time
  domain. Use gt time elapsed as reference in the selftest.
- Increase busyness calculation to 10ms duration to ensure batch runs
  longer and falls within the busyness tolerances in selftest.


[snip]


diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c 
b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
index 75569666105d..24358bef6691 100644
--- a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
@@ -234,6 +234,7 @@ static int live_engine_busy_stats(void *arg)
struct i915_request *rq;
ktime_t de, dt;
ktime_t t[2];
+   u32 gt_stamp;
if (!intel_engine_supports_stats(engine))
continue;
@@ -251,10 +252,16 @@ static int live_engine_busy_stats(void *arg)
ENGINE_TRACE(engine, "measuring idle time\n");
preempt_disable();
de = intel_engine_get_busy_time(engine, [0]);
-   udelay(100);
+   gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
+   udelay(1);
de = ktime_sub(intel_engine_get_busy_time(engine, [1]), de);
+   gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP) - 
gt_stamp;
preempt_enable();
-   dt = ktime_sub(t[1], t[0]);
+
+   dt = intel_engine_uses_guc(engine) ?
+

Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu

2021-10-18 Thread Tvrtko Ursulin




On 16/10/2021 00:47, Umesh Nerlige Ramappa wrote:

With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:

- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)

At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation

engine busyness = total + (now - start)

All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.

The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.

Note:
There might be an overaccounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.

v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at guc level to update engine stats
- Document worker specific details

v3: (Tvrtko/Umesh)
- Demarcate guc and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of guc state
- Add hooks to gt park/unpark for guc busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to guc initialization
- Drop helpers that are called only once

v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and guc stats objects
- Since disable_submission is called from many places, move resetting
   stats to intel_guc_submission_reset_prepare

v5: (Tvrtko)
- Add a trylock helper that does not sleep and synchronize PMU event
   callbacks and worker with gt reset

v6: (CI BAT failures)
- DUTs using execlist submission failed to boot since __gt_unpark is
   called during i915 load. This ends up calling the guc busyness unpark
   hook and results in kiskstarting an uninitialized worker. Let
   park/unpark hooks check if guc submission has been initialized.
- drop cant_sleep() from trylock hepler since rcu_read_lock takes care
   of that.

v7: (CI) Fix igt@i915_selftest@live@gt_engines
- For guc mode of submission the engine busyness is derived from gt time
   domain. Use gt time elapsed as reference in the selftest.
- Increase busyness calculation to 10ms duration to ensure batch runs
   longer and falls within the busyness tolerances in selftest.


[snip]

  
diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c

index 75569666105d..24358bef6691 100644
--- a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
@@ -234,6 +234,7 @@ static int live_engine_busy_stats(void *arg)
struct i915_request *rq;
ktime_t de, dt;
ktime_t t[2];
+   u32 gt_stamp;
  
  		if (!intel_engine_supports_stats(engine))

continue;
@@ -251,10 +252,16 @@ static int live_engine_busy_stats(void *arg)
ENGINE_TRACE(engine, "measuring idle time\n");
preempt_disable();
de = intel_engine_get_busy_time(engine, [0]);
-   udelay(100);
+   gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP);
+   udelay(1);
de = ktime_sub(intel_engine_get_busy_time(engine, [1]), de);
+   gt_stamp = intel_uncore_read(gt->uncore, GUCPMTIMESTAMP) - 
gt_stamp;
preempt_enable();
-   dt = ktime_sub(t[1], t[0]);
+
+   dt = intel_engine_uses_guc(engine) ?
+intel_gt_clock_interval_to_ns(engine->gt, gt_stamp) :
+

Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu

2021-10-14 Thread Umesh Nerlige Ramappa

On Thu, Oct 14, 2021 at 09:21:28AM +0100, Tvrtko Ursulin wrote:


On 13/10/2021 01:56, Umesh Nerlige Ramappa wrote:

With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:

- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)

At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation

engine busyness = total + (now - start)

All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.

The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.

Note:
There might be an overaccounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.

v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at guc level to update engine stats
- Document worker specific details

v3: (Tvrtko/Umesh)
- Demarcate guc and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of guc state
- Add hooks to gt park/unpark for guc busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to guc initialization
- Drop helpers that are called only once

v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and guc stats objects
- Since disable_submission is called from many places, move resetting
  stats to intel_guc_submission_reset_prepare

v5: (Tvrtko)
- Add a trylock helper that does not sleep and synchronize PMU event
  callbacks and worker with gt reset

Signed-off-by: John Harrison 
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c |  28 +-
 drivers/gpu/drm/i915/gt/intel_engine_types.h  |  33 ++-
 .../drm/i915/gt/intel_execlists_submission.c  |  34 +++
 drivers/gpu/drm/i915/gt/intel_gt_pm.c |   2 +
 drivers/gpu/drm/i915/gt/intel_reset.c |  16 ++
 drivers/gpu/drm/i915/gt/intel_reset.h |   1 +
 .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   1 +
 drivers/gpu/drm/i915/gt/uc/intel_guc.h|  30 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c|  21 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h|   5 +
 drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  13 +
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 267 ++
 .../gpu/drm/i915/gt/uc/intel_guc_submission.h |   2 +
 drivers/gpu/drm/i915/i915_reg.h   |   2 +
 14 files changed, 427 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 38436f4b5706..6b783fdcba2a 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -1873,23 +1873,6 @@ void intel_engine_dump(struct intel_engine_cs *engine,
intel_engine_print_breadcrumbs(engine, m);
 }
-static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
-   ktime_t *now)
-{
-   struct intel_engine_execlists_stats *stats = >stats.execlists;
-   ktime_t total = stats->total;
-
-   /*
-* If the engine is executing something at the moment
-* add it to the total.
-*/
-   *now = ktime_get();
-   if (READ_ONCE(stats->active))
-   total = ktime_add(total, ktime_sub(*now, stats->start));
-
-   return total;
-}
-
 /**
  * intel_engine_get_busy_time() - Return current accumulated engine busyness
  * @engine: engine to 

Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu

2021-10-14 Thread Tvrtko Ursulin



On 13/10/2021 01:56, Umesh Nerlige Ramappa wrote:

With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:

- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)

At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation

engine busyness = total + (now - start)

All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.

The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.

Note:
There might be an overaccounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.

v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at guc level to update engine stats
- Document worker specific details

v3: (Tvrtko/Umesh)
- Demarcate guc and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of guc state
- Add hooks to gt park/unpark for guc busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to guc initialization
- Drop helpers that are called only once

v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and guc stats objects
- Since disable_submission is called from many places, move resetting
   stats to intel_guc_submission_reset_prepare

v5: (Tvrtko)
- Add a trylock helper that does not sleep and synchronize PMU event
   callbacks and worker with gt reset

Signed-off-by: John Harrison 
Signed-off-by: Umesh Nerlige Ramappa 
---
  drivers/gpu/drm/i915/gt/intel_engine_cs.c |  28 +-
  drivers/gpu/drm/i915/gt/intel_engine_types.h  |  33 ++-
  .../drm/i915/gt/intel_execlists_submission.c  |  34 +++
  drivers/gpu/drm/i915/gt/intel_gt_pm.c |   2 +
  drivers/gpu/drm/i915/gt/intel_reset.c |  16 ++
  drivers/gpu/drm/i915/gt/intel_reset.h |   1 +
  .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   1 +
  drivers/gpu/drm/i915/gt/uc/intel_guc.h|  30 ++
  drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c|  21 ++
  drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h|   5 +
  drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  13 +
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 267 ++
  .../gpu/drm/i915/gt/uc/intel_guc_submission.h |   2 +
  drivers/gpu/drm/i915/i915_reg.h   |   2 +
  14 files changed, 427 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 38436f4b5706..6b783fdcba2a 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -1873,23 +1873,6 @@ void intel_engine_dump(struct intel_engine_cs *engine,
intel_engine_print_breadcrumbs(engine, m);
  }
  
-static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,

-   ktime_t *now)
-{
-   struct intel_engine_execlists_stats *stats = >stats.execlists;
-   ktime_t total = stats->total;
-
-   /*
-* If the engine is executing something at the moment
-* add it to the total.
-*/
-   *now = ktime_get();
-   if (READ_ONCE(stats->active))
-   total = ktime_add(total, ktime_sub(*now, stats->start));
-
-   return total;
-}
-
  /**
   * intel_engine_get_busy_time() - Return current accumulated engine busyness
   * @engine: engine to report on
@@ -1899,16 +1882,7 @@ static 

Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu

2021-10-13 Thread Umesh Nerlige Ramappa

On Wed, Oct 13, 2021 at 05:06:26PM +0100, Tvrtko Ursulin wrote:


On 13/10/2021 01:56, Umesh Nerlige Ramappa wrote:

With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:

- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)

At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation

engine busyness = total + (now - start)

All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.

The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.

Note:
There might be an overaccounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.

v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at guc level to update engine stats
- Document worker specific details

v3: (Tvrtko/Umesh)
- Demarcate guc and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of guc state
- Add hooks to gt park/unpark for guc busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to guc initialization
- Drop helpers that are called only once

v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and guc stats objects
- Since disable_submission is called from many places, move resetting
  stats to intel_guc_submission_reset_prepare

v5: (Tvrtko)
- Add a trylock helper that does not sleep and synchronize PMU event
  callbacks and worker with gt reset


Looks good to me now, for some combination of high level and 
incomeplte low level review (I did not check the overflow handling or 
the GuC page layout and flow.). Both patches:


Acked-by: Tvrtko Ursulin 


Thanks



Do you have someone available to check the parts I did not and r-b?


I will check with Matt/John.

Regards,
Umesh


Regards,

Tvrtko



Signed-off-by: John Harrison 
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c |  28 +-
 drivers/gpu/drm/i915/gt/intel_engine_types.h  |  33 ++-
 .../drm/i915/gt/intel_execlists_submission.c  |  34 +++
 drivers/gpu/drm/i915/gt/intel_gt_pm.c |   2 +
 drivers/gpu/drm/i915/gt/intel_reset.c |  16 ++
 drivers/gpu/drm/i915/gt/intel_reset.h |   1 +
 .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   1 +
 drivers/gpu/drm/i915/gt/uc/intel_guc.h|  30 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c|  21 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h|   5 +
 drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  13 +
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 267 ++
 .../gpu/drm/i915/gt/uc/intel_guc_submission.h |   2 +
 drivers/gpu/drm/i915/i915_reg.h   |   2 +
 14 files changed, 427 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 38436f4b5706..6b783fdcba2a 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -1873,23 +1873,6 @@ void intel_engine_dump(struct intel_engine_cs *engine,
intel_engine_print_breadcrumbs(engine, m);
 }
-static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
-   ktime_t *now)
-{
-   struct intel_engine_execlists_stats *stats = >stats.execlists;
-   ktime_t total = stats->total;
-
-   /*
-* If the engine is 

Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu

2021-10-13 Thread Tvrtko Ursulin



On 13/10/2021 01:56, Umesh Nerlige Ramappa wrote:

With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:

- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)

At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation

engine busyness = total + (now - start)

All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.

The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.

Note:
There might be an overaccounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.

v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at guc level to update engine stats
- Document worker specific details

v3: (Tvrtko/Umesh)
- Demarcate guc and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of guc state
- Add hooks to gt park/unpark for guc busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to guc initialization
- Drop helpers that are called only once

v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and guc stats objects
- Since disable_submission is called from many places, move resetting
   stats to intel_guc_submission_reset_prepare

v5: (Tvrtko)
- Add a trylock helper that does not sleep and synchronize PMU event
   callbacks and worker with gt reset


Looks good to me now, for some combination of high level and incomeplte 
low level review (I did not check the overflow handling or the GuC page 
layout and flow.). Both patches:


Acked-by: Tvrtko Ursulin 

Do you have someone available to check the parts I did not and r-b?

Regards,

Tvrtko



Signed-off-by: John Harrison 
Signed-off-by: Umesh Nerlige Ramappa 
---
  drivers/gpu/drm/i915/gt/intel_engine_cs.c |  28 +-
  drivers/gpu/drm/i915/gt/intel_engine_types.h  |  33 ++-
  .../drm/i915/gt/intel_execlists_submission.c  |  34 +++
  drivers/gpu/drm/i915/gt/intel_gt_pm.c |   2 +
  drivers/gpu/drm/i915/gt/intel_reset.c |  16 ++
  drivers/gpu/drm/i915/gt/intel_reset.h |   1 +
  .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   1 +
  drivers/gpu/drm/i915/gt/uc/intel_guc.h|  30 ++
  drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c|  21 ++
  drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h|   5 +
  drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  13 +
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 267 ++
  .../gpu/drm/i915/gt/uc/intel_guc_submission.h |   2 +
  drivers/gpu/drm/i915/i915_reg.h   |   2 +
  14 files changed, 427 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 38436f4b5706..6b783fdcba2a 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -1873,23 +1873,6 @@ void intel_engine_dump(struct intel_engine_cs *engine,
intel_engine_print_breadcrumbs(engine, m);
  }
  
-static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,

-   ktime_t *now)
-{
-   struct intel_engine_execlists_stats *stats = >stats.execlists;
-   ktime_t total = stats->total;
-
-   /*
-* If the engine is executing something at the moment
-* add it to the total.
-*/
-   *now = 

Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu

2021-10-12 Thread Tvrtko Ursulin



On 11/10/2021 21:08, Umesh Nerlige Ramappa wrote:

On Mon, Oct 11, 2021 at 12:41:19PM +0100, Tvrtko Ursulin wrote:


On 07/10/2021 23:55, Umesh Nerlige Ramappa wrote:

With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:

- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)

At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation

engine busyness = total + (now - start)

All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.

The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.

Note:
There might be an overaccounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.

v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at guc level to update engine stats
- Document worker specific details

v3: (Tvrtko/Umesh)
- Demarcate guc and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of guc state
- Add hooks to gt park/unpark for guc busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to guc initialization
- Drop helpers that are called only once

v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and guc stats objects
- Since disable_submission is called from many places, move resetting
  stats to intel_guc_submission_reset_prepare

Signed-off-by: John Harrison 
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c |  28 +--
 drivers/gpu/drm/i915/gt/intel_engine_types.h  |  33 ++-
 .../drm/i915/gt/intel_execlists_submission.c  |  34 +++
 drivers/gpu/drm/i915/gt/intel_gt_pm.c |   2 +
 .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   1 +
 drivers/gpu/drm/i915/gt/uc/intel_guc.h    |  26 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c    |  21 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h    |   5 +
 drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  13 +
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 238 ++
 .../gpu/drm/i915/gt/uc/intel_guc_submission.h |   2 +
 drivers/gpu/drm/i915/i915_reg.h   |   2 +
 12 files changed, 377 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_cs.c

index 38436f4b5706..6b783fdcba2a 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -1873,23 +1873,6 @@ void intel_engine_dump(struct intel_engine_cs 
*engine,

 intel_engine_print_breadcrumbs(engine, m);
 }
-static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs 
*engine,

-    ktime_t *now)
-{
-    struct intel_engine_execlists_stats *stats = 
>stats.execlists;

-    ktime_t total = stats->total;
-
-    /*
- * If the engine is executing something at the moment
- * add it to the total.
- */
-    *now = ktime_get();
-    if (READ_ONCE(stats->active))
-    total = ktime_add(total, ktime_sub(*now, stats->start));
-
-    return total;
-}
-
 /**
  * intel_engine_get_busy_time() - Return current accumulated engine 
busyness

  * @engine: engine to report on
@@ -1899,16 +1882,7 @@ static ktime_t 
__intel_engine_get_busy_time(struct intel_engine_cs *engine,

  */
 ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, 
ktime_t *now)

 {
-    struct 

Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu

2021-10-11 Thread Umesh Nerlige Ramappa

On Mon, Oct 11, 2021 at 12:41:19PM +0100, Tvrtko Ursulin wrote:


On 07/10/2021 23:55, Umesh Nerlige Ramappa wrote:

With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:

- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)

At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation

engine busyness = total + (now - start)

All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.

The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.

Note:
There might be an overaccounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.

v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at guc level to update engine stats
- Document worker specific details

v3: (Tvrtko/Umesh)
- Demarcate guc and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of guc state
- Add hooks to gt park/unpark for guc busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to guc initialization
- Drop helpers that are called only once

v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and guc stats objects
- Since disable_submission is called from many places, move resetting
  stats to intel_guc_submission_reset_prepare

Signed-off-by: John Harrison 
Signed-off-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c |  28 +--
 drivers/gpu/drm/i915/gt/intel_engine_types.h  |  33 ++-
 .../drm/i915/gt/intel_execlists_submission.c  |  34 +++
 drivers/gpu/drm/i915/gt/intel_gt_pm.c |   2 +
 .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   1 +
 drivers/gpu/drm/i915/gt/uc/intel_guc.h|  26 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c|  21 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h|   5 +
 drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  13 +
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 238 ++
 .../gpu/drm/i915/gt/uc/intel_guc_submission.h |   2 +
 drivers/gpu/drm/i915/i915_reg.h   |   2 +
 12 files changed, 377 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 38436f4b5706..6b783fdcba2a 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -1873,23 +1873,6 @@ void intel_engine_dump(struct intel_engine_cs *engine,
intel_engine_print_breadcrumbs(engine, m);
 }
-static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
-   ktime_t *now)
-{
-   struct intel_engine_execlists_stats *stats = >stats.execlists;
-   ktime_t total = stats->total;
-
-   /*
-* If the engine is executing something at the moment
-* add it to the total.
-*/
-   *now = ktime_get();
-   if (READ_ONCE(stats->active))
-   total = ktime_add(total, ktime_sub(*now, stats->start));
-
-   return total;
-}
-
 /**
  * intel_engine_get_busy_time() - Return current accumulated engine busyness
  * @engine: engine to report on
@@ -1899,16 +1882,7 @@ static ktime_t __intel_engine_get_busy_time(struct 
intel_engine_cs *engine,
  */
 ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t 
*now)
 {
-   struct 

Re: [PATCH 2/2] drm/i915/pmu: Connect engine busyness stats from GuC to pmu

2021-10-11 Thread Tvrtko Ursulin



On 07/10/2021 23:55, Umesh Nerlige Ramappa wrote:

With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:

- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)

At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation

engine busyness = total + (now - start)

All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.

The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.

Note:
There might be an overaccounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.

v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at guc level to update engine stats
- Document worker specific details

v3: (Tvrtko/Umesh)
- Demarcate guc and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of guc state
- Add hooks to gt park/unpark for guc busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to guc initialization
- Drop helpers that are called only once

v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and guc stats objects
- Since disable_submission is called from many places, move resetting
   stats to intel_guc_submission_reset_prepare

Signed-off-by: John Harrison 
Signed-off-by: Umesh Nerlige Ramappa 
---
  drivers/gpu/drm/i915/gt/intel_engine_cs.c |  28 +--
  drivers/gpu/drm/i915/gt/intel_engine_types.h  |  33 ++-
  .../drm/i915/gt/intel_execlists_submission.c  |  34 +++
  drivers/gpu/drm/i915/gt/intel_gt_pm.c |   2 +
  .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   1 +
  drivers/gpu/drm/i915/gt/uc/intel_guc.h|  26 ++
  drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c|  21 ++
  drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h|   5 +
  drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  13 +
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 238 ++
  .../gpu/drm/i915/gt/uc/intel_guc_submission.h |   2 +
  drivers/gpu/drm/i915/i915_reg.h   |   2 +
  12 files changed, 377 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 38436f4b5706..6b783fdcba2a 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -1873,23 +1873,6 @@ void intel_engine_dump(struct intel_engine_cs *engine,
intel_engine_print_breadcrumbs(engine, m);
  }
  
-static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,

-   ktime_t *now)
-{
-   struct intel_engine_execlists_stats *stats = >stats.execlists;
-   ktime_t total = stats->total;
-
-   /*
-* If the engine is executing something at the moment
-* add it to the total.
-*/
-   *now = ktime_get();
-   if (READ_ONCE(stats->active))
-   total = ktime_add(total, ktime_sub(*now, stats->start));
-
-   return total;
-}
-
  /**
   * intel_engine_get_busy_time() - Return current accumulated engine busyness
   * @engine: engine to report on
@@ -1899,16 +1882,7 @@ static ktime_t __intel_engine_get_busy_time(struct 
intel_engine_cs *engine,
   */
  ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t 
*now)
  {
-   struct intel_engine_execlists_stats *stats = >stats.execlists;
-