Re: [PATCH v2 2/2] drm/i915: Fix gt reset with GuC submission is disabled

2024-04-22 Thread John Harrison

On 4/22/2024 13:19, Nirmoy Das wrote:

Currently intel_gt_reset() kills the GuC and then resets requested
engines. This is problematic because there is a dedicated CSB FIFO
which only GuC can access and if that FIFO fills up, the hardware
will block on the next context switch until there is space that means
the system is effectively hung. If an engine is reset whilst actively
executing a context, a CSB entry will be sent to say that the context
has gone idle. Thus if reset happens on a very busy system then
killing GuC before killing the engines will lead to deadlock because
of filled up CSB FIFO.

To address this issue, the GuC should be killed only after resetting
the requested engines and before calling intel_gt_init_hw().

v2: Improve commit message(John)

Cc: John Harrison 
Signed-off-by: Nirmoy Das 

Reviewed-by: John Harrison 


---
  drivers/gpu/drm/i915/gt/intel_reset.c | 16 ++--
  1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c
index b1393863ca9b..6161f7a3ff70 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -879,8 +879,17 @@ static intel_engine_mask_t reset_prepare(struct intel_gt 
*gt)
intel_engine_mask_t awake = 0;
enum intel_engine_id id;
  
-	/* For GuC mode, ensure submission is disabled before stopping ring */

-   intel_uc_reset_prepare(>uc);
+   /**
+* For GuC mode with submission enabled, ensure submission
+* is disabled before stopping ring.
+*
+* For GuC mode with submission disabled, ensure that GuC is not
+* sanitized, do that after engine reset. reset_prepare()
+* is followed by engine reset which in this mode requires GuC to
+* process any CSB FIFO entries generated by the resets.
+*/
+   if (intel_uc_uses_guc_submission(>uc))
+   intel_uc_reset_prepare(>uc);
  
  	for_each_engine(engine, gt, id) {

if (intel_engine_pm_get_if_awake(engine))
@@ -1227,6 +1236,9 @@ void intel_gt_reset(struct intel_gt *gt,
  
  	intel_overlay_reset(gt->i915);
  
+	/* sanitize uC after engine reset */

+   if (!intel_uc_uses_guc_submission(>uc))
+   intel_uc_reset_prepare(>uc);
/*
 * Next we need to restore the context, but we don't use those
 * yet either...




Re: [PATCH v2 1/2] drm/i915: Refactor confusing __intel_gt_reset()

2024-04-22 Thread John Harrison

On 4/22/2024 13:19, Nirmoy Das wrote:

__intel_gt_reset() is really for resetting engines though
the name might suggest something else. So add a helper function
to remove confusions with no functional changes.

v2: Move intel_gt_reset_all_engines() next to
 intel_gt_reset_engine() to make diff simple(John)

Cc: John Harrison 
Signed-off-by: Nirmoy Das 

Reviewed-by: John Harrison 


---
  drivers/gpu/drm/i915/gt/intel_engine_cs.c |  2 +-
  .../drm/i915/gt/intel_execlists_submission.c  |  2 +-
  drivers/gpu/drm/i915/gt/intel_gt.c|  2 +-
  drivers/gpu/drm/i915/gt/intel_gt_pm.c |  2 +-
  drivers/gpu/drm/i915/gt/intel_reset.c | 35 +++
  drivers/gpu/drm/i915/gt/intel_reset.h |  3 +-
  drivers/gpu/drm/i915/gt/selftest_reset.c  |  2 +-
  drivers/gpu/drm/i915/i915_driver.c|  2 +-
  8 files changed, 37 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 8c44af1c3451..5c8e9ee3b008 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -678,7 +678,7 @@ void intel_engines_release(struct intel_gt *gt)
 */
GEM_BUG_ON(intel_gt_pm_is_awake(gt));
if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
-   __intel_gt_reset(gt, ALL_ENGINES);
+   intel_gt_reset_all_engines(gt);
  
  	/* Decouple the backend; but keep the layout for late GPU resets */

for_each_engine(engine, gt, id) {
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 
b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 355aab5b38ba..21829439e686 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -2898,7 +2898,7 @@ static void enable_error_interrupt(struct intel_engine_cs 
*engine)
drm_err(>i915->drm,
"engine '%s' resumed still in error: %08x\n",
engine->name, status);
-   __intel_gt_reset(engine->gt, engine->mask);
+   intel_gt_reset_engine(engine);
}
  
  	/*

diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c 
b/drivers/gpu/drm/i915/gt/intel_gt.c
index 580b5141ce1e..626b166e67ef 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt.c
@@ -832,7 +832,7 @@ void intel_gt_driver_unregister(struct intel_gt *gt)
  
  	/* Scrub all HW state upon release */

with_intel_runtime_pm(gt->uncore->rpm, wakeref)
-   __intel_gt_reset(gt, ALL_ENGINES);
+   intel_gt_reset_all_engines(gt);
  }
  
  void intel_gt_driver_release(struct intel_gt *gt)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c 
b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index 220ac4f92edf..c08fdb65cc69 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -159,7 +159,7 @@ static bool reset_engines(struct intel_gt *gt)
if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
return false;
  
-	return __intel_gt_reset(gt, ALL_ENGINES) == 0;

+   return intel_gt_reset_all_engines(gt) == 0;
  }
  
  static void gt_sanitize(struct intel_gt *gt, bool force)

diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c
index c8e9aa41fdea..b1393863ca9b 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -764,7 +764,7 @@ wa_14015076503_end(struct intel_gt *gt, intel_engine_mask_t 
engine_mask)
 HECI_H_GS1_ER_PREP, 0);
  }
  
-int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask)

+static int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t 
engine_mask)
  {
const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1;
reset_func reset;
@@ -978,7 +978,7 @@ static void __intel_gt_set_wedged(struct intel_gt *gt)
  
  	/* Even if the GPU reset fails, it should still stop the engines */

if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
-   __intel_gt_reset(gt, ALL_ENGINES);
+   intel_gt_reset_all_engines(gt);
  
  	for_each_engine(engine, gt, id)

engine->submit_request = nop_submit_request;
@@ -1089,7 +1089,7 @@ static bool __intel_gt_unset_wedged(struct intel_gt *gt)
/* We must reset pending GPU events before restoring our submission */
ok = !HAS_EXECLISTS(gt->i915); /* XXX better agnosticism desired */
if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
-   ok = __intel_gt_reset(gt, ALL_ENGINES) == 0;
+   ok = intel_gt_reset_all_engines(gt) == 0;
if (!ok) {
/*
 * Warn CI about the unrecoverable wedged condition.
@@ -1133,10 +1133,10 @@ static int do_reset(struct intel_gt *

Re: [PATCH 3/3] drm/i915: Fix gt reset with GuC submission disabled

2024-04-18 Thread John Harrison

On 4/18/2024 10:10, Nirmoy Das wrote:

Currently intel_gt_reset() happens as follows:

reset_prepare() ---> Sends GDRST to GuC, GuC is in GS_MIA_IN_RESET
do_reset()
   intel_gt_reset_all_engines()
 *_engine_reset_prepare() -->RESET_CTL expects running GuC
Not technically correct. There is no direct connection between RESET_CTL 
and GuC.



 *_reset_engines()
intel_gt_init_hw() --> GuC comes out of GS_MIA_IN_RESET with FW loaded.

Fix the issue by sanitizing the GuC only after resetting requested
engines and before intel_gt_init_hw().

You never actually state what the issue is.

The problem is that there is a dedicated CSB FIFO going to GuC (and 
nothing else has access to it). If that FIFO fills up, the hardware will 
block on the next context switch until there is space. If no-one (i.e. 
GuC) is draining it, that means the system is effectively hung. If an 
engine is reset whilst actively executing a context, a CSB entry will be 
sent to say that the context has gone idle. Thus if you reset a very 
busy system and start with killing GuC before killing the engines and 
only then re-enabling GuC, you run the risk of generating more CSB 
entries than will fit in the FIFO and deadlocking. Whereas, if the 
system is idle then you can reset the engines as much as you like while 
GuC is dead and it won't be a problem.




Note intel_uc_reset_finish() and intel_uc_reset() are nop when
guc submission is disabled.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/i915/gt/intel_reset.c | 16 ++--
  1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c
index 6504e8ba9c58..bd166f5aca4b 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -907,8 +907,17 @@ static intel_engine_mask_t reset_prepare(struct intel_gt 
*gt)
intel_engine_mask_t awake = 0;
enum intel_engine_id id;
  
-	/* For GuC mode, ensure submission is disabled before stopping ring */

-   intel_uc_reset_prepare(>uc);
+   /**
+* For GuC mode with submission enabled, ensure submission
+* is disabled before stopping ring.
+*
+* For GuC mode with submission disabled, ensure that GuC is not
+* sanitized, do that at the end in reset_finish(). reset_prepare()
+* is followed by engine reset which in this mode requires GuC to
+* be functional to process engine reset events.

-> to process any CSB FIFO entries generated by the resets.

John.


+*/
+   if (intel_uc_uses_guc_submission(>uc))
+   intel_uc_reset_prepare(>uc);
  
  	for_each_engine(engine, gt, id) {

if (intel_engine_pm_get_if_awake(engine))
@@ -1255,6 +1264,9 @@ void intel_gt_reset(struct intel_gt *gt,
  
  	intel_overlay_reset(gt->i915);
  
+	/* sanitize uC after engine reset */

+   if (!intel_uc_uses_guc_submission(>uc))
+   intel_uc_reset_prepare(>uc);
/*
 * Next we need to restore the context, but we don't use those
 * yet either...




Re: [PATCH 2/3] drm/i915 Rename intel_engine_reset to intel_gt_engine_recover

2024-04-18 Thread John Harrison

On 4/18/2024 10:10, Nirmoy Das wrote:

intel_engine_reset() not only reset a engine but also
tries to recover it so give it a proper name without
any functional changes.
Not seeing what the difference is. If this was a super low level 
function (with an __ prefix for example) then one might expect it to 
literally just poke the reset register and leave the engine in a dead 
state. But as a high level function, I think it is reasonable to expect 
a reset function to 'recover' the entity being reset.


Also, many of the callers are tests that are explicitly testing reset. 
So now the tests all talk about attempting resets, resets failing, etc. 
but around a call to 'recover' instead of 'reset', which seems confusing.


John.



Signed-off-by: Nirmoy Das 
---
  .../drm/i915/gem/selftests/i915_gem_context.c |  2 +-
  .../drm/i915/gt/intel_execlists_submission.c  |  2 +-
  drivers/gpu/drm/i915/gt/intel_reset.c |  4 ++--
  drivers/gpu/drm/i915/gt/intel_reset.h |  4 ++--
  drivers/gpu/drm/i915/gt/selftest_hangcheck.c  | 20 +--
  drivers/gpu/drm/i915/gt/selftest_mocs.c   |  4 ++--
  drivers/gpu/drm/i915/gt/selftest_reset.c  |  2 +-
  .../gpu/drm/i915/gt/selftest_workarounds.c|  6 +++---
  8 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c 
b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
index 89d4dc8b60c6..4f4cde55f621 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
@@ -1171,7 +1171,7 @@ __sseu_finish(const char *name,
int ret = 0;
  
  	if (flags & TEST_RESET) {

-   ret = intel_engine_reset(ce->engine, "sseu");
+   ret = intel_gt_engine_recover(ce->engine, "sseu");
if (ret)
goto out;
}
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 
b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 21829439e686..9485a622a704 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -2404,7 +2404,7 @@ static void execlists_reset(struct intel_engine_cs 
*engine, const char *msg)
  
  	ring_set_paused(engine, 1); /* Freeze the current request in place */

execlists_capture(engine);
-   intel_engine_reset(engine, msg);
+   intel_gt_engine_recover(engine, msg);
  
  	tasklet_enable(>sched_engine->tasklet);

clear_and_wake_up_bit(bit, lock);
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c
index b825daace58e..6504e8ba9c58 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -1348,7 +1348,7 @@ int __intel_engine_reset_bh(struct intel_engine_cs 
*engine, const char *msg)
  }
  
  /**

- * intel_engine_reset - reset GPU engine to recover from a hang
+ * intel_gt_engine_recover - reset GPU engine to recover from a hang
   * @engine: engine to reset
   * @msg: reason for GPU reset; or NULL for no drm_notice()
   *
@@ -1360,7 +1360,7 @@ int __intel_engine_reset_bh(struct intel_engine_cs 
*engine, const char *msg)
   *  - reset engine (which will force the engine to idle)
   *  - re-init/configure engine
   */
-int intel_engine_reset(struct intel_engine_cs *engine, const char *msg)
+int intel_gt_engine_recover(struct intel_engine_cs *engine, const char *msg)
  {
int err;
  
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.h b/drivers/gpu/drm/i915/gt/intel_reset.h

index c00de353075c..be984357bf27 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.h
+++ b/drivers/gpu/drm/i915/gt/intel_reset.h
@@ -31,8 +31,8 @@ void intel_gt_handle_error(struct intel_gt *gt,
  void intel_gt_reset(struct intel_gt *gt,
intel_engine_mask_t stalled_mask,
const char *reason);
-int intel_engine_reset(struct intel_engine_cs *engine,
-  const char *reason);
+int intel_gt_engine_recover(struct intel_engine_cs *engine,
+   const char *reason);
  int __intel_engine_reset_bh(struct intel_engine_cs *engine,
const char *reason);
  
diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c

index 9ce8ff1c04fe..9bfda3f2bd24 100644
--- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
+++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
@@ -495,9 +495,9 @@ static int igt_reset_nop_engine(void *arg)
  
  i915_request_add(rq);

}
-   err = intel_engine_reset(engine, NULL);
+   err = intel_gt_engine_recover(engine, NULL);
if (err) {
-   pr_err("intel_engine_reset(%s) failed, 
err:%d\n",
+   pr_err("intel_gt_engine_recover(%s) failed, 
err:%d\n",
   

Re: [PATCH 1/3] drm/i915: Refactor confusing __intel_gt_reset()

2024-04-18 Thread John Harrison

On 4/18/2024 10:10, Nirmoy Das wrote:

__intel_gt_reset() is really for resetting engines though
the name might suggest something else. So add two helper functions
to remove confusions with no functional changes.
Technically you only added one and just moved the other :). It already 
existed, it just wasn't being used everywhere that it could be!




Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/i915/gt/intel_engine_cs.c |  2 +-
  .../drm/i915/gt/intel_execlists_submission.c  |  2 +-
  drivers/gpu/drm/i915/gt/intel_gt.c|  2 +-
  drivers/gpu/drm/i915/gt/intel_gt_pm.c |  2 +-
  drivers/gpu/drm/i915/gt/intel_reset.c | 43 ++-
  drivers/gpu/drm/i915/gt/intel_reset.h |  3 +-
  drivers/gpu/drm/i915/gt/selftest_reset.c  |  2 +-
  drivers/gpu/drm/i915/i915_driver.c|  2 +-
  8 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 8c44af1c3451..5c8e9ee3b008 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -678,7 +678,7 @@ void intel_engines_release(struct intel_gt *gt)
 */
GEM_BUG_ON(intel_gt_pm_is_awake(gt));
if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
-   __intel_gt_reset(gt, ALL_ENGINES);
+   intel_gt_reset_all_engines(gt);
  
  	/* Decouple the backend; but keep the layout for late GPU resets */

for_each_engine(engine, gt, id) {
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 
b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 355aab5b38ba..21829439e686 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -2898,7 +2898,7 @@ static void enable_error_interrupt(struct intel_engine_cs 
*engine)
drm_err(>i915->drm,
"engine '%s' resumed still in error: %08x\n",
engine->name, status);
-   __intel_gt_reset(engine->gt, engine->mask);
+   intel_gt_reset_engine(engine);
}
  
  	/*

diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c 
b/drivers/gpu/drm/i915/gt/intel_gt.c
index 580b5141ce1e..626b166e67ef 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt.c
@@ -832,7 +832,7 @@ void intel_gt_driver_unregister(struct intel_gt *gt)
  
  	/* Scrub all HW state upon release */

with_intel_runtime_pm(gt->uncore->rpm, wakeref)
-   __intel_gt_reset(gt, ALL_ENGINES);
+   intel_gt_reset_all_engines(gt);
  }
  
  void intel_gt_driver_release(struct intel_gt *gt)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c 
b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index 220ac4f92edf..c08fdb65cc69 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -159,7 +159,7 @@ static bool reset_engines(struct intel_gt *gt)
if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
return false;
  
-	return __intel_gt_reset(gt, ALL_ENGINES) == 0;

+   return intel_gt_reset_all_engines(gt) == 0;
  }
  
  static void gt_sanitize(struct intel_gt *gt, bool force)

diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c
index c8e9aa41fdea..b825daace58e 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -764,7 +764,7 @@ wa_14015076503_end(struct intel_gt *gt, intel_engine_mask_t 
engine_mask)
 HECI_H_GS1_ER_PREP, 0);
  }
  
-int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask)

+static int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t 
engine_mask)
  {
const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1;
reset_func reset;
@@ -795,6 +795,34 @@ int __intel_gt_reset(struct intel_gt *gt, 
intel_engine_mask_t engine_mask)
return ret;
  }
  
+/**

+ * intel_gt_reset_all_engines() - Reset all engines in the given gt.
+ * @gt: the GT to reset all engines for.
+ *
+ * This function resets all engines within the given gt.
+ *
+ * Returns:
+ * Zero on success, negative error code on failure.
+ */
+int intel_gt_reset_all_engines(struct intel_gt *gt)
+{
+   return __intel_gt_reset(gt, ALL_ENGINES);
+}
+
+/**
+ * intel_gt_reset_engine() - Reset a specific engine within a gt.
+ * @engine: engine to be reset.
+ *
+ * This function resets the specified engine within a gt.
+ *
+ * Returns:
+ * Zero on success, negative error code on failure.
+ */
+int intel_gt_reset_engine(struct intel_engine_cs *engine)
+{
+   return __intel_gt_reset(engine->gt, engine->mask);
+}
+
You could have just dropped the 'static' from the existing copy of this 
function and added the new version next to it. That would make the diff 
simpler and therefore clearer. Unless you think there is a good 

Re: [RFC PATCH] drm/i915: Don't reset GuC before engine reset on full GT reset

2024-04-16 Thread John Harrison

On 4/15/2024 09:44, Nirmoy Das wrote:

Currently intel_gt_reset() happens as follows:

reset_prepare() ---> Sends GDRST to GuC, GuC is in GS_MIA_IN_RESET
do_reset()
__intel_gt_reset()
*_engine_reset_prepare() -->RESET_CTL expects running
GuC
*_reset_engines()
intel_gt_init_hw() --> GuC FW loading happens, GuC comes out of
GS_MIA_IN_RESET.

Fix the above flow so that GuC reset happens after all the
engines reset is done.

Cc: John Harrison 
Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/i915/gt/intel_reset.c |  9 --
  drivers/gpu/drm/i915/gt/uc/intel_uc.c | 42 +--
  drivers/gpu/drm/i915/gt/uc/intel_uc.h |  1 +
  3 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c
index c8e9aa41fdea..9ebd68ce0c22 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -879,8 +879,11 @@ static intel_engine_mask_t reset_prepare(struct intel_gt 
*gt)
intel_engine_mask_t awake = 0;
enum intel_engine_id id;
  
-	/* For GuC mode, ensure submission is disabled before stopping ring */

-   intel_uc_reset_prepare(>uc);
+   /*
+* For GuC mode, ensure submission is disabled before stopping ring.
+* Don't reset the GuC a engine reset requires GuC to be running.
These two lines appear to be mutually exclusive unless there is a test 
for GuC submission being enabled, which I am not seeing. Note that 
"ensure submission is disabled" means "reset the GuC".



+*/
+   intel_uc_reset_prepare_without_guc_reset(>uc);
  
  	for_each_engine(engine, gt, id) {

if (intel_engine_pm_get_if_awake(engine))
@@ -1227,6 +1230,8 @@ void intel_gt_reset(struct intel_gt *gt,
  
  	intel_overlay_reset(gt->i915);
  
+	/* Now that all engines are clean, Reset the GuC */

+   intel_uc_reset_prepare(>uc);
/*
 * Next we need to restore the context, but we don't use those
 * yet either...
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c 
b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
index 7a63abf8f644..5feee4db2ccc 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
@@ -345,7 +345,7 @@ static void __uc_fini(struct intel_uc *uc)
intel_guc_fini(>guc);
  }
  
-static int __uc_sanitize(struct intel_uc *uc)

+static void __uc_sanitize_without_guc_reset(struct intel_uc *uc)
  {
struct intel_guc *guc = >guc;
struct intel_huc *huc = >huc;
@@ -354,7 +354,11 @@ static int __uc_sanitize(struct intel_uc *uc)
  
  	intel_huc_sanitize(huc);

intel_guc_sanitize(guc);
+}
This seems like an extremely bad idea. You are wiping out all the GuC 
communication structures on the host side while the GuC itself is still 
executing and using those same structures.


Is the failure when doing individual engine resets or when doing a full 
GT reset?


If the former, I think a better approach would be to just not reset GuC 
at all (or indeed any UC) if not using GuC submission. Although, looking 
at the code, I'm not seeing an engine only reset path that does nuke the 
UC layers?


If it is the latter, then how/why are individual engine resets happening 
in the middle of a full GT reset? Don't we just splat everything all at 
once? Either way, it would be safer to split at the GT reset code layer 
rather than inside the UC layer. That is, when not using GuC submission, 
do the entire prepare/reset/init sequence of the UC layers as one 
'atomic' operation either before the GT/engine reset or after it (or 
potentially both before and after?).


John.


  
+static int __uc_sanitize(struct intel_uc *uc)

+{
+   __uc_sanitize_without_guc_reset(uc);
return __intel_uc_reset_hw(uc);
  }
  
@@ -593,13 +597,7 @@ static void __uc_fini_hw(struct intel_uc *uc)

__uc_sanitize(uc);
  }
  
-/**

- * intel_uc_reset_prepare - Prepare for reset
- * @uc: the intel_uc structure
- *
- * Preparing for full gpu reset.
- */
-void intel_uc_reset_prepare(struct intel_uc *uc)
+static void __intel_uc_reset_prepare(struct intel_uc *uc, bool reset_guc)
  {
struct intel_guc *guc = >guc;
  
@@ -617,9 +615,35 @@ void intel_uc_reset_prepare(struct intel_uc *uc)

intel_guc_submission_reset_prepare(guc);
  
  sanitize:

-   __uc_sanitize(uc);
+   if (reset_guc)
+   __uc_sanitize(uc);
+   else
+   __uc_sanitize_without_guc_reset(uc);
  }
  
+/**

+ * intel_uc_reset_prepare - Prepare for reset
+ * @uc: the intel_uc structure
+ *
+ * Preparing for full gpu reset.
+ */
+void intel_uc_reset_prepare(struct intel_uc *uc)
+{
+   __intel_uc_reset_prepare(uc, true);
+}
+/**
+ * intel_uc_reset_prepare_without_guc_reset - Prepare for reset but don't reset
+ * the GuC
+ * @uc: the intel_uc struc

Re: [PATCH] drm/i915/dg2: wait for HuC load completion before running selftests

2024-04-16 Thread John Harrison

On 4/10/2024 13:15, Daniele Ceraolo Spurio wrote:

On DG2, submissions to VCS engines tied to a gem context are blocked
until the HuC is loaded. Since some selftests do use a gem context,
wait for the HuC load to complete before running the tests to avoid
contamination.

Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10564
Signed-off-by: Daniele Ceraolo Spurio 
Cc: John Harrison 

Reviewed-by: John Harrison 


---
  .../gpu/drm/i915/selftests/i915_selftest.c| 36 ---
  1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/selftests/i915_selftest.c 
b/drivers/gpu/drm/i915/selftests/i915_selftest.c
index ee79e0809a6d..fee76c1d2f45 100644
--- a/drivers/gpu/drm/i915/selftests/i915_selftest.c
+++ b/drivers/gpu/drm/i915/selftests/i915_selftest.c
@@ -154,6 +154,30 @@ __wait_gsc_proxy_completed(struct drm_i915_private *i915)
pr_warn(DRIVER_NAME "Timed out waiting for 
gsc_proxy_completion!\n");
  }
  
+static void

+__wait_gsc_huc_load_completed(struct drm_i915_private *i915)
+{
+   /* this only applies to DG2, so we only care about GT0 */
+   struct intel_huc *huc = _gt(i915)->uc.huc;
+   bool need_to_wait = (IS_ENABLED(CONFIG_INTEL_MEI_PXP) &&
+intel_huc_wait_required(huc));
+   /*
+* The GSC and PXP mei bringup depends on the kernel boot ordering, so
+* to account for the worst case scenario the HuC code waits for up to
+* 10s for the GSC driver to load and then another 5s for the PXP
+* component to bind before giving up, even though those steps normally
+* complete in less than a second from the i915 load. We match that
+* timeout here, but we expect to bail early due to the fence being
+* signalled even in a failure case, as it is extremely unlikely that
+* both components will use their full timeout.
+*/
+   unsigned long timeout_ms = 15000;
+
+   if (need_to_wait &&
+   wait_for(i915_sw_fence_done(>delayed_load.fence), timeout_ms))
+   pr_warn(DRIVER_NAME "Timed out waiting for huc load via 
GSC!\n");
+}
+
  static int __run_selftests(const char *name,
   struct selftest *st,
   unsigned int count,
@@ -228,14 +252,16 @@ int i915_mock_selftests(void)
  
  int i915_live_selftests(struct pci_dev *pdev)

  {
+   struct drm_i915_private *i915 = pdev_to_i915(pdev);
int err;
  
  	if (!i915_selftest.live)

return 0;
  
-	__wait_gsc_proxy_completed(pdev_to_i915(pdev));

+   __wait_gsc_proxy_completed(i915);
+   __wait_gsc_huc_load_completed(i915);
  
-	err = run_selftests(live, pdev_to_i915(pdev));

+   err = run_selftests(live, i915);
if (err) {
i915_selftest.live = err;
return err;
@@ -251,14 +277,16 @@ int i915_live_selftests(struct pci_dev *pdev)
  
  int i915_perf_selftests(struct pci_dev *pdev)

  {
+   struct drm_i915_private *i915 = pdev_to_i915(pdev);
int err;
  
  	if (!i915_selftest.perf)

return 0;
  
-	__wait_gsc_proxy_completed(pdev_to_i915(pdev));

+   __wait_gsc_proxy_completed(i915);
+   __wait_gsc_huc_load_completed(i915);
  
-	err = run_selftests(perf, pdev_to_i915(pdev));

+   err = run_selftests(perf, i915);
if (err) {
i915_selftest.perf = err;
return err;




Re: [PATCH] drm/i915/guc: Update w/a 14019159160

2024-03-12 Thread John Harrison

On 3/12/2024 09:24, Matt Roper wrote:

On Thu, Mar 07, 2024 at 06:01:29PM -0800, john.c.harri...@intel.com wrote:

From: John Harrison 

An existing workaround has been extended in both platforms affected
and implementation complexity.

Signed-off-by: John Harrison 
---
  drivers/gpu/drm/i915/gt/uc/abi/guc_klvs_abi.h |  3 ++-
  drivers/gpu/drm/i915/gt/uc/intel_guc.c|  3 ++-
  drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c| 21 ++-
  3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_klvs_abi.h 
b/drivers/gpu/drm/i915/gt/uc/abi/guc_klvs_abi.h
index bebf28e3c4794..3e7060e859794 100644
--- a/drivers/gpu/drm/i915/gt/uc/abi/guc_klvs_abi.h
+++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_klvs_abi.h
@@ -105,7 +105,8 @@ enum {
   * Workaround keys:
   */
  enum {
-   GUC_WORKAROUND_KLV_SERIALIZED_RA_MODE   = 
0x9001,
+   GUC_WORKAROUND_KLV_SERIALIZED_RA_MODE   = 
0x9001,   /* Wa_14019159160 */
+   GUC_WORKAROUND_KLV_AVOID_GFX_CLEAR_WHILE_ACTIVE = 
0x9006,   /* Wa_14019159160 */
  };
  
  #endif /* _ABI_GUC_KLVS_ABI_H */

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
index 0c67d674c94de..4c3dae98656af 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
@@ -296,7 +296,8 @@ static u32 guc_ctl_wa_flags(struct intel_guc *guc)
  
  	/* Wa_16019325821 */

/* Wa_14019159160 */
-   if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71)))
+   if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71)) ||

 From what I can see, this workaround is also needed on Xe_LPG+ (12.74)

Isn't that an Xe platform? Or is 12.74 just ARL?

John.


now.


Matt


+   IS_DG2(gt->i915))
flags |= GUC_WA_RCS_CCS_SWITCHOUT;
  
  	/*

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
index 5c9908b56616e..00fe3c21a9b1c 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
@@ -815,23 +815,23 @@ guc_capture_prep_lists(struct intel_guc *guc)
return PAGE_ALIGN(total_size);
  }
  
-/* Wa_14019159160 */

-static u32 guc_waklv_ra_mode(struct intel_guc *guc, u32 offset, u32 remain)
+static void guc_waklv_enable_simple(struct intel_guc *guc, u32 *offset, u32 
*remain, u32 klv_id)
  {
u32 size;
u32 klv_entry[] = {
/* 16:16 key/length */
-   FIELD_PREP(GUC_KLV_0_KEY, 
GUC_WORKAROUND_KLV_SERIALIZED_RA_MODE) |
+   FIELD_PREP(GUC_KLV_0_KEY, klv_id) |
FIELD_PREP(GUC_KLV_0_LEN, 0),
/* 0 dwords data */
};
  
  	size = sizeof(klv_entry);

-   GEM_BUG_ON(remain < size);
+   GEM_BUG_ON(*remain < size);
  
-	iosys_map_memcpy_to(>ads_map, offset, klv_entry, size);

+   iosys_map_memcpy_to(>ads_map, *offset, klv_entry, size);
  
-	return size;

+   *offset += size;
+   *remain -= size;
  }
  
  static void guc_waklv_init(struct intel_guc *guc)

@@ -850,10 +850,11 @@ static void guc_waklv_init(struct intel_guc *guc)
remain = guc_ads_waklv_size(guc);
  
  	/* Wa_14019159160 */

-   if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71))) {
-   size = guc_waklv_ra_mode(guc, offset, remain);
-   offset += size;
-   remain -= size;
+   if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71)) || 
IS_DG2(gt->i915)) {
+   guc_waklv_enable_simple(guc, , ,
+   GUC_WORKAROUND_KLV_SERIALIZED_RA_MODE);
+   guc_waklv_enable_simple(guc, , ,
+   
GUC_WORKAROUND_KLV_AVOID_GFX_CLEAR_WHILE_ACTIVE);
}
  
  	size = guc_ads_waklv_size(guc) - remain;

--
2.43.0





Re: [PATCH v4 1/3] drm/i915/gt: Disable HW load balancing for CCS

2024-03-07 Thread John Harrison

On 3/7/2024 12:02, Andi Shyti wrote:

Hi Matt,

On Wed, Mar 06, 2024 at 03:46:09PM -0800, Matt Roper wrote:

On Wed, Mar 06, 2024 at 02:22:45AM +0100, Andi Shyti wrote:

The hardware should not dynamically balance the load between CCS
engines. Wa_14019159160 recommends disabling it across all
platforms.

Fixes: d2eae8e98d59 ("drm/i915/dg2: Drop force_probe requirement")
Signed-off-by: Andi Shyti 
Cc: Chris Wilson 
Cc: Joonas Lahtinen 
Cc: Matt Roper 
Cc:  # v6.2+
---
  drivers/gpu/drm/i915/gt/intel_gt_regs.h | 1 +
  drivers/gpu/drm/i915/gt/intel_workarounds.c | 5 +
  2 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_regs.h 
b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
index 50962cfd1353..cf709f6c05ae 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_regs.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
@@ -1478,6 +1478,7 @@
  
  #define GEN12_RCU_MODE_MMIO(0x14800)

  #define   GEN12_RCU_MODE_CCS_ENABLE   REG_BIT(0)
+#define   XEHP_RCU_MODE_FIXED_SLICE_CCS_MODE   REG_BIT(1)
  
  #define CHV_FUSE_GT_MMIO(VLV_GUNIT_BASE + 0x2168)

  #define   CHV_FGT_DISABLE_SS0 (1 << 10)
diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c 
b/drivers/gpu/drm/i915/gt/intel_workarounds.c
index d67d44611c28..a2e78cf0b5f5 100644
--- a/drivers/gpu/drm/i915/gt/intel_workarounds.c
+++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c
@@ -2945,6 +2945,11 @@ general_render_compute_wa_init(struct intel_engine_cs 
*engine, struct i915_wa_li
  
  		/* Wa_18028616096 */

wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, 
UGM_FRAGMENT_THRESHOLD_TO_3);
+
+   /*
+* Wa_14019159160: disable the automatic CCS load balancing

I'm still a bit concerned that this doesn't really match what this
specific workaround is asking us to do.  There seems to be an agreement
on various internal email threads that we need to disable load
balancing, but there's no single specific workaround that officially
documents that decision.

This specific workaround asks us to do a bunch of different things, and
the third item it asks for is to disable load balancing in very specific
cases (i.e., while the RCS is active at the same time as one or more CCS
engines).  Taking this workaround in isolation, it would be valid to
keep load balancing active if you were just using the CCS engines and
leaving the RCS idle, or if balancing was turned on/off by the GuC
scheduler according to engine use at the moment, as the documented
workaround seems to assume will be the case.

So in general I think we do need to disable load balancing based on
other offline discussion, but blaming that entire change on
Wa_14019159160 seems a bit questionable since it's not really what this
specific workaround is asking us to do and someone may come back and try
to "correct" the implementation of this workaround in the future without
realizing there are other factors too.  It would be great if we could
get hardware teams to properly document this expectation somewhere
(either in a separate dedicated workaround, or in the MMIO tuning guide)
so that we'll have a more direct and authoritative source for such a
large behavioral change.

On one had I think you are right, on the other hand I think this
workaround has not properly developed in what we have been
describing later.
I think it is not so much that the w/a is 'not properly developed'. It's 
more that this w/a plus others when taken in combination plus knowledge 
of future directions has led to an architectural decision that is beyond 
the scope of the w/a.


As such, I think Matt is definitely correct. Tagging a code change with 
a w/a number when that change does something very different to what is 
described in the w/a is wrong and a maintenance issue waiting to happen.


At the very least, you should just put in a comment explaining the 
situation. E.g.:


 /*
 * Wa_14019159160: This w/a plus others cause significant issues with the use of
 * load balancing. Hence an architectural level decision was taking to simply
 * disable automatic CCS load balancing completely.
 */

Ideally yes, we would get an officially trackable software only 
workaround number or something created and just use that. But in the 
meantime, just clearly explaining the situation seems reasonable to me.


John.




Perhaps, one solution would be to create a new generic workaround
for all platforms with more than one CCS and put everyone at
peace. But I don't know the process.

Are you able to help here? Or Joonas?

Thanks, Matt!
Andi




Re: GuC issue

2024-02-27 Thread John Harrison

On 2/26/2024 08:30, mak...@wezdecki.pl wrote:

Hello,

Thank you for your help.

Is there a possibility to load GuC, then "unload" it and load it again without 
cold reset?
You need to reset the GuC at least - bit 3 of GDRST. The GuC cannot be 
reloaded 'live'. It must be put into reset first. The last line of 
__uc_sanitize() is a call to reset the GuC, so yes that would be an 
option. Note that fini is more about cleaning up to unload the driver. 
Whereas the sanitise functions are about resets with the potential to 
restart again.


John.



By loading I mean HuC firmware upload, GuC ADS/log init, GuC firmware upload, 
CT init, HuC authentication by GuC.

I'm asking because I need to perform severe testing on the target for safety 
purposes without GPU cold reset.
What should be done in order to "unload" the GuC? Is it __uc_sanitize() and 
__uc_fini()?

Maksym

czwartek, 22 lutego 2024 20:31, Harrison, John C  
napisał(a):



Hello,

That worked better. The complaint is that the engine mapping table is invalid. 
See the i915 code in guc_mapping_table_init () in gt/uc/intel_guc_ads.c for an 
example of how to initialise the table.

John.


-Original Message-
From: mak...@wezdecki.pl mak...@wezdecki.pl

Sent: Wednesday, February 21, 2024 07:15
To: Harrison, John C john.c.harri...@intel.com

Cc: mak...@wezdecki.pl; Wajdeczko, Michal michal.wajdec...@intel.com; 
intel-gfx@lists.freedesktop.org

Subject: Re: GuC issue

Ah, I dumped them with Windows new line characters.

Here is a new log binary dump.

I moved to the newest TGL GuC firmware from linux-firmware repo.


środa, 21 lutego 2024 12:16 AM, John Harrison john.c.harri...@intel.com 
napisał(a):


Hello,

Something is very corrupted with that GuC log. The log consists of a
header page and then a stream of log entry structures. The structure
is supposed to be 20 bytes long and starts with a four byte time
stamp. But I am seeing what is conceivably a 32bit timestamp appearing
at 21 byte increments through the log. Even more curiously, the time
stamp seems to have an 0x0D, 0x0A after it. Are you doing any printf
type operation in order to write the log out from memory to disk?

INTEL_GUC_LOAD_STATUS_INIT_DATA_INVALID means that the GuC did not
like the initialisation data passed in. Most likely, something in the
ADS structure is not valid. If you try with the latest GuC version,
that might give you more information as to what is the incorrect. More
status codes have been added since 70.1.1.

John.

On 2/20/2024 05:03, mak...@wezdecki.pl wrote:


Hi,

Please see GuC log attached to this email.

Log size is "PAGE_SIZE+Debug Log(64KB) + Crash Log (8KB) + Capture Log (1M)"

Can anybody from Intel decode this log buffer? Thanks.

What am I doing wrong?

Maksym

poniedziałek, 19 lutego 2024 09:44, mak...@wezdecki.pl mak...@wezdecki.pl 
napisał(a):


Hi,

I fixed one issue in my driver. Log address was set incorrectly.

Right now, after GuC uploading, GUC_STATUS changed.
Right now, intel_guc_load_status is INTEL_GUC_LOAD_STATUS_INIT_DATA_INVALID = 
0x71.

What does it mean?
Could you please help me with this?

Thanks,
Maksym

piątek, 9 lutego 2024 08:42, natur.prod...@pm.me natur.prod...@pm.me napisał(a):


Hello,

Please see my comments below.

piątek, 9 lutego 2024 2:45 AM, John Harrison john.c.harri...@intel.com 
napisał(a):


Hello,

What platform is this on? And which GuC firmware version are you using?
It's TGL. I'm using tgl_guc_70.1.1.bin firmware blob.
One thing you made need to do is force maximum GT frequency
during GuC load. That is something the i915 driver does. If
the system decides the GPU is idle and drops the frequency to
minimum then it can take multiple seconds for the GuC initialisation to 
complete.
Thanks for the hint. I'm not doing that at all in my code. How am I supposed to 
do this? Is there a specific register for that?
Did the status change at all during that second of waiting? Or
was it still reading LAPIC_DONE?
It's always LAPIC_DONE.
For ADS documentation, I'm afraid that the best we currently
have publicly available is the i915 driver code. If you are
not intending to use GuC submission then most of the ADS can be ignored.
Ok, that great. Which part of ADS is must-have then?
If you can share the GuC log, that might provide some clues as
to what is happening. For just logging the boot process, you
shouldn't need to allocate a large log. The default size of
i915 for release builds is 64KB. That should be plenty.
I'll collect GuC log as soon as possible. Is it something that can be 
understood without a knowledge of GuC internals? Or is it simply hex dumps?
John.

On 2/6/2024 23:59, natur.prod...@pm.me wrote:


Hi,

I'm currently implementing GuC/HuC firmware support in one Safety Critical OS.
I'm following i915 code and I implemented all paths (I don't want GuC 
submission or SLPC features). I need GuC to authenticate HuC firmware blob.

I mirrored GuC implementation in my code.

After 

Re: [PATCH v3 3/3] drm/i915/guc: Enable Wa_14019159160

2024-02-26 Thread John Harrison

On 2/26/2024 05:25, Nilawar, Badal wrote:

Hi John,

On 04-01-2024 23:35, john.c.harri...@intel.com wrote:

From: John Harrison 

Use the new w/a KLV support to enable a MTL w/a. Note, this w/a is a
super-set of Wa_16019325821, so requires turning that one as well as
setting the new flag for Wa_14019159160 itself.

Signed-off-by: John Harrison 
Reviewed-by: Vinay Belgaumkar 
---
  drivers/gpu/drm/i915/gt/gen8_engine_cs.c  |  3 ++
  drivers/gpu/drm/i915/gt/intel_engine_types.h  |  1 +
  drivers/gpu/drm/i915/gt/uc/abi/guc_klvs_abi.h |  7 
  drivers/gpu/drm/i915/gt/uc/intel_guc.c    |  1 +
  drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c    | 34 ++-
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  1 +
  6 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c 
b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c

index 9cccd60a5c41d..359b21fb02ab2 100644
--- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
@@ -744,6 +744,7 @@ static u32 *gen12_emit_preempt_busywait(struct 
i915_request *rq, u32 *cs)

    /* Wa_14014475959:dg2 */
  /* Wa_16019325821 */
+/* Wa_14019159160 */
  #define HOLD_SWITCHOUT_SEMAPHORE_PPHWSP_OFFSET    0x540
  static u32 hold_switchout_semaphore_offset(struct i915_request *rq)
  {
@@ -753,6 +754,7 @@ static u32 hold_switchout_semaphore_offset(struct 
i915_request *rq)

    /* Wa_14014475959:dg2 */
  /* Wa_16019325821 */
+/* Wa_14019159160 */
  static u32 *hold_switchout_emit_wa_busywait(struct i915_request 
*rq, u32 *cs)

  {
  int i;
@@ -793,6 +795,7 @@ gen12_emit_fini_breadcrumb_tail(struct 
i915_request *rq, u32 *cs)

    /* Wa_14014475959:dg2 */
  /* Wa_16019325821 */
+    /* Wa_14019159160 */
  if (intel_engine_uses_wa_hold_switchout(rq->engine))
  cs = hold_switchout_emit_wa_busywait(rq, cs);
  diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h 
b/drivers/gpu/drm/i915/gt/intel_engine_types.h

index b519812ba120d..ba55c059063db 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -697,6 +697,7 @@ intel_engine_has_relative_mmio(const struct 
intel_engine_cs * const engine)

    /* Wa_14014475959:dg2 */
  /* Wa_16019325821 */
+/* Wa_14019159160 */
  static inline bool
  intel_engine_uses_wa_hold_switchout(struct intel_engine_cs *engine)
  {
diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_klvs_abi.h 
b/drivers/gpu/drm/i915/gt/uc/abi/guc_klvs_abi.h

index 58012edd4eb0e..bebf28e3c4794 100644
--- a/drivers/gpu/drm/i915/gt/uc/abi/guc_klvs_abi.h
+++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_klvs_abi.h
@@ -101,4 +101,11 @@ enum {
  GUC_CONTEXT_POLICIES_KLV_NUM_IDS = 5,
  };
  +/*
+ * Workaround keys:
+ */
+enum {
+    GUC_WORKAROUND_KLV_SERIALIZED_RA_MODE    = 0x9001,
+};
+
  #endif /* _ABI_GUC_KLVS_ABI_H */
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc.c

index d5c856be31491..db3cb628f40dc 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
@@ -295,6 +295,7 @@ static u32 guc_ctl_wa_flags(struct intel_guc *guc)
  flags |= GUC_WA_HOLD_CCS_SWITCHOUT;
    /* Wa_16019325821 */
+    /* Wa_14019159160 */
  if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71)))
  flags |= GUC_WA_RCS_CCS_SWITCHOUT;
  diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c

index 6af3fa8b92e34..68d9e277eca8b 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
@@ -815,6 +815,25 @@ guc_capture_prep_lists(struct intel_guc *guc)
  return PAGE_ALIGN(total_size);
  }
  +/* Wa_14019159160 */
+static u32 guc_waklv_ra_mode(struct intel_guc *guc, u32 offset, u32 
remain)

+{

How about making this function generic by passing KLV id as arg?
At this point, there is only one KLV supported. So there is no advantage 
to making the code more complex.


The next patch in the series (not yet posted because this one was not 
supposed to be taking so long to get through CI and merged!) adds 
support for another KLV which is similarly zero length. At that point, 
the helper function is updated to become more generic.


John.


+    u32 size;
+    u32 klv_entry[] = {
+    /* 16:16 key/length */
+    FIELD_PREP(GUC_KLV_0_KEY, 
GUC_WORKAROUND_KLV_SERIALIZED_RA_MODE) |

+    FIELD_PREP(GUC_KLV_0_LEN, 0),
+    /* 0 dwords data */
+    };
+
+    size = sizeof(klv_entry);
+    GEM_BUG_ON(remain < size);
+
+    iosys_map_memcpy_to(>ads_map, offset, klv_entry, size);
Otherwise preparing and adding klv entry can be wrapped in generic 
function.


Regards,
Badal

+
+    return size;
+}
+
  static void guc_waklv_init(struct intel_guc *guc)
  {
  struct intel_gt *gt = guc_to_gt(guc);
@@ -830,15 +849,12 @@ static void guc_waklv_init(struct intel_guc *guc)
  offset = guc_ads_waklv_offset(guc);
   

Re: GuC issue

2024-02-20 Thread John Harrison

Hello,

Something is very corrupted with that GuC log. The log consists of a 
header page and then a stream of log entry structures. The structure is 
supposed to be 20 bytes long and starts with a four byte time stamp. But 
I am seeing what is conceivably a 32bit timestamp appearing at 21 byte 
increments through the log. Even more curiously, the time stamp seems to 
have an 0x0D, 0x0A after it. Are you doing any printf type operation in 
order to write the log out from memory to disk?


INTEL_GUC_LOAD_STATUS_INIT_DATA_INVALID means that the GuC did not like 
the initialisation data passed in. Most likely, something in the ADS 
structure is not valid. If you try with the latest GuC version, that 
might give you more information as to what is the incorrect. More status 
codes have been added since 70.1.1.


John.


On 2/20/2024 05:03, mak...@wezdecki.pl wrote:

Hi,

Please see GuC log attached to this email.

Log size is "PAGE_SIZE+Debug Log(64KB) + Crash Log (8KB) + Capture Log (1M)"

Can anybody from Intel decode this log buffer? Thanks.

What am I doing wrong?

Maksym


poniedziałek, 19 lutego 2024 09:44, mak...@wezdecki.pl  
napisał(a):



Hi,

I fixed one issue in my driver. Log address was set incorrectly.

Right now, after GuC uploading, GUC_STATUS changed.
Right now, intel_guc_load_status is INTEL_GUC_LOAD_STATUS_INIT_DATA_INVALID = 
0x71.

What does it mean?
Could you please help me with this?

Thanks,
Maksym



piątek, 9 lutego 2024 08:42, natur.prod...@pm.me natur.prod...@pm.me napisał(a):


Hello,

Please see my comments below.

piątek, 9 lutego 2024 2:45 AM, John Harrison john.c.harri...@intel.com 
napisał(a):


Hello,

What platform is this on? And which GuC firmware version are you using?

It's TGL. I'm using tgl_guc_70.1.1.bin firmware blob.


One thing you made need to do is force maximum GT frequency during GuC
load. That is something the i915 driver does. If the system decides the
GPU is idle and drops the frequency to minimum then it can take multiple
seconds for the GuC initialisation to complete.

Thanks for the hint. I'm not doing that at all in my code. How am I supposed to 
do this? Is there a specific register for that?


Did the status change at all during that second of waiting? Or was it
still reading LAPIC_DONE?

It's always LAPIC_DONE.


For ADS documentation, I'm afraid that the best we currently have
publicly available is the i915 driver code. If you are not intending to
use GuC submission then most of the ADS can be ignored.

Ok, that great. Which part of ADS is must-have then?


If you can share the GuC log, that might provide some clues as to what
is happening. For just logging the boot process, you shouldn't need to
allocate a large log. The default size of i915 for release builds is
64KB. That should be plenty.

I'll collect GuC log as soon as possible. Is it something that can be 
understood without a knowledge of GuC internals? Or is it simply hex dumps?


John.

On 2/6/2024 23:59, natur.prod...@pm.me wrote:


Hi,

I'm currently implementing GuC/HuC firmware support in one Safety Critical OS.
I'm following i915 code and I implemented all paths (I don't want GuC 
submission or SLPC features). I need GuC to authenticate HuC firmware blob.

I mirrored GuC implementation in my code.

After GuC DMA transfer succeeds, I'm reading GUC_STATUS register.
HW returns INTEL_BOOTROM_STATUS_JUMP_PASSED as bootrom status and 
INTEL_GUC_LOAD_STATUS_LAPIC_DONE as GuC load status.

Unfortunately, after one second of waiting, the status didn't get changed to 
INTEL_GUC_LOAD_STATUS_READY at all.

What is a potential issue here?
Could you please help me?

In addition to this, could you please point out some documentation about GuC's 
ADS struct?

Thanks,
Maksym




Re: [PATCH v3] drm/i915/guc: Simplify/extend platform check for Wa_14018913170

2024-02-20 Thread John Harrison

On 2/19/2024 12:28, Rodrigo Vivi wrote:

On Fri, Feb 16, 2024 at 10:38:41AM -0800, john.c.harri...@intel.com wrote:

From: John Harrison 

The above w/a is required for every platform that the i915 driver
supports. It is fixed on the latest platforms but they are only
supported by Xe instead of i915. So just remove the platform check
completely and keep the code simple.

Well, I was going to say that I would prefer a GMD version greater-than
check to be future proof. However if this code gets used in some other
new platform a new specific guc support would likely need to be added
as well right?
There is no future for i915. That's the point. The only platforms that 
have the hardware fix are all ones that will only ever be supported by 
the Xe driver. So if such a platform were to be backported to i915 then 
there would be a lot more work than just adding a new GuC firmware platform.


And going backwards, the bug affects all platforms that have a GuC. So 
if any GuC code is being executed at all, then this w/a is applicable.




Perhaps at least adding a comment in the code?

Such as this?
    /*
 * Wa_14018913170: Applicable to all platforms supported by i915 so
 * don't bother testing for all X/Y/Z platforms explicitly.
 */

John.




with that
Reviewed-by: Rodrigo Vivi 



Signed-off-by: John Harrison 
---
  drivers/gpu/drm/i915/gt/uc/intel_guc.c | 3 +--
  1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
index 2b450c43bbd7f..a3662edb42032 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
@@ -321,8 +321,7 @@ static u32 guc_ctl_wa_flags(struct intel_guc *guc)
  
  	/* Wa_14018913170 */

if (GUC_FIRMWARE_VER(guc) >= MAKE_GUC_VER(70, 7, 0)) {
-   if (IS_DG2(gt->i915) || IS_METEORLAKE(gt->i915) || 
IS_PONTEVECCHIO(gt->i915))
-   flags |= GUC_WA_ENABLE_TSC_CHECK_ON_RC6;
+   flags |= GUC_WA_ENABLE_TSC_CHECK_ON_RC6;
}
  
  	return flags;

--
2.43.0





Re: [PATCH 2/2] drm/i915/gt: Set default CCS mode '1'

2024-02-15 Thread John Harrison

On 2/15/2024 14:34, Andi Shyti wrote:

Hi John,

On Thu, Feb 15, 2024 at 01:23:24PM -0800, John Harrison wrote:

On 2/15/2024 05:59, Andi Shyti wrote:

Since CCS automatic load balancing is disabled, we will impose a
fixed balancing policy that involves setting all the CCS engines
to work together on the same load.

Simultaneously, the user will see only 1 CCS rather than the
actual number. As of now, this change affects only DG2.

These two paragraphs are mutually exclusive. You can't have four CCS engines
'working together' if only one engine exists. I think you are meaning that
we only export 1 CCS engine and that single engine is configured to control
all the EUs. As opposed to running in 4 CCS engine mode where the EUs are
(dynamically or statically) divided amongst those four engines.

The balancing is done statically. The dynamic balancing is
disabled in patch 1.

The 2 or 4 CCS engines will share the same workload.

But they don't.

In i915, we use 'engine' to refer to a command streamer and all the 
associated hardware. This is distinct from the EUs which sit behind and 
can be driven by one or more command streamers. Saying that multiple 
engines are sharing a workload implies that you are submitting the 
context to multiple command streamers in parallel. I.e. a similar 
process to media frame split where they have a set of LRCA contexts 
bound together which are submitted in parallel to two or more video 
decode engines (VCS0, VCS1, etc.). That is not what is happening here.


Here, you are submitting a single context with a singe ring buffer to a 
single engine - CCS0. That engine is configured to own all EUs. Which 
actually means that submitting a compute task to another CCS engine will 
achieve nothing because there are no EUs available to those other 
engines. They will simply hang when waiting for the walker instruction 
to complete.




Because the user won't be able anymore to select the CCS engine
he wants to use, he will see only one CCS.

I think we are saying the same thing using different words :)

But words are important.

John.


I can try in v2 to reword the commit better.

Thanks for looking into this.
Andi


John.


Fixes: d2eae8e98d59 ("drm/i915/dg2: Drop force_probe requirement")
Signed-off-by: Andi Shyti 
Cc: Chris Wilson 
Cc: Joonas Lahtinen 
Cc: Matt Roper 
Cc:  # v6.2+
---
   drivers/gpu/drm/i915/gt/intel_gt.c  | 11 +++
   drivers/gpu/drm/i915/gt/intel_gt_regs.h |  2 ++
   drivers/gpu/drm/i915/i915_drv.h | 17 +
   drivers/gpu/drm/i915/i915_query.c   |  5 +++--
   4 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c 
b/drivers/gpu/drm/i915/gt/intel_gt.c
index a425db5ed3a2..e19df4ef47f6 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt.c
@@ -168,6 +168,14 @@ static void init_unused_rings(struct intel_gt *gt)
}
   }
+static void intel_gt_apply_ccs_mode(struct intel_gt *gt)
+{
+   if (!IS_DG2(gt->i915))
+   return;
+
+   intel_uncore_write(gt->uncore, XEHP_CCS_MODE, 0);
+}
+
   int intel_gt_init_hw(struct intel_gt *gt)
   {
struct drm_i915_private *i915 = gt->i915;
@@ -195,6 +203,9 @@ int intel_gt_init_hw(struct intel_gt *gt)
intel_gt_init_swizzling(gt);
+   /* Configure CCS mode */
+   intel_gt_apply_ccs_mode(gt);
+
/*
 * At least 830 can leave some of the unused rings
 * "active" (ie. head != tail) after resume which
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_regs.h 
b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
index cf709f6c05ae..c148113770ea 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_regs.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
@@ -1605,6 +1605,8 @@
   #define   GEN12_VOLTAGE_MASK REG_GENMASK(10, 0)
   #define   GEN12_CAGF_MASKREG_GENMASK(19, 11)
+#define XEHP_CCS_MODE  _MMIO(0x14804)
+
   #define GEN11_GT_INTR_DW(x)  _MMIO(0x190018 + ((x) * 4))
   #define   GEN11_CSME (31)
   #define   GEN12_HECI_2   (30)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index e81b3b2858ac..0853ffd3cb8d 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -396,6 +396,23 @@ static inline struct intel_gt *to_gt(const struct 
drm_i915_private *i915)
 (engine__); \
 (engine__) = rb_to_uabi_engine(rb_next(&(engine__)->uabi_node)))
+/*
+ * Exclude unavailable engines.
+ *
+ * Only the first CCS engine is utilized due to the disabling of CCS auto load
+ * balancing. As a result, all CCS engines operate collectively, functioning
+ * essentially as a single CCS engine, hence the count of active CCS engines is
+ * considered '1'.
+ * Currently, this applies to platforms with more than one CCS engine,
+ * specifically DG2.
+ */
+#define for

Re: [PATCH 2/2] drm/i915/gt: Set default CCS mode '1'

2024-02-15 Thread John Harrison

On 2/15/2024 05:59, Andi Shyti wrote:

Since CCS automatic load balancing is disabled, we will impose a
fixed balancing policy that involves setting all the CCS engines
to work together on the same load.

Simultaneously, the user will see only 1 CCS rather than the
actual number. As of now, this change affects only DG2.
These two paragraphs are mutually exclusive. You can't have four CCS 
engines 'working together' if only one engine exists. I think you are 
meaning that we only export 1 CCS engine and that single engine is 
configured to control all the EUs. As opposed to running in 4 CCS engine 
mode where the EUs are (dynamically or statically) divided amongst those 
four engines.


John.



Fixes: d2eae8e98d59 ("drm/i915/dg2: Drop force_probe requirement")
Signed-off-by: Andi Shyti 
Cc: Chris Wilson 
Cc: Joonas Lahtinen 
Cc: Matt Roper 
Cc:  # v6.2+
---
  drivers/gpu/drm/i915/gt/intel_gt.c  | 11 +++
  drivers/gpu/drm/i915/gt/intel_gt_regs.h |  2 ++
  drivers/gpu/drm/i915/i915_drv.h | 17 +
  drivers/gpu/drm/i915/i915_query.c   |  5 +++--
  4 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c 
b/drivers/gpu/drm/i915/gt/intel_gt.c
index a425db5ed3a2..e19df4ef47f6 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt.c
@@ -168,6 +168,14 @@ static void init_unused_rings(struct intel_gt *gt)
}
  }
  
+static void intel_gt_apply_ccs_mode(struct intel_gt *gt)

+{
+   if (!IS_DG2(gt->i915))
+   return;
+
+   intel_uncore_write(gt->uncore, XEHP_CCS_MODE, 0);
+}
+
  int intel_gt_init_hw(struct intel_gt *gt)
  {
struct drm_i915_private *i915 = gt->i915;
@@ -195,6 +203,9 @@ int intel_gt_init_hw(struct intel_gt *gt)
  
  	intel_gt_init_swizzling(gt);
  
+	/* Configure CCS mode */

+   intel_gt_apply_ccs_mode(gt);
+
/*
 * At least 830 can leave some of the unused rings
 * "active" (ie. head != tail) after resume which
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_regs.h 
b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
index cf709f6c05ae..c148113770ea 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_regs.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
@@ -1605,6 +1605,8 @@
  #define   GEN12_VOLTAGE_MASK  REG_GENMASK(10, 0)
  #define   GEN12_CAGF_MASK REG_GENMASK(19, 11)
  
+#define XEHP_CCS_MODE  _MMIO(0x14804)

+
  #define GEN11_GT_INTR_DW(x)   _MMIO(0x190018 + ((x) * 4))
  #define   GEN11_CSME  (31)
  #define   GEN12_HECI_2(30)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index e81b3b2858ac..0853ffd3cb8d 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -396,6 +396,23 @@ static inline struct intel_gt *to_gt(const struct 
drm_i915_private *i915)
 (engine__); \
 (engine__) = rb_to_uabi_engine(rb_next(&(engine__)->uabi_node)))
  
+/*

+ * Exclude unavailable engines.
+ *
+ * Only the first CCS engine is utilized due to the disabling of CCS auto load
+ * balancing. As a result, all CCS engines operate collectively, functioning
+ * essentially as a single CCS engine, hence the count of active CCS engines is
+ * considered '1'.
+ * Currently, this applies to platforms with more than one CCS engine,
+ * specifically DG2.
+ */
+#define for_each_available_uabi_engine(engine__, i915__) \
+   for_each_uabi_engine(engine__, i915__) \
+   if ((IS_DG2(i915__)) && \
+   ((engine__)->uabi_class == I915_ENGINE_CLASS_COMPUTE) && \
+   ((engine__)->uabi_instance)) { } \
+   else
+
  #define INTEL_INFO(i915)  ((i915)->__info)
  #define RUNTIME_INFO(i915)(&(i915)->__runtime)
  #define DRIVER_CAPS(i915) (&(i915)->caps)
diff --git a/drivers/gpu/drm/i915/i915_query.c 
b/drivers/gpu/drm/i915/i915_query.c
index fa3e937ed3f5..2d41bda626a6 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -124,6 +124,7 @@ static int query_geometry_subslices(struct drm_i915_private 
*i915,
return fill_topology_info(sseu, query_item, 
sseu->geometry_subslice_mask);
  }
  
+

  static int
  query_engine_info(struct drm_i915_private *i915,
  struct drm_i915_query_item *query_item)
@@ -140,7 +141,7 @@ query_engine_info(struct drm_i915_private *i915,
if (query_item->flags)
return -EINVAL;
  
-	for_each_uabi_engine(engine, i915)

+   for_each_available_uabi_engine(engine, i915)
num_uabi_engines++;
  
  	len = struct_size(query_ptr, engines, num_uabi_engines);

@@ -155,7 +156,7 @@ query_engine_info(struct drm_i915_private *i915,
  
  	info_ptr = _ptr->engines[0];
  
-	for_each_uabi_engine(engine, i915) {

+   for_each_available_uabi_engine(engine, i915) {

Re: PR for new GuC v70.19.2

2024-02-14 Thread John Harrison

Hello,

Please disregard this pull request if hasn't already been processed.

We need to send out an update to bump to a newer version.

Thanks,
John.


On 2/1/2024 17:12, john.c.harri...@intel.com wrote:

The following changes since commit 1a9518c73c4b54854c9cd8f416fd3b8f8e3456e7:

   Merge branch 'mlimonci/amd-2024-01-30.2' into 'main' (2024-01-30 15:55:30 
+)

are available in the Git repository at:

   git://anongit.freedesktop.org/drm/drm-firmware guc_70.19.2

for you to fetch changes up to 92c06b3c1b4b93ccd9953825cfd4e6ab56e03f16:

   xe: First GuC release for LNL and Xe (2024-01-30 09:23:50 -0800)


John Harrison (2):
   i915: Add GuC v70.19.2 for ADL-P, DG1, DG2, MTL and TGL
   xe: First GuC release for LNL and Xe

  LICENSE.xe   |  39 +++
  WHENCE   |  20 ++--
  i915/adlp_guc_70.bin | Bin 342848 -> 347264 bytes
  i915/dg1_guc_70.bin  | Bin 272512 -> 321088 bytes
  i915/dg2_guc_70.bin  | Bin 443200 -> 406336 bytes
  i915/mtl_guc_70.bin  | Bin 365376 -> 332608 bytes
  i915/tgl_guc_70.bin  | Bin 330304 -> 334784 bytes
  xe/lnl_guc_70.bin| Bin 0 -> 336704 bytes
  8 files changed, 53 insertions(+), 6 deletions(-)
  create mode 100644 LICENSE.xe
  create mode 100644 xe/lnl_guc_70.bin




Re: GuC issue

2024-02-08 Thread John Harrison

Hello,

What platform is this on? And which GuC firmware version are you using?

One thing you made need to do is force maximum GT frequency during GuC 
load. That is something the i915 driver does. If the system decides the 
GPU is idle and drops the frequency to minimum then it can take multiple 
seconds for the GuC initialisation to complete.


Did the status change at all during that second of waiting? Or was it 
still reading LAPIC_DONE?


For ADS documentation, I'm afraid that the best we currently have 
publicly available is the i915 driver code. If you are not intending to 
use GuC submission then most of the ADS can be ignored.


If you can share the GuC log, that might provide some clues as to what 
is happening. For just logging the boot process, you shouldn't need to 
allocate a large log. The default size of i915 for release builds is 
64KB. That should be plenty.


John.


On 2/6/2024 23:59, natur.prod...@pm.me wrote:

Hi,

I'm currently implementing GuC/HuC firmware support in one Safety Critical OS.
I'm following i915 code and I implemented all paths (I don't want GuC 
submission or SLPC features). I need GuC to authenticate HuC firmware blob.

I mirrored GuC implementation in my code.

After GuC DMA transfer succeeds, I'm reading GUC_STATUS register.
HW returns INTEL_BOOTROM_STATUS_JUMP_PASSED as bootrom status and 
INTEL_GUC_LOAD_STATUS_LAPIC_DONE as GuC load status.

Unfortunately, after one second of waiting, the status didn't get changed to 
INTEL_GUC_LOAD_STATUS_READY at all.

What is a potential issue here?
Could you please help me?

In addition to this, could you please point out some documentation about GuC's 
ADS struct?

Thanks,
Maksym




Re: [RFC] drm/i915: Add GuC submission interface version query

2024-02-08 Thread John Harrison

On 2/8/2024 00:41, Tvrtko Ursulin wrote:

On 07/02/2024 19:34, John Harrison wrote:

On 2/7/2024 10:49, Tvrtko Ursulin wrote:

On 07/02/2024 18:12, John Harrison wrote:

On 2/7/2024 03:56, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Add a new query to the GuC submission interface version.

Mesa intends to use this information to check for old firmware 
versions

with a known bug where using the render and compute command streamers
simultaneously can cause GPU hangs due issues in firmware scheduling.

Based on patches from Vivaik and Joonas.

There is a little bit of an open around the width required for 
versions.

While the GuC FW iface tells they are u8, i915 GuC code uses u32:

  #define CSS_SW_VERSION_UC_MAJOR   (0xFF << 16)
  #define CSS_SW_VERSION_UC_MINOR   (0xFF << 8)
  #define CSS_SW_VERSION_UC_PATCH   (0xFF << 0)
...
  struct intel_uc_fw_ver {
  u32 major;
  u32 minor;
  u32 patch;
  u32 build;
  };
This is copied from generic code which supports firmwares other 
than GuC. Only GuC promises to use 8-bit version components. Other 
firmwares very definitely do not. There is no open.


Ack.



So we could make the query u8, and refactor the struct 
intel_uc_fw_ver

to use u8, or not. To avoid any doubts on why are we assigning u32 to
u8 I simply opted to use u64. Which avoids the need to add any 
padding

too.

I don't follow how potential 8 vs 32 confusion means jump to 64?!


Suggestion was to use u8 in the uapi in order to align with GuC FW 
ABI (or however it's called), in which case there would be:


   ver.major = guc->submission_version.major;

which would be:

   (u8) = (u32)

And I was anticipating someone not liking that either. Using too 
wide u64 simply avoids the need to add a padding element to the uapi 
struct.


If you are positive we need to include a branch number, even though 
it does not seem to be implemented in the code even(*) then I can 
make uapi 4x u32 and achieve the same.
It's not implemented in the code because we've never had to, and it 
is yet another train wreck waiting to happen. There are a bunch of 
issues at different levels that need to be resolved. But that is all 
in the kernel and/or firmware and so can be added by a later kernel 
update when necessary. However, if the UMDs are not already taking it 
into account or its not even in the UAPI, then we can't back fill in 
the kernel later, we are just broken.




(*)
static void uc_unpack_css_version(struct intel_uc_fw_ver *ver, u32 
css_value)

{
/* Get version numbers from the CSS header */
ver->major = FIELD_GET(CSS_SW_VERSION_UC_MAJOR, css_value);
ver->minor = FIELD_GET(CSS_SW_VERSION_UC_MINOR, css_value);
ver->patch = FIELD_GET(CSS_SW_VERSION_UC_PATCH, css_value);
}

No branch field in the CSS header?

I think there is, it's just not officially implemented yet.



And Why is UMD supposed to reject a non-zero branch? Like how would 
1.1.3.0 be fine and 1.1.3.1 be bad? I don't get it. But anyway, I 
can respin if you definitely confirm.

Because that is backwards. The branch number goes at the front.

So, for example (using made up numbers, I don't recall offhand what 
versions we have where) say we currently have 0.1.3.0 in tip and 
0.1.1.0 in the last LTS. We then need to ship a critical security fix 
and back port it to the LTS. Tip becomes 0.1.3.1 but the LTS can't 
become 0.1.1.1 because that version already exists in the history of 
tip and does not contain the fix. So the LTS gets branched to 
1.1.0.0. We then have both branches potentially moving forwards with 
completely independent versioning.


Exactly the same as 5.8.x, 5.9,y, 6.0.z, etc in the Linux kernel 
versioning. You cannot make any assumptions about what might be in 
1.4.5.6 compared to 0.1.2.3. 1.4.5.6 could actually 0.1.0.3 with a 
stack of security fixes but none of the features, workarounds or bug 
fixes that are in 0.1.2.3.


Hence, if the branch number changes then all bets are off. You have 
to start over and reject anything you do not explicitly know about.


This is why we were saying that exposing version numbers to UMDs 
breaks down horribly as soon as we have to start branching. There is 
no clean or simple way to do this.


Right, thank you, I know we talked about the challenges with version 
numbers in the past and fully agreed. I just did not think to idea is 
to conceptually put the branch number first.


(It is called build btw in the i915 struct if that needs cleanup at 
some point. Or maybe name depends on the firmware type.)
That's different. Some of the firmware files we have do have a build 
number. As I said before, branching isn't really implemented yet because 
we've never had to use for real. And generally, we don't spend time 
implementing stuff in the KMD that isn't being used. But we definitely 
need to make sure it is present in any relevant UAPIs so that if/when we 
do need to start using i

Re: [RFC] drm/i915: Add GuC submission interface version query

2024-02-07 Thread John Harrison

On 2/7/2024 12:47, Souza, Jose wrote:

On Wed, 2024-02-07 at 11:52 -0800, John Harrison wrote:

On 2/7/2024 11:43, Souza, Jose wrote:

On Wed, 2024-02-07 at 11:34 -0800, John Harrison wrote:

On 2/7/2024 10:49, Tvrtko Ursulin wrote:

On 07/02/2024 18:12, John Harrison wrote:

On 2/7/2024 03:56, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Add a new query to the GuC submission interface version.

Mesa intends to use this information to check for old firmware versions
with a known bug where using the render and compute command streamers
simultaneously can cause GPU hangs due issues in firmware scheduling.

Based on patches from Vivaik and Joonas.

There is a little bit of an open around the width required for
versions.
While the GuC FW iface tells they are u8, i915 GuC code uses u32:

    #define CSS_SW_VERSION_UC_MAJOR   (0xFF << 16)
    #define CSS_SW_VERSION_UC_MINOR   (0xFF << 8)
    #define CSS_SW_VERSION_UC_PATCH   (0xFF << 0)
...
    struct intel_uc_fw_ver {
    u32 major;
    u32 minor;
    u32 patch;
    u32 build;
    };

This is copied from generic code which supports firmwares other than
GuC. Only GuC promises to use 8-bit version components. Other
firmwares very definitely do not. There is no open.

Ack.


So we could make the query u8, and refactor the struct intel_uc_fw_ver
to use u8, or not. To avoid any doubts on why are we assigning u32 to
u8 I simply opted to use u64. Which avoids the need to add any padding
too.

I don't follow how potential 8 vs 32 confusion means jump to 64?!

Suggestion was to use u8 in the uapi in order to align with GuC FW ABI
(or however it's called), in which case there would be:

     ver.major = guc->submission_version.major;

which would be:

     (u8) = (u32)

And I was anticipating someone not liking that either. Using too wide
u64 simply avoids the need to add a padding element to the uapi struct.

If you are positive we need to include a branch number, even though it
does not seem to be implemented in the code even(*) then I can make
uapi 4x u32 and achieve the same.

It's not implemented in the code because we've never had to, and it is
yet another train wreck waiting to happen. There are a bunch of issues
at different levels that need to be resolved. But that is all in the
kernel and/or firmware and so can be added by a later kernel update when
necessary. However, if the UMDs are not already taking it into account
or its not even in the UAPI, then we can't back fill in the kernel
later, we are just broken.

This sounds to me like a firmware version for internal testing or for 
pre-production HW, would any branched firmware be released to customers?

See comments below. Branching is about back porting critical fixes to
older releases. I.e. supporting LTS releases. There is absolutely
nothing internal only or testing related about branching.

Just because we haven't had to do so yet doesn't mean we won't need to
do so tomorrow.

John.


(*)
static void uc_unpack_css_version(struct intel_uc_fw_ver *ver, u32
css_value)
{
  /* Get version numbers from the CSS header */
  ver->major = FIELD_GET(CSS_SW_VERSION_UC_MAJOR, css_value);
  ver->minor = FIELD_GET(CSS_SW_VERSION_UC_MINOR, css_value);
  ver->patch = FIELD_GET(CSS_SW_VERSION_UC_PATCH, css_value);
}

No branch field in the CSS header?

I think there is, it's just not officially implemented yet.


And Why is UMD supposed to reject a non-zero branch? Like how would
1.1.3.0 be fine and 1.1.3.1 be bad? I don't get it. But anyway, I can
respin if you definitely confirm.

Because that is backwards. The branch number goes at the front.

So, for example (using made up numbers, I don't recall offhand what
versions we have where) say we currently have 0.1.3.0 in tip and 0.1.1.0
in the last LTS. We then need to ship a critical security fix and back
port it to the LTS. Tip becomes 0.1.3.1 but the LTS can't become 0.1.1.1
because that version already exists in the history of tip and does not
contain the fix. So the LTS gets branched to 1.1.0.0. We then have both
branches potentially moving forwards with completely independent versioning.

Exactly the same as 5.8.x, 5.9,y, 6.0.z, etc in the Linux kernel
versioning. You cannot make any assumptions about what might be in
1.4.5.6 compared to 0.1.2.3. 1.4.5.6 could actually 0.1.0.3 with a stack
of security fixes but none of the features, workarounds or bug fixes
that are in 0.1.2.3.

Hence, if the branch number changes then all bets are off. You have to
start over and reject anything you do not explicitly know about.

This is why we were saying that exposing version numbers to UMDs breaks
down horribly as soon as we have to start branching. There is no clean
or simple way to do this.

Odd versioning, would expect that fixes on LTS would increase patch version.
You can't. That would create multiple firmware entities with the same 
version number.


E.g. every

Re: [RFC] drm/i915: Add GuC submission interface version query

2024-02-07 Thread John Harrison

On 2/7/2024 11:43, Souza, Jose wrote:

On Wed, 2024-02-07 at 11:34 -0800, John Harrison wrote:

On 2/7/2024 10:49, Tvrtko Ursulin wrote:

On 07/02/2024 18:12, John Harrison wrote:

On 2/7/2024 03:56, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Add a new query to the GuC submission interface version.

Mesa intends to use this information to check for old firmware versions
with a known bug where using the render and compute command streamers
simultaneously can cause GPU hangs due issues in firmware scheduling.

Based on patches from Vivaik and Joonas.

There is a little bit of an open around the width required for
versions.
While the GuC FW iface tells they are u8, i915 GuC code uses u32:

   #define CSS_SW_VERSION_UC_MAJOR   (0xFF << 16)
   #define CSS_SW_VERSION_UC_MINOR   (0xFF << 8)
   #define CSS_SW_VERSION_UC_PATCH   (0xFF << 0)
...
   struct intel_uc_fw_ver {
   u32 major;
   u32 minor;
   u32 patch;
   u32 build;
   };

This is copied from generic code which supports firmwares other than
GuC. Only GuC promises to use 8-bit version components. Other
firmwares very definitely do not. There is no open.

Ack.


So we could make the query u8, and refactor the struct intel_uc_fw_ver
to use u8, or not. To avoid any doubts on why are we assigning u32 to
u8 I simply opted to use u64. Which avoids the need to add any padding
too.

I don't follow how potential 8 vs 32 confusion means jump to 64?!

Suggestion was to use u8 in the uapi in order to align with GuC FW ABI
(or however it's called), in which case there would be:

    ver.major = guc->submission_version.major;

which would be:

    (u8) = (u32)

And I was anticipating someone not liking that either. Using too wide
u64 simply avoids the need to add a padding element to the uapi struct.

If you are positive we need to include a branch number, even though it
does not seem to be implemented in the code even(*) then I can make
uapi 4x u32 and achieve the same.

It's not implemented in the code because we've never had to, and it is
yet another train wreck waiting to happen. There are a bunch of issues
at different levels that need to be resolved. But that is all in the
kernel and/or firmware and so can be added by a later kernel update when
necessary. However, if the UMDs are not already taking it into account
or its not even in the UAPI, then we can't back fill in the kernel
later, we are just broken.

This sounds to me like a firmware version for internal testing or for 
pre-production HW, would any branched firmware be released to customers?
See comments below. Branching is about back porting critical fixes to 
older releases. I.e. supporting LTS releases. There is absolutely 
nothing internal only or testing related about branching.


Just because we haven't had to do so yet doesn't mean we won't need to 
do so tomorrow.


John.




(*)
static void uc_unpack_css_version(struct intel_uc_fw_ver *ver, u32
css_value)
{
 /* Get version numbers from the CSS header */
 ver->major = FIELD_GET(CSS_SW_VERSION_UC_MAJOR, css_value);
 ver->minor = FIELD_GET(CSS_SW_VERSION_UC_MINOR, css_value);
 ver->patch = FIELD_GET(CSS_SW_VERSION_UC_PATCH, css_value);
}

No branch field in the CSS header?

I think there is, it's just not officially implemented yet.


And Why is UMD supposed to reject a non-zero branch? Like how would
1.1.3.0 be fine and 1.1.3.1 be bad? I don't get it. But anyway, I can
respin if you definitely confirm.

Because that is backwards. The branch number goes at the front.

So, for example (using made up numbers, I don't recall offhand what
versions we have where) say we currently have 0.1.3.0 in tip and 0.1.1.0
in the last LTS. We then need to ship a critical security fix and back
port it to the LTS. Tip becomes 0.1.3.1 but the LTS can't become 0.1.1.1
because that version already exists in the history of tip and does not
contain the fix. So the LTS gets branched to 1.1.0.0. We then have both
branches potentially moving forwards with completely independent versioning.

Exactly the same as 5.8.x, 5.9,y, 6.0.z, etc in the Linux kernel
versioning. You cannot make any assumptions about what might be in
1.4.5.6 compared to 0.1.2.3. 1.4.5.6 could actually 0.1.0.3 with a stack
of security fixes but none of the features, workarounds or bug fixes
that are in 0.1.2.3.

Hence, if the branch number changes then all bets are off. You have to
start over and reject anything you do not explicitly know about.

This is why we were saying that exposing version numbers to UMDs breaks
down horribly as soon as we have to start branching. There is no clean
or simple way to do this.

John.



Regards,

Tvrtko


Compile tested only.

Signed-off-by: Tvrtko Ursulin 
Cc: Kenneth Graunke 
Cc: Jose Souza 
Cc: Sagar Ghuge 
Cc: Paulo Zanoni 
Cc: John Harrison 
Cc: Rodrigo Vivi 
Cc: Jani Nikula 
Cc: Tvrtko Ursulin 
Cc: Vivaik Balasubrawmanian 
---
   drivers/

Re: [RFC] drm/i915: Add GuC submission interface version query

2024-02-07 Thread John Harrison

On 2/7/2024 10:49, Tvrtko Ursulin wrote:

On 07/02/2024 18:12, John Harrison wrote:

On 2/7/2024 03:56, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Add a new query to the GuC submission interface version.

Mesa intends to use this information to check for old firmware versions
with a known bug where using the render and compute command streamers
simultaneously can cause GPU hangs due issues in firmware scheduling.

Based on patches from Vivaik and Joonas.

There is a little bit of an open around the width required for 
versions.

While the GuC FW iface tells they are u8, i915 GuC code uses u32:

  #define CSS_SW_VERSION_UC_MAJOR   (0xFF << 16)
  #define CSS_SW_VERSION_UC_MINOR   (0xFF << 8)
  #define CSS_SW_VERSION_UC_PATCH   (0xFF << 0)
...
  struct intel_uc_fw_ver {
  u32 major;
  u32 minor;
  u32 patch;
  u32 build;
  };
This is copied from generic code which supports firmwares other than 
GuC. Only GuC promises to use 8-bit version components. Other 
firmwares very definitely do not. There is no open.


Ack.



So we could make the query u8, and refactor the struct intel_uc_fw_ver
to use u8, or not. To avoid any doubts on why are we assigning u32 to
u8 I simply opted to use u64. Which avoids the need to add any padding
too.

I don't follow how potential 8 vs 32 confusion means jump to 64?!


Suggestion was to use u8 in the uapi in order to align with GuC FW ABI 
(or however it's called), in which case there would be:


   ver.major = guc->submission_version.major;

which would be:

   (u8) = (u32)

And I was anticipating someone not liking that either. Using too wide 
u64 simply avoids the need to add a padding element to the uapi struct.


If you are positive we need to include a branch number, even though it 
does not seem to be implemented in the code even(*) then I can make 
uapi 4x u32 and achieve the same.
It's not implemented in the code because we've never had to, and it is 
yet another train wreck waiting to happen. There are a bunch of issues 
at different levels that need to be resolved. But that is all in the 
kernel and/or firmware and so can be added by a later kernel update when 
necessary. However, if the UMDs are not already taking it into account 
or its not even in the UAPI, then we can't back fill in the kernel 
later, we are just broken.




(*)
static void uc_unpack_css_version(struct intel_uc_fw_ver *ver, u32 
css_value)

{
/* Get version numbers from the CSS header */
ver->major = FIELD_GET(CSS_SW_VERSION_UC_MAJOR, css_value);
ver->minor = FIELD_GET(CSS_SW_VERSION_UC_MINOR, css_value);
ver->patch = FIELD_GET(CSS_SW_VERSION_UC_PATCH, css_value);
}

No branch field in the CSS header?

I think there is, it's just not officially implemented yet.



And Why is UMD supposed to reject a non-zero branch? Like how would 
1.1.3.0 be fine and 1.1.3.1 be bad? I don't get it. But anyway, I can 
respin if you definitely confirm.

Because that is backwards. The branch number goes at the front.

So, for example (using made up numbers, I don't recall offhand what 
versions we have where) say we currently have 0.1.3.0 in tip and 0.1.1.0 
in the last LTS. We then need to ship a critical security fix and back 
port it to the LTS. Tip becomes 0.1.3.1 but the LTS can't become 0.1.1.1 
because that version already exists in the history of tip and does not 
contain the fix. So the LTS gets branched to 1.1.0.0. We then have both 
branches potentially moving forwards with completely independent versioning.


Exactly the same as 5.8.x, 5.9,y, 6.0.z, etc in the Linux kernel 
versioning. You cannot make any assumptions about what might be in 
1.4.5.6 compared to 0.1.2.3. 1.4.5.6 could actually 0.1.0.3 with a stack 
of security fixes but none of the features, workarounds or bug fixes 
that are in 0.1.2.3.


Hence, if the branch number changes then all bets are off. You have to 
start over and reject anything you do not explicitly know about.


This is why we were saying that exposing version numbers to UMDs breaks 
down horribly as soon as we have to start branching. There is no clean 
or simple way to do this.


John.




Regards,

Tvrtko



Compile tested only.

Signed-off-by: Tvrtko Ursulin 
Cc: Kenneth Graunke 
Cc: Jose Souza 
Cc: Sagar Ghuge 
Cc: Paulo Zanoni 
Cc: John Harrison 
Cc: Rodrigo Vivi 
Cc: Jani Nikula 
Cc: Tvrtko Ursulin 
Cc: Vivaik Balasubrawmanian 
---
  drivers/gpu/drm/i915/i915_query.c | 32 
+++

  include/uapi/drm/i915_drm.h   | 11 +++
  2 files changed, 43 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_query.c 
b/drivers/gpu/drm/i915/i915_query.c

index 00871ef99792..999687f6a3d4 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -551,6 +551,37 @@ static int query_hwconfig_blob(struct 
drm_i915_private *i915,

  return hwconfig->size;
  }
+static int
+query_guc_submission_vers

Re: [RFC] drm/i915: Add GuC submission interface version query

2024-02-07 Thread John Harrison

On 2/7/2024 03:56, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Add a new query to the GuC submission interface version.

Mesa intends to use this information to check for old firmware versions
with a known bug where using the render and compute command streamers
simultaneously can cause GPU hangs due issues in firmware scheduling.

Based on patches from Vivaik and Joonas.

There is a little bit of an open around the width required for versions.
While the GuC FW iface tells they are u8, i915 GuC code uses u32:

  #define CSS_SW_VERSION_UC_MAJOR   (0xFF << 16)
  #define CSS_SW_VERSION_UC_MINOR   (0xFF << 8)
  #define CSS_SW_VERSION_UC_PATCH   (0xFF << 0)
...
  struct intel_uc_fw_ver {
  u32 major;
  u32 minor;
  u32 patch;
  u32 build;
  };
This is copied from generic code which supports firmwares other than 
GuC. Only GuC promises to use 8-bit version components. Other firmwares 
very definitely do not. There is no open.




So we could make the query u8, and refactor the struct intel_uc_fw_ver
to use u8, or not. To avoid any doubts on why are we assigning u32 to
u8 I simply opted to use u64. Which avoids the need to add any padding
too.

I don't follow how potential 8 vs 32 confusion means jump to 64?!



Compile tested only.

Signed-off-by: Tvrtko Ursulin 
Cc: Kenneth Graunke 
Cc: Jose Souza 
Cc: Sagar Ghuge 
Cc: Paulo Zanoni 
Cc: John Harrison 
Cc: Rodrigo Vivi 
Cc: Jani Nikula 
Cc: Tvrtko Ursulin 
Cc: Vivaik Balasubrawmanian 
---
  drivers/gpu/drm/i915/i915_query.c | 32 +++
  include/uapi/drm/i915_drm.h   | 11 +++
  2 files changed, 43 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_query.c 
b/drivers/gpu/drm/i915/i915_query.c
index 00871ef99792..999687f6a3d4 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -551,6 +551,37 @@ static int query_hwconfig_blob(struct drm_i915_private 
*i915,
return hwconfig->size;
  }
  
+static int

+query_guc_submission_version(struct drm_i915_private *i915,
+struct drm_i915_query_item *query)
+{
+   struct drm_i915_query_guc_submission_version __user *query_ptr =
+   u64_to_user_ptr(query->data_ptr);
+   struct drm_i915_query_guc_submission_version ver;
+   struct intel_guc *guc = _gt(i915)->uc.guc;
+   const size_t size = sizeof(ver);
+   int ret;
+
+   if (!intel_uc_uses_guc_submission(_gt(i915)->uc))
+   return -ENODEV;
+
+   ret = copy_query_item(, size, size, query);
+   if (ret != 0)
+   return ret;
+
+   if (ver.major || ver.minor || ver.patch)
+   return -EINVAL;
+
+   ver.major = guc->submission_version.major;
+   ver.minor = guc->submission_version.minor;
+   ver.patch = guc->submission_version.patch;
This needs to include the branch version (currently set to zero) in the 
definition. And the UMD needs to barf if branch comes back as non-zero. 
I.e. there is no guarantee that a branched version will have the w/a + 
fix that they are wanting.


John.



+
+   if (copy_to_user(query_ptr, , size))
+   return -EFAULT;
+
+   return 0;
+}
+
  static int (* const i915_query_funcs[])(struct drm_i915_private *dev_priv,
struct drm_i915_query_item *query_item) 
= {
query_topology_info,
@@ -559,6 +590,7 @@ static int (* const i915_query_funcs[])(struct 
drm_i915_private *dev_priv,
query_memregion_info,
query_hwconfig_blob,
query_geometry_subslices,
+   query_guc_submission_version,
  };
  
  int i915_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file)

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 550c496ce76d..d80d9b5e1eda 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -3038,6 +3038,7 @@ struct drm_i915_query_item {
 *  - %DRM_I915_QUERY_MEMORY_REGIONS (see struct 
drm_i915_query_memory_regions)
 *  - %DRM_I915_QUERY_HWCONFIG_BLOB (see `GuC HWCONFIG blob uAPI`)
 *  - %DRM_I915_QUERY_GEOMETRY_SUBSLICES (see struct 
drm_i915_query_topology_info)
+*  - %DRM_I915_QUERY_GUC_SUBMISSION_VERSION (see struct 
drm_i915_query_guc_submission_version)
 */
__u64 query_id;
  #define DRM_I915_QUERY_TOPOLOGY_INFO  1
@@ -3046,6 +3047,7 @@ struct drm_i915_query_item {
  #define DRM_I915_QUERY_MEMORY_REGIONS 4
  #define DRM_I915_QUERY_HWCONFIG_BLOB  5
  #define DRM_I915_QUERY_GEOMETRY_SUBSLICES 6
+#define DRM_I915_QUERY_GUC_SUBMISSION_VERSION  7
  /* Must be kept compact -- no holes and well documented */
  
  	/**

@@ -3591,6 +3593,15 @@ struct drm_i915_query_memory_regions {
struct drm_i915_memory_region_info regions[];
  };
  
+/**

+* struct drm_i915_query_guc_submission_version - 

Re: [RFC PATCH] drm/i915: Add GETPARAM for GuC submission version

2024-02-07 Thread John Harrison

On 2/7/2024 03:36, Joonas Lahtinen wrote:

Quoting Tvrtko Ursulin (2024-02-07 10:44:01)

On 06/02/2024 20:51, Souza, Jose wrote:

On Tue, 2024-02-06 at 12:42 -0800, John Harrison wrote:

On 2/6/2024 08:33, Tvrtko Ursulin wrote:

On 01/02/2024 18:25, Souza, Jose wrote:

On Wed, 2024-01-24 at 08:55 +, Tvrtko Ursulin wrote:

On 24/01/2024 08:19, Joonas Lahtinen wrote:

Add reporting of the GuC submissio/VF interface version via GETPARAM
properties. Mesa intends to use this information to check for old
firmware versions with known bugs before enabling features like async
compute.

There was
https://patchwork.freedesktop.org/patch/560704/?series=124592=1
which does everything in one go so would be my preference.

IMO Joonas version brings less burden to be maintained(no new struct).
But both versions works, please just get into some agreement so we
can move this forward.

So I would really prefer the query. Simplified version would do like
the compile tested only:

Vivaik's patch is definitely preferred. It is much cleaner to make one
single call than having to make four separate calls. It is also
extensible to other firmwares if required. The only blockage against it
was whether it was a good thing to report at all. If that blockage is no
longer valid then we should just merge the patch that has already been
discussed, polished, fixed, etc. rather than starting the whole process
from scratch.

Agreed.

Vivaik can you please rebase and send it again?

Note there was review feedback not addressed so do that too please.
AFAIR incorrect usage of copy item, pad/rsvd/mbz checking and questions
about padding in general. Last is why I proposed a simplified version
which is not future extensible and avoids the need for padding.

Yeah, I don't think there is point an adding an extensible interface as
we're not going to add further FW version queries. This only the
submission interface version we're going to expose:
The media team have flip flopped multiple times about whether they need 
a HuC version query.




  * Note that the spec for the CSS header defines this version number
  * as 'vf_version' as it was originally intended for virtualisation.
  * However, it is applicable to native submission as well.

If somebody wants to work on the simplified version like Tvrtko
suggested below, I have no objection. We can also remove the reference
to the VF version even if that's used by the header definition.

But if there are just suggestions but no actual patches floated, then we
should be merging the GETPARAM version with the "VF" word removed.
The original patch was posted to the mailing list many months ago. Why 
do you say 'just suggestions but no patches floated'?





We've already discussed on the topic for some months so doing the
minimal changes to fulfill Mesa requirements should be considered a
priority to avoid further delays.


Regards,

Tvrtko




And note that it is four calls not three. The code below is missing the
branch version number.

Not even kernel uses the 'build' version anywhere. I don't see how there
could be 'build' version for the VF interface version? It's not supposed
to version a concrete firmware build but the API contract implemented by
the build where patch version should already be incremented for each
fix.

So adding the build does not seem appropriate as there is no plan to
extend this API any further.
I did not say "build" version. There is no build version. I said 
"branch" version. And the branch version absolute becomes important if 
we ever have to release a bug fix to an LTS branch. So it needs to be 
part of the interface from the beginning and the UMDs need to be using 
it from the beginning.


John.




Regards, Joonas


John.


diff --git a/drivers/gpu/drm/i915/i915_query.c
b/drivers/gpu/drm/i915/i915_query.c
index 00871ef99792..999687f6a3d4 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -551,6 +551,37 @@ static int query_hwconfig_blob(struct
drm_i915_private *i915,
      return hwconfig->size;
   }

+static int
+query_guc_submission_version(struct drm_i915_private *i915,
+    struct drm_i915_query_item *query)
+{
+   struct drm_i915_query_guc_submission_version __user *query_ptr =
+ u64_to_user_ptr(query->data_ptr);
+   struct drm_i915_query_guc_submission_version ver;
+   struct intel_guc *guc = _gt(i915)->uc.guc;
+   const size_t size = sizeof(ver);
+   int ret;
+
+   if (!intel_uc_uses_guc_submission(_gt(i915)->uc))
+   return -ENODEV;
+
+   ret = copy_query_item(, size, size, query);
+   if (ret != 0)
+   return ret;
+
+   if (ver.major || ver.minor || ver.patch)
+   return -EINVAL;
+
+   ver.major = guc->submission_version.major;
+   ver.minor = guc->submission_version.minor;
+   ver.patch = guc->submission_version.

Re: [RFC PATCH] drm/i915: Add GETPARAM for GuC submission version

2024-02-06 Thread John Harrison
e remembers.


Signed-off-by: Joonas Lahtinen 
Cc: Kenneth Graunke 
Cc: Jose Souza 
Cc: Sagar Ghuge 
Cc: Paulo Zanoni 
Cc: John Harrison 
Cc: Rodrigo Vivi 
Cc: Jani Nikula 
Cc: Tvrtko Ursulin 
---
   drivers/gpu/drm/i915/i915_getparam.c | 12 
   include/uapi/drm/i915_drm.h  | 13 +
   2 files changed, 25 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_getparam.c 
b/drivers/gpu/drm/i915/i915_getparam.c

index 5c3fec63cb4c1..f176372debc54 100644
--- a/drivers/gpu/drm/i915/i915_getparam.c
+++ b/drivers/gpu/drm/i915/i915_getparam.c
@@ -113,6 +113,18 @@ int i915_getparam_ioctl(struct drm_device 
*dev, void *data,

   if (value < 0)
   return value;
   break;
+    case I915_PARAM_GUC_SUBMISSION_VERSION_MAJOR:
+    case I915_PARAM_GUC_SUBMISSION_VERSION_MINOR:
+    case I915_PARAM_GUC_SUBMISSION_VERSION_PATCH:
+    if (!intel_uc_uses_guc_submission(_gt(i915)->uc))
+    return -ENODEV;
+    if (param->param == I915_PARAM_GUC_SUBMISSION_VERSION_MAJOR)
+    value = to_gt(i915)->uc.guc.submission_version.major;
+    else if (param->param == 
I915_PARAM_GUC_SUBMISSION_VERSION_MINOR)

+    value = to_gt(i915)->uc.guc.submission_version.minor;
+    else
+    value = to_gt(i915)->uc.guc.submission_version.patch;
+    break;
   case I915_PARAM_MMAP_GTT_VERSION:
   /* Though we've started our numbering from 1, and so 
class all
    * earlier versions as 0, in effect their value is 
undefined as

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index fd4f9574d177a..7d5a47f182542 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -806,6 +806,19 @@ typedef struct drm_i915_irq_wait {
    */
   #define I915_PARAM_PXP_STATUS 58
   +/*
+ * Query for the GuC submission/VF interface version number


What is this VF you speak of? :/

Regards,

Tvrtko


+ *
+ * -ENODEV is returned if GuC submission is not used
+ *
+ * On success, returns the respective GuC submission/VF interface 
major,

+ * minor or patch version as per the requested parameter.
+ *
+ */
+#define I915_PARAM_GUC_SUBMISSION_VERSION_MAJOR 59
+#define I915_PARAM_GUC_SUBMISSION_VERSION_MINOR 60
+#define I915_PARAM_GUC_SUBMISSION_VERSION_PATCH 61
+
   /* Must be kept compact -- no holes and well documented */
      /**






Re: [PATCH] drm/i915/gt: Restart the heartbeat timer when forcing a pulse

2024-01-31 Thread John Harrison

On 1/31/2024 10:48, Janusz Krzysztofik wrote:

Hi John,

On Wednesday, 10 January 2024 22:02:16 CET john.c.harri...@intel.com wrote:

From: John Harrison 

The context persistence code does things like send super high priority
heartbeat pulses to ensure any leaked context can still be pre-empted
and thus isn't a total denial of service but only a minor denial of
service. Unfortunately, it wasn't bothering to restart the heatbeat
worker with a fresh timeout. Thus, if a persistent context happened to
be closed just before the heartbeat was going to go ping anyway then
the forced pulse would get a negligble execution time. And as the
forced pulse is super high priority, the worker thread's next step is
a reset. Which means a potentially innocent system randomly goes boom
when attempting to close a context. So, force a re-schedule of the
worker thread with the appropriate timeout.

I haven't looked too much in heartbeat pulses code before, but I think I can
understand your change.  I've also got a positive opinion from Chris on it.
I can provide my Ack, assuming the pre-merge failure reported by CI is not
related, but could you please comment that failure first and/or ask BUG Filing
to handle it so we have it cleaned up?
Pretty confident the CI failure is unrelated. Not seeing how a change to 
the heartbeat timing of persistence context clean up could cause a HDMI 
test to fail to complete.


However, I was really hoping for a full code review by someone who 
understands this code and would be able to say whether there could be 
unexpected side effects of the change. Or even if there is a better / 
safer way to fix the problem.


@Andi Shyti, you were fingered as being someone who might have such 
knowledge. Can you comment?


Thanks,
John.


Thanks,
Janusz



Signed-off-by: John Harrison 
---
  drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c 
b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
index 1a8e2b7db0138..4ae2fa0b61dd4 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
@@ -290,6 +290,9 @@ static int __intel_engine_pulse(struct intel_engine_cs 
*engine)
heartbeat_commit(rq, );
GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER);
  
+	/* Ensure the forced pulse gets a full period to execute */

+   next_heartbeat(engine);
+
return 0;
  }
  









Re: [RFC PATCH] drm/i915: Add GETPARAM for GuC submission version

2024-01-24 Thread John Harrison

On 1/24/2024 00:55, Tvrtko Ursulin wrote:

On 24/01/2024 08:19, Joonas Lahtinen wrote:

Add reporting of the GuC submissio/VF interface version via GETPARAM
properties. Mesa intends to use this information to check for old
firmware versions with known bugs before enabling features like async
compute.


There was 
https://patchwork.freedesktop.org/patch/560704/?series=124592=1 
which does everything in one go so would be my preference.

I also think that the original version is a cleaner implementation.



During the time of that patch there was discussion whether firmware 
version or submission version was better. I vaguely remember someone 
raised an issue with the latter. Adding John in case he remembers.
The file version number should not be exposed to UMDs, only the 
submission version. The whole purpose of the submission version is to 
track user facing changes. There was a very, very, very long discussion 
about that to which all parties did eventually agree on using the 
submission version.


The outstanding issues were simply a) whether UMDs should be tracking 
version numbers and all the complications that arise with branching and 
non-linear numbering, b) should it just be exposed as a feature flag 
instead and c) this will prevent hangs in certain specific situations 
but it won't prevent the system running slowly and not using the full 
capabilities of the hardware, for that we need to be making sure that 
distros actually update to a firmware release that is not ancient.






Signed-off-by: Joonas Lahtinen 
Cc: Kenneth Graunke 
Cc: Jose Souza 
Cc: Sagar Ghuge 
Cc: Paulo Zanoni 
Cc: John Harrison 
Cc: Rodrigo Vivi 
Cc: Jani Nikula 
Cc: Tvrtko Ursulin 
---
  drivers/gpu/drm/i915/i915_getparam.c | 12 
  include/uapi/drm/i915_drm.h  | 13 +
  2 files changed, 25 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_getparam.c 
b/drivers/gpu/drm/i915/i915_getparam.c

index 5c3fec63cb4c1..f176372debc54 100644
--- a/drivers/gpu/drm/i915/i915_getparam.c
+++ b/drivers/gpu/drm/i915/i915_getparam.c
@@ -113,6 +113,18 @@ int i915_getparam_ioctl(struct drm_device *dev, 
void *data,

  if (value < 0)
  return value;
  break;
+    case I915_PARAM_GUC_SUBMISSION_VERSION_MAJOR:
+    case I915_PARAM_GUC_SUBMISSION_VERSION_MINOR:
+    case I915_PARAM_GUC_SUBMISSION_VERSION_PATCH:
+    if (!intel_uc_uses_guc_submission(_gt(i915)->uc))
+    return -ENODEV;
+    if (param->param == I915_PARAM_GUC_SUBMISSION_VERSION_MAJOR)
+    value = to_gt(i915)->uc.guc.submission_version.major;
+    else if (param->param == 
I915_PARAM_GUC_SUBMISSION_VERSION_MINOR)

+    value = to_gt(i915)->uc.guc.submission_version.minor;
+    else
+    value = to_gt(i915)->uc.guc.submission_version.patch;
+    break;
  case I915_PARAM_MMAP_GTT_VERSION:
  /* Though we've started our numbering from 1, and so class all
   * earlier versions as 0, in effect their value is 
undefined as

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index fd4f9574d177a..7d5a47f182542 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -806,6 +806,19 @@ typedef struct drm_i915_irq_wait {
   */
  #define I915_PARAM_PXP_STATUS 58
  +/*
+ * Query for the GuC submission/VF interface version number


What is this VF you speak of? :/
Agreed. There is no SRIOV support in i915 so i915 should not be 
mentioning SRIOV specific features.


John.



Regards,

Tvrtko


+ *
+ * -ENODEV is returned if GuC submission is not used
+ *
+ * On success, returns the respective GuC submission/VF interface 
major,

+ * minor or patch version as per the requested parameter.
+ *
+ */
+#define I915_PARAM_GUC_SUBMISSION_VERSION_MAJOR 59
+#define I915_PARAM_GUC_SUBMISSION_VERSION_MINOR 60
+#define I915_PARAM_GUC_SUBMISSION_VERSION_PATCH 61
+
  /* Must be kept compact -- no holes and well documented */
    /**




Re: [PATCH] drm/i915/huc: Allow for very slow HuC loading

2024-01-05 Thread John Harrison

On 1/4/2024 12:34, Daniele Ceraolo Spurio wrote:

On 1/2/2024 2:22 PM, john.c.harri...@intel.com wrote:

From: John Harrison 

A failure to load the HuC is occasionally observed where the cause is
believed to be a low GT frequency leading to very long load times.

So a) increase the timeout so that the user still gets a working
system even in the case of slow load. And b) report the frequency
during the load to see if that is the cause of the slow down.

Also update the similar code on the GuC load to not use uncore->gt
when there is a local gt available. The two should match, but no need
for unnecessary de-referencing.


Since the code is identical in almost identical in both places, I'm 
wondering if it is worth using a common waiter function and pass in a 
function pointer with the waiting logic. The cons of that is that we'd 
have to move to gt-level logging and pass in a tag, so not sure if it 
is worth it overall given that it isn't a lot of code. Maybe we should 
consider it when we implement this on the Xe-driver side?
Yeah, I did think about trying to have something more common but it gets 
messy with all the error reports being specific to the firmware in 
question. But yeah, maybe think some more on it for the Xe version.







Signed-off-by: John Harrison 
---
  drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c | 10 ++--
  drivers/gpu/drm/i915/gt/uc/intel_huc.c    | 64 ---
  2 files changed, 63 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c

index 0f79cb6585182..52332bb143395 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c
@@ -184,7 +184,7 @@ static int guc_wait_ucode(struct intel_guc *guc)
   * in the seconds range. However, there is a limit on how long an
   * individual wait_for() can wait. So wrap it in a loop.
   */
-    before_freq = intel_rps_read_actual_frequency(>gt->rps);
+    before_freq = intel_rps_read_actual_frequency(>rps);
  before = ktime_get();
  for (count = 0; count < GUC_LOAD_RETRY_LIMIT; count++) {
  ret = wait_for(guc_load_done(uncore, , ), 
1000);

@@ -192,7 +192,7 @@ static int guc_wait_ucode(struct intel_guc *guc)
  break;
    guc_dbg(guc, "load still in progress, count = %d, freq = 
%dMHz, status = 0x%08X [0x%02X/%02X]\n",
-    count, 
intel_rps_read_actual_frequency(>gt->rps), status,

+    count, intel_rps_read_actual_frequency(>rps), status,
  REG_FIELD_GET(GS_BOOTROM_MASK, status),
  REG_FIELD_GET(GS_UKERNEL_MASK, status));
  }
@@ -204,7 +204,7 @@ static int guc_wait_ucode(struct intel_guc *guc)
  u32 bootrom = REG_FIELD_GET(GS_BOOTROM_MASK, status);
    guc_info(guc, "load failed: status = 0x%08X, time = 
%lldms, freq = %dMHz, ret = %d\n",
- status, delta_ms, 
intel_rps_read_actual_frequency(>gt->rps), ret);
+ status, delta_ms, 
intel_rps_read_actual_frequency(>rps), ret);
  guc_info(guc, "load failed: status: Reset = %d, BootROM = 
0x%02X, UKernel = 0x%02X, MIA = 0x%02X, Auth = 0x%02X\n",

   REG_FIELD_GET(GS_MIA_IN_RESET, status),
   bootrom, ukernel,
@@ -254,11 +254,11 @@ static int guc_wait_ucode(struct intel_guc *guc)
  guc_warn(guc, "excessive init time: %lldms! [status = 
0x%08X, count = %d, ret = %d]\n",

   delta_ms, status, count, ret);
  guc_warn(guc, "excessive init time: [freq = %dMHz, before = 
%dMHz, perf_limit_reasons = 0x%08X]\n",

- intel_rps_read_actual_frequency(>gt->rps), before_freq,
+ intel_rps_read_actual_frequency(>rps), before_freq,
   intel_uncore_read(uncore, 
intel_gt_perf_limit_reasons_reg(gt)));

  } else {
  guc_dbg(guc, "init took %lldms, freq = %dMHz, before = 
%dMHz, status = 0x%08X, count = %d, ret = %d\n",
-    delta_ms, 
intel_rps_read_actual_frequency(>gt->rps),

+    delta_ms, intel_rps_read_actual_frequency(>rps),
  before_freq, status, count, ret);
  }
  diff --git a/drivers/gpu/drm/i915/gt/uc/intel_huc.c 
b/drivers/gpu/drm/i915/gt/uc/intel_huc.c

index ba9e07fc2b577..9ccec7de9628a 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_huc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_huc.c
@@ -6,6 +6,7 @@
  #include 
    #include "gt/intel_gt.h"
+#include "gt/intel_rps.h"
  #include "intel_guc_reg.h"
  #include "intel_huc.h"
  #include "intel_huc_print.h"
@@ -447,17 +448,68 @@ static const char *auth_mode_string(struct 
intel_huc *huc,

  return partial ? "clear media" : "all workloads";
  }
  +/*
+ * Use a longer timeout for debug builds so that problems can be 
detected
+ * and analysed. But a shorter timeout for releases so that user's 

Re: [Intel-gfx] [PATCH v2] drm/i915/gt: Convert reset prepare failure log to trace

2023-12-05 Thread John Harrison

On 12/5/2023 02:39, Nirmoy Das wrote:

Hi John,

On 12/5/2023 10:10 AM, John Harrison wrote:

On 12/5/2023 00:52, Nirmoy Das wrote:

gen8_engine_reset_prepare() can fail when HW fails to set
RESET_CTL_READY_TO_RESET bit. In some cases this is not fatal
error as driver will retry.

Convert the log to a trace log for debugging without triggering
unnecessary concerns in CI or for end-users during non-fatal scenarios.
I strongly disagree with this change. The hardware spec for the 
RESET_CTL and GDRST registers are that they will self clear within a 
matter of microseconds. If something is so badly wrong with the 
hardware that it can't even manage to reset



This message is for reset readiness  poll timeout not that the reset 
is failed which doesn't sound so serious if the subsequent attempt 
managed reset the engine.
Not sure what the distinction is. The reset procedure is poke RESET_CTL 
wait for it to clear, poke GDRST and wait for it to clear. Just because 
step one is failing rather than step 2 does not mean that the reset as a 
whole has not failed.


Note that the purpose of RESET_CTL is to pause a bunch of stuff like the 
command streamers to prevent them from issuing new memory requests while 
the reset is in progress. If it fails, it likely means that a CS is 
refusing to stop. Most probably because it can't reach a stopping point 
because it is stuck waiting on a lost memory request or similar. And the 
point of stopping further memory requests during reset is that if the 
memory channel gets out of sync (because only the GT side is reset 
during a GT reset) then that can result in total system failure. As in 
potentially even the CPU can no longer get to memory if it is an 
integrated platform. So yes, it can be quite a serious failure indeed.





I couldn't get enough details when this can happen that HW takes very 
long time to set the readiness bit.
Is it simply 'taking a long time' or is never clearing at all? If it is 
just that the timeout is too short then the proper fix would be to 
increase the timeout. But if it is taking seconds or longer or just 
never succeeding at all, then something is very bad.





then that is something that very much warrants more than a completely 
silent trace event. It most certainly should be flagged as a failure 
in CI.


Just because the driver will retry does not mean that this is not a 
serious error. And if the first attempt failed, why would a 
subsequent attempt succeed?


The patch is not ignoring the failure. If the subsequent attempt fails 
then driver load will fail or it will be wedged if that happens after 
driver load.
One thing I really hate about our driver is the total lack of 
information when something goes wrong during load. The driver wedges in 
total silence. There are many error paths that have no reporting at all. 
Which means you are left with a totally useless bug report.






Escalating to FLR may have more success, but that is not something 
that i915 currently does.


Do we still need to do FLR if a subsequent engine reset failure ?
Assuming that we are talking about modern(ish) platforms, an engine 
reset failure would be hit by GuC rather than i915, but that would be 
escalated to an i915 based full GT reset. Generally speaking though, if 
the engine reset fails the GT reset isn't going to do much better. It 
would fix a dead GuC problem but it can't help with memory related 
issues. If the full GT reset fails then we are out of escalation routes 
as there is no FLR path at present (I think we have that at driver 
unload on MTL but not for general reset?). The FLR resets a lot more 
than just the GT, so it does have a chance to fix some issues that a GT 
reset can't. After driver-level FLR, there is PCI level FLR. Not sure if 
that involves a full power down and restart, but if not then that would 
be the last escalation possible. A power cycle really should fix any 
issues, if it doesn't then it's time to return the system as being 
totally dead!


My recollection is that the vast majority of engine reset failures I've 
looked at have been completely catastrophic and the system only 
recovered after a reboot. I.e. after the card was power cycled. Such 
issues were generally caused by bad memory. Once the path to memory has 
died, there really is not much of the GPU that can do anything at all 
and there isn't much that can be done to recover it.


John.





Regards,

Nirmoy



John.




v2: Improve commit message(Tvrtko)

Cc: Tvrtko Ursulin 
Cc: John Harrison 
Cc: Andi Shyti 
Cc: Andrzej Hajda 
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/5591
Signed-off-by: Nirmoy Das 
Reviewed-by: Andi Shyti 
Reviewed-by: Andrzej Hajda 
---
  drivers/gpu/drm/i915/gt/intel_reset.c | 8 
  1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c

index d5ed904f355d..e6fbc6202c80 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b

Re: [Intel-gfx] [PATCH v2] drm/i915/gt: Convert reset prepare failure log to trace

2023-12-05 Thread John Harrison

On 12/5/2023 00:52, Nirmoy Das wrote:

gen8_engine_reset_prepare() can fail when HW fails to set
RESET_CTL_READY_TO_RESET bit. In some cases this is not fatal
error as driver will retry.

Convert the log to a trace log for debugging without triggering
unnecessary concerns in CI or for end-users during non-fatal scenarios.
I strongly disagree with this change. The hardware spec for the 
RESET_CTL and GDRST registers are that they will self clear within a 
matter of microseconds. If something is so badly wrong with the hardware 
that it can't even manage to reset then that is something that very much 
warrants more than a completely silent trace event. It most certainly 
should be flagged as a failure in CI.


Just because the driver will retry does not mean that this is not a 
serious error. And if the first attempt failed, why would a subsequent 
attempt succeed? Escalating to FLR may have more success, but that is 
not something that i915 currently does.


John.




v2: Improve commit message(Tvrtko)

Cc: Tvrtko Ursulin 
Cc: John Harrison 
Cc: Andi Shyti 
Cc: Andrzej Hajda 
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/5591
Signed-off-by: Nirmoy Das 
Reviewed-by: Andi Shyti 
Reviewed-by: Andrzej Hajda 
---
  drivers/gpu/drm/i915/gt/intel_reset.c | 8 
  1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c
index d5ed904f355d..e6fbc6202c80 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -593,10 +593,10 @@ static int gen8_engine_reset_prepare(struct 
intel_engine_cs *engine)
ret = __intel_wait_for_register_fw(uncore, reg, mask, ack,
   700, 0, NULL);
if (ret)
-   gt_err(engine->gt,
-  "%s reset request timed out: {request: %08x, RESET_CTL: 
%08x}\n",
-  engine->name, request,
-  intel_uncore_read_fw(uncore, reg));
+   GT_TRACE(engine->gt,
+"%s reset request timed out: {request: %08x, RESET_CTL: 
%08x}\n",
+engine->name, request,
+intel_uncore_read_fw(uncore, reg));
  
  	return ret;

  }




Re: [Intel-gfx] [PATCH 1/1] drm/i915/pxp: Add missing tag for Wa_14019159160

2023-11-21 Thread John Harrison

On 11/21/2023 10:55, Alan Previn wrote:

Add missing tag for "Wa_14019159160 - Case 2" (for existing
PXP code that ensures run alone mode bit is set to allow
PxP-decryption.

Signed-off-by: Alan Previn 
---
  drivers/gpu/drm/i915/gt/intel_lrc.c | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c 
b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 7c367ba8d9dc..c758fe4906a5 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -863,8 +863,10 @@ static bool ctx_needs_runalone(const struct intel_context 
*ce)
bool ctx_is_protected = false;
  
  	/*

+* Wa_140191591606 - Case 2: mtl

Too many sixes.


 * On MTL and newer platforms, protected contexts require setting
Probably better to say 'On some platforms'. The current wording implies 
this is an intentional hardware change that will be carried forward 
rather than a bug requiring a workaround which will (hopefully) be fixed 
on some future platform.



-* the LRC run-alone bit or else the encryption will not happen.
+* the LRC run-alone bit or else the encryption/decryption will not 
happen.
+* NOTE: Case 2 only applies to PXP use-case of said workaround.
 */
if (GRAPHICS_VER_FULL(ce->engine->i915) >= IP_VER(12, 70) &&
Likewise, this should only test for the explicit platforms listed in the 
w/a definition rather than assume all future platforms.


John.


(ce->engine->class == COMPUTE_CLASS || ce->engine->class == 
RENDER_CLASS)) {

base-commit: dbdb47c227dc21b7bf98ada039bf42aac4b58b8b




Re: [Intel-gfx] [PATCH] drm/i915/huc: Stop printing about unsupported HuC on MTL

2023-11-13 Thread John Harrison

On 11/13/2023 07:36, Daniele Ceraolo Spurio wrote:

On 11/9/2023 6:06 PM, John Harrison wrote:

On 11/9/2023 15:54, Daniele Ceraolo Spurio wrote:

On MTL, the HuC is only supported on the media GT, so our validation
check on the module parameter detects an inconsistency on the root GT
(the modparams asks to enable HuC, but the support is not there) and
prints the following info message:

[drm] GT0: Incompatible option enable_guc=3 - HuC is not supported!

This can be confusing to the user and make them think that something is
wrong when it isn't, so we need to silence it.
Given that any platform that supports HuC also supports GuC, if a user
tries to enable HuC on a platform that really doesn't support it 
they'll

already see a message about GuC not being supported, so instead of just
silencing the HuC message on newer platforms we can just get rid of it
entirely.
Not following this argument. Someone might attempt to enable HuC only 
and do so on a older platform that supports neither HuC nor GuC. 
There would be no GuC warning because GuC was not requested. But now 
there would also be no HuC warning either.




Enabling HuC also enabled GuC loading, because the latter is needed to 
auth the former. The message about GuC not being supported is printed 
for all values of enable_guc that are not zero.


Daniele

This would indeed appear to be the case. So...

Reviewed-by: John Harrison 




John.



Signed-off-by: Daniele Ceraolo Spurio 
Cc: John Harrison 
---
  drivers/gpu/drm/i915/gt/uc/intel_uc.c | 5 -
  1 file changed, 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c 
b/drivers/gpu/drm/i915/gt/uc/intel_uc.c

index 27f6561dd731..3872d309ed31 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
@@ -106,11 +106,6 @@ static void __confirm_options(struct intel_uc *uc)
  gt_info(gt,  "Incompatible option enable_guc=%d - %s\n",
  i915->params.enable_guc, "GuC is not supported!");
  -    if (i915->params.enable_guc & ENABLE_GUC_LOAD_HUC &&
-    !intel_uc_supports_huc(uc))
-    gt_info(gt, "Incompatible option enable_guc=%d - %s\n",
-    i915->params.enable_guc, "HuC is not supported!");
-
  if (i915->params.enable_guc & ENABLE_GUC_SUBMISSION &&
  !intel_uc_supports_guc_submission(uc))
  gt_info(gt, "Incompatible option enable_guc=%d - %s\n",








Re: [Intel-gfx] [PATCH] drm/i915/huc: Stop printing about unsupported HuC on MTL

2023-11-09 Thread John Harrison

On 11/9/2023 15:54, Daniele Ceraolo Spurio wrote:

On MTL, the HuC is only supported on the media GT, so our validation
check on the module parameter detects an inconsistency on the root GT
(the modparams asks to enable HuC, but the support is not there) and
prints the following info message:

[drm] GT0: Incompatible option enable_guc=3 - HuC is not supported!

This can be confusing to the user and make them think that something is
wrong when it isn't, so we need to silence it.
Given that any platform that supports HuC also supports GuC, if a user
tries to enable HuC on a platform that really doesn't support it they'll
already see a message about GuC not being supported, so instead of just
silencing the HuC message on newer platforms we can just get rid of it
entirely.
Not following this argument. Someone might attempt to enable HuC only 
and do so on a older platform that supports neither HuC nor GuC. There 
would be no GuC warning because GuC was not requested. But now there 
would also be no HuC warning either.


John.



Signed-off-by: Daniele Ceraolo Spurio 
Cc: John Harrison 
---
  drivers/gpu/drm/i915/gt/uc/intel_uc.c | 5 -
  1 file changed, 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c 
b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
index 27f6561dd731..3872d309ed31 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
@@ -106,11 +106,6 @@ static void __confirm_options(struct intel_uc *uc)
gt_info(gt,  "Incompatible option enable_guc=%d - %s\n",
i915->params.enable_guc, "GuC is not supported!");
  
-	if (i915->params.enable_guc & ENABLE_GUC_LOAD_HUC &&

-   !intel_uc_supports_huc(uc))
-   gt_info(gt, "Incompatible option enable_guc=%d - %s\n",
-   i915->params.enable_guc, "HuC is not supported!");
-
if (i915->params.enable_guc & ENABLE_GUC_SUBMISSION &&
!intel_uc_supports_guc_submission(uc))
gt_info(gt, "Incompatible option enable_guc=%d - %s\n",




Re: [Intel-gfx] [PATCH 2/2] drm/i915/guc: Add a selftest for FAST_REQUEST errors

2023-11-09 Thread John Harrison

On 11/9/2023 12:33, Daniele Ceraolo Spurio wrote:

On 11/6/2023 3:59 PM, john.c.harri...@intel.com wrote:

From: John Harrison 

There is a mechanism for reporting errors from fire and forget H2G
messages. This is the only way to find out about almost any error in
the GuC backend submission path. So it would be useful to know that it
is working.

Signed-off-by: John Harrison 
---
  drivers/gpu/drm/i915/gt/uc/intel_guc.h    |   4 +
  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c |   9 ++
  drivers/gpu/drm/i915/gt/uc/selftest_guc.c | 122 ++
  3 files changed, 135 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h 
b/drivers/gpu/drm/i915/gt/uc/intel_guc.h

index 2b6dfe62c8f2a..e22c12ce245ad 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -297,6 +297,10 @@ struct intel_guc {
   * @number_guc_id_stolen: The number of guc_ids that have been 
stolen

   */
  int number_guc_id_stolen;
+    /**
+ * @fast_response_selftest: Backdoor to CT handler for fast 
response selftest

+ */
+    u32 fast_response_selftest;
  #endif
  };
  diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c

index 89e314b3756bb..9d958afb78b7f 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
@@ -1076,6 +1076,15 @@ static int ct_handle_response(struct 
intel_guc_ct *ct, struct ct_incoming_msg *r

  found = true;
  break;
  }
+
+#ifdef CONFIG_DRM_I915_SELFTEST
+    if (!found && ct_to_guc(ct)->fast_response_selftest) {
+    CT_DEBUG(ct, "Assuming unsolicited response due to 
FAST_REQUEST selftest\n");

+    ct_to_guc(ct)->fast_response_selftest++;
+    found = 1;


found = true ? it's the same thing, but it's cleaner to assign boolean 
values to bool variables

Doh.




+    }
+#endif
+
  if (!found) {
  CT_ERROR(ct, "Unsolicited response message: len %u, data 
%#x (fence %u, last %u)\n",

   len, hxg[0], fence, ct->requests.last_fence);
diff --git a/drivers/gpu/drm/i915/gt/uc/selftest_guc.c 
b/drivers/gpu/drm/i915/gt/uc/selftest_guc.c

index bfb72143566f6..97fbbb396336c 100644
--- a/drivers/gpu/drm/i915/gt/uc/selftest_guc.c
+++ b/drivers/gpu/drm/i915/gt/uc/selftest_guc.c
@@ -286,11 +286,133 @@ static int intel_guc_steal_guc_ids(void *arg)
  return ret;
  }
  +/*
+ * Send a context schedule H2G message with an invalid context id.
+ * This should generate a GUC_RESULT_INVALID_CONTEXT response.
+ */
+static int bad_h2g(struct intel_guc *guc)
+{
+    u32 action[3], len = 0;


AFAICS This is a 2 DW command, so you can use action[2].

Yup. Copy and paste bug.




+
+    action[len++] = INTEL_GUC_ACTION_SCHED_CONTEXT;
+    action[len++] = 0x12345678;
+
+    return intel_guc_send_nb(guc, action, len, 0);
+}
+
+/*
+ * Set a spinner running to make sure the system is alive and active,
+ * then send a bad but asynchronous H2G command and wait to see if an
+ * error response is returned. If no response is received or if the
+ * spinner dies then the test will fail.
+ */
+#define FAST_RESPONSE_TIMEOUT_MS    1000
+static int intel_guc_fast_request(void *arg)
+{
+    struct intel_gt *gt = arg;
+    struct intel_context *ce;
+    struct igt_spinner spin;
+    struct i915_request *rq;
+    intel_wakeref_t wakeref;
+    struct intel_engine_cs *engine = 
intel_selftest_find_any_engine(gt);

+    ktime_t before, now, delta;
+    bool spinning = false;
+    u64 delta_ms;
+    int ret = 0;
+
+    if (!engine)
+    return 0;
+
+    wakeref = intel_runtime_pm_get(gt->uncore->rpm);
+
+    ce = intel_context_create(engine);
+    if (IS_ERR(ce)) {
+    ret = PTR_ERR(ce);
+    gt_err(gt, "Failed to create spinner request: %pe\n", ce);
+    goto err_pm;
+    }
+
+    ret = igt_spinner_init(, engine->gt);
+    if (ret) {
+    gt_err(gt, "Failed to create spinner: %pe\n", ERR_PTR(ret));
+    goto err_pm;
+    }
+    spinning = true;
+
+    rq = igt_spinner_create_request(, ce, MI_ARB_CHECK);
+    intel_context_put(ce);
+    if (IS_ERR(rq)) {
+    ret = PTR_ERR(rq);
+    gt_err(gt, "Failed to create spinner request: %pe\n", rq);
+    goto err_spin;
+    }
+
+    ret = request_add_spin(rq, );
+    if (ret) {
+    gt_err(gt, "Failed to add Spinner request: %pe\n", 
ERR_PTR(ret));

+    goto err_rq;
+    }
+
+    gt->uc.guc.fast_response_selftest = 1;
+
+    ret = bad_h2g(>uc.guc);
+    if (ret) {
+    gt_err(gt, "Failed to send H2G: %pe\n", ERR_PTR(ret));
+    goto err_rq;
+    }
+
+    before = ktime_get();
+    while (gt->uc.guc.fast_response_selftest == 1) {
+    ret = i915_request_wait(rq, 0, 1);
+    if (ret != -ETIME) {
+    gt_err(gt, "Request wait failed: %pe\n", ERR_PTR(ret));
+    goto err_rq;
+ 

Re: [Intel-gfx] [PATCH 3/4] drm/i915/guc: Add support for w/a KLVs

2023-10-27 Thread John Harrison

On 10/6/2023 17:38, Belgaumkar, Vinay wrote:

On 9/15/2023 2:55 PM, john.c.harri...@intel.com wrote:

From: John Harrison 

To prevent running out of bits, new w/a enable flags are being added
via a KLV system instead of a 32 bit flags word.

Signed-off-by: John Harrison 
---
  .../gpu/drm/i915/gt/uc/abi/guc_errors_abi.h   |  1 +
  drivers/gpu/drm/i915/gt/uc/intel_guc.h    |  3 +
  drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c    | 64 ++-
  drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c |  6 ++
  drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  5 +-
  5 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_errors_abi.h 
b/drivers/gpu/drm/i915/gt/uc/abi/guc_errors_abi.h

index dabeaf4f245f3..00d6402333f8e 100644
--- a/drivers/gpu/drm/i915/gt/uc/abi/guc_errors_abi.h
+++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_errors_abi.h
@@ -36,6 +36,7 @@ enum intel_guc_load_status {
  INTEL_GUC_LOAD_STATUS_INVALID_INIT_DATA_RANGE_START,
  INTEL_GUC_LOAD_STATUS_MPU_DATA_INVALID = 0x73,
  INTEL_GUC_LOAD_STATUS_INIT_MMIO_SAVE_RESTORE_INVALID   = 0x74,
+    INTEL_GUC_LOAD_STATUS_KLV_WORKAROUND_INIT_ERROR    = 0x75,
  INTEL_GUC_LOAD_STATUS_INVALID_INIT_DATA_RANGE_END,
    INTEL_GUC_LOAD_STATUS_READY    = 0xF0,
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h 
b/drivers/gpu/drm/i915/gt/uc/intel_guc.h

index 6c392bad29c19..3b1fc5f96306b 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -186,6 +186,8 @@ struct intel_guc {
  struct guc_mmio_reg *ads_regset;
  /** @ads_golden_ctxt_size: size of the golden contexts in the 
ADS */

  u32 ads_golden_ctxt_size;
+    /** @ads_waklv_size: size of workaround KLVs */
+    u32 ads_waklv_size;
  /** @ads_capture_size: size of register lists in the ADS used 
for error capture */

  u32 ads_capture_size;
  /** @ads_engine_usage_size: size of engine usage in the ADS */
@@ -295,6 +297,7 @@ struct intel_guc {
  #define MAKE_GUC_VER(maj, min, pat)    (((maj) << 16) | ((min) << 
8) | (pat))
  #define MAKE_GUC_VER_STRUCT(ver)    MAKE_GUC_VER((ver).major, 
(ver).minor, (ver).patch)
  #define GUC_SUBMIT_VER(guc) 
MAKE_GUC_VER_STRUCT((guc)->submission_version)
+#define GUC_FIRMWARE_VER(guc) 
MAKE_GUC_VER_STRUCT((guc)->fw.file_selected.ver)
    static inline struct intel_guc *log_to_guc(struct intel_guc_log 
*log)

  {
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c

index 63724e17829a7..792910af3a481 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
@@ -46,6 +46,10 @@
   *  +---+
   *  | padding   |
   *  +---+ <== 4K aligned
+ *  | w/a KLVs  |
+ *  +---+
+ *  | padding   |
+ *  +---+ <== 4K aligned
   *  | capture lists |
   *  +---+
   *  | padding   |
@@ -88,6 +92,11 @@ static u32 guc_ads_golden_ctxt_size(struct 
intel_guc *guc)

  return PAGE_ALIGN(guc->ads_golden_ctxt_size);
  }
  +static u32 guc_ads_waklv_size(struct intel_guc *guc)
+{
+    return PAGE_ALIGN(guc->ads_waklv_size);
+}
+
  static u32 guc_ads_capture_size(struct intel_guc *guc)
  {
  return PAGE_ALIGN(guc->ads_capture_size);
@@ -113,7 +122,7 @@ static u32 guc_ads_golden_ctxt_offset(struct 
intel_guc *guc)

  return PAGE_ALIGN(offset);
  }
  -static u32 guc_ads_capture_offset(struct intel_guc *guc)
+static u32 guc_ads_waklv_offset(struct intel_guc *guc)
  {
  u32 offset;
  @@ -123,6 +132,16 @@ static u32 guc_ads_capture_offset(struct 
intel_guc *guc)

  return PAGE_ALIGN(offset);
  }
  +static u32 guc_ads_capture_offset(struct intel_guc *guc)
+{
+    u32 offset;
+
+    offset = guc_ads_waklv_offset(guc) +
+ guc_ads_waklv_size(guc);
+
+    return PAGE_ALIGN(offset);
+}
+
  static u32 guc_ads_private_data_offset(struct intel_guc *guc)
  {
  u32 offset;
@@ -791,6 +810,40 @@ guc_capture_prep_lists(struct intel_guc *guc)
  return PAGE_ALIGN(total_size);
  }
  +static void guc_waklv_init(struct intel_guc *guc)
+{
+    struct intel_gt *gt = guc_to_gt(guc);
+    u32 offset, addr_ggtt, remain, size;
+
+    if (!intel_uc_uses_guc_submission(>uc))
+    return;
+
+    if (GUC_FIRMWARE_VER(guc) < MAKE_GUC_VER(70, 10, 0))
+    return;

should this be <= ?
No. GuC 70.10.0 is when w/a KLVs were introduced. So we want to skip on 
any version that is prior to 70.10.0.



+
+    GEM_BUG_ON(iosys_map_is_null(>ads_map));
+    offset = guc_ads_waklv_offset(guc);
+    remain = guc_ads_waklv_size(guc);
+
+    /

Re: [Intel-gfx] [PATCH 2/4] drm/i915: Enable Wa_16019325821

2023-10-27 Thread John Harrison

On 10/6/2023 17:10, Belgaumkar, Vinay wrote:

On 9/15/2023 2:55 PM, john.c.harri...@intel.com wrote:

From: John Harrison 

Some platforms require holding RCS context switches until CCS is idle
(the reverse w/a of Wa_14014475959). Some platforms require both
versions.

Signed-off-by: John Harrison 
---
  drivers/gpu/drm/i915/gt/gen8_engine_cs.c  | 19 +++
  drivers/gpu/drm/i915/gt/intel_engine_types.h  |  7 ---
  drivers/gpu/drm/i915/gt/uc/intel_guc.c    |  4 
  drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  3 ++-
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  8 +++-
  5 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c 
b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c

index 0143445dba830..8b494825c55f2 100644
--- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
@@ -733,21 +733,23 @@ static u32 *gen12_emit_preempt_busywait(struct 
i915_request *rq, u32 *cs)

  }
    /* Wa_14014475959:dg2 */
-#define CCS_SEMAPHORE_PPHWSP_OFFSET    0x540
-static u32 ccs_semaphore_offset(struct i915_request *rq)
+/* Wa_16019325821 */
+#define HOLD_SWITCHOUT_SEMAPHORE_PPHWSP_OFFSET    0x540
+static u32 hold_switchout_semaphore_offset(struct i915_request *rq)
  {
  return i915_ggtt_offset(rq->context->state) +
-    (LRC_PPHWSP_PN * PAGE_SIZE) + CCS_SEMAPHORE_PPHWSP_OFFSET;
+    (LRC_PPHWSP_PN * PAGE_SIZE) + 
HOLD_SWITCHOUT_SEMAPHORE_PPHWSP_OFFSET;

  }
    /* Wa_14014475959:dg2 */
-static u32 *ccs_emit_wa_busywait(struct i915_request *rq, u32 *cs)
+/* Wa_16019325821 */
+static u32 *hold_switchout_emit_wa_busywait(struct i915_request *rq, 
u32 *cs)

  {
  int i;
    *cs++ = MI_ATOMIC_INLINE | MI_ATOMIC_GLOBAL_GTT | 
MI_ATOMIC_CS_STALL |

  MI_ATOMIC_MOVE;
-    *cs++ = ccs_semaphore_offset(rq);
+    *cs++ = hold_switchout_semaphore_offset(rq);
  *cs++ = 0;
  *cs++ = 1;
  @@ -763,7 +765,7 @@ static u32 *ccs_emit_wa_busywait(struct 
i915_request *rq, u32 *cs)

  MI_SEMAPHORE_POLL |
  MI_SEMAPHORE_SAD_EQ_SDD;
  *cs++ = 0;
-    *cs++ = ccs_semaphore_offset(rq);
+    *cs++ = hold_switchout_semaphore_offset(rq);
  *cs++ = 0;
    return cs;
@@ -780,8 +782,9 @@ gen12_emit_fini_breadcrumb_tail(struct 
i915_request *rq, u32 *cs)

  cs = gen12_emit_preempt_busywait(rq, cs);
    /* Wa_14014475959:dg2 */
-    if (intel_engine_uses_wa_hold_ccs_switchout(rq->engine))
-    cs = ccs_emit_wa_busywait(rq, cs);
+    /* Wa_16019325821 */
+    if (intel_engine_uses_wa_hold_switchout(rq->engine))
+    cs = hold_switchout_emit_wa_busywait(rq, cs);
    rq->tail = intel_ring_offset(rq, cs);
  assert_ring_tail_valid(rq->ring, rq->tail);
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h 
b/drivers/gpu/drm/i915/gt/intel_engine_types.h

index a7e6775980043..68fe1cef9cd94 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -573,7 +573,7 @@ struct intel_engine_cs {
  #define I915_ENGINE_HAS_RCS_REG_STATE  BIT(9)
  #define I915_ENGINE_HAS_EU_PRIORITY    BIT(10)
  #define I915_ENGINE_FIRST_RENDER_COMPUTE BIT(11)
-#define I915_ENGINE_USES_WA_HOLD_CCS_SWITCHOUT BIT(12)
+#define I915_ENGINE_USES_WA_HOLD_SWITCHOUT BIT(12)
  unsigned int flags;
    /*
@@ -683,10 +683,11 @@ intel_engine_has_relative_mmio(const struct 
intel_engine_cs * const engine)

  }
    /* Wa_14014475959:dg2 */
+/* Wa_16019325821 */
  static inline bool
-intel_engine_uses_wa_hold_ccs_switchout(struct intel_engine_cs *engine)
+intel_engine_uses_wa_hold_switchout(struct intel_engine_cs *engine)
  {
-    return engine->flags & I915_ENGINE_USES_WA_HOLD_CCS_SWITCHOUT;
+    return engine->flags & I915_ENGINE_USES_WA_HOLD_SWITCHOUT;
  }
    #endif /* __INTEL_ENGINE_TYPES_H__ */
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc.c

index 27df41c53b890..4001679ba0793 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
@@ -294,6 +294,10 @@ static u32 guc_ctl_wa_flags(struct intel_guc *guc)
  IS_DG2(gt->i915))
  flags |= GUC_WA_HOLD_CCS_SWITCHOUT;
  +    /* Wa_16019325821 */
+    if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71)))
+    flags |= GUC_WA_RCS_CCS_SWITCHOUT;
+
  /*
   * Wa_14012197797
   * Wa_22011391025
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h

index b4d56eccfb1f0..f97af0168a66b 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
@@ -95,8 +95,9 @@
  #define   GUC_WA_GAM_CREDITS    BIT(10)
  #define   GUC_WA_DUAL_QUEUE    BIT(11)
  #define   GUC_WA_RCS_RESET_BEFORE_RC6    BIT(13)
-#define   GUC_WA_CONTEXT_ISOLATION    BIT(15)
  #define   GUC_WA_PRE_PARSER    BIT(14)
+#define   GUC_WA_CONTEXT_ISOLATION    BIT(1

Re: [Intel-gfx] [PATCH 3/3] drm/i915/mtl: Add counters for engine busyness ticks

2023-10-19 Thread John Harrison

On 10/19/2023 09:21, Dong, Zhanjun wrote:

See comments inline below.

Zhanjun

On 2023-09-22 6:25 p.m., john.c.harri...@intel.com wrote:

From: Umesh Nerlige Ramappa 

In new version of GuC engine busyness, GuC provides engine busyness
ticks as a 64 bit counter. Add a new counter to relay this value to the
user as is.

Signed-off-by: Umesh Nerlige Ramappa 
Signed-off-by: John Harrison 
---
  drivers/gpu/drm/i915/gt/intel_engine.h    |  1 +
  drivers/gpu/drm/i915/gt/intel_engine_cs.c | 16 +
  drivers/gpu/drm/i915/gt/intel_engine_types.h  | 12 
  drivers/gpu/drm/i915/gt/intel_engine_user.c   |  1 +
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 67 ++-
  drivers/gpu/drm/i915/i915_pmu.c   | 25 ++-
  drivers/gpu/drm/i915/i915_pmu.h   |  2 +-
  include/uapi/drm/i915_drm.h   | 13 +++-
  8 files changed, 116 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h 
b/drivers/gpu/drm/i915/gt/intel_engine.h

index b58c30ac8ef02..57af7ec8ecd82 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine.h
@@ -249,6 +249,7 @@ void intel_engine_dump_active_requests(struct 
list_head *requests,

    ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine,
 ktime_t *now);
+u64 intel_engine_get_busy_ticks(struct intel_engine_cs *engine);
    void intel_engine_get_hung_entity(struct intel_engine_cs *engine,
    struct intel_context **ce, struct i915_request 
**rq);
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_cs.c

index 84a75c95f3f7d..1c9ffb1ae9889 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -2426,6 +2426,22 @@ ktime_t intel_engine_get_busy_time(struct 
intel_engine_cs *engine, ktime_t *now)

  return engine->busyness(engine, now);
  }
  +/**
+ * intel_engine_get_busy_ticks() - Return current accumulated engine 
busyness

+ * ticks
+ * @engine: engine to report on
+ *
+ * Returns accumulated ticks @engine was busy since engine stats 
were enabled.

+ */
+u64 intel_engine_get_busy_ticks(struct intel_engine_cs *engine)
+{
+    if (!engine->busyness_ticks ||
+    !(engine->flags & I915_ENGINE_SUPPORTS_TICKS_STATS))
+    return 0;
+
+    return engine->busyness_ticks(engine);
+}
+
  struct intel_context *
  intel_engine_create_virtual(struct intel_engine_cs **siblings,
  unsigned int count, unsigned long flags)
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h 
b/drivers/gpu/drm/i915/gt/intel_engine_types.h

index 40fd8f984d64b..a88d40c74d604 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -548,6 +548,11 @@ struct intel_engine_cs {
  ktime_t    (*busyness)(struct intel_engine_cs *engine,
  ktime_t *now);
  +    /*
+ * Get engine busyness ticks
+ */
+    u64    (*busyness_ticks)(struct intel_engine_cs *engine);
+
  struct intel_engine_execlists execlists;
    /*
@@ -574,6 +579,7 @@ struct intel_engine_cs {
  #define I915_ENGINE_HAS_EU_PRIORITY    BIT(10)
  #define I915_ENGINE_FIRST_RENDER_COMPUTE BIT(11)
  #define I915_ENGINE_USES_WA_HOLD_CCS_SWITCHOUT BIT(12)
+#define I915_ENGINE_SUPPORTS_TICKS_STATS   BIT(13)
  unsigned int flags;
    /*
@@ -649,6 +655,12 @@ intel_engine_supports_stats(const struct 
intel_engine_cs *engine)

  return engine->flags & I915_ENGINE_SUPPORTS_STATS;
  }
  +static inline bool
+intel_engine_supports_tick_stats(const struct intel_engine_cs *engine)
+{
+    return engine->flags & I915_ENGINE_SUPPORTS_TICKS_STATS;
+}
+
  static inline bool
  intel_engine_has_preemption(const struct intel_engine_cs *engine)
  {
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c 
b/drivers/gpu/drm/i915/gt/intel_engine_user.c

index dcedff41a825f..69eb610b5ab0a 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_user.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c
@@ -100,6 +100,7 @@ static void set_scheduler_caps(struct 
drm_i915_private *i915)

  MAP(HAS_PREEMPTION, PREEMPTION),
  MAP(HAS_SEMAPHORES, SEMAPHORES),
  MAP(SUPPORTS_STATS, ENGINE_BUSY_STATS),
+    MAP(SUPPORTS_TICKS_STATS, ENGINE_BUSY_TICKS_STATS),
  #undef MAP
  };
  struct intel_engine_cs *engine;
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c

index 0c1fee5360777..71749fb9ad35b 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1289,12 +1289,7 @@ static void 
busy_v1_guc_update_pm_timestamp(struct intel_guc *guc, ktime_t *now)

  guc->busy.v1.gt_stamp = ((u64)gt_stamp_hi << 32) | gt_stamp_lo;
  }
  -/*
- * Unlike the execlist mode of submission total and active times are 
in ter

Re: [Intel-gfx] [CI] PR for new GuC v70.13.1

2023-10-18 Thread John Harrison
Apologies, I sent this with the wrong subject. Please ignore. Will 
resend with the correct subject.


John.


On 10/18/2023 12:07, john.c.harri...@intel.com wrote:

The following changes since commit 7727f7e3b3358713c7c91c64a835e80c331a6b8b:

   Merge branch 'patch-1696561325' into 'main' (2023-10-06 03:04:57 +)

are available in the Git repository at:

   git://anongit.freedesktop.org/drm/drm-firmware guc_70.13.1

for you to fetch changes up to 44a9510c94ac0334931b6c89dd240ffe5bf1e5fa:

   i915: Add GuC v70.13.1 for DG2, TGL, ADL-P and MTL (2023-10-13 11:34:26 
-0700)


John Harrison (1):
   i915: Add GuC v70.13.1 for DG2, TGL, ADL-P and MTL

  WHENCE   |   8 
  i915/adlp_guc_70.bin | Bin 297984 -> 342848 bytes
  i915/dg2_guc_70.bin  | Bin 385856 -> 443200 bytes
  i915/mtl_guc_70.bin  | Bin 308032 -> 365376 bytes
  i915/tgl_guc_70.bin  | Bin 285888 -> 330304 bytes
  5 files changed, 4 insertions(+), 4 deletions(-)




Re: [Intel-gfx] [PATCH] drm/i915/mtl: Don't set PIPE_CONTROL_FLUSH_L3

2023-10-16 Thread John Harrison

On 10/16/2023 15:55, Vinay Belgaumkar wrote:

This bit does not cause an explicit L3 flush. We already use
At all? Or only on newer hardware? And as a genuine spec change or as a 
bug / workaround?


If the hardware has re-purposed the bit then it is probably worth at 
least adding a comment to the bit definition to say that it is only 
valid up to IP version 12.70.



PIPE_CONTROL_DC_FLUSH_ENABLE for that purpose.

Cc: Nirmoy Das 
Cc: Mikka Kuoppala 
Signed-off-by: Vinay Belgaumkar 
---
  drivers/gpu/drm/i915/gt/gen8_engine_cs.c | 8 ++--
  1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c 
b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
index ba4c2422b340..abbc02f3e66e 100644
--- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
@@ -247,6 +247,7 @@ static int mtl_dummy_pipe_control(struct i915_request *rq)
  int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode)
  {
struct intel_engine_cs *engine = rq->engine;
+   struct intel_gt *gt = rq->engine->gt;
  
  	/*

 * On Aux CCS platforms the invalidation of the Aux
@@ -278,7 +279,8 @@ int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode)
 * deals with Protected Memory which is not needed for
 * AUX CCS invalidation and lead to unwanted side effects.
 */
-   if (mode & EMIT_FLUSH)
+   if ((mode & EMIT_FLUSH) &&
+   !(IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71
Why stop at 12.71? Is the meaning only changed for 12.70 and the 
old/correct version will be restored in later hardware?


John.



bit_group_1 |= PIPE_CONTROL_FLUSH_L3;
  
  		bit_group_1 |= PIPE_CONTROL_TILE_CACHE_FLUSH;

@@ -812,12 +814,14 @@ u32 *gen12_emit_fini_breadcrumb_rcs(struct i915_request 
*rq, u32 *cs)
u32 flags = (PIPE_CONTROL_CS_STALL |
 PIPE_CONTROL_TLB_INVALIDATE |
 PIPE_CONTROL_TILE_CACHE_FLUSH |
-PIPE_CONTROL_FLUSH_L3 |
 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 PIPE_CONTROL_DC_FLUSH_ENABLE |
 PIPE_CONTROL_FLUSH_ENABLE);
  
+	if (!(IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71

+   flags |= PIPE_CONTROL_FLUSH_L3;
+
/* Wa_14016712196 */
if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71)) || 
IS_DG2(i915))
/* dummy PIPE_CONTROL + depth flush */




Re: [Intel-gfx] [PATCH v14 0/7] drm/i915: Define and use GuC and CTB TLB invalidation routines

2023-10-13 Thread John Harrison




On 10/13/2023 10:52, Jonathan Cavitt wrote:

Implement GuC-based TLB invalidations and use them on MTL.

Some complexity in the implementation was introduced early on
and will be required for range-based TLB invalidations.
RFC: https://patchwork.freedesktop.org/series/124922/

v2:
- Add missing supporting patches.

v3:
- Split suspend/resume changes and multi-gt support into separate
   patches.
- Only perform GuC TLB invalidation functions when supported.
- Move intel_guc_is_enabled check function to usage location.
- Address comments.

v4:
- Change conditions for GuC-based tlb invalidation support
   to a pci tag that's only active for MTL.
- Address some FIXMEs and formatting issues.
- Move suspend/resume changes to helper functions in intel_gt.h
- Improve comment for ct_handle_event change.
- Use cleaner if-else conditions.
- Address comments.

v5:
- Reintroduce missing change to selftest msleep duration
- Move suspend/resume loops from intel_gt.h to intel_tlb.c,
   making them no longer static inlines.
- Remove superfluous blocking and error checks.
- Move ct_handle_event exception to general case in
   ct_process_request.
- Explain usage of xa_alloc_cyclic_irq.
- Modify explanation of purpose of
   OUTSTANDING_GUC_TIMEOUT_PERIOD macro.
- Explain purpose of performing tlb invalidation twice in
   intel_gt_tlb_resume_all.

v6:
- Add this cover letter.
- Fix explanation of purpose of
   OUTSTANDING_GUC_TIMEOUT_PERIOD macro again.
- s/pci tags/pci flags
- Enable GuC TLB Invalidations separately from adding the
   flags to do so.

v7:
- Eliminate pci terminology from patches.
- Order new device info flag correctly.
- Run gen8_ggtt_invalidate in more cases, specifically when
   GuC-based TLB invalidation is not supported.
- Use intel_uncore_write_fw instead of intel_uncore_write
   during guc_ggtt_invalidate.
- Remove duplicate request message clear in ct_process_request.
- Remove faulty tag from series.

v8:
- Simplify cover letter contents.
- Fix miscellaneous formatting and typos.
- Reorder device info flags and defines.
- Reword commit message.
- Rename TLB invalidation enums and functions.
- Add comments explaining confusing points.
- Add helper function getting expected delay of CT buffer.
- Simplify intel_guc_tlb_invalidation_done by passing computed
   values.
- Remove helper functions for tlb suspend and resume.
- Move tlb suspend and resume paths to uc.
- Split suspend/resume and wedged into two patches.
- Clarify purpose of sleep change in tlb selftest.

v9:
- Explain complexity of GuC TLB invalidations as required for
   range-based TLB invalidations, which will be platformed later.
- Fix CHECKPATCH issues.
- Explain intel_guc_is_ready tlb invalidation skip in
   intel_gt_invalidate_tlb_full.
- Reword comment for unlocked xa_for_each loop in
   intel_guc_submission_reset.
- Report all errors in init_tlb_lookup.
- Remove debug message from fini_tlb_lookup.
- Use standardized interface for
   intel_guc_tlb_invalidation_done
- Remove spurious changes.
- Move wake_up_all_tlb_invalidate on wedge to correct patch.

v10:
- Add lock to tlb_lookup on guc submission reset.
- Add comment about why timeout increased from 10 ms to 20 ms
   by default in gt_tlb selftest.
- Remove spurious changes.

v11:
- Update CT size delay helper to be clearer.
- Reorder some function declarations.
- Clarify some comments.
- Produce error message if attempting to free a busy wait
   during fini_tlb_lookup.
- Revert default sleep back to 10 ms.
- Link to RFC.

v12:
- Add helper for checking if GuC TLB invalidation is
   supported and guc is ready.
- Prevent suspend/resume actions involving GuC TLB
   invalidations if guc is not ready.
- Add path for INTEL_GUC_ACTION_TLB_INVALIDATION_DONE
   to immediately process in ct_process_request after
   it is submitted to ct_handle_event.

v13:
- Readd error check in intel_guc_tlb_invalidation_done
   for invalid length.
- Remove intel_guc_is_ready requirement from
   wake_up_all_tlb_invalidate.
- Align patches 3 and 4 by adding a check for GuC
   TLB invalidation support to the former that was
   added in the latter.

v14:
- Readd intel_guc_is_ready requirement to
   wake_up_all_tlb_invalidate.
Can you please use 're-add'. It took me some time to realise this wasn't 
a typo for 'read' or 'ready'.



- Move wake_up_all_tlb_invalidate from
   intel_guc_submission_reset to the end of
   __uc_hw_init.
I can see that this change was done. But why? What was the problem with 
the previous version? How does this move fix it? Because an init 
specific function is not the correct place for reset specific code.


John.



Re: [Intel-gfx] [PATCH v14 3/7] drm/i915: Define and use GuC and CTB TLB invalidation routines

2023-10-13 Thread John Harrison

On 10/13/2023 10:52, Jonathan Cavitt wrote:

From: Prathap Kumar Valsan 

The GuC firmware had defined the interface for Translation Look-Aside
Buffer (TLB) invalidation.  We should use this interface when
invalidating the engine and GuC TLBs.
Add additional functionality to intel_gt_invalidate_tlb, invalidating
the GuC TLBs and falling back to GT invalidation when the GuC is
disabled.
The invalidation is done by sending a request directly to the GuC
tlb_lookup that invalidates the table.  The invalidation is submitted as
a wait request and is performed in the CT event handler.  This means we
cannot perform this TLB invalidation path if the CT is not enabled.
If the request isn't fulfilled in two seconds, this would constitute
an error in the invalidation as that would constitute either a lost
request or a severe GuC overload.

With this new invalidation routine, we can perform GuC-based GGTT
invalidations.  GuC-based GGTT invalidation is incompatible with
MMIO invalidation so we should not perform MMIO invalidation when
GuC-based GGTT invalidation is expected.

The additional complexity incurred in this patch will be necessary for
range-based tlb invalidations, which will be platformed in the future.

Signed-off-by: Prathap Kumar Valsan 
Signed-off-by: Bruce Chang 
Signed-off-by: Chris Wilson 
Signed-off-by: Umesh Nerlige Ramappa 
Signed-off-by: Jonathan Cavitt 
Signed-off-by: Aravind Iddamsetty 
Signed-off-by: Fei Yang 
CC: Andi Shyti 
Reviewed-by: Andi Shyti 
Acked-by: Tvrtko Ursulin 
Acked-by: Nirmoy Das 
Reviewed-by: John Harrison 
---
  drivers/gpu/drm/i915/gt/intel_ggtt.c  |  30 ++-
  drivers/gpu/drm/i915/gt/intel_tlb.c   |  16 +-
  .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |  33 
  drivers/gpu/drm/i915/gt/uc/intel_guc.h|  22 +++
  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c |  11 ++
  drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |   1 +
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 183 +-
  drivers/gpu/drm/i915/gt/uc/intel_uc.c |  13 ++
  8 files changed, 298 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c 
b/drivers/gpu/drm/i915/gt/intel_ggtt.c
index 4d7d88b92632b..1c93e84278a03 100644
--- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
@@ -206,22 +206,36 @@ static void gen8_ggtt_invalidate(struct i915_ggtt *ggtt)
intel_uncore_write_fw(uncore, GFX_FLSH_CNTL_GEN6, GFX_FLSH_CNTL_EN);
  }
  
+static void guc_ggtt_ct_invalidate(struct intel_gt *gt)

+{
+   struct intel_uncore *uncore = gt->uncore;
+   intel_wakeref_t wakeref;
+
+   with_intel_runtime_pm_if_active(uncore->rpm, wakeref) {
+   struct intel_guc *guc = >uc.guc;
+
+   intel_guc_invalidate_tlb_guc(guc);
+   }
+}
+
  static void guc_ggtt_invalidate(struct i915_ggtt *ggtt)
  {
struct drm_i915_private *i915 = ggtt->vm.i915;
+   struct intel_gt *gt;
  
  	gen8_ggtt_invalidate(ggtt);
  
-	if (GRAPHICS_VER(i915) >= 12) {

-   struct intel_gt *gt;
-
-   list_for_each_entry(gt, >gt_list, ggtt_link)
+   list_for_each_entry(gt, >gt_list, ggtt_link) {
+   if (intel_guc_tlb_invalidation_is_available(>uc.guc)) {
+   guc_ggtt_ct_invalidate(gt);
+   } else if (GRAPHICS_VER(i915) >= 12) {
intel_uncore_write_fw(gt->uncore,
  GEN12_GUC_TLB_INV_CR,
  GEN12_GUC_TLB_INV_CR_INVALIDATE);
-   } else {
-   intel_uncore_write_fw(ggtt->vm.gt->uncore,
- GEN8_GTCR, GEN8_GTCR_INVALIDATE);
+   } else {
+   intel_uncore_write_fw(gt->uncore,
+ GEN8_GTCR, GEN8_GTCR_INVALIDATE);
+   }
}
  }
  
@@ -1243,7 +1257,7 @@ static int gen8_gmch_probe(struct i915_ggtt *ggtt)

ggtt->vm.raw_insert_page = gen8_ggtt_insert_page;
}
  
-	if (intel_uc_wants_guc(>vm.gt->uc))

+   if (intel_uc_wants_guc_submission(>vm.gt->uc))
ggtt->invalidate = guc_ggtt_invalidate;
else
ggtt->invalidate = gen8_ggtt_invalidate;
diff --git a/drivers/gpu/drm/i915/gt/intel_tlb.c 
b/drivers/gpu/drm/i915/gt/intel_tlb.c
index 139608c30d978..4bb13d1890e37 100644
--- a/drivers/gpu/drm/i915/gt/intel_tlb.c
+++ b/drivers/gpu/drm/i915/gt/intel_tlb.c
@@ -12,6 +12,7 @@
  #include "intel_gt_print.h"
  #include "intel_gt_regs.h"
  #include "intel_tlb.h"
+#include "uc/intel_guc.h"
  
  /*

   * HW architecture suggest typical invalidation time at 40us,
@@ -131,11 +132,24 @@ void intel_gt_invalidate_tlb_full(struct intel_gt *gt, 
u32 seqno)
return;
  
  	with_intel_gt_pm_if_awake(gt, wakeref) {

+   struct intel_guc *guc

Re: [Intel-gfx] [PATCH v13 4/7] drm/i915: No TLB invalidation on suspended GT

2023-10-13 Thread John Harrison

On 10/13/2023 12:12, John Harrison wrote:

On 10/13/2023 07:42, Cavitt, Jonathan wrote:

-Original Message-
From: Harrison, John C 
Sent: Thursday, October 12, 2023 6:08 PM
To: Cavitt, Jonathan ; 
intel-gfx@lists.freedesktop.org
Cc: Gupta, saurabhg ; 
chris.p.wil...@linux.intel.com; Iddamsetty, Aravind 
; Yang, Fei ; 
Shyti, Andi ; Das, Nirmoy 
; Krzysztofik, Janusz 
; Roper, Matthew D 
; tvrtko.ursu...@linux.intel.com; 
jani.nik...@linux.intel.com
Subject: Re: [PATCH v13 4/7] drm/i915: No TLB invalidation on 
suspended GT

On 10/12/2023 15:38, Jonathan Cavitt wrote:
In case of GT is suspended, don't allow submission of new TLB 
invalidation

request and cancel all pending requests. The TLB entries will be
invalidated either during GuC reload or on system resume.

Signed-off-by: Fei Yang 
Signed-off-by: Jonathan Cavitt 
CC: John Harrison 
Reviewed-by: Andi Shyti 
Acked-by: Tvrtko Ursulin 
Acked-by: Nirmoy Das 
---
   drivers/gpu/drm/i915/gt/uc/intel_guc.h    |  1 +
   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 22 
---

   drivers/gpu/drm/i915/gt/uc/intel_uc.c |  7 ++
   3 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h 
b/drivers/gpu/drm/i915/gt/uc/intel_guc.h

index 0949628d69f8b..2b6dfe62c8f2a 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -537,4 +537,5 @@ int intel_guc_invalidate_tlb_engines(struct 
intel_guc *guc);

   int intel_guc_invalidate_tlb_guc(struct intel_guc *guc);
   int intel_guc_tlb_invalidation_done(struct intel_guc *guc,
   const u32 *payload, u32 len);
+void wake_up_all_tlb_invalidate(struct intel_guc *guc);
   #endif
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c

index 1377398afcdfa..3a0d20064878a 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1796,13 +1796,24 @@ static void __guc_reset_context(struct 
intel_context *ce, intel_engine_mask_t st

   intel_context_put(parent);
   }
   -void intel_guc_submission_reset(struct intel_guc *guc, 
intel_engine_mask_t stalled)

+void wake_up_all_tlb_invalidate(struct intel_guc *guc)
   {
   struct intel_guc_tlb_wait *wait;
+    unsigned long i;
+
+    if (HAS_GUC_TLB_INVALIDATION(guc_to_gt(guc)->i915)) {

Why the change from 'if(!is_available) return' to 'if(HAS_) {doStuff}'?


I feel like this question has two parts, so I'll answer them separately:

1. Why HAS_GUC_TLB_INVALIDATION and not 
intel_guc_tlb_invalidation_is_available?


Wake_up_all_tlb_invalidate is called during the suspend/resume path, 
specifically in the
middle of suspend.  It's required for it to be called here to clean 
up any invalidations left
in the queue during the suspend/resume phase because they are no 
longer valid requests.
However, the suspend/resume phase also resets GuC, so 
intel_guc_is_ready returns false.
In short, using intel_guc_invalidation_is_available was causing us to 
skip this code section
incorrectly, resulting in spurious GuC TLB invalidation timeout 
errors during gt reset.
I'm not following this argument. If a reset is occurring then there is 
no need to issue the invalidate. And the previous version was skipping 
if GuC is in reset but this version does not. Which means it is now 
sending invalidate requests to GuC when GuC is not able to respond and 
therefore more likely to cause timeout errors not less likely.
Hang on. I'm getting confused between sending the request and waking up 
blocked threads. Apologies.


Okay, that makes sense now.

John.



Re: [Intel-gfx] [PATCH v13 4/7] drm/i915: No TLB invalidation on suspended GT

2023-10-13 Thread John Harrison

On 10/13/2023 07:42, Cavitt, Jonathan wrote:

-Original Message-
From: Harrison, John C 
Sent: Thursday, October 12, 2023 6:08 PM
To: Cavitt, Jonathan ; 
intel-gfx@lists.freedesktop.org
Cc: Gupta, saurabhg ; chris.p.wil...@linux.intel.com; Iddamsetty, Aravind 
; Yang, Fei ; Shyti, Andi ; 
Das, Nirmoy ; Krzysztofik, Janusz ; Roper, Matthew D 
; tvrtko.ursu...@linux.intel.com; jani.nik...@linux.intel.com
Subject: Re: [PATCH v13 4/7] drm/i915: No TLB invalidation on suspended GT

On 10/12/2023 15:38, Jonathan Cavitt wrote:

In case of GT is suspended, don't allow submission of new TLB invalidation
request and cancel all pending requests. The TLB entries will be
invalidated either during GuC reload or on system resume.

Signed-off-by: Fei Yang 
Signed-off-by: Jonathan Cavitt 
CC: John Harrison 
Reviewed-by: Andi Shyti 
Acked-by: Tvrtko Ursulin 
Acked-by: Nirmoy Das 
---
   drivers/gpu/drm/i915/gt/uc/intel_guc.h|  1 +
   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 22 ---
   drivers/gpu/drm/i915/gt/uc/intel_uc.c |  7 ++
   3 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h 
b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
index 0949628d69f8b..2b6dfe62c8f2a 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -537,4 +537,5 @@ int intel_guc_invalidate_tlb_engines(struct intel_guc *guc);
   int intel_guc_invalidate_tlb_guc(struct intel_guc *guc);
   int intel_guc_tlb_invalidation_done(struct intel_guc *guc,
const u32 *payload, u32 len);
+void wake_up_all_tlb_invalidate(struct intel_guc *guc);
   #endif
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 1377398afcdfa..3a0d20064878a 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1796,13 +1796,24 @@ static void __guc_reset_context(struct intel_context 
*ce, intel_engine_mask_t st
intel_context_put(parent);
   }
   
-void intel_guc_submission_reset(struct intel_guc *guc, intel_engine_mask_t stalled)

+void wake_up_all_tlb_invalidate(struct intel_guc *guc)
   {
struct intel_guc_tlb_wait *wait;
+   unsigned long i;
+
+   if (HAS_GUC_TLB_INVALIDATION(guc_to_gt(guc)->i915)) {

Why the change from 'if(!is_available) return' to 'if(HAS_) {doStuff}'?


I feel like this question has two parts, so I'll answer them separately:

1. Why HAS_GUC_TLB_INVALIDATION and not intel_guc_tlb_invalidation_is_available?

Wake_up_all_tlb_invalidate is called during the suspend/resume path, 
specifically in the
middle of suspend.  It's required for it to be called here to clean up any 
invalidations left
in the queue during the suspend/resume phase because they are no longer valid 
requests.
However, the suspend/resume phase also resets GuC, so intel_guc_is_ready 
returns false.
In short, using intel_guc_invalidation_is_available was causing us to skip this 
code section
incorrectly, resulting in spurious GuC TLB invalidation timeout errors during 
gt reset.
I'm not following this argument. If a reset is occurring then there is 
no need to issue the invalidate. And the previous version was skipping 
if GuC is in reset but this version does not. Which means it is now 
sending invalidate requests to GuC when GuC is not able to respond and 
therefore more likely to cause timeout errors not less likely.





2. Why use a positive check to perform and not a negative check to skip?

In patch 3, wake_up_all_tlb_invalidate was originally called universally on all 
platforms
during intel_guc_submission_reset, which is incorrect and not how was 
reimplemented here.
I discovered this was the case and retroactively corrected it, as seen below.
Because of how intel_guc_submission_reset is structured, a negative check to 
skip wouldn't
make much sense there, so I used a positive check to perform instead.  This is 
a holdover from
that implementation, and was kept to maintain consistency between patches 3 and 
4.  It's
probably not as big of a deal as I'm imagining, but I think it would be awkward 
if the initial
implementation in intel_guc_submission_reset and the reimplementation in
wake_up_all_tlb_invalidate weren't superficially the same, even if they were 
functionally
equivalent otherwise.
I would argue that a bunch of early exit conditions at the start of a 
function is easy to read and maintain than adding nesting levels to the 
entire function.


John.





-Jonathan Cavitt



John.


+   xa_lock_irq(>tlb_lookup);
+   xa_for_each(>tlb_lookup, i, wait)
+   wake_up(>wq);
+   xa_unlock_irq(>tlb_lookup);
+   }
+}
+
+void intel_guc_submission_reset(struct intel_guc *guc, intel_engine_mask_t 
stalled)
+{
struct intel_context *ce;
unsigned long index;
unsi

Re: [Intel-gfx] [PATCH v13 3/7] drm/i915: Define and use GuC and CTB TLB invalidation routines

2023-10-13 Thread John Harrison

On 10/13/2023 07:52, Cavitt, Jonathan wrote:

-Original Message-
From: Harrison, John C 
Sent: Thursday, October 12, 2023 6:11 PM
To: Cavitt, Jonathan ; 
intel-gfx@lists.freedesktop.org
Cc: Gupta, saurabhg ; chris.p.wil...@linux.intel.com; Iddamsetty, Aravind 
; Yang, Fei ; Shyti, Andi ; 
Das, Nirmoy ; Krzysztofik, Janusz ; Roper, Matthew D 
; tvrtko.ursu...@linux.intel.com; jani.nik...@linux.intel.com
Subject: Re: [PATCH v13 3/7] drm/i915: Define and use GuC and CTB TLB 
invalidation routines

On 10/12/2023 15:38, Jonathan Cavitt wrote:

From: Prathap Kumar Valsan 

The GuC firmware had defined the interface for Translation Look-Aside
Buffer (TLB) invalidation.  We should use this interface when
invalidating the engine and GuC TLBs.
Add additional functionality to intel_gt_invalidate_tlb, invalidating
the GuC TLBs and falling back to GT invalidation when the GuC is
disabled.
The invalidation is done by sending a request directly to the GuC
tlb_lookup that invalidates the table.  The invalidation is submitted as
a wait request and is performed in the CT event handler.  This means we
cannot perform this TLB invalidation path if the CT is not enabled.
If the request isn't fulfilled in two seconds, this would constitute
an error in the invalidation as that would constitute either a lost
request or a severe GuC overload.

With this new invalidation routine, we can perform GuC-based GGTT
invalidations.  GuC-based GGTT invalidation is incompatible with
MMIO invalidation so we should not perform MMIO invalidation when
GuC-based GGTT invalidation is expected.

The additional complexity incurred in this patch will be necessary for
range-based tlb invalidations, which will be platformed in the future.

Signed-off-by: Prathap Kumar Valsan 
Signed-off-by: Bruce Chang 
Signed-off-by: Chris Wilson 
Signed-off-by: Umesh Nerlige Ramappa 
Signed-off-by: Jonathan Cavitt 
Signed-off-by: Aravind Iddamsetty 
Signed-off-by: Fei Yang 
CC: Andi Shyti 
Reviewed-by: Andi Shyti 
Acked-by: Tvrtko Ursulin 
Acked-by: Nirmoy Das 
Reviewed-by: John Harrison 
---
   drivers/gpu/drm/i915/gt/intel_ggtt.c  |  33 ++-
   drivers/gpu/drm/i915/gt/intel_tlb.c   |  16 +-
   .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |  33 +++
   drivers/gpu/drm/i915/gt/uc/intel_guc.h|  22 ++
   drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c |  11 +
   drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |   1 +
   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 195 +-
   7 files changed, 299 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c 
b/drivers/gpu/drm/i915/gt/intel_ggtt.c
index 4d7d88b92632b..7d145b2d3cb17 100644
--- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
@@ -206,22 +206,37 @@ static void gen8_ggtt_invalidate(struct i915_ggtt *ggtt)
intel_uncore_write_fw(uncore, GFX_FLSH_CNTL_GEN6, GFX_FLSH_CNTL_EN);
   }
   
+static void guc_ggtt_ct_invalidate(struct intel_gt *gt)

+{
+   struct intel_uncore *uncore = gt->uncore;
+   intel_wakeref_t wakeref;
+
+   with_intel_runtime_pm_if_active(uncore->rpm, wakeref) {
+   struct intel_guc *guc = >uc.guc;
+
+   intel_guc_invalidate_tlb_guc(guc);
+   }
+}
+
   static void guc_ggtt_invalidate(struct i915_ggtt *ggtt)
   {
struct drm_i915_private *i915 = ggtt->vm.i915;
+   struct intel_gt *gt;
   
-	gen8_ggtt_invalidate(ggtt);

-
-   if (GRAPHICS_VER(i915) >= 12) {
-   struct intel_gt *gt;
+   if (!HAS_GUC_TLB_INVALIDATION(i915))
+   gen8_ggtt_invalidate(ggtt);

This has not changed? As per comments from Matthew Roper and Nirmoy Das,
there needs to be a fixup patch first to stop gen8_ggtt_invalidate()
from being called on invalid platforms.


Given the sounds of things, it seems like this change here is irrelevant to 
this patch series, as the reason we're
guarding against gen8_ggtt_invalidate isn't related to GuC-based TLB 
invalidations at all.  Ergo, it would actually
make more sense for me to not skip it here and leave the respective guard 
change to a different patch series.
-Jonathan Cavitt
The point was that if this code needs to change then that patch needs to 
happen first. Otherwise there would be merge conflicts when pushing that 
patch to the stable trees.


However, it looks like the change is all happening inside the gen8_ 
function and the intention is to keep calling it even on Gen12+ 
platforms that don't need it. Seems odd but people appear to be happy 
with it. And therefore no conflicts should happen with this patch no 
matter what order they land in.


John.



Re: [Intel-gfx] [PATCH v13 3/7] drm/i915: Define and use GuC and CTB TLB invalidation routines

2023-10-12 Thread John Harrison

On 10/12/2023 15:38, Jonathan Cavitt wrote:

From: Prathap Kumar Valsan 

The GuC firmware had defined the interface for Translation Look-Aside
Buffer (TLB) invalidation.  We should use this interface when
invalidating the engine and GuC TLBs.
Add additional functionality to intel_gt_invalidate_tlb, invalidating
the GuC TLBs and falling back to GT invalidation when the GuC is
disabled.
The invalidation is done by sending a request directly to the GuC
tlb_lookup that invalidates the table.  The invalidation is submitted as
a wait request and is performed in the CT event handler.  This means we
cannot perform this TLB invalidation path if the CT is not enabled.
If the request isn't fulfilled in two seconds, this would constitute
an error in the invalidation as that would constitute either a lost
request or a severe GuC overload.

With this new invalidation routine, we can perform GuC-based GGTT
invalidations.  GuC-based GGTT invalidation is incompatible with
MMIO invalidation so we should not perform MMIO invalidation when
GuC-based GGTT invalidation is expected.

The additional complexity incurred in this patch will be necessary for
range-based tlb invalidations, which will be platformed in the future.

Signed-off-by: Prathap Kumar Valsan 
Signed-off-by: Bruce Chang 
Signed-off-by: Chris Wilson 
Signed-off-by: Umesh Nerlige Ramappa 
Signed-off-by: Jonathan Cavitt 
Signed-off-by: Aravind Iddamsetty 
Signed-off-by: Fei Yang 
CC: Andi Shyti 
Reviewed-by: Andi Shyti 
Acked-by: Tvrtko Ursulin 
Acked-by: Nirmoy Das 
Reviewed-by: John Harrison 
---
  drivers/gpu/drm/i915/gt/intel_ggtt.c  |  33 ++-
  drivers/gpu/drm/i915/gt/intel_tlb.c   |  16 +-
  .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |  33 +++
  drivers/gpu/drm/i915/gt/uc/intel_guc.h|  22 ++
  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c |  11 +
  drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |   1 +
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 195 +-
  7 files changed, 299 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c 
b/drivers/gpu/drm/i915/gt/intel_ggtt.c
index 4d7d88b92632b..7d145b2d3cb17 100644
--- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
@@ -206,22 +206,37 @@ static void gen8_ggtt_invalidate(struct i915_ggtt *ggtt)
intel_uncore_write_fw(uncore, GFX_FLSH_CNTL_GEN6, GFX_FLSH_CNTL_EN);
  }
  
+static void guc_ggtt_ct_invalidate(struct intel_gt *gt)

+{
+   struct intel_uncore *uncore = gt->uncore;
+   intel_wakeref_t wakeref;
+
+   with_intel_runtime_pm_if_active(uncore->rpm, wakeref) {
+   struct intel_guc *guc = >uc.guc;
+
+   intel_guc_invalidate_tlb_guc(guc);
+   }
+}
+
  static void guc_ggtt_invalidate(struct i915_ggtt *ggtt)
  {
struct drm_i915_private *i915 = ggtt->vm.i915;
+   struct intel_gt *gt;
  
-	gen8_ggtt_invalidate(ggtt);

-
-   if (GRAPHICS_VER(i915) >= 12) {
-   struct intel_gt *gt;
+   if (!HAS_GUC_TLB_INVALIDATION(i915))
+   gen8_ggtt_invalidate(ggtt);
This has not changed? As per comments from Matthew Roper and Nirmoy Das, 
there needs to be a fixup patch first to stop gen8_ggtt_invalidate() 
from being called on invalid platforms.


  
-		list_for_each_entry(gt, >gt_list, ggtt_link)

+   list_for_each_entry(gt, >gt_list, ggtt_link) {
+   if (intel_guc_tlb_invalidation_is_available(>uc.guc)) {
+   guc_ggtt_ct_invalidate(gt);
+   } else if (GRAPHICS_VER(i915) >= 12) {
intel_uncore_write_fw(gt->uncore,
  GEN12_GUC_TLB_INV_CR,
  GEN12_GUC_TLB_INV_CR_INVALIDATE);
-   } else {
-   intel_uncore_write_fw(ggtt->vm.gt->uncore,
- GEN8_GTCR, GEN8_GTCR_INVALIDATE);
+   } else {
+   intel_uncore_write_fw(gt->uncore,
+ GEN8_GTCR, GEN8_GTCR_INVALIDATE);
+   }
}
  }
  
@@ -1243,7 +1258,7 @@ static int gen8_gmch_probe(struct i915_ggtt *ggtt)

ggtt->vm.raw_insert_page = gen8_ggtt_insert_page;
}
  
-	if (intel_uc_wants_guc(>vm.gt->uc))

+   if (intel_uc_wants_guc_submission(>vm.gt->uc))
ggtt->invalidate = guc_ggtt_invalidate;
else
ggtt->invalidate = gen8_ggtt_invalidate;
diff --git a/drivers/gpu/drm/i915/gt/intel_tlb.c 
b/drivers/gpu/drm/i915/gt/intel_tlb.c
index 139608c30d978..4bb13d1890e37 100644
--- a/drivers/gpu/drm/i915/gt/intel_tlb.c
+++ b/drivers/gpu/drm/i915/gt/intel_tlb.c
@@ -12,6 +12,7 @@
  #include "intel_gt_print.h"
  #include "intel_gt_regs.h"
  #include "intel_tlb.h"
+#include "uc/intel_guc.h"
  
  /*

   * HW architecture suggest typical invalidation t

Re: [Intel-gfx] [PATCH v13 4/7] drm/i915: No TLB invalidation on suspended GT

2023-10-12 Thread John Harrison

On 10/12/2023 15:38, Jonathan Cavitt wrote:

In case of GT is suspended, don't allow submission of new TLB invalidation
request and cancel all pending requests. The TLB entries will be
invalidated either during GuC reload or on system resume.

Signed-off-by: Fei Yang 
Signed-off-by: Jonathan Cavitt 
CC: John Harrison 
Reviewed-by: Andi Shyti 
Acked-by: Tvrtko Ursulin 
Acked-by: Nirmoy Das 
---
  drivers/gpu/drm/i915/gt/uc/intel_guc.h|  1 +
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 22 ---
  drivers/gpu/drm/i915/gt/uc/intel_uc.c |  7 ++
  3 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h 
b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
index 0949628d69f8b..2b6dfe62c8f2a 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -537,4 +537,5 @@ int intel_guc_invalidate_tlb_engines(struct intel_guc *guc);
  int intel_guc_invalidate_tlb_guc(struct intel_guc *guc);
  int intel_guc_tlb_invalidation_done(struct intel_guc *guc,
const u32 *payload, u32 len);
+void wake_up_all_tlb_invalidate(struct intel_guc *guc);
  #endif
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 1377398afcdfa..3a0d20064878a 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1796,13 +1796,24 @@ static void __guc_reset_context(struct intel_context 
*ce, intel_engine_mask_t st
intel_context_put(parent);
  }
  
-void intel_guc_submission_reset(struct intel_guc *guc, intel_engine_mask_t stalled)

+void wake_up_all_tlb_invalidate(struct intel_guc *guc)
  {
struct intel_guc_tlb_wait *wait;
+   unsigned long i;
+
+   if (HAS_GUC_TLB_INVALIDATION(guc_to_gt(guc)->i915)) {

Why the change from 'if(!is_available) return' to 'if(HAS_) {doStuff}'?

John.


+   xa_lock_irq(>tlb_lookup);
+   xa_for_each(>tlb_lookup, i, wait)
+   wake_up(>wq);
+   xa_unlock_irq(>tlb_lookup);
+   }
+}
+
+void intel_guc_submission_reset(struct intel_guc *guc, intel_engine_mask_t 
stalled)
+{
struct intel_context *ce;
unsigned long index;
unsigned long flags;
-   unsigned long i;
  
  	if (unlikely(!guc_submission_initialized(guc))) {

/* Reset called during driver load? GuC not yet initialised! */
@@ -1833,12 +1844,7 @@ void intel_guc_submission_reset(struct intel_guc *guc, 
intel_engine_mask_t stall
 * The full GT reset will have cleared the TLB caches and flushed the
 * G2H message queue; we can release all the blocked waiters.
 */
-   if (HAS_GUC_TLB_INVALIDATION(guc_to_gt(guc)->i915)) {
-   xa_lock_irq(>tlb_lookup);
-   xa_for_each(>tlb_lookup, i, wait)
-   wake_up(>wq);
-   xa_unlock_irq(>tlb_lookup);
-   }
+   wake_up_all_tlb_invalidate(guc);
  }
  
  static void guc_cancel_context_requests(struct intel_context *ce)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c 
b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
index 98b103375b7ab..27f6561dd7319 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
@@ -688,6 +688,8 @@ void intel_uc_suspend(struct intel_uc *uc)
/* flush the GSC worker */
intel_gsc_uc_flush_work(>gsc);
  
+	wake_up_all_tlb_invalidate(guc);

+
if (!intel_guc_is_ready(guc)) {
guc->interrupts.enabled = false;
return;
@@ -736,6 +738,11 @@ static int __uc_resume(struct intel_uc *uc, bool 
enable_communication)
  
  	intel_gsc_uc_resume(>gsc);
  
+	if (intel_guc_tlb_invalidation_is_available(guc)) {

+   intel_guc_invalidate_tlb_engines(guc);
+   intel_guc_invalidate_tlb_guc(guc);
+   }
+
return 0;
  }
  




Re: [Intel-gfx] ✗ Fi.CI.BAT: failure for drm/i915: Define and use GuC and CTB TLB invalidation routines (rev2)

2023-10-12 Thread John Harrison

On 10/12/2023 06:45, Patchwork wrote:

Project List - Patchwork *Patch Details*
*Series:* 	drm/i915: Define and use GuC and CTB TLB invalidation 
routines (rev2)

*URL:*  https://patchwork.freedesktop.org/series/125002/
*State:*failure
*Details:* 
https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_125002v2/index.html



  CI Bug Log - changes from CI_DRM_13746 -> Patchwork_125002v2


Summary

*FAILURE*

Serious unknown changes coming with Patchwork_125002v2 absolutely need 
to be

verified manually.

If you think the reported changes have nothing to do with the changes
introduced in Patchwork_125002v2, please notify your bug team 
(lgci.bug.fil...@intel.com) to allow them
to document this new failure mode, which will reduce false positives 
in CI.


External URL: 
https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_125002v2/index.html



Participating hosts (38 -> 38)

Additional (1): fi-bsw-n3050
Missing (1): fi-snb-2520m


Possible new issues

Here are the unknown changes that may have been introduced in 
Patchwork_125002v2:



  IGT changes


Possible regressions

 *

igt@i915_selftest@live@gt_mocs:

  o bat-mtlp-8: PASS


-> DMESG-WARN


+1 other test dmesg-warn
 *

igt@i915_selftest@live@guc:

 o

bat-mtlp-6: PASS


-> DMESG-WARN


+3 other tests dmesg-warn

 o

bat-mtlp-8: NOTRUN -> DMESG-WARN


+1 other test dmesg-warn

These failure all appear to be reset related. However, they are full GT 
reset not individual engine resets. So not sure they can be explained by 
the worker thread vs interrupt handler processing issue of the 
gem_exec_capture failures.


I would definitely recommend running these locally when you have the 
next version of the patch set ready.


John.


Re: [Intel-gfx] [PATCH] drm/i915/guc: Suppress 'ignoring reset notification' message

2023-10-12 Thread John Harrison

On 10/12/2023 03:21, Tvrtko Ursulin wrote:

On 21/09/2023 19:20, john.c.harri...@intel.com wrote:

From: John Harrison 

If an active context has been banned (e.g. Ctrl+C killed) then it is
likely to be reset as part of evicting it from the hardware. That
results in a 'ignoring context reset notification: banned = 1'
message at info level. This confuses/concerns people and makes them
thing something has gone wrong when it hasn't.


Noticed the "confuses/concerns people" part while preparing the 6.7 
pull request, and the fact there is no Fixes: tag. Is this something 
that would be worth sending to stable (manually and if yes could you 
do that please? If there were actual user bugs filed I guess.


No upstream bugs that I am aware of. There were very occasional 
concerned emails from internal test teams (E2E and such rather than 
kernel) and I think one internal bug was logged about it being seen when 
running some automated user interaction stress test thing (monkey runner 
or similar). So not sure that it is worth the effort of a backport to 
older trees. And you can't really call it a bug with an older patch. The 
message was never an error or even a warning, just an info level.


John.



Regards,

Tvrtko


There is already a debug level message with essentially the same
information. So drop the 'ignore' info level one and just add the
'ignore' flag to the debug level one instead (which will therefore not
appear by default but will still show up in CI runs).

Signed-off-by: John Harrison 
---
  drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 10 +-
  1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c

index cabdc645fcddb..da7331346df1f 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -4770,19 +4770,19 @@ static void guc_context_replay(struct 
intel_context *ce)

  static void guc_handle_context_reset(struct intel_guc *guc,
   struct intel_context *ce)
  {
+    bool capture = intel_context_is_schedulable(ce);
+
  trace_intel_context_reset(ce);
  -    guc_dbg(guc, "Got context reset notification: 0x%04X on %s, 
exiting = %s, banned = %s\n",
+    guc_dbg(guc, "%s context reset notification: 0x%04X on %s, 
exiting = %s, banned = %s\n",

+    capture ? "Got" : "Ignoring",
  ce->guc_id.id, ce->engine->name,
  str_yes_no(intel_context_is_exiting(ce)),
  str_yes_no(intel_context_is_banned(ce)));
  -    if (likely(intel_context_is_schedulable(ce))) {
+    if (capture) {
  capture_error_state(guc, ce);
  guc_context_replay(ce);
-    } else {
-    guc_info(guc, "Ignoring context reset notification of 
exiting context 0x%04X on %s",

- ce->guc_id.id, ce->engine->name);
  }
  }




Re: [Intel-gfx] [PATCH v12 3/7] drm/i915: Define and use GuC and CTB TLB invalidation routines

2023-10-11 Thread John Harrison

On 10/11/2023 13:52, Jonathan Cavitt wrote:

From: Prathap Kumar Valsan 

The GuC firmware had defined the interface for Translation Look-Aside
Buffer (TLB) invalidation.  We should use this interface when
invalidating the engine and GuC TLBs.
Add additional functionality to intel_gt_invalidate_tlb, invalidating
the GuC TLBs and falling back to GT invalidation when the GuC is
disabled.
The invalidation is done by sending a request directly to the GuC
tlb_lookup that invalidates the table.  The invalidation is submitted as
a wait request and is performed in the CT event handler.  This means we
cannot perform this TLB invalidation path if the CT is not enabled.
If the request isn't fulfilled in two seconds, this would constitute
an error in the invalidation as that would constitute either a lost
request or a severe GuC overload.

With this new invalidation routine, we can perform GuC-based GGTT
invalidations.  GuC-based GGTT invalidation is incompatible with
MMIO invalidation so we should not perform MMIO invalidation when
GuC-based GGTT invalidation is expected.

The additional complexity incurred in this patch will be necessary for
range-based tlb invalidations, which will be platformed in the future.

Signed-off-by: Prathap Kumar Valsan 
Signed-off-by: Bruce Chang 
Signed-off-by: Chris Wilson 
Signed-off-by: Umesh Nerlige Ramappa 
Signed-off-by: Jonathan Cavitt 
Signed-off-by: Aravind Iddamsetty 
Signed-off-by: Fei Yang 
CC: Andi Shyti 
Reviewed-by: Andi Shyti 
Acked-by: Tvrtko Ursulin 
Acked-by: Nirmoy Das 
---
  drivers/gpu/drm/i915/gt/intel_ggtt.c  |  33 ++-
  drivers/gpu/drm/i915/gt/intel_tlb.c   |  16 +-
  .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |  33 +++
  drivers/gpu/drm/i915/gt/uc/intel_guc.h|  22 ++
  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c |  11 +
  drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |   1 +
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 191 +-
  7 files changed, 295 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c 
b/drivers/gpu/drm/i915/gt/intel_ggtt.c
index 4d7d88b92632b..7d145b2d3cb17 100644
--- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
@@ -206,22 +206,37 @@ static void gen8_ggtt_invalidate(struct i915_ggtt *ggtt)
intel_uncore_write_fw(uncore, GFX_FLSH_CNTL_GEN6, GFX_FLSH_CNTL_EN);
  }
  
+static void guc_ggtt_ct_invalidate(struct intel_gt *gt)

+{
+   struct intel_uncore *uncore = gt->uncore;
+   intel_wakeref_t wakeref;
+
+   with_intel_runtime_pm_if_active(uncore->rpm, wakeref) {
+   struct intel_guc *guc = >uc.guc;
+
+   intel_guc_invalidate_tlb_guc(guc);
+   }
+}
+
  static void guc_ggtt_invalidate(struct i915_ggtt *ggtt)
  {
struct drm_i915_private *i915 = ggtt->vm.i915;
+   struct intel_gt *gt;
  
-	gen8_ggtt_invalidate(ggtt);

-
-   if (GRAPHICS_VER(i915) >= 12) {
-   struct intel_gt *gt;
+   if (!HAS_GUC_TLB_INVALIDATION(i915))
+   gen8_ggtt_invalidate(ggtt);
  
-		list_for_each_entry(gt, >gt_list, ggtt_link)

+   list_for_each_entry(gt, >gt_list, ggtt_link) {
+   if (intel_guc_tlb_invalidation_is_available(>uc.guc)) {
+   guc_ggtt_ct_invalidate(gt);
+   } else if (GRAPHICS_VER(i915) >= 12) {
intel_uncore_write_fw(gt->uncore,
  GEN12_GUC_TLB_INV_CR,
  GEN12_GUC_TLB_INV_CR_INVALIDATE);
-   } else {
-   intel_uncore_write_fw(ggtt->vm.gt->uncore,
- GEN8_GTCR, GEN8_GTCR_INVALIDATE);
+   } else {
+   intel_uncore_write_fw(gt->uncore,
+ GEN8_GTCR, GEN8_GTCR_INVALIDATE);
+   }
Is the logic here correct for the case of a MTL prior to GuC start / 
during reset?


Specifically, on a device where HAS_ is not set (i.e. not MTL) then all 
TLB invals will call gen8_ggtt_invalidate() followed by a direct poke of 
either GEN8_GTCR or GEN12_GUC_TLB_INV_CR as appropriate. But on MTL 
during GuC downtime, only the register poke happens. The call to 
gen8_ggtt_invalidate() is skipped. Is that correct? Or am I just 
mis-reading the diffs?




}
  }
  
@@ -1243,7 +1258,7 @@ static int gen8_gmch_probe(struct i915_ggtt *ggtt)

ggtt->vm.raw_insert_page = gen8_ggtt_insert_page;
}
  
-	if (intel_uc_wants_guc(>vm.gt->uc))

+   if (intel_uc_wants_guc_submission(>vm.gt->uc))
ggtt->invalidate = guc_ggtt_invalidate;
else
ggtt->invalidate = gen8_ggtt_invalidate;
diff --git a/drivers/gpu/drm/i915/gt/intel_tlb.c 
b/drivers/gpu/drm/i915/gt/intel_tlb.c
index 139608c30d978..4bb13d1890e37 100644
--- a/drivers/gpu/drm/i915/gt/intel_tlb.c
+++ b/drivers/gpu/drm/i915/gt/intel_tlb.c
@@ -12,6 +12,7 @@
  #include 

Re: [Intel-gfx] [PATCH v11 3/7] drm/i915: Define and use GuC and CTB TLB invalidation routines

2023-10-11 Thread John Harrison

On 10/10/2023 17:02, Jonathan Cavitt wrote:

From: Prathap Kumar Valsan 

The GuC firmware had defined the interface for Translation Look-Aside
Buffer (TLB) invalidation.  We should use this interface when
invalidating the engine and GuC TLBs.
Add additional functionality to intel_gt_invalidate_tlb, invalidating
the GuC TLBs and falling back to GT invalidation when the GuC is
disabled.
The invalidation is done by sending a request directly to the GuC
tlb_lookup that invalidates the table.  The invalidation is submitted as
a wait request and is performed in the CT event handler.  This means we
cannot perform this TLB invalidation path if the CT is not enabled.
If the request isn't fulfilled in two seconds, this would constitute
an error in the invalidation as that would constitute either a lost
request or a severe GuC overload.

With this new invalidation routine, we can perform GuC-based GGTT
invalidations.  GuC-based GGTT invalidation is incompatible with
MMIO invalidation so we should not perform MMIO invalidation when
GuC-based GGTT invalidation is expected.

The additional complexity incurred in this patch will be necessary for
range-based tlb invalidations, which will be platformed in the future.

Signed-off-by: Prathap Kumar Valsan 
Signed-off-by: Bruce Chang 
Signed-off-by: Chris Wilson 
Signed-off-by: Umesh Nerlige Ramappa 
Signed-off-by: Jonathan Cavitt 
Signed-off-by: Aravind Iddamsetty 
Signed-off-by: Fei Yang 
CC: Andi Shyti 
Reviewed-by: Andi Shyti 

Reviewed-by: John Harrison 



Re: [Intel-gfx] [PATCH v11 2/7] drm/i915/guc: Add CT size delay helper

2023-10-11 Thread John Harrison

On 10/10/2023 17:02, Jonathan Cavitt wrote:

Add a helper function to the GuC CT buffer that reports the expected
time to process all outstanding requests.  As of now, there is no
functionality to check number of requests in the buffer, so the helper
function just reports 2 seconds, or 1ms per request up to the maximum
number of requests the CT buffer can store.

This comment is inaccurate.

The buffer is 4K bytes. If it was only 1ms per request then a 2s total 
means 2000 requests in the buffer, or 2 bytes per request. The smallest 
request possible is 2 words or 8 bytes (and that would be a request with 
no data at all). The average requests size is more likely 4 words at 
least. Which means only 250 requests per queue and therefore a maximum 
time of 8ms per request to hit a 2s total.


It would be better to simply say "As of now, there is no mechanism for 
tracking a given request's progress through the queue. Instead, add a 
helper that returns an estimated maximum time the queue should take to 
drain if completely full.". The description in the code itself gives the 
full details. No need to repeat all that in the commit message.


With that updated:
Reviewed-by: John Harrison 



Suggested-by: John Harrison 
Signed-off-by: Jonathan Cavitt 
Reviewed-by: Andi Shyti 
---
  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 27 +++
  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h |  2 ++
  2 files changed, 29 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
index c33210ead1ef7..03b616ba4ebb7 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
@@ -103,6 +103,33 @@ enum { CTB_SEND = 0, CTB_RECV = 1 };
  
  enum { CTB_OWNER_HOST = 0 };
  
+/*

+ * Some H2G commands involve a synchronous response that the driver needs
+ * to wait for. In such cases, a timeout is required to prevent the driver
+ * from waiting forever in the case of an error (either no error response
+ * is defined in the protocol or something has died and requires a reset).
+ * The specific command may be defined as having a time bound response but
+ * the CT is a queue and that time guarantee only starts from the point
+ * when the command reaches the head of the queue and is processed by GuC.
+ *
+ * Ideally there would be a helper to report the progress of a given
+ * command through the CT. However, that would require a significant
+ * amount of work in the CT layer. In the meantime, provide a reasonable
+ * estimation of the worst case latency it should take for the entire
+ * queue to drain. And therefore, how long a caller should wait before
+ * giving up on their request. The current estimate is based on empirical
+ * measurement of a test that fills the buffer with context creation and
+ * destruction requests as they seem to be the slowest operation.
+ */
+long intel_guc_ct_max_queue_time_jiffies(void)
+{
+   /*
+* A 4KB buffer full of context destroy commands takes a little
+* over a second to process so bump that to 2s to be super safe.
+*/
+   return (CTB_H2G_BUFFER_SIZE * HZ) / SZ_2K;
+}
+
  static void ct_receive_tasklet_func(struct tasklet_struct *t);
  static void ct_incoming_request_worker_func(struct work_struct *w);
  
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h

index 58e42901ff498..2c4bb9a941be6 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
@@ -104,6 +104,8 @@ struct intel_guc_ct {
  #endif
  };
  
+long intel_guc_ct_max_queue_time_jiffies(void);

+
  void intel_guc_ct_init_early(struct intel_guc_ct *ct);
  int intel_guc_ct_init(struct intel_guc_ct *ct);
  void intel_guc_ct_fini(struct intel_guc_ct *ct);




Re: [Intel-gfx] [PATCH 3/3] drm/i915: move gpu error sysfs to i915_gpu_error.c

2023-10-11 Thread John Harrison




On 10/11/2023 09:38, Jani Nikula wrote:

Hide gpu error specifics in i915_gpu_error.c. This is also cleaner wrt
conditional compilation, as i915_gpu_error.c is only built with
DRM_I915_CAPTURE_ERROR=y.

With this, we can also make i915_first_error_state() static.

Signed-off-by: Jani Nikula 
---
  drivers/gpu/drm/i915/i915_gpu_error.c | 75 -
  drivers/gpu/drm/i915/i915_gpu_error.h | 17 +++---
  drivers/gpu/drm/i915/i915_sysfs.c | 79 +--
  3 files changed, 86 insertions(+), 85 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
b/drivers/gpu/drm/i915/i915_gpu_error.c
index b4c8459deb7b..f9e750217f18 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -57,6 +57,7 @@
  #include "i915_memcpy.h"
  #include "i915_reg.h"
  #include "i915_scatterlist.h"
+#include "i915_sysfs.h"
  #include "i915_utils.h"
  
  #define ALLOW_FAIL (__GFP_KSWAPD_RECLAIM | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)

@@ -2208,7 +2209,7 @@ void i915_capture_error_state(struct intel_gt *gt,
i915_gpu_coredump_put(error);
  }
  
-struct i915_gpu_coredump *

+static struct i915_gpu_coredump *
  i915_first_error_state(struct drm_i915_private *i915)
  {
struct i915_gpu_coredump *error;
@@ -2484,3 +2485,75 @@ void i915_gpu_error_debugfs_register(struct 
drm_i915_private *i915)
debugfs_create_file("i915_gpu_info", 0644, minor->debugfs_root, i915,
_gpu_info_fops);
  }
+
+static ssize_t error_state_read(struct file *filp, struct kobject *kobj,
+   struct bin_attribute *attr, char *buf,
+   loff_t off, size_t count)
+{
+
+   struct device *kdev = kobj_to_dev(kobj);
+   struct drm_i915_private *i915 = kdev_minor_to_i915(kdev);
+   struct i915_gpu_coredump *gpu;
+   ssize_t ret = 0;
+
+   /*
+* FIXME: Concurrent clients triggering resets and reading + clearing
+* dumps can cause inconsistent sysfs reads when a user calls in with a
+* non-zero offset to complete a prior partial read but the
+* gpu_coredump has been cleared or replaced.
+*/
+
+   gpu = i915_first_error_state(i915);
+   if (IS_ERR(gpu)) {
+   ret = PTR_ERR(gpu);
+   } else if (gpu) {
+   ret = i915_gpu_coredump_copy_to_buffer(gpu, buf, off, count);
+   i915_gpu_coredump_put(gpu);
+   } else {
+   const char *str = "No error state collected\n";
+   size_t len = strlen(str);
+
+   if (off < len) {
+   ret = min_t(size_t, count, len - off);
+   memcpy(buf, str + off, ret);
+   }
+   }
Can this and the debugfs equivalent not be common code? It seems like 
the implementations are conceptually the same even if the code currently 
looks quite different.


John.


+
+   return ret;
+}
+
+static ssize_t error_state_write(struct file *file, struct kobject *kobj,
+struct bin_attribute *attr, char *buf,
+loff_t off, size_t count)
+{
+   struct device *kdev = kobj_to_dev(kobj);
+   struct drm_i915_private *dev_priv = kdev_minor_to_i915(kdev);
+
+   drm_dbg(_priv->drm, "Resetting error state\n");
+   i915_reset_error_state(dev_priv);
+
+   return count;
+}
+
+static const struct bin_attribute error_state_attr = {
+   .attr.name = "error",
+   .attr.mode = S_IRUSR | S_IWUSR,
+   .size = 0,
+   .read = error_state_read,
+   .write = error_state_write,
+};
+
+void i915_gpu_error_sysfs_setup(struct drm_i915_private *i915)
+{
+   struct device *kdev = i915->drm.primary->kdev;
+
+   if (sysfs_create_bin_file(>kobj, _state_attr))
+   drm_err(>drm, "error_state sysfs setup failed\n");
+}
+
+void i915_gpu_error_sysfs_teardown(struct drm_i915_private *i915)
+{
+   struct device *kdev = i915->drm.primary->kdev;
+
+   sysfs_remove_bin_file(>kobj, _state_attr);
+}
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h 
b/drivers/gpu/drm/i915/i915_gpu_error.h
index a6f2a7518cf0..68c964d6720a 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -323,11 +323,12 @@ static inline void i915_gpu_coredump_put(struct 
i915_gpu_coredump *gpu)
kref_put(>ref, __i915_gpu_coredump_free);
  }
  
-struct i915_gpu_coredump *i915_first_error_state(struct drm_i915_private *i915);

  void i915_reset_error_state(struct drm_i915_private *i915);
  void i915_disable_error_state(struct drm_i915_private *i915, int err);
  
  void i915_gpu_error_debugfs_register(struct drm_i915_private *i915);

+void i915_gpu_error_sysfs_setup(struct drm_i915_private *i915);
+void i915_gpu_error_sysfs_teardown(struct drm_i915_private *i915);
  
  #else
  
@@ -396,12 +397,6 @@ static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu)

  {
  

Re: [Intel-gfx] [PATCH v10 3/7] drm/i915: Define and use GuC and CTB TLB invalidation routines

2023-10-10 Thread John Harrison

On 10/10/2023 15:30, Cavitt, Jonathan wrote:

-Original Message-
From: Harrison, John C 
Sent: Tuesday, October 10, 2023 2:51 PM
To: Cavitt, Jonathan ; 
intel-gfx@lists.freedesktop.org
Cc: Gupta, saurabhg ; chris.p.wil...@linux.intel.com; Iddamsetty, Aravind 
; Yang, Fei ; Shyti, Andi ; 
Das, Nirmoy ; Krzysztofik, Janusz ; Roper, Matthew D 
; tvrtko.ursu...@linux.intel.com; jani.nik...@linux.intel.com
Subject: Re: [PATCH v10 3/7] drm/i915: Define and use GuC and CTB TLB 
invalidation routines

On 10/10/2023 08:02, Jonathan Cavitt wrote:

...

+static void fini_tlb_lookup(struct intel_guc *guc)
+{
+   struct intel_guc_tlb_wait *wait;
+
+   if (!HAS_GUC_TLB_INVALIDATION(guc_to_gt(guc)->i915))
+   return;
+
+   wait = xa_load(>tlb_lookup, guc->serial_slot);
+   kfree(wait);

There was originally a error being printed if wait->busy was still set,
i.e. someone was still waiting on the object that is about to be
destroyed. There were review comments about that being broken in an
intermediate patch set. I don't recall seeing any explanation as to why
the error message should be completely removed.


The GEM_BUG_ON was downgraded to a debug message in an intermediate step
at the request of one of the reviewers (this was a version 8 change, IIRC).
We concluded that if the execution of the system was not impacted by the debug
path, we shouldn't bother with the debug message at all.  So we removed it.
I think it was Fei or Andi that suggested it?
-Jonathan Cavitt
I recall it was me that said it should be an error message rather than a 
BUG_ON. And my point is that I don't see how this is a 'debug path'. If 
a waiter is still waiting on the wait object that is about to be freed 
then that is a potential dangling pointer dereference. That totally has 
the possibility to impact execution of the system.


John.



Re: [Intel-gfx] [PATCH v10 6/7] drm/i915/gt: Increase sleep in gt_tlb selftest sanitycheck

2023-10-10 Thread John Harrison

On 10/10/2023 08:02, Jonathan Cavitt wrote:

For the gt_tlb live selftest, when operating on the GSC engine,
increase the timeout from 10 ms to 200 ms because the GSC
engine is a bit slower than the rest.

Additionally, increase the default timeout from 10 ms to 20 ms
because msleep < 20ms can sleep for up to 20ms.
I'm not seeing why that is a reason to make it always sleep for 20ms. 
msleep is not guaranteed to have any kind of high accuracy. It just 
vaguely guarantees to sleep for at least the time requested. The point 
of warning if used for small values is to check against the case where a 
larger sleep is a problem. E.g. if you must sleep for at least 1ms but 
no more than 5ms then you need to use a different function because 
msleep might violate that requirement. But if your requirement is simply 
to sleep for at least 10ms and who cares if it is longer (as 
demonstrated by the bump to 200ms for GSC), then it is fine to use 
msleep(10). Maybe it will waste time and sleep for 20ms, maybe it won't. 
But it's not a problem if it does. And if it doesn't then you haven't 
wasted the time.


John.



Signed-off-by: Jonathan Cavitt 
---
  drivers/gpu/drm/i915/gt/selftest_tlb.c | 11 +--
  1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_tlb.c 
b/drivers/gpu/drm/i915/gt/selftest_tlb.c
index 7e41f69fc818f..24beb94aa7a37 100644
--- a/drivers/gpu/drm/i915/gt/selftest_tlb.c
+++ b/drivers/gpu/drm/i915/gt/selftest_tlb.c
@@ -136,8 +136,15 @@ pte_tlbinv(struct intel_context *ce,
i915_request_get(rq);
i915_request_add(rq);
  
-	/* Short sleep to sanitycheck the batch is spinning before we begin */

-   msleep(10);
+   /*
+* Short sleep to sanitycheck the batch is spinning before we begin.
+* FIXME: Why is GSC so slow?
+*/
+   if (ce->engine->class == OTHER_CLASS)
+   msleep(200);
+   else
+   msleep(20);
+
if (va == vb) {
if (!i915_request_completed(rq)) {
pr_err("%s(%s): Semaphore sanitycheck failed %llx, with 
alignment %llx, using PTE size %x (phys %x, sg %x)\n",




Re: [Intel-gfx] [PATCH v10 4/7] drm/i915: No TLB invalidation on suspended GT

2023-10-10 Thread John Harrison

On 10/10/2023 08:02, Jonathan Cavitt wrote:

In case of GT is suspended, don't allow submission of new TLB invalidation
request and cancel all pending requests. The TLB entries will be
invalidated either during GuC reload or on system resume.

Signed-off-by: Fei Yang 
Signed-off-by: Jonathan Cavitt 
CC: John Harrison 
---
  drivers/gpu/drm/i915/gt/uc/intel_guc.h|  1 +
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 21 +--
  drivers/gpu/drm/i915/gt/uc/intel_uc.c |  7 +++
  3 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h 
b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
index 06c44f5c28776..ff7e7b90fd49b 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -536,4 +536,5 @@ void intel_guc_dump_time_info(struct intel_guc *guc, struct 
drm_printer *p);
  
  int intel_guc_sched_disable_gucid_threshold_max(struct intel_guc *guc);
  
+void wake_up_all_tlb_invalidate(struct intel_guc *guc);

  #endif
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index e9854652c2b52..b9c168ea57270 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1796,13 +1796,25 @@ static void __guc_reset_context(struct intel_context 
*ce, intel_engine_mask_t st
intel_context_put(parent);
  }
  
-void intel_guc_submission_reset(struct intel_guc *guc, intel_engine_mask_t stalled)

+void wake_up_all_tlb_invalidate(struct intel_guc *guc)
  {
struct intel_guc_tlb_wait *wait;
+   unsigned long i;
+
+   if (!HAS_GUC_TLB_INVALIDATION(guc_to_gt(guc)->i915))
+   return;
+
+   xa_lock_irq(>tlb_lookup);
+   xa_for_each(>tlb_lookup, i, wait)
+   wake_up(>wq);
+   xa_unlock_irq(>tlb_lookup);
+}
+
+void intel_guc_submission_reset(struct intel_guc *guc, intel_engine_mask_t 
stalled)
What is changed on this line? Or is it just diff being confused and 
seeing the move of the 'wait' declaration as being the anchor point 
rather than the function declaration?


John.



+{
struct intel_context *ce;
unsigned long index;
unsigned long flags;
-   unsigned long i;
  
  	if (unlikely(!guc_submission_initialized(guc))) {

/* Reset called during driver load? GuC not yet initialised! */
@@ -1833,10 +1845,7 @@ void intel_guc_submission_reset(struct intel_guc *guc, 
intel_engine_mask_t stall
 * The full GT reset will have cleared the TLB caches and flushed the
 * G2H message queue; we can release all the blocked waiters.
 */
-   xa_lock_irq(>tlb_lookup);
-   xa_for_each(>tlb_lookup, i, wait)
-   wake_up(>wq);
-   xa_unlock_irq(>tlb_lookup);
+   wake_up_all_tlb_invalidate(guc);
  }
  
  static void guc_cancel_context_requests(struct intel_context *ce)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c 
b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
index 98b103375b7ab..750cb63503dd7 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
@@ -688,6 +688,8 @@ void intel_uc_suspend(struct intel_uc *uc)
/* flush the GSC worker */
intel_gsc_uc_flush_work(>gsc);
  
+	wake_up_all_tlb_invalidate(guc);

+
if (!intel_guc_is_ready(guc)) {
guc->interrupts.enabled = false;
return;
@@ -736,6 +738,11 @@ static int __uc_resume(struct intel_uc *uc, bool 
enable_communication)
  
  	intel_gsc_uc_resume(>gsc);
  
+	if (HAS_GUC_TLB_INVALIDATION(gt->i915)) {

+   intel_guc_invalidate_tlb_engines(guc);
+   intel_guc_invalidate_tlb_guc(guc);
+   }
+
return 0;
  }
  




Re: [Intel-gfx] [PATCH v10 3/7] drm/i915: Define and use GuC and CTB TLB invalidation routines

2023-10-10 Thread John Harrison

On 10/10/2023 08:02, Jonathan Cavitt wrote:

From: Prathap Kumar Valsan 

The GuC firmware had defined the interface for Translation Look-Aside
Buffer (TLB) invalidation.  We should use this interface when
invalidating the engine and GuC TLBs.
Add additional functionality to intel_gt_invalidate_tlb, invalidating
the GuC TLBs and falling back to GT invalidation when the GuC is
disabled.
The invalidation is done by sending a request directly to the GuC
tlb_lookup that invalidates the table.  The invalidation is submitted as
a wait request and is performed in the CT event handler.  This means we
cannot perform this TLB invalidation path if the CT is not enabled.
If the request isn't fulfilled in two seconds, this would constitute
an error in the invalidation as that would constitute either a lost
request or a severe GuC overload.

With this new invalidation routine, we can perform GuC-based GGTT
invalidations.  GuC-based GGTT invalidation is incompatible with
MMIO invalidation so we should not perform MMIO invalidation when
GuC-based GGTT invalidation is expected.

The additional complexity incurred in this patch will be necessary for
range-based tlb invalidations, which will be platformed in the future.

Signed-off-by: Prathap Kumar Valsan 
Signed-off-by: Bruce Chang 
Signed-off-by: Chris Wilson 
Signed-off-by: Umesh Nerlige Ramappa 
Signed-off-by: Jonathan Cavitt 
Signed-off-by: Aravind Iddamsetty 
Signed-off-by: Fei Yang 
CC: Andi Shyti 
---
  drivers/gpu/drm/i915/gt/intel_ggtt.c  |  34 +++-
  drivers/gpu/drm/i915/gt/intel_tlb.c   |  16 +-
  .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |  33 
  drivers/gpu/drm/i915/gt/uc/intel_guc.h|  22 +++
  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c |   4 +
  drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |   1 +
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 186 +-
  7 files changed, 284 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c 
b/drivers/gpu/drm/i915/gt/intel_ggtt.c
index 4d7d88b92632b..a1f7bdc602996 100644
--- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
@@ -206,22 +206,38 @@ static void gen8_ggtt_invalidate(struct i915_ggtt *ggtt)
intel_uncore_write_fw(uncore, GFX_FLSH_CNTL_GEN6, GFX_FLSH_CNTL_EN);
  }
  
+static void guc_ggtt_ct_invalidate(struct intel_gt *gt)

+{
+   struct intel_uncore *uncore = gt->uncore;
+   intel_wakeref_t wakeref;
+
+   with_intel_runtime_pm_if_active(uncore->rpm, wakeref) {
+   struct intel_guc *guc = >uc.guc;
+
+   intel_guc_invalidate_tlb_guc(guc);
+   }
+}
+
  static void guc_ggtt_invalidate(struct i915_ggtt *ggtt)
  {
struct drm_i915_private *i915 = ggtt->vm.i915;
+   struct intel_gt *gt;
  
-	gen8_ggtt_invalidate(ggtt);

-
-   if (GRAPHICS_VER(i915) >= 12) {
-   struct intel_gt *gt;
+   if (!HAS_GUC_TLB_INVALIDATION(i915))
+   gen8_ggtt_invalidate(ggtt);
  
-		list_for_each_entry(gt, >gt_list, ggtt_link)

+   list_for_each_entry(gt, >gt_list, ggtt_link) {
+   if (HAS_GUC_TLB_INVALIDATION(i915) &&
+   intel_guc_is_ready(>uc.guc)) {
+   guc_ggtt_ct_invalidate(gt);
+   } else if (GRAPHICS_VER(i915) >= 12) {
intel_uncore_write_fw(gt->uncore,
  GEN12_GUC_TLB_INV_CR,
  GEN12_GUC_TLB_INV_CR_INVALIDATE);
-   } else {
-   intel_uncore_write_fw(ggtt->vm.gt->uncore,
- GEN8_GTCR, GEN8_GTCR_INVALIDATE);
+   } else {
+   intel_uncore_write_fw(gt->uncore,
+ GEN8_GTCR, GEN8_GTCR_INVALIDATE);
+   }
}
  }
  
@@ -1243,7 +1259,7 @@ static int gen8_gmch_probe(struct i915_ggtt *ggtt)

ggtt->vm.raw_insert_page = gen8_ggtt_insert_page;
}
  
-	if (intel_uc_wants_guc(>vm.gt->uc))

+   if (intel_uc_wants_guc_submission(>vm.gt->uc))
ggtt->invalidate = guc_ggtt_invalidate;
else
ggtt->invalidate = gen8_ggtt_invalidate;
diff --git a/drivers/gpu/drm/i915/gt/intel_tlb.c 
b/drivers/gpu/drm/i915/gt/intel_tlb.c
index 139608c30d978..4bb13d1890e37 100644
--- a/drivers/gpu/drm/i915/gt/intel_tlb.c
+++ b/drivers/gpu/drm/i915/gt/intel_tlb.c
@@ -12,6 +12,7 @@
  #include "intel_gt_print.h"
  #include "intel_gt_regs.h"
  #include "intel_tlb.h"
+#include "uc/intel_guc.h"
  
  /*

   * HW architecture suggest typical invalidation time at 40us,
@@ -131,11 +132,24 @@ void intel_gt_invalidate_tlb_full(struct intel_gt *gt, 
u32 seqno)
return;
  
  	with_intel_gt_pm_if_awake(gt, wakeref) {

+   struct intel_guc *guc = >uc.guc;
+
mutex_lock(>tlb.invalidate_lock);
if (tlb_seqno_passed(gt, seqno))
goto 

Re: [Intel-gfx] [PATCH v10 2/7] drm/i915/guc: Add CT size delay helper

2023-10-10 Thread John Harrison

On 10/10/2023 08:02, Jonathan Cavitt wrote:

Add a helper function to the GuC CT buffer that reports the expected
time to process all outstanding requests.  As of now, there is no
functionality to check number of requests in the buffer, so the helper
function just reports 2 seconds, or 1ms per request up to the maximum
number of requests the CT buffer can store.

Suggested-by: John Harrison
Signed-off-by: Jonathan Cavitt
---
  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h | 13 +
  1 file changed, 13 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
index 58e42901ff498..36afc1ce9fabd 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
@@ -120,6 +120,19 @@ static inline bool intel_guc_ct_enabled(struct 
intel_guc_ct *ct)
return ct->enabled;
  }
  
+/*

+ * GuC has a timeout of 1ms for a TLB invalidation response from GAM.  On a
+ * timeout GuC drops the request and has no mechanism to notify the host about
+ * the timeout.  There is also no mechanism for determining the number of
+ * outstanding requests in the CT buffer.  Ergo, keep a larger timeout that 
accounts
+ * for this individual timeout and the max number of outstanding requests that
+ * can be queued in CT buffer.
+ */
This feels like the wrong wording. TLB invalidations aren't even close 
to the slowest thing that goes through the CT buffer. And the 
description about dropping failed requests and such is irrelevant to the 
implementation/purpose of this helper. That is specific detail about one 
single use case of the helper. That might be the only user at this point 
but the intention is that other parts of the driver will be updated to 
call this as well rather than hard coding their own timeouts as they 
currently do.


I would suggest:

   Some H2G commands involve a synchronous response that the driver
   needs to wait for. In such cases, a timeout is required to prevent
   the driver from waiting forever in the case of an error (either no
   error response is defined in the protocol or something has died and
   requires a reset). The specific command may be defined as having a
   time bound response but the CT is a queue and that time guarantee
   only starts from the point when the command reaches the head of the
   queue and is processed by GuC.

   Ideally there would be a helper to report the progress of a given
   command through the CT. However, that would require a significant
   amount of work in the CT layer. In the meantime, provide a
   reasonable estimation of the worst case latency it should take for
   the entire queue to drain. And therefore, how long a caller should
   wait before giving up on their request. The current estimate is
   based on empirical measurement of a test that fills the buffer with
   context creation and destruction requests as they seem to be the
   slowest operation.



+static inline long intel_guc_ct_expected_delay(struct intel_guc_ct *ct)
This is not the 'expected' delay but the worst case maximum delay. Also, 
no need to force the callers to know about ct structures. They 
presumably have a intel_guc structure if they are sending H2G messages, 
and that is all you should need to know about. Having said that, the 
implementation isn't currently accessing any stored data, so why bother 
with a parameter at all?



+{
+   return HZ * 2;
Also, this needs to be based on the buffer size so that if the size were 
to increase then the time would as well.


My thought would be:

   long intel_guc_ct_max_queue_time_jiffies(void) {
    /*
     * A 4KB buffer full of context destroy commands takes a little
   over a second to process
 * so bump that to 2s to be super safe.
 */
    return (CTB_H2G_BUFFER_SIZE * HZ) / SZ_2K;
   }

John.



+}
+
  #define INTEL_GUC_CT_SEND_NB  BIT(31)
  #define INTEL_GUC_CT_SEND_G2H_DW_SHIFT0
  #define INTEL_GUC_CT_SEND_G2H_DW_MASK (0xff << INTEL_GUC_CT_SEND_G2H_DW_SHIFT)


Re: [Intel-gfx] [PATCH] drm/i915/gt: Temporarily force MTL into uncached mode

2023-10-10 Thread John Harrison

On 10/10/2023 09:44, Matt Roper wrote:

On Tue, Oct 10, 2023 at 05:42:28PM +0100, Tvrtko Ursulin wrote:

On 10/10/2023 17:17, Andi Shyti wrote:

Hi Matt,


FIXME: CAT errors are cropping up on MTL.  This removes them,
but the real root cause must still be diagnosed.

Do you have a link to specific IGT test(s) that illustrate the CAT
errors so that we can ensure that they now appear fixed in CI?

this one:

https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_124599v1/bat-mtlp-8/igt@i915_selftest@l...@hugepages.html

Andi

Wait, now I'm confused.  That's a failure caused by a different patch
series (one that we won't be moving forward with).  The live@hugepages
test is always passing on drm-tip today:
https://intel-gfx-ci.01.org/tree/drm-tip/igt@i915_selftest@l...@hugepages.html

yes, true, but that patch allows us to move forward with the
testing and hit the CAT error.

(it was the most reachable link I found :))


Is there a test that's giving CAT errors on drm-tip itself (even
sporadically) that we can monitor to see the impact of Jonathan's patch
here?

Otherwise this one:

https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13667/re-mtlp-3/igt@gem_exec_fe...@parallel.html#dmesg-warnings11

Parachuting in on a tangent - please do not mix CAT and CT errors. CAT, for me 
at least, associates with CATastrophic faults reported over CT channel, like 
GuC page faulting IIRC.

For CT errors maybe GuC folks can sched some light what they mean.

0x6000 is GUC_ACTION_GUC2HOST_NOTIFY_MEMORY_CAT_ERROR so this actually
is a CAT error, delivered via the CT channel.
The history is that catastrophic memory errors (CAT is an abbreviation 
not an acronym) are never meant to happen in the upstream driver because 
we map all invalid addresses to a scratch page and silently hide such 
accesses. Hence there has been push back on adding support for an error 
channel which is officially impossible to hit. The problem is that we 
keep hitting it due to hardware and/or software bugs.


Because there is no official support for handling this notification, the 
CT layer reports it as an unexpected notification and barfs. As far as 
the CT layer is concerned, it is a corrupted packet from GuC. And thus 
the error reporting looks totally weird for what is just an illegal 
address access from some random part of the GPU. And note that it is 
very unlikely that GuC itself caused the page fault. It is much more 
plausible to be coming from an engine/EU/batch buffer instruction. 
Although as noted, the fundamental cause is believed to be broken page 
table updates due to cache coherency issues.


John.




Matt


Regards,

Tvrtko




Re: [Intel-gfx] [PATCH v2] drm/i915/gt: Temporarily force MTL into uncached mode

2023-10-10 Thread John Harrison

On 10/10/2023 07:36, Jonathan Cavitt wrote:

FIXME: CAT errors are cropping up on MTL.  This removes them,
but the real root cause must still be diagnosed.
I think 'hides' would be more accurate than 'removes'. At least until we 
have a better understanding of the issue.


Also, is there any performance penalty with this change? If we are going 
from fully cached to write combined then one assumes that something, 
somewhere is going to notice? Do we have any benchmark results or other 
tests that show an impact?


John.




Signed-off-by: Jonathan Cavitt 
---

v2: Apply FIXME to shmem_utils as well.

  drivers/gpu/drm/i915/gt/intel_gt.c | 6 +-
  drivers/gpu/drm/i915/gt/intel_lrc.c| 5 -
  drivers/gpu/drm/i915/gt/shmem_utils.c  | 8 +++-
  drivers/gpu/drm/i915/gt/uc/intel_guc.c | 5 -
  4 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c 
b/drivers/gpu/drm/i915/gt/intel_gt.c
index ed32bf5b15464..b52c8eb0b033f 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt.c
@@ -1026,8 +1026,12 @@ enum i915_map_type intel_gt_coherent_map_type(struct 
intel_gt *gt,
/*
 * Wa_22016122933: always return I915_MAP_WC for Media
 * version 13.0 when the object is on the Media GT
+*
+* FIXME: CAT errors are cropping up on MTL.  This removes them,
+* but the real root cause must still be diagnosed.
 */
-   if (i915_gem_object_is_lmem(obj) || intel_gt_needs_wa_22016122933(gt))
+   if (i915_gem_object_is_lmem(obj) || intel_gt_needs_wa_22016122933(gt) ||
+   IS_METEORLAKE(gt->i915))
return I915_MAP_WC;
if (HAS_LLC(gt->i915) || always_coherent)
return I915_MAP_WB;
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c 
b/drivers/gpu/drm/i915/gt/intel_lrc.c
index eaf66d9031665..8aaa4df84cb3e 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -1124,8 +1124,11 @@ __lrc_alloc_state(struct intel_context *ce, struct 
intel_engine_cs *engine)
 * Wa_22016122933: For Media version 13.0, all Media GT shared
 * memory needs to be mapped as WC on CPU side and UC (PAT
 * index 2) on GPU side.
+*
+* FIXME: CAT errors are cropping up on MTL.  This removes them,
+* but the real root cause must still be diagnosed.
 */
-   if (intel_gt_needs_wa_22016122933(engine->gt))
+   if (intel_gt_needs_wa_22016122933(engine->gt) || 
IS_METEORLAKE(engine->i915))
i915_gem_object_set_cache_coherency(obj, 
I915_CACHE_NONE);
}
  
diff --git a/drivers/gpu/drm/i915/gt/shmem_utils.c b/drivers/gpu/drm/i915/gt/shmem_utils.c

index bccc3a1200bc6..a026c216fd286 100644
--- a/drivers/gpu/drm/i915/gt/shmem_utils.c
+++ b/drivers/gpu/drm/i915/gt/shmem_utils.c
@@ -43,7 +43,13 @@ struct file *shmem_create_from_object(struct 
drm_i915_gem_object *obj)
return file;
}
  
-	map_type = i915_gem_object_is_lmem(obj) ? I915_MAP_WC : I915_MAP_WB;

+   /*
+* FIXME: CAT errors are cropping up on MTL.  This removes them,
+* but the real root cause must still be diagnosed.
+*/
+   map_type = i915_gem_object_is_lmem(obj) ||
+  IS_METEORLAKE(to_i915(obj->base.dev)) ?
+  I915_MAP_WC : I915_MAP_WB;
ptr = i915_gem_object_pin_map_unlocked(obj, map_type);
if (IS_ERR(ptr))
return ERR_CAST(ptr);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
index 27df41c53b890..e3a7d61506188 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
@@ -774,8 +774,11 @@ struct i915_vma *intel_guc_allocate_vma(struct intel_guc 
*guc, u32 size)
 * Wa_22016122933: For Media version 13.0, all Media GT shared
 * memory needs to be mapped as WC on CPU side and UC (PAT
 * index 2) on GPU side.
+*
+* FIXME: CAT errors are cropping up on MTL.  This removes them,
+* but the real root cause must still be diagnosed.
 */
-   if (intel_gt_needs_wa_22016122933(gt))
+   if (intel_gt_needs_wa_22016122933(gt) || IS_METEORLAKE(gt->i915))
i915_gem_object_set_cache_coherency(obj, I915_CACHE_NONE);
  
  	vma = i915_vma_instance(obj, >ggtt->vm, NULL);




Re: [Intel-gfx] ✗ Fi.CI.BAT: failure for More print message helper updates

2023-10-10 Thread John Harrison

On 10/9/2023 19:26, Patchwork wrote:

Project List - Patchwork *Patch Details*
*Series:*   More print message helper updates
*URL:*  https://patchwork.freedesktop.org/series/124853/
*State:*failure
*Details:* 
https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_124853v1/index.html



  CI Bug Log - changes from CI_DRM_13732 -> Patchwork_124853v1


Summary

*FAILURE*

Serious unknown changes coming with Patchwork_124853v1 absolutely need 
to be

verified manually.

If you think the reported changes have nothing to do with the changes
introduced in Patchwork_124853v1, please notify your bug team 
(lgci.bug.fil...@intel.com) to allow them
to document this new failure mode, which will reduce false positives 
in CI.


External URL: 
https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_124853v1/index.html



Participating hosts (36 -> 34)

Missing (2): fi-kbl-soraka fi-snb-2520m


Possible new issues

Here are the unknown changes that may have been introduced in 
Patchwork_124853v1:



  CI changes


Possible regressions

  * boot:
  o fi-ilk-650: PASS


-> FAIL



System failed to boot properly. Didn't even get as far as loading the 
i915 module according to the logs. So definitely not caused by tweaking 
some debug prints within the i915 module. Also, system booted and ran 
just fine on the re-test.


John.


 *


Known issues

Here are the changes found in Patchwork_124853v1 that come from known 
issues:



  CI changes


Issues hit

  * boot:
  o fi-skl-guc: PASS


-> FAIL


(i915#8293 )


  IGT changes


Issues hit

 *

igt@gem_exec_suspend@basic-s0@lmem0:

  o bat-dg2-9: PASS


-> INCOMPLETE


(i915#9275 )
 *

igt@i915_selftest@live@requests:

  o bat-mtlp-8: PASS


-> ABORT


(i915#9414 )


Possible fixes

  * igt@kms_hdmi_inject@inject-audio:
  o fi-kbl-guc: FAIL


(IGT#3
)
-> PASS




Build changes

  * Linux: CI_DRM_13732 -> Patchwork_124853v1

CI-20190529: 20190529
CI_DRM_13732: 7c57bbfe2c6194cc4d4edf50466b057d7b191251 @ 
git://anongit.freedesktop.org/gfx-ci/linux
IGT_7523: 361c2f92f1fe5641090f2fc59951fcaba15387f5 @ 
https://gitlab.freedesktop.org/drm/igt-gpu-tools.git
Patchwork_124853v1: 7c57bbfe2c6194cc4d4edf50466b057d7b191251 @ 
git://anongit.freedesktop.org/gfx-ci/linux



  Linux commits

6eb131b16d85 drm/i915: More use of GT specific print helpers
decb307d48d5 drm/i915/gt: More use of GT specific print helpers



Re: [Intel-gfx] [PATCH RESEND v2 0/2] Add drm_dbg_ratelimited()

2023-10-10 Thread John Harrison

On 10/10/2023 05:15, Andi Shyti wrote:

Hi,

I might have picked up the wrong series and missed some reviews
and the extra patch from Nirmoy with a real use of the
drm_dbg_ratelimited() that John was looking for.

Thanks,
Andi
I just found the original post of this from back in January 
(https://patchwork.freedesktop.org/series/112925/). Is there a reason 
why it was never merged? As noted, it appears to have a whole bunch of 
r-b's on it.


John.


v2:
pick the right patch with the following changes:
  - add more r-b's
  - add a patch 2 where the drm_dbg_ratelimited is actually used.

Nirmoy Das (2):
   drm/print: Add drm_dbg_ratelimited
   drm/i915: Ratelimit debug log in vm_fault_ttm

  drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 5 +++--
  include/drm/drm_print.h | 3 +++
  2 files changed, 6 insertions(+), 2 deletions(-)





Re: [Intel-gfx] [PATCH 1/2] drm/i915/gt: More use of GT specific print helpers

2023-10-09 Thread John Harrison

On 10/9/2023 13:02, Andi Shyti wrote:

Hi John,

...


if (intf_id >= INTEL_GSC_NUM_INTERFACES) {
-   drm_warn_once(>i915->drm, "GSC irq: intf_id %d is out of 
range", intf_id);
+   gt_warn_once(gt, "GSC irq: intf_id %d is out of range", 
intf_id);
return;
}
if (!HAS_HECI_GSC(gt->i915)) {
-   drm_warn_once(>i915->drm, "GSC irq: not supported");
+   gt_warn_once(gt, "GSC irq: not supported");
return;
}
@@ -300,7 +301,7 @@ static void gsc_irq_handler(struct intel_gt *gt, unsigned 
int intf_id)
ret = generic_handle_irq(gt->gsc.intf[intf_id].irq);
if (ret)
-   drm_err_ratelimited(>i915->drm, "error handling GSC irq: 
%d\n", ret);
+   gt_err_ratelimited(gt, "error handling GSC irq: %d\n", ret);
   }
   void intel_gsc_irq_handler(struct intel_gt *gt, u32 iir)
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_print.h 
b/drivers/gpu/drm/i915/gt/intel_gt_print.h
index 55a336a9ff061..7fdc78c79273e 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_print.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_print.h
@@ -16,6 +16,9 @@
   #define gt_warn(_gt, _fmt, ...) \
drm_warn(&(_gt)->i915->drm, "GT%u: " _fmt, (_gt)->info.id, 
##__VA_ARGS__)
+#define gt_warn_once(_gt, _fmt, ...) \
+   drm_warn_once(&(_gt)->i915->drm, "GT%u: " _fmt, (_gt)->info.id, 
##__VA_ARGS__)
+

I would add the gt_warn_once() part in a different patch.

But this is the patch that uses it. You should not add dead code. The only
exception being if it is something large and complex that needs to be added
in stages for ease of code review. But this really doesn't count as large or
complex!

I wouldn't call it dead code if it's used right after... you
could also put all the *_warn_* changes in different patch.
I did start splitting it into errors vs debugs but then decided it 
wasn't worth the effort ;)




Anyway, I don't have a strong opinion for such a straight forward
patch, so that I'm fine with it as it is:

Reviewed-by: Andi Shyti 

Thanks :).



Andi




Re: [Intel-gfx] [PATCH 2/2] drm/i915: More use of GT specific print helpers

2023-10-09 Thread John Harrison

On 10/9/2023 12:54, Andi Shyti wrote:

Hi John,

...


--- a/drivers/gpu/drm/i915/i915_driver.c
+++ b/drivers/gpu/drm/i915/i915_driver.c
@@ -71,6 +71,7 @@
  #include "gem/i915_gem_pm.h"
  #include "gt/intel_gt.h"
  #include "gt/intel_gt_pm.h"
+#include "gt/intel_gt_print.h"
  #include "gt/intel_rc6.h"
  
  #include "pxp/intel_pxp.h"

@@ -429,7 +430,7 @@ static int i915_pcode_init(struct drm_i915_private *i915)
for_each_gt(gt, i915, id) {
ret = intel_pcode_init(gt->uncore);
if (ret) {
-   drm_err(>i915->drm, "gt%d: intel_pcode_init failed 
%d\n", id, ret);
+   gt_err(gt, "intel_pcode_init failed %d\n", ret);

using gt_*() print functions in the upper layers looks a bit
wrong to me. If we need GT printing, the prints need to be done
inside the function called, in this case would be
intel_pcode_init().
It is less wrong that using gt->i915->drm as a parameter and 'gt%d' in 
the format string. That is the whole point of the helper. The code has 
access to a gt object so it should use the gt helper to make use of that 
object rather than unrolling it and diving in to the gt internals.


As for moving the error message inside the init function itself. That is 
maybe a valid change but that potentially counts as a functional change 
and should be done by someone who actually knows the code. All I'm doing 
is improving the code layering by using the correct helper to hide the 
internal details of an object this layer should not know about.


John.



Andi




Re: [Intel-gfx] [PATCH 1/2] drm/i915/gt: More use of GT specific print helpers

2023-10-09 Thread John Harrison

On 10/9/2023 12:50, Andi Shyti wrote:

Hi John,

...


if (intf_id >= INTEL_GSC_NUM_INTERFACES) {
-   drm_warn_once(>i915->drm, "GSC irq: intf_id %d is out of 
range", intf_id);
+   gt_warn_once(gt, "GSC irq: intf_id %d is out of range", 
intf_id);
return;
}
  
  	if (!HAS_HECI_GSC(gt->i915)) {

-   drm_warn_once(>i915->drm, "GSC irq: not supported");
+   gt_warn_once(gt, "GSC irq: not supported");
return;
}
  
@@ -300,7 +301,7 @@ static void gsc_irq_handler(struct intel_gt *gt, unsigned int intf_id)
  
  	ret = generic_handle_irq(gt->gsc.intf[intf_id].irq);

if (ret)
-   drm_err_ratelimited(>i915->drm, "error handling GSC irq: 
%d\n", ret);
+   gt_err_ratelimited(gt, "error handling GSC irq: %d\n", ret);
  }
  
  void intel_gsc_irq_handler(struct intel_gt *gt, u32 iir)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_print.h 
b/drivers/gpu/drm/i915/gt/intel_gt_print.h
index 55a336a9ff061..7fdc78c79273e 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_print.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_print.h
@@ -16,6 +16,9 @@
  #define gt_warn(_gt, _fmt, ...) \
drm_warn(&(_gt)->i915->drm, "GT%u: " _fmt, (_gt)->info.id, 
##__VA_ARGS__)
  
+#define gt_warn_once(_gt, _fmt, ...) \

+   drm_warn_once(&(_gt)->i915->drm, "GT%u: " _fmt, (_gt)->info.id, 
##__VA_ARGS__)
+

I would add the gt_warn_once() part in a different patch.
But this is the patch that uses it. You should not add dead code. The 
only exception being if it is something large and complex that needs to 
be added in stages for ease of code review. But this really doesn't 
count as large or complex!


John.



Andi




Re: [Intel-gfx] [PATCH] drm/print: Add drm_dbg_ratelimited

2023-10-09 Thread John Harrison

On 10/9/2023 12:43, Andi Shyti wrote:

Hi John,


From: Nirmoy Das 

Add a function for ratelimitted debug print.

Cc: Maarten Lankhorst 
Cc: Maxime Ripard 
Cc: Thomas Zimmermann 
Cc: David Airlie 
Cc: Daniel Vetter 
Reviewed-by: Matthew Auld 
Reviewed-by: Andi Shyti 
Signed-off-by: Nirmoy Das 
Signed-off-by: Andi Shyti 

Just a kind reminder!

This is the second time this patch has been sent and we have seen
some potential use of the drm_dbg_ratelimited().

But this patch does not actually add a user. So it is dead code at this
point, which is not allowed.

If you have code that wants to use such a helper then the helper should be
part of the patch that adds that code.

this is the kind of "Argh! If I had it!" patch. At the 3/4th
"Argh! If I had it!" then you decide to send it but don't
remember for which case you needed it.

But you should have sent it on the 1st patch set that wanted it.

And if you have code already merged that would benefit from it, then 
update that code to use it and post that as a patch together with this 
implementation.


And note that if your code is GT related, then it should use 
gt_dbg_ratelimited not drm_dbg_ratelimited.


John.




I'm sure that once this goes in won't be long until poeple will
start using it. In any case, if it doesn't go in I will keep it
ready until the case comes.

Andi




Re: [Intel-gfx] [PATCH v8 3/7] drm/i915: Define and use GuC and CTB TLB invalidation routines

2023-10-09 Thread John Harrison

On 10/9/2023 01:56, Tvrtko Ursulin wrote:

On 06/10/2023 19:20, Jonathan Cavitt wrote:

From: Prathap Kumar Valsan 

The GuC firmware had defined the interface for Translation Look-Aside
Buffer (TLB) invalidation.  We should use this interface when
invalidating the engine and GuC TLBs.
Add additional functionality to intel_gt_invalidate_tlb, invalidating
the GuC TLBs and falling back to GT invalidation when the GuC is
disabled.
The invalidation is done by sending a request directly to the GuC
tlb_lookup that invalidates the table.  The invalidation is submitted as
a wait request and is performed in the CT event handler.  This means we
cannot perform this TLB invalidation path if the CT is not enabled.
If the request isn't fulfilled in two seconds, this would constitute
an error in the invalidation as that would constitute either a lost
request or a severe GuC overload.

With this new invalidation routine, we can perform GuC-based GGTT
invalidations.  GuC-based GGTT invalidation is incompatible with
MMIO invalidation so we should not perform MMIO invalidation when
GuC-based GGTT invalidation is expected.

Purpose of xarray:
The tlb_lookup table is allocated as an xarray because the set of
pending TLB invalidations may have no upper bound.  The consequence of
this is that all actions interfacing with this table need to use the
xarray functions, such as xa_alloc_cyclic_irq for array insertion.

Purpose of must_wait_woken:
Our wait for the G2H ack for the completion of a TLB invalidation is
mandatory; we must wait for the HW to confirm that the physical
addresses are no longer accessible before we return those to the system.

On switching to using the wait_woken() convenience routine, we
introduced ourselves to an issue where wait_woken() may complete early
under a kthread that is stopped. Since we send a TLB invalidation when
we try to release pages from the shrinker, we can be called from any
process; including kthreads.

Using wait_woken() from any process context causes another issue. The
use of is_kthread_should_stop() assumes that any task with PF_KTHREAD
set was made by kthread_create() and has called set_kthread_struct().
This is not true for the raw kernel_thread():


This explanation misses the main point of my ask - which is to explain 
why a simpler scheme isn't sufficient. Simpler scheme aka not needed 
the xarray or any flavour of wait_token().


In other words it is obvious we have to wait for the invalidation ack, 
but not obvious why we need a complicated scheme.
The alternative being to simply serialise all TLB invalidation requests? 
Thus, no complex tracking required as there is only one in flight at a 
time? That seems inefficient and a potential performance impact if a 
bunch of invalidations are required back to back. But given that the 
current scheme is global invalidation only (no support for ranges / per 
page invalidation yet), is it possible to get multiple back-to-back 
requests?





BUG: kernel NULL pointer dereference, address: 
[ 3089.759660] Call Trace:
[ 3089.762110]  wait_woken+0x4f/0x80
[ 3089.765496]  guc_send_invalidate_tlb+0x1fe/0x310 [i915]
[ 3089.770725]  ? syscall_return_via_sysret+0xf/0x7f
[ 3089.775426]  ? do_wait_intr_irq+0xb0/0xb0
[ 3089.779430]  ? __switch_to_asm+0x40/0x70
[ 3089.783349]  ? __switch_to_asm+0x34/0x70
[ 3089.787273]  ? __switch_to+0x7a/0x3e0
[ 3089.790930]  ? __switch_to_asm+0x34/0x70
[ 3089.794883]  intel_guc_invalidate_tlb_full+0x92/0xa0 [i915]
[ 3089.800487]  intel_invalidate_tlb_full+0x94/0x190 [i915]
[ 3089.805824]  intel_invalidate_tlb_full_sync+0x1b/0x30 [i915]
[ 3089.811508]  __i915_gem_object_unset_pages+0x138/0x150 [i915]
[ 3089.817279]  __i915_gem_object_put_pages+0x25/0x90 [i915]
[ 3089.822706]  i915_gem_shrink+0x532/0x7e0 [i915]
[ 3089.827264]  i915_gem_shrinker_scan+0x3d/0xd0 [i915]
[ 3089.832230]  do_shrink_slab+0x12c/0x2a0
[ 3089.836065]  shrink_slab+0xad/0x2b0
[ 3089.839550]  shrink_node+0xcc/0x410
[ 3089.843035]  do_try_to_free_pages+0xc6/0x380
[ 3089.847306]  try_to_free_pages+0xec/0x1c0
[ 3089.851312]  __alloc_pages_slowpath+0x3ad/0xd10
[ 3089.855845]  ? update_sd_lb_stats+0x636/0x710
[ 3089.860204]  __alloc_pages_nodemask+0x2d5/0x310
[ 3089.864737]  new_slab+0x265/0xa80
[ 3089.868053]  ___slab_alloc+0y_to_free_pages+0xec/0x1c0
[ 3089.871798]  ? copy_process+0x1e5/0x1a00
[ 3089.875717]  ? load_balance+0x165/0xb20
[ 3089.879555]  __slab_alloc+0x1c/0x30
[ 3089.883047]  kmem_cache_alloc_node+0x9f/0x240
[ 3089.887397]  ? copy_process+0x1e5/0x1a00
[ 3089.891314]  copy_process+0x1e5/0x1a00
[ 3089.895058]  ? __switch_to_asm+0x40/0x70
[ 3089.879555]  __slab_alloc+0x1c/0x30
[ 3089.883047]  kmem_cache_alloc_node+0x9f/0x240
[ 3089.887397]  ? copy_process+0x1e5/0x1a00
[ 3089.891314]  copy_process+0x1e5/0x1a00
[ 3089.895058]  ? __switch_to_asm+0x40/0x70
[ 3089.898977]  ? __switch_to_asm+0x34/0x70
[ 3089.902903]  ? __switch_to_asm+0x40/0x70
[ 3089.906828]  ? __switch_to_asm+0x34/0x70
[ 3089.910745]  _do_fork+0x83/0x350
[ 

Re: [Intel-gfx] [PATCH] drm/print: Add drm_dbg_ratelimited

2023-10-09 Thread John Harrison

On 10/9/2023 09:52, Andi Shyti wrote:

Hi,


From: Nirmoy Das 

Add a function for ratelimitted debug print.

Cc: Maarten Lankhorst 
Cc: Maxime Ripard 
Cc: Thomas Zimmermann 
Cc: David Airlie 
Cc: Daniel Vetter 
Reviewed-by: Matthew Auld 
Reviewed-by: Andi Shyti 
Signed-off-by: Nirmoy Das 
Signed-off-by: Andi Shyti 

Just a kind reminder!

This is the second time this patch has been sent and we have seen
some potential use of the drm_dbg_ratelimited().
But this patch does not actually add a user. So it is dead code at this 
point, which is not allowed.


If you have code that wants to use such a helper then the helper should 
be part of the patch that adds that code.


John.



Any feedback?

Thanks,
Andi




Re: [Intel-gfx] [PATCH 1/3] drm/i915/guc: Support new and improved engine busyness

2023-10-06 Thread John Harrison

On 10/3/2023 13:58, Umesh Nerlige Ramappa wrote:
On Fri, Sep 22, 2023 at 03:25:08PM -0700, john.c.harri...@intel.com 
wrote:

From: John Harrison 

The GuC has been extended to support a much more friendly engine
busyness interface. So partition the old interface into a 'busy_v1'
space and add 'busy_v2' support alongside. And if v2 is available, use
that in preference to v1. Note that v2 provides extra features over
and above v1 which will be exposed via PMU in subsequent patches.


Since we are thinking of using the existing busyness counter to expose 
the v2 values, we can drop the last sentence from above.




Signed-off-by: John Harrison 
---
drivers/gpu/drm/i915/gt/intel_engine_types.h  |   4 +-
.../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   4 +-
drivers/gpu/drm/i915/gt/uc/intel_guc.h    |  82 ++--
drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c    |  55 ++-
drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h    |   9 +-
drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  23 +-
.../gpu/drm/i915/gt/uc/intel_guc_submission.c | 381 ++
7 files changed, 427 insertions(+), 131 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h 
b/drivers/gpu/drm/i915/gt/intel_engine_types.h

index a7e6775980043..40fd8f984d64b 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -323,7 +323,7 @@ struct intel_engine_execlists_stats {
ktime_t start;
};

-struct intel_engine_guc_stats {
+struct intel_engine_guc_stats_v1 {
/**
 * @running: Active state of the engine when busyness was last 
sampled.

 */
@@ -603,7 +603,7 @@ struct intel_engine_cs {
struct {
    union {
    struct intel_engine_execlists_stats execlists;
-    struct intel_engine_guc_stats guc;
+    struct intel_engine_guc_stats_v1 guc_v1;
    };


Overall, I would suggest having the renames as a separate patch. Would 
make the review easier.




    /**
diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h 
b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h

index f359bef046e0b..c190a99a36c38 100644
--- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
+++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
@@ -137,7 +137,9 @@ enum intel_guc_action {
INTEL_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600,
INTEL_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC = 0x4601,
INTEL_GUC_ACTION_CLIENT_SOFT_RESET = 0x5507,
-    INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A,
+    INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF_V1 = 0x550A,
+    INTEL_GUC_ACTION_SET_DEVICE_ENGINE_UTILIZATION_V2 = 0x550C,
+    INTEL_GUC_ACTION_SET_FUNCTION_ENGINE_UTILIZATION_V2 = 0x550D,
INTEL_GUC_ACTION_STATE_CAPTURE_NOTIFICATION = 0x8002,
INTEL_GUC_ACTION_NOTIFY_FLUSH_LOG_BUFFER_TO_FILE = 0x8003,
INTEL_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED = 0x8004,
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h 
b/drivers/gpu/drm/i915/gt/uc/intel_guc.h

index 6c392bad29c19..e6502ab5f049f 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -226,45 +226,61 @@ struct intel_guc {
struct mutex send_mutex;

/**
- * @timestamp: GT timestamp object that stores a copy of the 
timestamp

- * and adjusts it for overflow using a worker.
+ * @busy: Data used by the different versions of engine busyness 
implementations.

 */
-    struct {
-    /**
- * @lock: Lock protecting the below fields and the engine 
stats.

- */
-    spinlock_t lock;
-
-    /**
- * @gt_stamp: 64 bit extended value of the GT timestamp.
- */
-    u64 gt_stamp;
-
-    /**
- * @ping_delay: Period for polling the GT timestamp for
- * overflow.
- */
-    unsigned long ping_delay;
-
-    /**
- * @work: Periodic work to adjust GT timestamp, engine and
- * context usage for overflows.
- */
-    struct delayed_work work;
-
+    union {
    /**
- * @shift: Right shift value for the gpm timestamp
+ * @v1: Data used by v1 engine busyness implementation. 
Mostly a copy
+ * of the GT timestamp extended to 64 bits and the worker 
for maintaining it.

 */
-    u32 shift;
+    struct {
+    /**
+ * @lock: Lock protecting the below fields and the 
engine stats.

+ */
+    spinlock_t lock;
+
+    /**
+ * @gt_stamp: 64 bit extended value of the GT timestamp.
+ */
+    u64 gt_stamp;
+
+    /**
+ * @ping_delay: Period for polling the GT timestamp for
+ * overflow.
+ */
+    unsigned long ping_delay;
+
+    /**
+ * @work: Periodic work to adjust GT timestamp, engine and
+ * context usage for overflows.
+ */
+    struct delayed_work work;
+
+    /**
+ * @shift: Right shift value for the gpm timestamp

Re: [Intel-gfx] [bug report] drm/i915: Move submission tasklet to i915_sched_engine

2023-10-06 Thread John Harrison

Tvrtko, would you have any thoughts on this one?

John.


On 10/4/2023 02:57, Dan Carpenter wrote:

Hello Matthew Brost,

This is a semi-automatic email about new static checker warnings.

The patch 22916bad07a5: "drm/i915: Move submission tasklet to
i915_sched_engine" from Jun 17, 2021, leads to the following Smatch
complaint:

 drivers/gpu/drm/i915/gt/intel_execlists_submission.c:3659 
rcu_virtual_context_destroy()
 warn: variable dereferenced before check 've->base.sched_engine' (see line 
3633)

drivers/gpu/drm/i915/gt/intel_execlists_submission.c
   3632  */
   3633 tasklet_kill(>base.sched_engine->tasklet);
  ^^^
The patch introduced a new dereference here

   3634 
   3635 /* Decouple ourselves from the siblings, no more access 
allowed. */
   3636 for (n = 0; n < ve->num_siblings; n++) {
   3637 struct intel_engine_cs *sibling = ve->siblings[n];
   3638 struct rb_node *node = >nodes[sibling->id].rb;
   3639 
   3640 if (RB_EMPTY_NODE(node))
   3641 continue;
   3642 
   3643 spin_lock_irq(>sched_engine->lock);
   3644 
   3645 /* Detachment is lazily performed in the 
sched_engine->tasklet */
   3646 if (!RB_EMPTY_NODE(node))
   3647 rb_erase_cached(node, 
>execlists.virtual);
   3648 
   3649 spin_unlock_irq(>sched_engine->lock);
   3650 }
   3651 
GEM_BUG_ON(__tasklet_is_scheduled(>base.sched_engine->tasklet));
   3652 GEM_BUG_ON(!list_empty(virtual_queue(ve)));
   3653 
   3654 lrc_fini(>context);
   3655 intel_context_fini(>context);
   3656 
   3657 if (ve->base.breadcrumbs)
   3658 intel_breadcrumbs_put(ve->base.breadcrumbs);
   3659 if (ve->base.sched_engine)
 ^
But previous code had assumed the sched_engine could be NULL.

   3660 i915_sched_engine_put(ve->base.sched_engine);
   3661 intel_engine_free_request_pool(>base);

regards,
dan carpenter




Re: [Intel-gfx] [PATCH v7 4/5] drm/i915/gt: Increase sleep in gt_tlb selftest sanitycheck

2023-10-06 Thread John Harrison

On 10/5/2023 12:35, Jonathan Cavitt wrote:

For the gt_tlb live selftest, increase the timeout from 10 ms to 200 ms.
200 ms should be more than enough time, and 10 ms was too aggressive.
This is simply waiting for a request to begin executing on an idle 
system? How can 10ms possibly be too aggressive?


John.



Signed-off-by: Jonathan Cavitt 
---
  drivers/gpu/drm/i915/gt/selftest_tlb.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_tlb.c 
b/drivers/gpu/drm/i915/gt/selftest_tlb.c
index 7e41f69fc818f..46e0a1dbecc8d 100644
--- a/drivers/gpu/drm/i915/gt/selftest_tlb.c
+++ b/drivers/gpu/drm/i915/gt/selftest_tlb.c
@@ -137,7 +137,7 @@ pte_tlbinv(struct intel_context *ce,
i915_request_add(rq);
  
  	/* Short sleep to sanitycheck the batch is spinning before we begin */

-   msleep(10);
+   msleep(200);
if (va == vb) {
if (!i915_request_completed(rq)) {
pr_err("%s(%s): Semaphore sanitycheck failed %llx, with 
alignment %llx, using PTE size %x (phys %x, sg %x)\n",




Re: [Intel-gfx] [PATCH v7 3/5] drm/i915: No TLB invalidation on wedged or suspended GT

2023-10-06 Thread John Harrison




On 10/6/2023 03:23, Tvrtko Ursulin wrote:



On 05/10/2023 20:35, Jonathan Cavitt wrote:

...
+static bool intel_gt_is_enabled(const struct intel_gt *gt)
+{
+    /* Check if GT is wedged or suspended */
+    if (intel_gt_is_wedged(gt) || !intel_irqs_enabled(gt->i915))
+    return false;
+    return true;
+}
+
  static int guc_send_invalidate_tlb(struct intel_guc *guc, u32 type)
  {
  struct intel_guc_tlb_wait _wq, *wq = &_wq;
@@ -4763,7 +4786,8 @@ static int guc_send_invalidate_tlb(struct 
intel_guc *guc, u32 type)

  };
  u32 size = ARRAY_SIZE(action);
  -    if (!intel_guc_ct_enabled(>ct))
+    if (!intel_gt_is_enabled(gt) ||
+    !intel_guc_ct_enabled(>ct))


IMO this reads confused but I leave it to the GuC experts to decide 
what makes sense. Not only that it reads confused but it does inspire 
confidence that it closes any race, since this state can still change 
just after this check, and then the invalidation request gets 
submitted (contrary to what the commit says?). Only thing it does 
below is skip the wait and the time out error message. Again, I leave 
it for people who know the GuC state transition flows to bless this part.


Regards,

Tvrtko
Regarding confused naming, I personally still think that 
intel_gt_is_enabled() is a bad name. Even the comment inside the 
function does not mention 'enable', it says 'wedged or suspended'. One 
could certainly argue that the GT is also not currently enabled if GuC 
is in use but the CT channel is down.


Regarding race conditions, the only things that can take the CT channel 
down are driver shutdown, suspend and GT reset. On the submission side, 
the assumption is that the scheduling levels of the driver are not going 
to call in to the submission backend without suitable locking held to 
ensure those operations cannot occur concurrently. Is the same not true 
here? We have to guard against the situation where the call starts from 
a 'bad' state, e.g. wedged. But the lowest level of code can't be 
expected to take higher level locks. From all the way down here, we have 
no idea what the upper levels may or may not be doing and what locks may 
or may not have been acquired, and therefore what locks may or may not 
be safe to acquire.


John.




  return -EINVAL;
    init_waitqueue_head(&_wq.wq);
@@ -4806,7 +4830,8 @@ static int guc_send_invalidate_tlb(struct 
intel_guc *guc, u32 type)

   * can be queued in CT buffer.
   */
  #define OUTSTANDING_GUC_TIMEOUT_PERIOD  (HZ * 2)
-    if (!must_wait_woken(, OUTSTANDING_GUC_TIMEOUT_PERIOD)) {
+    if (intel_gt_is_enabled(gt) &&
+    !must_wait_woken(, OUTSTANDING_GUC_TIMEOUT_PERIOD)) {
  gt_err(gt,
 "TLB invalidation response timed out for seqno 
%u\n", seqno);

  err = -ETIME;
diff --git a/drivers/gpu/drm/i915/i915_driver.c 
b/drivers/gpu/drm/i915/i915_driver.c

index ccbb2834cde07..0c9d9826d2f41 100644
--- a/drivers/gpu/drm/i915/i915_driver.c
+++ b/drivers/gpu/drm/i915/i915_driver.c
@@ -72,6 +72,7 @@
  #include "gt/intel_gt.h"
  #include "gt/intel_gt_pm.h"
  #include "gt/intel_rc6.h"
+#include "gt/intel_tlb.h"
    #include "pxp/intel_pxp.h"
  #include "pxp/intel_pxp_debugfs.h"
@@ -1093,6 +1094,9 @@ static int i915_drm_suspend(struct drm_device 
*dev)

  intel_dp_mst_suspend(dev_priv);
    intel_runtime_pm_disable_interrupts(dev_priv);
+
+    intel_gt_tlb_suspend_all(dev_priv);
+
  intel_hpd_cancel_work(dev_priv);
    intel_suspend_encoders(dev_priv);
@@ -1264,6 +1268,8 @@ static int i915_drm_resume(struct drm_device *dev)
    intel_gvt_resume(dev_priv);
  +    intel_gt_tlb_resume_all(dev_priv);
+
  enable_rpm_wakeref_asserts(_priv->runtime_pm);
    return 0;




Re: [Intel-gfx] [PATCH v7 2/5] drm/i915: Define and use GuC and CTB TLB invalidation routines

2023-10-06 Thread John Harrison

On 10/6/2023 09:18, John Harrison wrote:

On 10/6/2023 03:20, Nirmoy Das wrote:


On 10/6/2023 12:11 PM, Tvrtko Ursulin wrote:


Hi,


Andi asked me to summarize what I think is unaddressed review 
feedback so far in order to consolidate and enable hopefully things 
to move forward. So I will try to re-iterate the comments and 
questions below.


But also note that there is a bunch of new valid comments from John 
against v7 which I will not repeat.


On 05/10/2023 20:35, Jonathan Cavitt wrote:

...
+enum intel_guc_tlb_invalidation_type {
+    INTEL_GUC_TLB_INVAL_FULL = 0x0,
+    INTEL_GUC_TLB_INVAL_GUC = 0x3,


New question - are these names coming from the GuC iface? I find it 
confusing that full does not include GuC but maybe it is just me. So 
maybe full should be called GT or something? Although then again it 
wouldn't be clear GT does not include the GuC..  bummer. GPU? Dunno. 
Minor confusion I guess so can keep as is.


I agree this is bit confusing name. We are using 
INTEL_GUC_TLB_INVAL_GUC to invalidate ggtt, how about 
INTEL_GUC_TLB_INVAL_GGTT ?



The GuC interface spec says:

GUC_TLB_INV_TYPE_TLB_INV_FULL_INTRA_VF = 0x00
Full TLB invalidation within a VF. Invalidates VF’s TLBs only if
that VF is active, will invalidate across all engines.

GUC_TLB_INV_TYPE_TLB_INV_GUC = 0x03
Guc TLB Invalidation.


So the 'GUC' type is not GGTT, it is the TLBs internal to GuC itself 
is how I would read the above. Whereas 'FULL' is everything that is 
per-VF, aka everything in the GT that is beyond the GuC level - i.e. 
the engines, EUs and everything from there on.


So I think the INVAL_GUC name is correct. But maybe INVAL_FULL should 
be called INVAL_VF? Or INVAL_ENGINES if you don't like using the VF 
term in a non-SRIOV capable driver?


John.



PS: The function names should also match the type name.

Currently the functions are:
    int intel_guc_invalidate_tlb_full(struct intel_guc *guc);
    int intel_guc_invalidate_tlb(struct intel_guc *guc);

The second should have a suffix to say what is being invalidated - e.g. 
intel_guc_invalidate_tlb_guc(). The 'guc' in the prefix is just 
describing the mechanism not the target. So I would read the above as 
'full' being a subset of 'blank'.


John.


Re: [Intel-gfx] [PATCH v7 2/5] drm/i915: Define and use GuC and CTB TLB invalidation routines

2023-10-06 Thread John Harrison

On 10/6/2023 03:20, Nirmoy Das wrote:


On 10/6/2023 12:11 PM, Tvrtko Ursulin wrote:


Hi,


Andi asked me to summarize what I think is unaddressed review 
feedback so far in order to consolidate and enable hopefully things 
to move forward. So I will try to re-iterate the comments and 
questions below.


But also note that there is a bunch of new valid comments from John 
against v7 which I will not repeat.


On 05/10/2023 20:35, Jonathan Cavitt wrote:

...
+enum intel_guc_tlb_invalidation_type {
+    INTEL_GUC_TLB_INVAL_FULL = 0x0,
+    INTEL_GUC_TLB_INVAL_GUC = 0x3,


New question - are these names coming from the GuC iface? I find it 
confusing that full does not include GuC but maybe it is just me. So 
maybe full should be called GT or something? Although then again it 
wouldn't be clear GT does not include the GuC.. bummer. GPU? Dunno. 
Minor confusion I guess so can keep as is.


I agree this is bit confusing name. We are using 
INTEL_GUC_TLB_INVAL_GUC to invalidate ggtt, how about 
INTEL_GUC_TLB_INVAL_GGTT ?



The GuC interface spec says:

   GUC_TLB_INV_TYPE_TLB_INV_FULL_INTRA_VF = 0x00
   Full TLB invalidation within a VF. Invalidates VF’s TLBs only if
   that VF is active, will invalidate across all engines.

   GUC_TLB_INV_TYPE_TLB_INV_GUC = 0x03
   Guc TLB Invalidation.


So the 'GUC' type is not GGTT, it is the TLBs internal to GuC itself is 
how I would read the above. Whereas 'FULL' is everything that is per-VF, 
aka everything in the GT that is beyond the GuC level - i.e. the 
engines, EUs and everything from there on.


So I think the INVAL_GUC name is correct. But maybe INVAL_FULL should be 
called INVAL_VF? Or INVAL_ENGINES if you don't like using the VF term in 
a non-SRIOV capable driver?


John.


Re: [Intel-gfx] [PATCH v7 1/5] drm/i915: Add GuC TLB Invalidation device info flags

2023-10-05 Thread John Harrison

On 10/5/2023 12:35, Jonathan Cavitt wrote:

Add device info flags for if GuC TLB Invalidation is enabled.

Signed-off-by: Jonathan Cavitt 
---
  drivers/gpu/drm/i915/i915_drv.h  | 1 +
  drivers/gpu/drm/i915/intel_device_info.h | 3 ++-
  2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index cb60fc9cf8737..c53c5586c40c8 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -801,4 +801,5 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915,
  #define HAS_LMEMBAR_SMEM_STOLEN(i915) (!HAS_LMEM(i915) && \
   GRAPHICS_VER_FULL(i915) >= IP_VER(12, 
70))
  
+#define HAS_GUC_TLB_INVALIDATION(i915)	(INTEL_INFO(i915)->has_guc_tlb_invalidation)
These defines seem to be in completely random order, but it is probably 
still better to put this one next to HAS_GUC_DEPRIV for at least a 
little bit of consistency!



  #endif
diff --git a/drivers/gpu/drm/i915/intel_device_info.h 
b/drivers/gpu/drm/i915/intel_device_info.h
index 39817490b13fd..1cb2beff51835 100644
--- a/drivers/gpu/drm/i915/intel_device_info.h
+++ b/drivers/gpu/drm/i915/intel_device_info.h
@@ -153,6 +153,7 @@ enum intel_ppgtt_type {
func(has_heci_pxp); \
func(has_heci_gscfi); \
func(has_guc_deprivilege); \
+   func(has_guc_tlb_invalidation); \
func(has_l3_ccs_read); \
func(has_l3_dpf); \
func(has_llc); \
@@ -173,7 +174,7 @@ enum intel_ppgtt_type {
func(has_coherent_ggtt); \
func(tuning_thread_rr_after_dep); \
func(unfenced_needs_alignment); \
-   func(hws_needs_physical);
+   func(hws_needs_physical); \

Why this?

John.

  
  struct intel_ip_version {

u8 ver;




Re: [Intel-gfx] [PATCH v7 2/5] drm/i915: Define and use GuC and CTB TLB invalidation routines

2023-10-05 Thread John Harrison

On 10/5/2023 12:35, Jonathan Cavitt wrote:

From: Prathap Kumar Valsan 

The GuC firmware had defined the interface for Translation Look-Aside
Buffer (TLB) invalidation.  We should use this interface when
invalidating the engine and GuC TLBs.
Add additional functionality to intel_gt_invalidate_tlb, invalidating
the GuC TLBs and falling back to GT invalidation when the GuC is
disabled.
The invalidation is done by sending a request directly to the GuC
tlb_lookup that invalidates the table.  The invalidation is submitted as
a wait request and is performed in the CT event handler.  This means we
cannot perform this TLB invalidation path if the CT is not enabled.
If the request isn't fulfilled in two seconds, this would constitute
an error in the invalidation as that would constitute either a lost
request or a severe GuC overload.
The tlb_lookup table is allocated as an xarray because the set of
pending TLB invalidations may have no upper bound.  The consequence of
this is that all actions interfacing with this table need to use the
xarray functions, such as xa_alloc_cyclic_irq for array insertion.

With this new invalidation routine, we can perform GuC-based GGTT
invalidations.  GuC-based GGTT invalidation is incompatible with
MMIO invalidation so we should not perform MMIO invalidation when
GuC-based GGTT invalidation is expected.

Signed-off-by: Prathap Kumar Valsan 
Signed-off-by: Bruce Chang 
Signed-off-by: Chris Wilson 
Signed-off-by: Umesh Nerlige Ramappa 
Signed-off-by: Jonathan Cavitt 
Signed-off-by: Aravind Iddamsetty 
Signed-off-by: Fei Yang 
CC: Andi Shyti 
---
  drivers/gpu/drm/i915/gt/intel_ggtt.c  |  34 ++-
  drivers/gpu/drm/i915/gt/intel_tlb.c   |  14 +-
  .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |  33 +++
  drivers/gpu/drm/i915/gt/uc/intel_guc.h|  22 ++
  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c |   4 +
  drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |   1 +
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 211 +-
  7 files changed, 307 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c 
b/drivers/gpu/drm/i915/gt/intel_ggtt.c
index 4d7d88b92632b..18f23f27f1572 100644
--- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
@@ -206,22 +206,38 @@ static void gen8_ggtt_invalidate(struct i915_ggtt *ggtt)
intel_uncore_write_fw(uncore, GFX_FLSH_CNTL_GEN6, GFX_FLSH_CNTL_EN);
  }
  
+static void guc_ggtt_ct_invalidate(struct intel_gt *gt)

+{
+   struct intel_uncore *uncore = gt->uncore;
+   intel_wakeref_t wakeref;
+
+   with_intel_runtime_pm_if_active(uncore->rpm, wakeref) {
+   struct intel_guc *guc = >uc.guc;
+
+   intel_guc_invalidate_tlb(guc);
+   }
+}
+
  static void guc_ggtt_invalidate(struct i915_ggtt *ggtt)
  {
struct drm_i915_private *i915 = ggtt->vm.i915;
+   struct intel_gt *gt;
  
-	gen8_ggtt_invalidate(ggtt);

-
-   if (GRAPHICS_VER(i915) >= 12) {
-   struct intel_gt *gt;
+   if (!HAS_GUC_TLB_INVALIDATION(i915))
+   gen8_ggtt_invalidate(ggtt);
  
-		list_for_each_entry(gt, >gt_list, ggtt_link)

+   list_for_each_entry(gt, >gt_list, ggtt_link) {
+   if (HAS_GUC_TLB_INVALIDATION(i915) &&
+   intel_guc_is_ready(>uc.guc)) {
+   guc_ggtt_ct_invalidate(gt);
+   } else if (GRAPHICS_VER(i915) >= 12) {
intel_uncore_write_fw(gt->uncore,
  GEN12_GUC_TLB_INV_CR,
  GEN12_GUC_TLB_INV_CR_INVALIDATE);
-   } else {
-   intel_uncore_write_fw(ggtt->vm.gt->uncore,
- GEN8_GTCR, GEN8_GTCR_INVALIDATE);
+   } else {
+   intel_uncore_write_fw(gt->uncore,
+ GEN8_GTCR, GEN8_GTCR_INVALIDATE);
+   }
}
  }
  
@@ -1243,7 +1259,7 @@ static int gen8_gmch_probe(struct i915_ggtt *ggtt)

ggtt->vm.raw_insert_page = gen8_ggtt_insert_page;
}
  
-	if (intel_uc_wants_guc(>vm.gt->uc))

+   if (intel_uc_wants_guc_submission(>vm.gt->uc))
ggtt->invalidate = guc_ggtt_invalidate;
else
ggtt->invalidate = gen8_ggtt_invalidate;
diff --git a/drivers/gpu/drm/i915/gt/intel_tlb.c 
b/drivers/gpu/drm/i915/gt/intel_tlb.c
index 139608c30d978..a84563c178bc6 100644
--- a/drivers/gpu/drm/i915/gt/intel_tlb.c
+++ b/drivers/gpu/drm/i915/gt/intel_tlb.c
@@ -12,6 +12,7 @@
  #include "intel_gt_print.h"
  #include "intel_gt_regs.h"
  #include "intel_tlb.h"
+#include "uc/intel_guc.h"
  
  /*

   * HW architecture suggest typical invalidation time at 40us,
@@ -131,11 +132,22 @@ void intel_gt_invalidate_tlb_full(struct intel_gt *gt, 
u32 seqno)
return;
  
  	with_intel_gt_pm_if_awake(gt, wakeref) {

+   struct intel_guc *guc = >uc.guc;
+
 

Re: [Intel-gfx] [RFC PATCH] drm/i915/gt: Do not treat MCR locking timeouts as errors

2023-10-04 Thread John Harrison

On 10/4/2023 13:58, Andi Shyti wrote:

Hi Matt,


The MCR steering semaphore is a shared lock entry between i915
and various firmware components.

Getting the lock might sinchronize on some shared resources.
Sometimes though, it might happen that the firmware forgets to
unlock causing unnecessary noise in the driver which keeps doing
what was supposed to do, ignoring the problem.

Do not consider this failure as an error, but just print a debug
message stating that the MCR locking has been skipped.

On the driver side we still have spinlocks that make sure that
the access to the resources is serialized.

Signed-off-by: Andi Shyti 
Cc: Jonathan Cavitt 
Cc: Matt Roper 
Cc: Nirmoy Das 
---
drivers/gpu/drm/i915/gt/intel_gt_mcr.c | 6 ++
1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_mcr.c 
b/drivers/gpu/drm/i915/gt/intel_gt_mcr.c
index 326c2ed1d99b..51eb693df39b 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_mcr.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_mcr.c
@@ -395,10 +395,8 @@ void intel_gt_mcr_lock(struct intel_gt *gt, unsigned long 
*flags)
 * would indicate some hardware/firmware is misbehaving and not
 * releasing it properly.
 */
-   if (err == -ETIMEDOUT) {
-   gt_err_ratelimited(gt, "hardware MCR steering semaphore timed 
out");
-   add_taint_for_CI(gt->i915, TAINT_WARN);  /* CI is now 
unreliable */
-   }
+   if (err == -ETIMEDOUT)
+   gt_dbg(gt, "hardware MCR steering semaphore timed out");
}
/**

Are we sure this does not warrant a level higher than dbg, such as
notice/warn?

We might make it a warn, but this doesn't change much the economy
of the driver as we will keep doing what we were supposed to do.


Because how can we be sure the two entities will not stomp on
each other toes if we failed to obtain lock?

So far, in all the research I've done, no one looks like using
MCR lock, but yet someone is stuck in it.

If someone has the lock then that someone thinks they are using it. Just
because you can't see what someone piece of IFWI is doing doesn't mean it
isn't doing it. And if it is a genuinely missing unlock then it needs to be
tracked down and fixed with an IFWI update otherwise the system is going to
be unstable from that point on.

But I'm not changing here the behavior of the driver. The driver
will keep doing what was doing before.

Because this most probably won't be noticed by the user, then I
don't see why it should shout out loud that the system is
unusable when most probably it is.

That's like saying that any random race condition isn't likely to be
noticed by the user so it's not a big deal if we're missing a few
mutexes or spinlocks somewhere...even though there's likely to be no
user-visible impact to any race condition 99% of the time, it's the 1%
that winds up being absolutely catastrophic.

Not really... normally if you hit a spinlock/mutex race
condition, you end up in a deadlock and stall the system. In this
case, I agree that the lock should be sorted out by the hardware,
but in the meantime i915 is *already* ignoring it.
Um, "a deadlock and stall the system" is exactly what is happening here. 
To prevent a total hang, we are ignoring the deadlock and proceeding 
anyway. Essentially moving to the situation of having a critical section 
which is not protected by the mutex at all.  No matter how you phrase 
it, that is a critical section failure and you do not know how the 1% 
failure might manifest.




I'm not making any behavioral change with this patch.

What I'm trying to say is that if the system doesn't crash, then
let it go... don't crash it on purpose just because there is a
locking situation and we think it has a catastrophic effect, but
in reality is not producing anything (like practically in our
case, you can check the CI results[*]).
We are not going to 'crash it on purpose'. We are printing out an error 
message to say that an error has occurred. Which it has. And as above, 
just because you haven't noticed hitting a catastrophic race condition 
yet doesn't mean that it isn't there.


John.



[*] https://patchwork.freedesktop.org/patch/560733/?series=124599=1


If we're not obtaining the MCR lock as expected and are simply moving
forward to force our own steering (possibly at the same time firmware is
programming steering to a different value), you probably won't actually
see a problem either because the operations won't wind up interleaving
in a problematic order, or because the driver and the firmware both
happen to be trying to steer to the same instance (e.g., instance #0 is
a quite common target).  But even if they're hard to hit, the
possibility for a major problem is still there and basically we need to
consider the whole platform to be in an unknown, unstable state once
we've detected one of these issues.

Based on some earlier experiments, it sounds like the problem at the
moment is that we've just been too 

Re: [Intel-gfx] [RFC PATCH] drm/i915/gt: Do not treat MCR locking timeouts as errors

2023-10-04 Thread John Harrison

On 10/4/2023 13:09, Andi Shyti wrote:

Hi John,


The MCR steering semaphore is a shared lock entry between i915
and various firmware components.

Getting the lock might sinchronize on some shared resources.
Sometimes though, it might happen that the firmware forgets to
unlock causing unnecessary noise in the driver which keeps doing
what was supposed to do, ignoring the problem.

Do not consider this failure as an error, but just print a debug
message stating that the MCR locking has been skipped.

On the driver side we still have spinlocks that make sure that
the access to the resources is serialized.

Signed-off-by: Andi Shyti 
Cc: Jonathan Cavitt 
Cc: Matt Roper 
Cc: Nirmoy Das 
---
 drivers/gpu/drm/i915/gt/intel_gt_mcr.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_mcr.c 
b/drivers/gpu/drm/i915/gt/intel_gt_mcr.c
index 326c2ed1d99b..51eb693df39b 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_mcr.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_mcr.c
@@ -395,10 +395,8 @@ void intel_gt_mcr_lock(struct intel_gt *gt, unsigned long 
*flags)
 * would indicate some hardware/firmware is misbehaving and not
 * releasing it properly.
 */
-   if (err == -ETIMEDOUT) {
-   gt_err_ratelimited(gt, "hardware MCR steering semaphore timed 
out");
-   add_taint_for_CI(gt->i915, TAINT_WARN);  /* CI is now 
unreliable */
-   }
+   if (err == -ETIMEDOUT)
+   gt_dbg(gt, "hardware MCR steering semaphore timed out");
 }
 /**

Are we sure this does not warrant a level higher than dbg, such as
notice/warn?

We might make it a warn, but this doesn't change much the economy
of the driver as we will keep doing what we were supposed to do.


Because how can we be sure the two entities will not stomp on
each other toes if we failed to obtain lock?

So far, in all the research I've done, no one looks like using
MCR lock, but yet someone is stuck in it.

If someone has the lock then that someone thinks they are using it. Just
because you can't see what someone piece of IFWI is doing doesn't mean it
isn't doing it. And if it is a genuinely missing unlock then it needs to be
tracked down and fixed with an IFWI update otherwise the system is going to
be unstable from that point on.

But I'm not changing here the behavior of the driver. The driver
will keep doing what was doing before.

And what it is doing is dangerous and can lead to unpredictable results
because a critical section resource access is no longer wrapped with a
critical section lock. Hence there is an error message to say Here Be
Dragons.

hehe!

What are you suggesting, then? Should we reset the GT after
hitting the MCR lock issue?
No. I'm suggesting that you don't hide the issue by removing the error 
message. It is there for a reason. Just because that reason is being hit 
doesn't mean you should remove the message.




We could, but I rather prefer to work with what's available.


Because this most probably won't be noticed by the user, then I
don't see why it should shout out loud that the system is
unusable when most probably it is.

Just because a race condition is hard to hit doesn't mean it won't be hit.

We are hitting it, constantly, but it's not producing any effect.
Even when raising the MCR wait timeout. Which means that most
probably someone is having a nap on the lock.
No. You are hitting the lock contention problem constantly and so far 
are not seeing any *visible* effect.


The point is that there is still a potential race condition which you 
haven't hit yet which could lead to data corruption, page faults, 
crashes, etc. (because a TLB invalidation access went to the wrong 
target) or the CPU/GPU melting itself of the board (because a power 
management access went to the wrong target).





The point of shouting out loud is that we know for a fact a problem has
occurred. That problem might lead to nothing or it might lead to subtle data
corruption, gross crashes or anything in between.

yes, agree... the message still stays, though, with a bit of a
lower catastrophy.


BTW, at my understanding this is not an IFWI problem. We checked
with different version of IFWI and the issue is the same.

Which implies it is a driver bug after all? In which case you absolutely
should not be papering over it in the driver.

This section is serialized by a spinlock and besides I haven't
found any place else except for the TLB invalidation and the
resume where we can incur a locking situation.
There is a bug somewhere. Either it is IFWI or it is KMD. You can't say 
"I can't find a problem therefore there is no problem".


John.




Thanks a lot for your inputs and discussion!
Andi




Re: [Intel-gfx] [RFC PATCH] drm/i915/gt: Do not treat MCR locking timeouts as errors

2023-10-04 Thread John Harrison

On 10/4/2023 12:35, Andi Shyti wrote:

Hi John,


The MCR steering semaphore is a shared lock entry between i915
and various firmware components.

Getting the lock might sinchronize on some shared resources.
Sometimes though, it might happen that the firmware forgets to
unlock causing unnecessary noise in the driver which keeps doing
what was supposed to do, ignoring the problem.

Do not consider this failure as an error, but just print a debug
message stating that the MCR locking has been skipped.

On the driver side we still have spinlocks that make sure that
the access to the resources is serialized.

Signed-off-by: Andi Shyti 
Cc: Jonathan Cavitt 
Cc: Matt Roper 
Cc: Nirmoy Das 
---
drivers/gpu/drm/i915/gt/intel_gt_mcr.c | 6 ++
1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_mcr.c 
b/drivers/gpu/drm/i915/gt/intel_gt_mcr.c
index 326c2ed1d99b..51eb693df39b 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_mcr.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_mcr.c
@@ -395,10 +395,8 @@ void intel_gt_mcr_lock(struct intel_gt *gt, unsigned long 
*flags)
 * would indicate some hardware/firmware is misbehaving and not
 * releasing it properly.
 */
-   if (err == -ETIMEDOUT) {
-   gt_err_ratelimited(gt, "hardware MCR steering semaphore timed 
out");
-   add_taint_for_CI(gt->i915, TAINT_WARN);  /* CI is now 
unreliable */
-   }
+   if (err == -ETIMEDOUT)
+   gt_dbg(gt, "hardware MCR steering semaphore timed out");
}
/**

Are we sure this does not warrant a level higher than dbg, such as
notice/warn?

We might make it a warn, but this doesn't change much the economy
of the driver as we will keep doing what we were supposed to do.


Because how can we be sure the two entities will not stomp on
each other toes if we failed to obtain lock?

So far, in all the research I've done, no one looks like using
MCR lock, but yet someone is stuck in it.

If someone has the lock then that someone thinks they are using it. Just
because you can't see what someone piece of IFWI is doing doesn't mean it
isn't doing it. And if it is a genuinely missing unlock then it needs to be
tracked down and fixed with an IFWI update otherwise the system is going to
be unstable from that point on.

But I'm not changing here the behavior of the driver. The driver
will keep doing what was doing before.
And what it is doing is dangerous and can lead to unpredictable results 
because a critical section resource access is no longer wrapped with a 
critical section lock. Hence there is an error message to say Here Be 
Dragons.





Because this most probably won't be noticed by the user, then I
don't see why it should shout out loud that the system is
unusable when most probably it is.

Just because a race condition is hard to hit doesn't mean it won't be hit.

The point of shouting out loud is that we know for a fact a problem has 
occurred. That problem might lead to nothing or it might lead to subtle 
data corruption, gross crashes or anything in between.




BTW, at my understanding this is not an IFWI problem. We checked
with different version of IFWI and the issue is the same.
Which implies it is a driver bug after all? In which case you absolutely 
should not be papering over it in the driver.




Besides we received reports also from systems that are not using
IFWI at all.
There is no system that does not use IFWI. Integrated or discrete, all 
systems have an IFWI. It's just part of the main BIOS on an integrated 
platform.


John.




(How can we be sure about
"forgot to unlock" vs "in prolonged active use"?

There is a patch from Jonathan that is testing a different
timeout.


Or if we can be sure, can
we force unlock and take the lock for the driver explicitly?)

I sent a patch with this solution and Matt turned it down.

Presumably because both forcing a lock and ignoring a failed lock are Bad
Things to be doing!
Just because some entity out of our control isn't playing friendly doesn't
mean we can ignore the problem. The lock is there for a reason. If someone
else owns the lock then any steered access by i915 is potentially going to
be routed to the wrong register as the other entity re-directs the steering
behind out back. That is why there is an error message being printed.
Because things are quite possibly going to fail in some unknown manner.

Yes, agree. This has been already discussed here[*] where I sent
such RFC for the sole purpose of receiving some opinions and
check how CI would behave.

BTW, we are already doing this during the GT resume[**]... it's a
bit of a different context, but it still forces the release of
the lock.

This patch, anyway, is not doing this.

Thanks a lot,
Andi

[*] https://patchwork.freedesktop.org/series/124402/
[**] 37280ef5c1c4 ("drm/i915: Clean steer semaphore on resume")




Re: [Intel-gfx] [RFC PATCH] drm/i915/gt: Do not treat MCR locking timeouts as errors

2023-10-04 Thread John Harrison

On 10/4/2023 07:08, Andi Shyti wrote:

Hi Tvrtko,


The MCR steering semaphore is a shared lock entry between i915
and various firmware components.

Getting the lock might sinchronize on some shared resources.
Sometimes though, it might happen that the firmware forgets to
unlock causing unnecessary noise in the driver which keeps doing
what was supposed to do, ignoring the problem.

Do not consider this failure as an error, but just print a debug
message stating that the MCR locking has been skipped.

On the driver side we still have spinlocks that make sure that
the access to the resources is serialized.

Signed-off-by: Andi Shyti 
Cc: Jonathan Cavitt 
Cc: Matt Roper 
Cc: Nirmoy Das 
---
   drivers/gpu/drm/i915/gt/intel_gt_mcr.c | 6 ++
   1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_mcr.c 
b/drivers/gpu/drm/i915/gt/intel_gt_mcr.c
index 326c2ed1d99b..51eb693df39b 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_mcr.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_mcr.c
@@ -395,10 +395,8 @@ void intel_gt_mcr_lock(struct intel_gt *gt, unsigned long 
*flags)
 * would indicate some hardware/firmware is misbehaving and not
 * releasing it properly.
 */
-   if (err == -ETIMEDOUT) {
-   gt_err_ratelimited(gt, "hardware MCR steering semaphore timed 
out");
-   add_taint_for_CI(gt->i915, TAINT_WARN);  /* CI is now 
unreliable */
-   }
+   if (err == -ETIMEDOUT)
+   gt_dbg(gt, "hardware MCR steering semaphore timed out");
   }
   /**

Are we sure this does not warrant a level higher than dbg, such as
notice/warn?

We might make it a warn, but this doesn't change much the economy
of the driver as we will keep doing what we were supposed to do.


Because how can we be sure the two entities will not stomp on
each other toes if we failed to obtain lock?

So far, in all the research I've done, no one looks like using
MCR lock, but yet someone is stuck in it.
If someone has the lock then that someone thinks they are using it. Just 
because you can't see what someone piece of IFWI is doing doesn't mean 
it isn't doing it. And if it is a genuinely missing unlock then it needs 
to be tracked down and fixed with an IFWI update otherwise the system is 
going to be unstable from that point on.





(How can we be sure about
"forgot to unlock" vs "in prolonged active use"?

There is a patch from Jonathan that is testing a different
timeout.


Or if we can be sure, can
we force unlock and take the lock for the driver explicitly?)

I sent a patch with this solution and Matt turned it down.
Presumably because both forcing a lock and ignoring a failed lock are 
Bad Things to be doing!


Just because some entity out of our control isn't playing friendly 
doesn't mean we can ignore the problem. The lock is there for a reason. 
If someone else owns the lock then any steered access by i915 is 
potentially going to be routed to the wrong register as the other entity 
re-directs the steering behind out back. That is why there is an error 
message being printed. Because things are quite possibly going to fail 
in some unknown manner.


John.



Andi




Re: [Intel-gfx] [PATCH v5 1/4] drm/i915: Add GuC TLB Invalidation pci tags

2023-10-04 Thread John Harrison

On 10/4/2023 12:03, Andi Shyti wrote:

Hi Jonathan,

On Wed, Oct 04, 2023 at 11:36:22AM -0700, Jonathan Cavitt wrote:

Add pci (device info) tags for if GuC TLB Invalidation is enabled.
Since GuC based TLB invalidation is only strictly necessary for MTL
resently, only enable GuC based TLB invalidations for MTL.

Signed-off-by: Jonathan Cavitt 

Jani was mentioning that the pci tags is not a proper title.

No need to resend, I think I will merge this series, so that, if
you agree, I can change /pci tags/pci flag/ before pushing.
Have all the review comments been addressed? Surely it can't be pushed 
until it has at least an ack from everyone who has expressed concerns 
about the changes?


John.



In any case.

Reviewed-by: Andi Shyti 

Andi


---
  drivers/gpu/drm/i915/i915_drv.h  | 1 +
  drivers/gpu/drm/i915/i915_pci.c  | 1 +
  drivers/gpu/drm/i915/intel_device_info.h | 3 ++-
  3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 2b7a6db4d0d44..1e25cc1e3dba1 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -807,4 +807,5 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915,
  #define HAS_LMEMBAR_SMEM_STOLEN(i915) (!HAS_LMEM(i915) && \
   GRAPHICS_VER_FULL(i915) >= IP_VER(12, 
70))
  
+#define HAS_GUC_TLB_INVALIDATION(i915)	(INTEL_INFO(i915)->has_guc_tlb_invalidation)

  #endif
diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
index df7c261410f79..c3a5d5efb45d1 100644
--- a/drivers/gpu/drm/i915/i915_pci.c
+++ b/drivers/gpu/drm/i915/i915_pci.c
@@ -837,6 +837,7 @@ static const struct intel_device_info mtl_info = {
.memory_regions = REGION_SMEM | REGION_STOLEN_LMEM,
.platform_engine_mask = BIT(RCS0) | BIT(BCS0) | BIT(CCS0),
.require_force_probe = 1,
+   .has_guc_tlb_invalidation = 1,
MTL_CACHELEVEL,
  };
  
diff --git a/drivers/gpu/drm/i915/intel_device_info.h b/drivers/gpu/drm/i915/intel_device_info.h

index 39817490b13fd..ad54db0a22470 100644
--- a/drivers/gpu/drm/i915/intel_device_info.h
+++ b/drivers/gpu/drm/i915/intel_device_info.h
@@ -173,7 +173,8 @@ enum intel_ppgtt_type {
func(has_coherent_ggtt); \
func(tuning_thread_rr_after_dep); \
func(unfenced_needs_alignment); \
-   func(hws_needs_physical);
+   func(hws_needs_physical); \
+   func(has_guc_tlb_invalidation);
  
  struct intel_ip_version {

u8 ver;
--
2.25.1




Re: [Intel-gfx] [PATCH v5 1/4] drm/i915: Add GuC TLB Invalidation pci tags

2023-10-04 Thread John Harrison

Why is there no cover letter for this patch series?

It is at v5 but there is no history of what has changed from one version 
to the next. That makes it much harder to review.


John.


On 10/4/2023 11:36, Jonathan Cavitt wrote:

Add pci (device info) tags for if GuC TLB Invalidation is enabled.
Since GuC based TLB invalidation is only strictly necessary for MTL
resently, only enable GuC based TLB invalidations for MTL.

Signed-off-by: Jonathan Cavitt 
---
  drivers/gpu/drm/i915/i915_drv.h  | 1 +
  drivers/gpu/drm/i915/i915_pci.c  | 1 +
  drivers/gpu/drm/i915/intel_device_info.h | 3 ++-
  3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 2b7a6db4d0d44..1e25cc1e3dba1 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -807,4 +807,5 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915,
  #define HAS_LMEMBAR_SMEM_STOLEN(i915) (!HAS_LMEM(i915) && \
   GRAPHICS_VER_FULL(i915) >= IP_VER(12, 
70))
  
+#define HAS_GUC_TLB_INVALIDATION(i915)	(INTEL_INFO(i915)->has_guc_tlb_invalidation)

  #endif
diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
index df7c261410f79..c3a5d5efb45d1 100644
--- a/drivers/gpu/drm/i915/i915_pci.c
+++ b/drivers/gpu/drm/i915/i915_pci.c
@@ -837,6 +837,7 @@ static const struct intel_device_info mtl_info = {
.memory_regions = REGION_SMEM | REGION_STOLEN_LMEM,
.platform_engine_mask = BIT(RCS0) | BIT(BCS0) | BIT(CCS0),
.require_force_probe = 1,
+   .has_guc_tlb_invalidation = 1,
MTL_CACHELEVEL,
  };
  
diff --git a/drivers/gpu/drm/i915/intel_device_info.h b/drivers/gpu/drm/i915/intel_device_info.h

index 39817490b13fd..ad54db0a22470 100644
--- a/drivers/gpu/drm/i915/intel_device_info.h
+++ b/drivers/gpu/drm/i915/intel_device_info.h
@@ -173,7 +173,8 @@ enum intel_ppgtt_type {
func(has_coherent_ggtt); \
func(tuning_thread_rr_after_dep); \
func(unfenced_needs_alignment); \
-   func(hws_needs_physical);
+   func(hws_needs_physical); \
+   func(has_guc_tlb_invalidation);
  
  struct intel_ip_version {

u8 ver;




Re: [Intel-gfx] [PATCH v3 1/4] drm/i915: Define and use GuC and CTB TLB invalidation routines

2023-10-03 Thread John Harrison

On 10/3/2023 09:41, Andi Shyti wrote:

[...]


-   mmio_invalidate_full(gt);
+   if (INTEL_GUC_SUPPORTS_TLB_INVALIDATION(guc)) {
+   if (intel_guc_is_ready(guc))
+   intel_guc_invalidate_tlb_full(guc);
+   } else {
+   /*
+* Fall back to old path if GuC is disabled.
+* This is safe because GuC is not enabled and not 
writing to MMIO.
+*/

It is safe for intel_guc_is_ready() transitioning from false to true during GuC 
init? No way for some path to start issuing invalidations as that is happening?


+   mmio_invalidate_full(gt);
+   }

supernitpick: as we are at this, brackets are not required.
Braces are required on the first half of the 'if' because it is a double 
if and the else applies to the top level not the inner level. And my 
understanding of the style guide is that lop-sided bracing is incorrect. 
i.e. never have "} else". Plus while it might be syntactically valid to 
not have braces around the five line else clause because it is only one 
actual code statement, it massively helps readability of the code to 
have the braces present.


John.



Andi




Re: [Intel-gfx] [PATCH v3 1/4] drm/i915: Define and use GuC and CTB TLB invalidation routines

2023-10-03 Thread John Harrison

On 10/3/2023 03:28, Tvrtko Ursulin wrote:

On 02/10/2023 18:24, Jonathan Cavitt wrote:

From: Prathap Kumar Valsan 

The GuC firmware had defined the interface for Translation Look-Aside
Buffer (TLB) invalidation.  We should use this interface when
invalidating the engine and GuC TLBs.
Add additional functionality to intel_gt_invalidate_tlb, invalidating
the GuC TLBs and falling back to GT invalidation when the GuC is
disabled.
The invalidation is done by sending a request directly to the GuC
tlb_lookup that invalidates the table.  The invalidation is submitted as
a wait request and is performed in the CT event handler.  This means we
cannot perform this TLB invalidation path if the CT is not enabled.
If the request isn't fulfilled in two seconds, this would constitute
an error in the invalidation as that would constitute either a lost
request or a severe GuC overload.

With this new invalidation routine, we can perform GuC-based GGTT
invalidations.  We should only do this when GuC is enabled and fall
back to the original path when GuC is disabled to prevent concurrent
issuance between GuC and KMD.

Signed-off-by: Prathap Kumar Valsan 
Signed-off-by: Bruce Chang 
Signed-off-by: Chris Wilson 
Signed-off-by: Umesh Nerlige Ramappa 
Signed-off-by: Jonathan Cavitt 
Signed-off-by: Aravind Iddamsetty 
Signed-off-by: Fei Yang 
CC: Andi Shyti 
---
  drivers/gpu/drm/i915/gt/intel_ggtt.c  |  43 ++--
  drivers/gpu/drm/i915/gt/intel_tlb.c   |  14 +-
  .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |  33 +++
  drivers/gpu/drm/i915/gt/uc/intel_guc.h    |  22 ++
  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c |   9 +
  drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |   5 +
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 212 +-
  7 files changed, 322 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c 
b/drivers/gpu/drm/i915/gt/intel_ggtt.c

index 4d7d88b92632b..db5644b0146ca 100644
--- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
@@ -206,22 +206,38 @@ static void gen8_ggtt_invalidate(struct 
i915_ggtt *ggtt)
  intel_uncore_write_fw(uncore, GFX_FLSH_CNTL_GEN6, 
GFX_FLSH_CNTL_EN);

  }
  -static void guc_ggtt_invalidate(struct i915_ggtt *ggtt)
+static void guc_ggtt_ct_invalidate(struct intel_gt *gt)
  {
-    struct drm_i915_private *i915 = ggtt->vm.i915;
+    struct intel_uncore *uncore = gt->uncore;
+    intel_wakeref_t wakeref;
  -    gen8_ggtt_invalidate(ggtt);
+    with_intel_runtime_pm_if_active(uncore->rpm, wakeref) {
+    struct intel_guc *guc = >uc.guc;
  -    if (GRAPHICS_VER(i915) >= 12) {
-    struct intel_gt *gt;
+    intel_guc_invalidate_tlb(guc);
+    }
+}
  -    list_for_each_entry(gt, >gt_list, ggtt_link)
-    intel_uncore_write_fw(gt->uncore,
-  GEN12_GUC_TLB_INV_CR,
-  GEN12_GUC_TLB_INV_CR_INVALIDATE);
-    } else {
-    intel_uncore_write_fw(ggtt->vm.gt->uncore,
-  GEN8_GTCR, GEN8_GTCR_INVALIDATE);
+static void guc_ggtt_invalidate(struct i915_ggtt *ggtt)
+{
+    struct drm_i915_private *i915 = ggtt->vm.i915;
+    struct intel_gt *gt;
+
+    if (!IS_GEN9_LP(i915) && GRAPHICS_VER(i915) < 11)
+    gen8_ggtt_invalidate(ggtt);
+
+    list_for_each_entry(gt, >gt_list, ggtt_link) {
+    if (INTEL_GUC_SUPPORTS_TLB_INVALIDATION(>uc.guc) &&
+    intel_guc_is_ready(>uc.guc)) {


The condition here expands to a relatively heavy one:

+#define INTEL_GUC_SUPPORTS_TLB_INVALIDATION(guc) \
+    ((intel_guc_ct_enabled(&(guc)->ct)) && \
+ (intel_guc_submission_is_used(guc)) && \
+ (GRAPHICS_VER(guc_to_gt((guc))->i915) >= 12))


&&

static inline bool intel_guc_is_ready(struct intel_guc *guc)
{
return intel_guc_is_fw_running(guc) && 
intel_guc_ct_enabled(>ct);

}

intel_guc_ct_enabled is even duplicated.

Is there scope to consolidate the parts which are platform invariant, 
or even runtime invariant, or at least guaranteed not to transition 
back and forth but one way only?


In other words, if we know during init we will want it, mark it as a 
flag in intel_guc or somewhere, and then at runtime do only those 
conditions which can transition back and forth due driver flows.


I am not saying this is performance sensitive, but in terms of 
elegance, readability and self-documentation the proposed version 
looks a bit sub-optimal to me.



+    guc_ggtt_ct_invalidate(gt);
+    } else if (GRAPHICS_VER(i915) >= 12) {
+    intel_uncore_write(gt->uncore,
+   GEN12_GUC_TLB_INV_CR,
+   GEN12_GUC_TLB_INV_CR_INVALIDATE);
+    } else {
+    intel_uncore_write(gt->uncore,
+   GEN8_GTCR, GEN8_GTCR_INVALIDATE);
+    }
  }
  }
  @@ -1243,7 +1259,8 @@ static int gen8_gmch_probe(struct i915_ggtt 
*ggtt)

  ggtt->vm.raw_insert_page = gen8_ggtt_insert_page;
  }
  -    if (intel_uc_wants_guc(>vm.gt->uc))

Re: [Intel-gfx] [PATCH v3 3/4] drm/i915: Perform TLB invalidation on all GTs during suspend/resume

2023-10-03 Thread John Harrison

On 10/3/2023 08:59, Andi Shyti wrote:

Hi Jani,


Consider multi-gt support when cancelling all tlb invalidations on
suspend, and when submitting tlb invalidations on resume.

Suggested-by: Tvrtko Ursulin 
Signed-off-by: Fei Yang 
Signed-off-by: Jonathan Cavitt 
CC: John Harrison 

I guess I'm wondering why the top level suspend hook needs to iterate
gts instead of some lower level thing. We should aim to reduce
gem/gt/display details from the top level.

I'm not sure I am understanding the question.

The TLB invalidation details are kept under the GT. But when
suspend is called, then the GT invalidation has to be triggered
by the top levels for each GT. Right?

I think Jani's point is that the top level should be:
i915_drm_suspend(...) {
   ...
   intel_tlb_suspend(dev_priv);
}

Then the TLB suspend helper function calls into the GT / UC layers as 
appropriate. But none of that internal only detail is exposed at the top 
level.


John.



Thanks,
Andi




Re: [Intel-gfx] [PATCH v2] drm/i915/huc: silence injected failure in the load via GSC path

2023-09-07 Thread John Harrison

On 8/16/2023 16:13, Daniele Ceraolo Spurio wrote:

If we can't load the HuC due to an injected failure, we don't want
to throw and error and trip CI. Using the gt_probe_error macro for
logging ensure that the error is only printed if it wasn't explicitly
injected.

v2: keep the line to less than 100 characters (checkpatch).

Link: https://gitlab.freedesktop.org/drm/intel/-/issues/7061
Signed-off-by: Daniele Ceraolo Spurio 
Reviewed-by: Andi Shyti  #v1

Reviewed-by: John Harrison 

Although aren't we supposed to be using %pe / PTR_ERR(ret) these days? 
Not a blocker but for future reference.


John.


---
  drivers/gpu/drm/i915/pxp/intel_pxp_tee.c | 6 --
  1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/pxp/intel_pxp_tee.c 
b/drivers/gpu/drm/i915/pxp/intel_pxp_tee.c
index f89a1f80f50e..bb58fa9579b8 100644
--- a/drivers/gpu/drm/i915/pxp/intel_pxp_tee.c
+++ b/drivers/gpu/drm/i915/pxp/intel_pxp_tee.c
@@ -9,6 +9,7 @@
  #include 
  
  #include "gem/i915_gem_lmem.h"

+#include "gt/intel_gt_print.h"
  
  #include "i915_drv.h"

  #include "gt/intel_gt.h"
@@ -156,7 +157,8 @@ static int i915_pxp_tee_component_bind(struct device 
*i915_kdev,
  {
struct drm_i915_private *i915 = kdev_to_i915(i915_kdev);
struct intel_pxp *pxp = i915->pxp;
-   struct intel_uc *uc = >ctrl_gt->uc;
+   struct intel_gt *gt = pxp->ctrl_gt;
+   struct intel_uc *uc = >uc;
intel_wakeref_t wakeref;
int ret = 0;
  
@@ -176,7 +178,7 @@ static int i915_pxp_tee_component_bind(struct device *i915_kdev,

/* load huc via pxp */
ret = intel_huc_fw_load_and_auth_via_gsc(>huc);
if (ret < 0)
-   drm_err(>drm, "failed to load huc via gsc 
%d\n", ret);
+   gt_probe_error(gt, "failed to load huc via gsc 
%d\n", ret);
}
}
  




Re: [Intel-gfx] [PATCH v5] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset

2023-09-06 Thread John Harrison

On 9/6/2023 02:17, Andi Shyti wrote:

Hi John,


 static void guc_cancel_busyness_worker(struct intel_guc *guc)
 {
-   cancel_delayed_work_sync(>timestamp.work);
+   /*
+* When intel_gt_reset was called, task will hold a lock.
+* To cacel delayed work here, the _sync version will also acquire a 
lock, which might
+* trigger the possible cirular locking dependency warning.
+* Check the reset_in_progress flag, call async verion if reset is in 
progress.
+*/

This needs to explain in much more detail what is going on and why it is not
a problem. E.g.:

  The busyness worker needs to be cancelled. In general that means
  using the synchronous cancel version to ensure that an in-progress
  worker will not keep executing beyond whatever is happening that
  needs the cancel. E.g. suspend, driver unload, etc. However, in the
  case of a reset, the synchronous version is not required and can
  trigger a false deadlock detection warning.

  The business worker takes the reset mutex to protect against resets
  interfering with it. However, it does a trylock and bails out if the
  reset lock is already acquired. Thus there is no actual deadlock or
  other concern with the worker running concurrently with a reset. So
  an asynchronous cancel is safe in the case of a reset rather than a
  driver unload or suspend type operation. On the other hand, if the
  cancel_sync version is used when a reset is in progress then the
  mutex deadlock detection sees the mutex being acquired through
  multiple paths and complains.

  So just don't bother. That keeps the detection code happy and is
  safe because of the trylock code described above.

So why do we even need to cancel anything if it doesn't do anything while
the reset is in progress?

It still needs to be cancelled. The worker only aborts if it is actively
executing concurrently with the reset. It might not start to execute until
after the reset has completed. And there is presumably a reason why the
cancel is being called, a reason not necessarily related to resets at all.
Leaving the worker to run arbitrarily after the driver is expecting it to be
stopped will lead to much worse things than a fake lockdep splat, e.g. a use
after free pointer deref.

I was actually thinking why not leave things as they are and just
disable lockdep from CI. This doesn't look like a relevant report
to me.

Andi

Disable lockdep? The whole of lockdep? We absolutely do not want to disable
an extremely important deadlock testing infrastructure in our test
framework. That would be defeating the whole point of CI.

Potentially we could annotate this one particular scenario to suppress this
one particular error.  But it seems simpler and safer to just update the
code to not hit that scenario in the first place.

yes... lockdep is a debug tool and might provide false reports...
We need to have a great willingness to start fixing and hunting
debug lockdep's false positives (like this one, for instance).
That is how lockdep works. It's like a compiler warning. You have to fix 
them even if you think they don't matter. Because otherwise, when 
someone tries to turn warnings on, they drown in a sea of other people's 
unrelated garbage that they did not bother to fix. If lockdep is to be 
of any use at all then it must be run regularly as part of a CI type 
system and any issues it finds must be fixed up by the developer's that 
own the relevant code. Where fixing means either fixing genuine bugs, 
re-working the code to not hit a false positive or annotating the code 
to explain to lockdep why it is a safe operation.




It's even more annoying to reduce our CI pass rates, especially
when in BAT tests, with such false deadlocks.
Maybe. But it is even more annoying when you have a genuine locking 
issue that you don't notice because you have disabled lockdep and just 
have some random hang issue that is impossible to reproduce or debug.




It's the developer's responsibility to test its code with
debug_lockdep and fix all the potential deadlocks and ignore the
false ones.
You seem to have this backwards. Developers are not expected to run 
every possible test on every possible platform in every possible 
configuration. That is the job of CI.


John.


I sent a patch for this[*] already.

Andi

[*] https://gitlab.freedesktop.org/gfx-ci/i915-infra/-/merge_requests/128




Re: [Intel-gfx] [PATCH v5] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset

2023-09-06 Thread John Harrison

On 9/5/2023 23:50, Daniel Vetter wrote:

On Mon, Aug 28, 2023 at 04:01:38PM -0700, John Harrison wrote:

On 8/23/2023 10:37, John Harrison wrote:

On 8/23/2023 09:00, Daniel Vetter wrote:

On Tue, Aug 22, 2023 at 11:53:24AM -0700, John Harrison wrote:

On 8/11/2023 11:20, Zhanjun Dong wrote:

This attempts to avoid circular locking dependency between
flush delayed
work and intel_gt_reset.
When intel_gt_reset was called, task will hold a lock.
To cacel delayed work here, the _sync version will also
acquire a lock,
which might trigger the possible cirular locking dependency warning.
When intel_gt_reset called, reset_in_progress flag will be
set, add code
to check the flag, call async verion if reset is in progress.

Signed-off-by: Zhanjun Dong
Cc: John Harrison
Cc: Andi Shyti
Cc: Daniel Vetter
---
    drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 11 ++-
    1 file changed, 10 insertions(+), 1 deletion(-)

diff --git
a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index a0e3ef1c65d2..600388c849f7 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1359,7 +1359,16 @@ static void
guc_enable_busyness_worker(struct intel_guc *guc)
    static void guc_cancel_busyness_worker(struct intel_guc *guc)
    {
-    cancel_delayed_work_sync(>timestamp.work);
+    /*
+ * When intel_gt_reset was called, task will hold a lock.
+ * To cacel delayed work here, the _sync version will
also acquire a lock, which might
+ * trigger the possible cirular locking dependency warning.
+ * Check the reset_in_progress flag, call async verion
if reset is in progress.
+ */

This needs to explain in much more detail what is going on and
why it is not
a problem. E.g.:

     The busyness worker needs to be cancelled. In general that means
     using the synchronous cancel version to ensure that an in-progress
     worker will not keep executing beyond whatever is happening that
     needs the cancel. E.g. suspend, driver unload, etc. However, in the
     case of a reset, the synchronous version is not required and can
     trigger a false deadlock detection warning.

     The business worker takes the reset mutex to protect against resets
     interfering with it. However, it does a trylock and bails
out if the
     reset lock is already acquired. Thus there is no actual deadlock or
     other concern with the worker running concurrently with a reset. So
     an asynchronous cancel is safe in the case of a reset rather than a
     driver unload or suspend type operation. On the other hand, if the
     cancel_sync version is used when a reset is in progress then the
     mutex deadlock detection sees the mutex being acquired through
     multiple paths and complains.

     So just don't bother. That keeps the detection code happy and is
     safe because of the trylock code described above.

So why do we even need to cancel anything if it doesn't do anything
while
the reset is in progress?

It still needs to be cancelled. The worker only aborts if it is actively
executing concurrently with the reset. It might not start to execute
until after the reset has completed. And there is presumably a reason
why the cancel is being called, a reason not necessarily related to
resets at all. Leaving the worker to run arbitrarily after the driver is
expecting it to be stopped will lead to much worse things than a fake
lockdep splat, e.g. a use after free pointer deref.

John.

@Daniel Vetter - ping? Is this explanation sufficient? Are you okay with
this change now?

Sorry for the late reply, I'm constantly behind on mails :-/ Ping me on
irc next time around if I don't reply, that's quicker.

"presumably" isn't good enough for locking design. Either you know, and
can prove it all, or you shouldn't touch the code and its locking design
before you've figured this out.

Again, either this is a deadlock, race condition, or the cancel isn't
necessary. And this argument works in full generality. All this patch does
it replace the dealock with one of the other two, and that's not good
enough if you don't even know which one it is.

- if you need the cancel, you have a race condition

- if you don't have a race condition, you don't need the cancel
In the case of a reset in progress, we do not strictly need the cancel. 
The worker thread will take care of avoiding a deadlock by itself. But 
it is more efficient to do the cancel and avoid unnecessary code 
execution if possible. It is also more logically correct - the worker is 
being stopped, therefore we should cancel any pending execution of the 
worker.


In the case of a reset not being in progress, we absolutely do need the 
cancel as there are multiple race conditions.




- currently you have the deadlock

No, we do not. There is no deadlock.

The worker thread explicitly does a trylock and reschedules itself for 
later if it could not

Re: [Intel-gfx] [PATCH v5] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset

2023-08-31 Thread John Harrison

On 8/31/2023 07:00, Andi Shyti wrote:

Hi,


diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index a0e3ef1c65d2..600388c849f7 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1359,7 +1359,16 @@ static void guc_enable_busyness_worker(struct intel_guc 
*guc)
static void guc_cancel_busyness_worker(struct intel_guc *guc)
{
-   cancel_delayed_work_sync(>timestamp.work);
+   /*
+* When intel_gt_reset was called, task will hold a lock.
+* To cacel delayed work here, the _sync version will also acquire a 
lock, which might
+* trigger the possible cirular locking dependency warning.
+* Check the reset_in_progress flag, call async verion if reset is in 
progress.
+*/

This needs to explain in much more detail what is going on and why it is not
a problem. E.g.:

 The busyness worker needs to be cancelled. In general that means
 using the synchronous cancel version to ensure that an in-progress
 worker will not keep executing beyond whatever is happening that
 needs the cancel. E.g. suspend, driver unload, etc. However, in the
 case of a reset, the synchronous version is not required and can
 trigger a false deadlock detection warning.

 The business worker takes the reset mutex to protect against resets
 interfering with it. However, it does a trylock and bails out if the
 reset lock is already acquired. Thus there is no actual deadlock or
 other concern with the worker running concurrently with a reset. So
 an asynchronous cancel is safe in the case of a reset rather than a
 driver unload or suspend type operation. On the other hand, if the
 cancel_sync version is used when a reset is in progress then the
 mutex deadlock detection sees the mutex being acquired through
 multiple paths and complains.

 So just don't bother. That keeps the detection code happy and is
 safe because of the trylock code described above.

So why do we even need to cancel anything if it doesn't do anything while
the reset is in progress?

It still needs to be cancelled. The worker only aborts if it is actively
executing concurrently with the reset. It might not start to execute until
after the reset has completed. And there is presumably a reason why the
cancel is being called, a reason not necessarily related to resets at all.
Leaving the worker to run arbitrarily after the driver is expecting it to be
stopped will lead to much worse things than a fake lockdep splat, e.g. a use
after free pointer deref.

I was actually thinking why not leave things as they are and just
disable lockdep from CI. This doesn't look like a relevant report
to me.

Andi
Disable lockdep? The whole of lockdep? We absolutely do not want to 
disable an extremely important deadlock testing infrastructure in our 
test framework. That would be defeating the whole point of CI.


Potentially we could annotate this one particular scenario to suppress 
this one particular error.  But it seems simpler and safer to just 
update the code to not hit that scenario in the first place.


John.



Re: [Intel-gfx] [PATCH v5] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset

2023-08-28 Thread John Harrison

On 8/23/2023 10:37, John Harrison wrote:

On 8/23/2023 09:00, Daniel Vetter wrote:

On Tue, Aug 22, 2023 at 11:53:24AM -0700, John Harrison wrote:

On 8/11/2023 11:20, Zhanjun Dong wrote:
This attempts to avoid circular locking dependency between flush 
delayed

work and intel_gt_reset.
When intel_gt_reset was called, task will hold a lock.
To cacel delayed work here, the _sync version will also acquire a 
lock,

which might trigger the possible cirular locking dependency warning.
When intel_gt_reset called, reset_in_progress flag will be set, add 
code

to check the flag, call async verion if reset is in progress.

Signed-off-by: Zhanjun Dong
Cc: John Harrison
Cc: Andi Shyti
Cc: Daniel Vetter
---
   drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 11 ++-
   1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c

index a0e3ef1c65d2..600388c849f7 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1359,7 +1359,16 @@ static void 
guc_enable_busyness_worker(struct intel_guc *guc)

   static void guc_cancel_busyness_worker(struct intel_guc *guc)
   {
-    cancel_delayed_work_sync(>timestamp.work);
+    /*
+ * When intel_gt_reset was called, task will hold a lock.
+ * To cacel delayed work here, the _sync version will also 
acquire a lock, which might

+ * trigger the possible cirular locking dependency warning.
+ * Check the reset_in_progress flag, call async verion if 
reset is in progress.

+ */
This needs to explain in much more detail what is going on and why 
it is not

a problem. E.g.:

    The busyness worker needs to be cancelled. In general that means
    using the synchronous cancel version to ensure that an in-progress
    worker will not keep executing beyond whatever is happening that
    needs the cancel. E.g. suspend, driver unload, etc. However, in the
    case of a reset, the synchronous version is not required and can
    trigger a false deadlock detection warning.

    The business worker takes the reset mutex to protect against resets
    interfering with it. However, it does a trylock and bails out if 
the

    reset lock is already acquired. Thus there is no actual deadlock or
    other concern with the worker running concurrently with a reset. So
    an asynchronous cancel is safe in the case of a reset rather than a
    driver unload or suspend type operation. On the other hand, if the
    cancel_sync version is used when a reset is in progress then the
    mutex deadlock detection sees the mutex being acquired through
    multiple paths and complains.

    So just don't bother. That keeps the detection code happy and is
    safe because of the trylock code described above.
So why do we even need to cancel anything if it doesn't do anything 
while

the reset is in progress?
It still needs to be cancelled. The worker only aborts if it is 
actively executing concurrently with the reset. It might not start to 
execute until after the reset has completed. And there is presumably a 
reason why the cancel is being called, a reason not necessarily 
related to resets at all. Leaving the worker to run arbitrarily after 
the driver is expecting it to be stopped will lead to much worse 
things than a fake lockdep splat, e.g. a use after free pointer deref.


John.
@Daniel Vetter - ping? Is this explanation sufficient? Are you okay with 
this change now?


John.





Just remove the cancel from the reset path as uneeded instead, and 
explain

why that's ok? Because that's defacto what the cancel_work with a
potential deadlock scenario for cancel_work_sync does, you either don't
need it at all, or the replacement creates a bug.
-Daniel



John.



+    if (guc_to_gt(guc)->uc.reset_in_progress)
+    cancel_delayed_work(>timestamp.work);
+    else
+ cancel_delayed_work_sync(>timestamp.work);
   }
   static void __reset_guc_busyness_stats(struct intel_guc *guc)






Re: [Intel-gfx] [PATCH v5] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset

2023-08-23 Thread John Harrison

On 8/23/2023 09:00, Daniel Vetter wrote:

On Tue, Aug 22, 2023 at 11:53:24AM -0700, John Harrison wrote:

On 8/11/2023 11:20, Zhanjun Dong wrote:

This attempts to avoid circular locking dependency between flush delayed
work and intel_gt_reset.
When intel_gt_reset was called, task will hold a lock.
To cacel delayed work here, the _sync version will also acquire a lock,
which might trigger the possible cirular locking dependency warning.
When intel_gt_reset called, reset_in_progress flag will be set, add code
to check the flag, call async verion if reset is in progress.

Signed-off-by: Zhanjun Dong
Cc: John Harrison
Cc: Andi Shyti
Cc: Daniel Vetter
---
   drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 11 ++-
   1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index a0e3ef1c65d2..600388c849f7 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1359,7 +1359,16 @@ static void guc_enable_busyness_worker(struct intel_guc 
*guc)
   static void guc_cancel_busyness_worker(struct intel_guc *guc)
   {
-   cancel_delayed_work_sync(>timestamp.work);
+   /*
+* When intel_gt_reset was called, task will hold a lock.
+* To cacel delayed work here, the _sync version will also acquire a 
lock, which might
+* trigger the possible cirular locking dependency warning.
+* Check the reset_in_progress flag, call async verion if reset is in 
progress.
+*/

This needs to explain in much more detail what is going on and why it is not
a problem. E.g.:

The busyness worker needs to be cancelled. In general that means
using the synchronous cancel version to ensure that an in-progress
worker will not keep executing beyond whatever is happening that
needs the cancel. E.g. suspend, driver unload, etc. However, in the
case of a reset, the synchronous version is not required and can
trigger a false deadlock detection warning.

The business worker takes the reset mutex to protect against resets
interfering with it. However, it does a trylock and bails out if the
reset lock is already acquired. Thus there is no actual deadlock or
other concern with the worker running concurrently with a reset. So
an asynchronous cancel is safe in the case of a reset rather than a
driver unload or suspend type operation. On the other hand, if the
cancel_sync version is used when a reset is in progress then the
mutex deadlock detection sees the mutex being acquired through
multiple paths and complains.

So just don't bother. That keeps the detection code happy and is
safe because of the trylock code described above.

So why do we even need to cancel anything if it doesn't do anything while
the reset is in progress?
It still needs to be cancelled. The worker only aborts if it is actively 
executing concurrently with the reset. It might not start to execute 
until after the reset has completed. And there is presumably a reason 
why the cancel is being called, a reason not necessarily related to 
resets at all. Leaving the worker to run arbitrarily after the driver is 
expecting it to be stopped will lead to much worse things than a fake 
lockdep splat, e.g. a use after free pointer deref.


John.



Just remove the cancel from the reset path as uneeded instead, and explain
why that's ok? Because that's defacto what the cancel_work with a
potential deadlock scenario for cancel_work_sync does, you either don't
need it at all, or the replacement creates a bug.
-Daniel



John.



+   if (guc_to_gt(guc)->uc.reset_in_progress)
+   cancel_delayed_work(>timestamp.work);
+   else
+   cancel_delayed_work_sync(>timestamp.work);
   }
   static void __reset_guc_busyness_stats(struct intel_guc *guc)




Re: [Intel-gfx] [PATCH v5] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset

2023-08-22 Thread John Harrison

On 8/11/2023 11:20, Zhanjun Dong wrote:

This attempts to avoid circular locking dependency between flush delayed
work and intel_gt_reset.
When intel_gt_reset was called, task will hold a lock.
To cacel delayed work here, the _sync version will also acquire a lock,
which might trigger the possible cirular locking dependency warning.
When intel_gt_reset called, reset_in_progress flag will be set, add code
to check the flag, call async verion if reset is in progress.

Signed-off-by: Zhanjun Dong
Cc: John Harrison
Cc: Andi Shyti
Cc: Daniel Vetter
---
  drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 11 ++-
  1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index a0e3ef1c65d2..600388c849f7 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1359,7 +1359,16 @@ static void guc_enable_busyness_worker(struct intel_guc 
*guc)
  
  static void guc_cancel_busyness_worker(struct intel_guc *guc)

  {
-   cancel_delayed_work_sync(>timestamp.work);
+   /*
+* When intel_gt_reset was called, task will hold a lock.
+* To cacel delayed work here, the _sync version will also acquire a 
lock, which might
+* trigger the possible cirular locking dependency warning.
+* Check the reset_in_progress flag, call async verion if reset is in 
progress.
+*/
This needs to explain in much more detail what is going on and why it is 
not a problem. E.g.:


   The busyness worker needs to be cancelled. In general that means
   using the synchronous cancel version to ensure that an in-progress
   worker will not keep executing beyond whatever is happening that
   needs the cancel. E.g. suspend, driver unload, etc. However, in the
   case of a reset, the synchronous version is not required and can
   trigger a false deadlock detection warning.

   The business worker takes the reset mutex to protect against resets
   interfering with it. However, it does a trylock and bails out if the
   reset lock is already acquired. Thus there is no actual deadlock or
   other concern with the worker running concurrently with a reset. So
   an asynchronous cancel is safe in the case of a reset rather than a
   driver unload or suspend type operation. On the other hand, if the
   cancel_sync version is used when a reset is in progress then the
   mutex deadlock detection sees the mutex being acquired through
   multiple paths and complains.

   So just don't bother. That keeps the detection code happy and is
   safe because of the trylock code described above.


John.



+   if (guc_to_gt(guc)->uc.reset_in_progress)
+   cancel_delayed_work(>timestamp.work);
+   else
+   cancel_delayed_work_sync(>timestamp.work);
  }
  
  static void __reset_guc_busyness_stats(struct intel_guc *guc)


Re: [Intel-gfx] ✗ Fi.CI.IGT: failure for drm/i915/guc: Force a reset on internal GuC error (rev2)

2023-08-22 Thread John Harrison



On 8/15/2023 23:38, Patchwork wrote:

Project List - Patchwork *Patch Details*
*Series:*   drm/i915/guc: Force a reset on internal GuC error (rev2)
*URL:*  https://patchwork.freedesktop.org/series/118890/
*State:*failure
*Details:* 
https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v2/index.html



  CI Bug Log - changes from CI_DRM_13524_full -> Patchwork_118890v2_full


Summary

*FAILURE*

Serious unknown changes coming with Patchwork_118890v2_full absolutely 
need to be

verified manually.

If you think the reported changes have nothing to do with the changes
introduced in Patchwork_118890v2_full, please notify your bug team to 
allow them
to document this new failure mode, which will reduce false positives 
in CI.



Participating hosts (9 -> 9)

No changes in participating hosts


Possible new issues

Here are the unknown changes that may have been introduced in 
Patchwork_118890v2_full:



  IGT changes


Possible regressions

  * igt@kms_draw_crc@draw-method-blt@xrgb-ytiled:
  o shard-glk: PASS


-> DMESG-WARN



TLB invalidation timeout on a non-GuC platform. Not related to a patch 
about handling internal GuC errors.


John.


 *


Known issues

Here are the changes found in Patchwork_118890v2_full that come from 
known issues:



  IGT changes


Issues hit

 *

igt@drm_fdinfo@busy-hang@bcs0:

  o shard-dg2: NOTRUN -> SKIP


(i915#8414
) +20
similar issues
 *

igt@drm_fdinfo@most-busy-check-all@rcs0:

  o shard-rkl: PASS


-> FAIL


(i915#7742
) +1
similar issue
 *

igt@feature_discovery@chamelium:

  o shard-dg2: NOTRUN -> SKIP


(i915#4854 )
 *

igt@gem_close_race@multigpu-basic-threads:

  o shard-dg2: NOTRUN -> SKIP


(i915#7697 )
 *

igt@gem_ctx_persistence@heartbeat-hang:

  o shard-dg2: NOTRUN -> SKIP


(i915#8555
) +1
similar issue
 *

igt@gem_ctx_persistence@hostile:

  o shard-snb: NOTRUN -> SKIP


(fdo#109271
 /
i915#1099
) +1
similar issue
 *

igt@gem_ctx_persistence@saturated-hostile-nopreempt@ccs0:

  o shard-dg2: NOTRUN -> SKIP


(i915#5882
) +9
similar issues
 *

igt@gem_exec_balancer@bonded-pair:

  o shard-dg2: NOTRUN -> SKIP


(i915#4771 )
 *

igt@gem_exec_capture@pi@vcs0:

  o shard-mtlp: PASS


-> FAIL


(i915#4475 )
 *

igt@gem_exec_endless@dispatch@rcs0:

  o shard-dg2: PASS


-> TIMEOUT


  

Re: [Intel-gfx] ✓ Fi.CI.BAT: success for drm/i915/guc: Force a reset on internal GuC error (rev2)

2023-08-22 Thread John Harrison

On 8/15/2023 18:35, Patchwork wrote:

Project List - Patchwork *Patch Details*
*Series:*   drm/i915/guc: Force a reset on internal GuC error (rev2)
*URL:*  https://patchwork.freedesktop.org/series/118890/
*State:*success
*Details:* 
https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v2/index.html



  CI Bug Log - changes from CI_DRM_13524 -> Patchwork_118890v2


Summary

*WARNING*

Minor unknown changes coming with Patchwork_118890v2 need to be verified
manually.

If you think the reported changes have nothing to do with the changes
introduced in Patchwork_118890v2, please notify your bug team to allow 
them
to document this new failure mode, which will reduce false positives 
in CI.


External URL: 
https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118890v2/index.html



Participating hosts (40 -> 39)

Missing (1): fi-snb-2520m


Possible new issues

Here are the unknown changes that may have been introduced in 
Patchwork_118890v2:



  IGT changes


Warnings

  * igt@i915_module_load@load:
  o bat-adlp-11: ABORT


(i915#4423
) ->
DMESG-WARN



Some kind of display PHY failure. Not related to a patch about the 
handling of internal GuC errors.


John.


 *


Known issues

Here are the changes found in Patchwork_118890v2 that come from known 
issues:



  IGT changes


Issues hit

 *

igt@debugfs_test@basic-hwmon:

  o bat-adlp-11: NOTRUN -> SKIP


(i915#7456 )
 *

igt@gem_tiled_pread_basic:

  o bat-adlp-11: NOTRUN -> SKIP


(i915#3282 )
 *

igt@i915_selftest@live@gt_mocs:

  o bat-mtlp-8: PASS


-> DMESG-FAIL


(i915#7059 )
 *

igt@i915_selftest@live@requests:

  o bat-rpls-1: PASS


-> ABORT


(i915#4983
 /
i915#7911
 /
i915#7920 )
 *

igt@i915_selftest@live@slpc:

  o bat-mtlp-8: NOTRUN -> DMESG-WARN


(i915#6367 )
 *

igt@i915_suspend@basic-s3-without-i915:

  o bat-mtlp-8: NOTRUN -> SKIP


(i915#6645 )
 *

igt@kms_chamelium_frames@hdmi-crc-fast:

  o bat-adlp-11: NOTRUN -> SKIP


(i915#7828
) +7
similar issues
 *

igt@kms_chamelium_hpd@common-hpd-after-suspend:

  o bat-mtlp-8: NOTRUN -> SKIP


(i915#7828 )
 *

igt@kms_cursor_legacy@basic-busy-flip-before-cursor-atomic:

  o bat-adlp-11: NOTRUN -> ABORT


(i915#4423 )


Possible fixes

  * igt@i915_selftest@live@slpc:
  o bat-mtlp-6: DMESG-WARN


(i915#6367
) ->
PASS


Re: [Intel-gfx] [CI] drm/i915/gt: Refactor hangcheck selftest to use igt_spinner

2023-08-22 Thread John Harrison

On 8/19/2023 15:50, Andi Shyti wrote:

From: Jonathan Cavitt 

The hangcheck live selftest contains duplicate declarations of some
functions that already exist in igt_spinner.c, such as the creation and
deconstruction of a spinning batch buffer (spinner) that hangs an engine.
It's undesireable to have such code duplicated, as the requirements for
the spinner may change with hardware updates, necessitating both
execution paths be updated.  To avoid this, have the hangcheck live
selftest use the declaration from igt_spinner.  This eliminates the need
for the declarations in the selftest itself, as well as the associated
local helper structures, so we can erase those.

Suggested-by: Matt Roper 
Signed-off-by: Jonathan Cavitt 
---
  drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 457 ++-
  drivers/gpu/drm/i915/selftests/igt_spinner.c |  15 +-
  drivers/gpu/drm/i915/selftests/igt_spinner.h |   9 +
  3 files changed, 155 insertions(+), 326 deletions(-)

[snip]


-   pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
+   pr_err("[%s] Spinner init failed: %d!\n", engine->name, err);
If this code is being touched, can you also change it to use gt_err 
instead of pr_err? And gt_info instead of pr_info, etc. The pr_err 
functions are the worst of the worst for message prints, they don't even 
tag the output with 'i915' let alone anything useful like which GT it 
was or which card in a multi-card system.


John.



Re: [Intel-gfx] [PATCH] drm/i915/guc: Fix potential null pointer deref in GuC 'steal id' test

2023-08-07 Thread John Harrison

On 8/3/2023 06:28, Andi Shyti wrote:

Hi John,

On Wed, Aug 02, 2023 at 11:49:40AM -0700, john.c.harri...@intel.com wrote:

From: John Harrison 

It was noticed that if the very first 'stealing' request failed to
create for some reason then the 'steal all ids' loop would immediately
exit with 'last' still being NULL. The test would attempt to continue
but using a null pointer. Fix that by aborting the test if it fails to
create any requests at all.

Signed-off-by: John Harrison 
---
  drivers/gpu/drm/i915/gt/uc/selftest_guc.c | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/selftest_guc.c 
b/drivers/gpu/drm/i915/gt/uc/selftest_guc.c
index 1fd760539f77b..bfb72143566f6 100644
--- a/drivers/gpu/drm/i915/gt/uc/selftest_guc.c
+++ b/drivers/gpu/drm/i915/gt/uc/selftest_guc.c
@@ -204,9 +204,9 @@ static int intel_guc_steal_guc_ids(void *arg)
if (IS_ERR(rq)) {
ret = PTR_ERR(rq);
rq = NULL;
-   if (ret != -EAGAIN) {
-   guc_err(guc, "Failed to create request %d: 
%pe\n",
-   context_index, ERR_PTR(ret));
+   if ((ret != -EAGAIN) || !last) {

isn't last alway NULL here?

Andi
No, only on the first pass around the loop. When a request is 
successfully created, the else clause below assigns last to that new 
request. So if the failure to create only happens on pass 2 or later, 
last will be non-null. Which is the whole point of the code. It keeps 
creating all the contexts/requests that it can until it runs out of 
resources and gets an EAGAIN failure. At which point, last will be 
pointing to the last successful creation and the test continues to the 
next part of actually stealing an id.


But if the EAGAIN failure happens on the first pass then last will be 
null and it is not safe/valid to proceed so it needs to abort. And if 
anything other than EAGAIN is returned then something has gone wrong and 
it doesn't matter what last is set to, it needs to abort regardless.


John.





+   guc_err(guc, "Failed to create %srequest %d: 
%pe\n",
+   last ? "" : "first ", context_index, 
ERR_PTR(ret));
goto err_spin_rq;
}
} else {
--
2.39.1




Re: [Intel-gfx] [PATCH v3] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset

2023-07-24 Thread John Harrison

On 6/15/2023 14:15, Zhanjun Dong wrote:

This attempts to avoid circular locking dependency between flush delayed work 
and intel_gt_reset.
Switched from cancel_delayed_work_sync to cancel_delayed_work, the non-sync 
version for reset path, it is safe as the worker has the trylock code to handle 
the lock; Meanwhile keep the sync version for park/fini to ensure the worker is 
not still running during suspend or shutdown.

WARNING: possible circular locking dependency detected
6.4.0-rc1-drmtip_1340-g31e3463b0edb+ #1 Not tainted
--
kms_pipe_crc_ba/6415 is trying to acquire lock:
88813e6cc640 
((work_completion)(&(>timestamp.work)->work)){+.+.}-{0:0}, at: 
__flush_work+0x42/0x530

but task is already holding lock:
88813e6cce90 (>reset.mutex){+.+.}-{3:3}, at: intel_gt_reset+0x19e/0x470 
[i915]

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #3 (>reset.mutex){+.+.}-{3:3}:
 lock_acquire+0xd8/0x2d0
 i915_gem_shrinker_taints_mutex+0x31/0x50 [i915]
 intel_gt_init_reset+0x65/0x80 [i915]
 intel_gt_common_init_early+0xe1/0x170 [i915]
 intel_root_gt_init_early+0x48/0x60 [i915]
 i915_driver_probe+0x671/0xcb0 [i915]
 i915_pci_probe+0xdc/0x210 [i915]
 pci_device_probe+0x95/0x120
 really_probe+0x164/0x3c0
 __driver_probe_device+0x73/0x160
 driver_probe_device+0x19/0xa0
 __driver_attach+0xb6/0x180
 bus_for_each_dev+0x77/0xd0
 bus_add_driver+0x114/0x210
 driver_register+0x5b/0x110
 __pfx_vgem_open+0x3/0x10 [vgem]
 do_one_initcall+0x57/0x270
 do_init_module+0x5f/0x220
 load_module+0x1ca4/0x1f00
 __do_sys_finit_module+0xb4/0x130
 do_syscall_64+0x3c/0x90
 entry_SYSCALL_64_after_hwframe+0x72/0xdc

-> #2 (fs_reclaim){+.+.}-{0:0}:
 lock_acquire+0xd8/0x2d0
 fs_reclaim_acquire+0xac/0xe0
 kmem_cache_alloc+0x32/0x260
 i915_vma_instance+0xb2/0xc60 [i915]
 i915_gem_object_ggtt_pin_ww+0x175/0x370 [i915]
 vm_fault_gtt+0x22d/0xf60 [i915]
 __do_fault+0x2f/0x1d0
 do_pte_missing+0x4a/0xd20
 __handle_mm_fault+0x5b0/0x790
 handle_mm_fault+0xa2/0x230
 do_user_addr_fault+0x3ea/0xa10
 exc_page_fault+0x68/0x1a0
 asm_exc_page_fault+0x26/0x30

-> #1 (>reset.backoff_srcu){}-{0:0}:
 lock_acquire+0xd8/0x2d0
 _intel_gt_reset_lock+0x57/0x330 [i915]
 guc_timestamp_ping+0x35/0x130 [i915]
 process_one_work+0x250/0x510
 worker_thread+0x4f/0x3a0
 kthread+0xff/0x130
 ret_from_fork+0x29/0x50

-> #0 ((work_completion)(&(>timestamp.work)->work)){+.+.}-{0:0}:
 check_prev_add+0x90/0xc60
 __lock_acquire+0x1998/0x2590
 lock_acquire+0xd8/0x2d0
 __flush_work+0x74/0x530
 __cancel_work_timer+0x14f/0x1f0
 intel_guc_submission_reset_prepare+0x81/0x4b0 [i915]
 intel_uc_reset_prepare+0x9c/0x120 [i915]
 reset_prepare+0x21/0x60 [i915]
 intel_gt_reset+0x1dd/0x470 [i915]
 intel_gt_reset_global+0xfb/0x170 [i915]
 intel_gt_handle_error+0x368/0x420 [i915]
 intel_gt_debugfs_reset_store+0x5c/0xc0 [i915]
 i915_wedged_set+0x29/0x40 [i915]
 simple_attr_write_xsigned.constprop.0+0xb4/0x110
 full_proxy_write+0x52/0x80
 vfs_write+0xc5/0x4f0
 ksys_write+0x64/0xe0
 do_syscall_64+0x3c/0x90
 entry_SYSCALL_64_after_hwframe+0x72/0xdc

other info that might help us debug this:
  Chain exists of:
   (work_completion)(&(>timestamp.work)->work) --> fs_reclaim --> 
>reset.mutex
   Possible unsafe locking scenario:
 CPU0CPU1
 
lock(>reset.mutex);
 lock(fs_reclaim);
 lock(>reset.mutex);
lock((work_completion)(&(>timestamp.work)->work));

  *** DEADLOCK ***
  3 locks held by kms_pipe_crc_ba/6415:
   #0: 888101541430 (sb_writers#15){.+.+}-{0:0}, at: ksys_write+0x64/0xe0
   #1: 888136c7eab8 (>mutex){+.+.}-{3:3}, at: 
simple_attr_write_xsigned.constprop.0+0x47/0x110
   #2: 88813e6cce90 (>reset.mutex){+.+.}-{3:3}, at: 
intel_gt_reset+0x19e/0x470 [i915]

v2: Add sync flag to guc_cancel_busyness_worker to ensure reset path calls 
asynchronous cancel.
v3: Add sync flag to intel_guc_submission_disable to ensure reset path calls 
asynchronous cancel.

Signed-off-by: Zhanjun Dong 
---
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c   | 17 ++---
  .../gpu/drm/i915/gt/uc/intel_guc_submission.h   |  2 +-
  drivers/gpu/drm/i915/gt/uc/intel_uc.c   |  4 ++--
  3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index a0e3ef1c65d2..ef4300246ce1 

Re: [Intel-gfx] [PATCH v2] drm/i915/gt: update request engine before removing virtual GuC engine

2023-07-24 Thread John Harrison

On 7/19/2023 05:43, Tvrtko Ursulin wrote:

On 19/07/2023 11:41, Andrzej Hajda wrote:

On 18.07.2023 17:48, Tvrtko Ursulin wrote:

On 17/07/2023 19:03, John Harrison wrote:

On 7/13/2023 05:11, Tvrtko Ursulin wrote:

On 13/07/2023 12:09, Andrzej Hajda wrote:

Hi,

On 13.07.2023 09:39, Tvrtko Ursulin wrote:

On 12/07/2023 19:54, John Harrison wrote:

On 7/12/2023 09:27, Andrzej Hajda wrote:

On 12.07.2023 14:35, Tvrtko Ursulin wrote:

On 12/07/2023 13:18, Andrzej Hajda wrote:

On 11.07.2023 17:27, Tvrtko Ursulin wrote:

On 11/07/2023 14:58, Andrzej Hajda wrote:

On 11.07.2023 13:34, Andi Shyti wrote:

Hi Andrzej,

drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 11 
+++

  1 file changed, 11 insertions(+)

 diff --git 
a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c

 index a0e3ef1c65d246..2c877ea5eda6f0 100644
 --- 
a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
 +++ 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
 @@ -3461,6 +3461,8 @@ static void 
guc_prio_fini(struct i915_request *rq, struct 
intel_context *ce)
  static void remove_from_context(struct 
i915_request *rq)

  {
 struct intel_context *ce = 
request_to_scheduling_context(rq);

 +   struct intel_engine_cs *engine;
 +   intel_engine_mask_t tmp;

GEM_BUG_ON(intel_context_is_child(ce));

 @@ -3478,6 +3480,15 @@ static void 
remove_from_context(struct i915_request *rq)


atomic_dec(>guc_id.ref);
i915_request_notify_execute_cb_imm(rq);
 +
 +   /*
 +    * GuC virtual engine can disappear 
after this call, so let's assign
 +    * something valid, as driver expects 
this to be always valid pointer.

 +    */
 + for_each_engine_masked(engine, 
rq->engine->gt, rq->execution_mask, tmp) {

 +   rq->engine = engine;

 yes... here the context might lose the virtual 
engine... I wonder
 whether this is the rigth solution, though. Maybe 
we should set

 rq->engine = NULL; and check for NULL? Don't know.


Setting NULL causes occasional null page de-reference in

i915_request_wait_timeout:

mutex_release(>engine->gt->reset.mutex.dep_map, 
_THIS_IP_)


rq->engine after removing rq from context is (IMHO) used 
as a set of aliases
for gt and i915 (despite rq itself contains the alias to 
i915).
without investigating further, but maybe that code is not 
even
supposed to be executed, at this point, if the request's 
assigned

virtual engine is removed.


Real tests show it is executed and the function 
i915_request_wait_timeout is quite generic
I guess it is quite typical use-case, the only question is 
about timings - what happens earlier -

finalization of i915_request_wait_timeout or context removal.

The other point rq->engine is accessed after context 
removal is i915_fence_release -
there is long comment there regarding virtual context and 
reuse retired rq.
Anyway calling there "intel_engine_is_virtual(rq->engine)" 
is risky without this patch and KASAN complains clearly 
about it:
http://gfx-ci.igk.intel.com/tree/drm-tip/kasan.html?testfilter=gem_exec_balancer 



Looks like a bug introduced in bcb9aa45d5a0 ("Revert 
"drm/i915: Hold reference to intel_context over life of 
i915_request""), which was a partial revert of 1e98d8c52ed5 
("drm/i915: Hold reference to intel_context over life of 
i915_request").


Ie. if 1e98d8c52ed5 recognised the problem with 
disappearing rq->engine, then I am confused how 
bcb9aa45d5a0 left the rq->engine dereference in there after 
removing the extra reference.


Could it be that the intel_engine_is_virtual check simply 
needs to be removed from i915_fence_release, restoring 
things to how they were before 1e98d8c52ed5? Could you try 
it out?



I have already tried something similar [1] and KASAN bugs 
disappeared, or more precisely gem_exec_balance tests 
passed. But I have been warned by Nirmoy guc virtual engines 
can be created for only one real engine (ie. 
is_power_of_2(rq->execution_mask) is true but rq->engine 
points to virtual engine).


[1]: https://patchwork.freedesktop.org/series/118879/


Ugh.. Try involving media umd folks to see if they need a 
single engine virtual engine? Or we could always just not 
create it in the driver, I mean just use the physical one.



In case there is single physical engine 
intel_engine_create_virtual falls back to intel_context_create 
(no virtual engine), but in case of parallel contexts there is 
special KMD flag FORCE_VIRTUAL which enforces virtual engine 
even for single physical engine. So it seems to be KMD concept.


Anyway is it worth investigating how to make 
"is_power_of_2(rq->execution_mask)" indication of dangling 
engine pointer? It will not help in 1st case:

mutex_release(>

Re: [Intel-gfx] [PATCH] drm/i915/huc: check HuC and GuC version compatibility on MTL

2023-07-17 Thread John Harrison

On 7/12/2023 10:03, Ceraolo Spurio, Daniele wrote:

On 7/12/2023 3:03 AM, Andrzej Hajda wrote:

On 11.07.2023 22:31, Daniele Ceraolo Spurio wrote:

Due to a change in the auth flow on MTL, GuC 70.7.0 and newer will only
be able to authenticate HuC 8.5.1 and newer. The plan is to update 
the 2

binaries sinchronously in linux-firmware so that the fw repo always has

synchronously

a matching pair that works; still, it's better to check in the 
kernel so

we can print an error message and abort HuC loading if the binaries are
out of sync instead of failing the authentication.

Signed-off-by: Daniele Ceraolo Spurio 
Cc: John Harrison 
---
  drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c | 42 


  1 file changed, 42 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c 
b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c

index 08e16017584b..f0cc5bb47fa0 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
@@ -803,11 +803,53 @@ static int try_firmware_load(struct 
intel_uc_fw *uc_fw, const struct firmware **

  return 0;
  }
  +static int check_mtl_huc_guc_compatibility(struct intel_gt *gt,
+   struct intel_uc_fw_file *huc_selected)
+{
+    struct intel_uc_fw_file *guc_selected = 
>uc.guc.fw.file_selected;

+    struct intel_uc_fw_ver *huc_ver = _selected->ver;
+    struct intel_uc_fw_ver *guc_ver = _selected->ver;
+    bool new_huc;
+    bool new_guc;

Could put both of these bools on a single line.


+
+    /* we can only do this check after having fetched both GuC and 
HuC */

+    GEM_BUG_ON(!huc_selected->path || !guc_selected->path);
+
+    /*
+ * Due to changes in the authentication flow for MTL, HuC 8.5.1 
or newer
+ * requires GuC 70.7.0 or newer. Older HuC binaries will 
instead require

+ * GuC < 70.7.0.
+ */
+    new_huc = huc_ver->major > 8 ||
+  (huc_ver->major == 8 && huc_ver->minor > 5) ||
+  (huc_ver->major == 8 && huc_ver->minor == 5 && 
huc_ver->patch >= 1);

+
+    new_guc = guc_ver->major > 70 ||
+  (guc_ver->major == 70 && guc_ver->minor >= 7);


Wouldn't be more readable to define sth like UC_VER_FULL(v)
then use UC_VER_FULL(huc_ver) >= IP_VER_FULL(8, 5, 1).
I am not sure if it is worth for two checks.


We've been trying to avoid those kind of macros because the version 
would need to be a u64 under the hood (each version number is a u16) 
and therefore type casting would be required to make all the shifting 
work, which makes the macro nasty to look at and as you said IMO not 
worth it for just 2 checks. Note that the GuC is the exception because 
it guarantees its version fits in a u32, so there is some macro use in 
the GuC-specific code.
Pretty sure I did originally try to go the u64 version route but it 
caused a lot more problems than it solved. I forget the details but in 
addition to all the extra casting mentioned above, I vaguely recall 
there issues with 32bit compilers/architectures or some such. Hence we 
only have the 8bit-per-version-component/32bit-merged macros that are 
for use with the GuC version and only the GuC version.


Given that this is (hopefully) a one off hack to cope with a one off 
bug, I would stick with the unrolled code rather than adding extra 
complications.








+
+    if (new_huc != new_guc) {
+    UNEXPECTED(gt, "HuC %u.%u.%u is incompatible with GuC 
%u.%u.%u\n",

+   huc_ver->major, huc_ver->minor, huc_ver->patch,
+   guc_ver->major, guc_ver->minor, guc_ver->patch);
+    gt_info(gt, "MTL GuC 70.7.0+ and HuC 8.5.1+ don't work with 
older releases\n");

+    return -ENOEXEC;
+    }
+
+    return 0;
+}
+
  int intel_uc_check_file_version(struct intel_uc_fw *uc_fw, bool 
*old_ver)

  {
  struct intel_gt *gt = __uc_fw_to_gt(uc_fw);
  struct intel_uc_fw_file *wanted = _fw->file_wanted;
  struct intel_uc_fw_file *selected = _fw->file_selected;
+    int ret;
+
+    if (IS_METEORLAKE(gt->i915) && uc_fw->type == 
INTEL_UC_FW_TYPE_HUC) {


Moving this check inside check function would make it more generic, 
up to you.


This will hopefully never apply to any other platform. This is a light 
breach of the HuC compatibility contract, so I really don't want to 
have a generic function to handle it. I want it to be clear from a 
higher level that this is an exception for a specific platform. Maybe 
worth adding a comment? Would something like the following make things 
clearer?


/*
 * MTL has some compatibility issues with early GuC/HuC binaries
 * not working with newer ones. This is specific to MTL and we
 * don't expect it to extend to other platforms.
 */

I agree with Daniele about keeping this the exception not the norm. The 
comment works for me.


Typo in commit message and a declaration nit-pick but otherwi

Re: [Intel-gfx] [PATCH v2] drm/i915/gt: update request engine before removing virtual GuC engine

2023-07-17 Thread John Harrison

On 7/13/2023 05:11, Tvrtko Ursulin wrote:

On 13/07/2023 12:09, Andrzej Hajda wrote:

Hi,

On 13.07.2023 09:39, Tvrtko Ursulin wrote:

On 12/07/2023 19:54, John Harrison wrote:

On 7/12/2023 09:27, Andrzej Hajda wrote:

On 12.07.2023 14:35, Tvrtko Ursulin wrote:

On 12/07/2023 13:18, Andrzej Hajda wrote:

On 11.07.2023 17:27, Tvrtko Ursulin wrote:

On 11/07/2023 14:58, Andrzej Hajda wrote:

On 11.07.2023 13:34, Andi Shyti wrote:

Hi Andrzej,

drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 11 
+++

  1 file changed, 11 insertions(+)

 diff --git 
a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c

 index a0e3ef1c65d246..2c877ea5eda6f0 100644
 --- 
a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
 +++ 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
 @@ -3461,6 +3461,8 @@ static void 
guc_prio_fini(struct i915_request *rq, struct intel_context 
*ce)
  static void remove_from_context(struct 
i915_request *rq)

  {
 struct intel_context *ce = 
request_to_scheduling_context(rq);

 +   struct intel_engine_cs *engine;
 +   intel_engine_mask_t tmp;

GEM_BUG_ON(intel_context_is_child(ce));

 @@ -3478,6 +3480,15 @@ static void 
remove_from_context(struct i915_request *rq)


atomic_dec(>guc_id.ref);
i915_request_notify_execute_cb_imm(rq);
 +
 +   /*
 +    * GuC virtual engine can disappear after 
this call, so let's assign
 +    * something valid, as driver expects this 
to be always valid pointer.

 +    */
 + for_each_engine_masked(engine, rq->engine->gt, 
rq->execution_mask, tmp) {

 +   rq->engine = engine;

 yes... here the context might lose the virtual 
engine... I wonder
 whether this is the rigth solution, though. Maybe we 
should set

 rq->engine = NULL; and check for NULL? Don't know.


Setting NULL causes occasional null page de-reference in

i915_request_wait_timeout:

mutex_release(>engine->gt->reset.mutex.dep_map, _THIS_IP_)

rq->engine after removing rq from context is (IMHO) used as 
a set of aliases

for gt and i915 (despite rq itself contains the alias to i915).

without investigating further, but maybe that code is not even
supposed to be executed, at this point, if the request's 
assigned

virtual engine is removed.


Real tests show it is executed and the function 
i915_request_wait_timeout is quite generic
I guess it is quite typical use-case, the only question is 
about timings - what happens earlier -

finalization of i915_request_wait_timeout or context removal.

The other point rq->engine is accessed after context removal 
is i915_fence_release -
there is long comment there regarding virtual context and 
reuse retired rq.
Anyway calling there "intel_engine_is_virtual(rq->engine)" is 
risky without this patch and KASAN complains clearly about it:
http://gfx-ci.igk.intel.com/tree/drm-tip/kasan.html?testfilter=gem_exec_balancer 



Looks like a bug introduced in bcb9aa45d5a0 ("Revert "drm/i915: 
Hold reference to intel_context over life of i915_request""), 
which was a partial revert of 1e98d8c52ed5 ("drm/i915: Hold 
reference to intel_context over life of i915_request").


Ie. if 1e98d8c52ed5 recognised the problem with disappearing 
rq->engine, then I am confused how bcb9aa45d5a0 left the 
rq->engine dereference in there after removing the extra 
reference.


Could it be that the intel_engine_is_virtual check simply needs 
to be removed from i915_fence_release, restoring things to how 
they were before 1e98d8c52ed5? Could you try it out?



I have already tried something similar [1] and KASAN bugs 
disappeared, or more precisely gem_exec_balance tests passed. 
But I have been warned by Nirmoy guc virtual engines can be 
created for only one real engine (ie. 
is_power_of_2(rq->execution_mask) is true but rq->engine points 
to virtual engine).


[1]: https://patchwork.freedesktop.org/series/118879/


Ugh.. Try involving media umd folks to see if they need a single 
engine virtual engine? Or we could always just not create it in 
the driver, I mean just use the physical one.



In case there is single physical engine 
intel_engine_create_virtual falls back to intel_context_create (no 
virtual engine), but in case of parallel contexts there is special 
KMD flag FORCE_VIRTUAL which enforces virtual engine even for 
single physical engine. So it seems to be KMD concept.


Anyway is it worth investigating how to make 
"is_power_of_2(rq->execution_mask)" indication of dangling engine 
pointer? It will not help in 1st case:

mutex_release(>engine->gt->reset.mutex.dep_map, _THIS_IP_)


There seems to be a fundamental problem here. Object 1 (rq) is 
holding a pointer to a reference counted and transient object 2 
(

Re: [Intel-gfx] [PATCH v2] drm/i915/gt: update request engine before removing virtual GuC engine

2023-07-12 Thread John Harrison

On 7/12/2023 09:27, Andrzej Hajda wrote:

On 12.07.2023 14:35, Tvrtko Ursulin wrote:

On 12/07/2023 13:18, Andrzej Hajda wrote:

On 11.07.2023 17:27, Tvrtko Ursulin wrote:

On 11/07/2023 14:58, Andrzej Hajda wrote:

On 11.07.2023 13:34, Andi Shyti wrote:

Hi Andrzej,


drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 11 +++
  1 file changed, 11 insertions(+)

 diff --git 
a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c

 index a0e3ef1c65d246..2c877ea5eda6f0 100644
 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
 +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
 @@ -3461,6 +3461,8 @@ static void guc_prio_fini(struct 
i915_request *rq, struct intel_context *ce)

  static void remove_from_context(struct i915_request *rq)
  {
 struct intel_context *ce = 
request_to_scheduling_context(rq);

 +   struct intel_engine_cs *engine;
 +   intel_engine_mask_t tmp;

GEM_BUG_ON(intel_context_is_child(ce));

 @@ -3478,6 +3480,15 @@ static void 
remove_from_context(struct i915_request *rq)


 atomic_dec(>guc_id.ref);
i915_request_notify_execute_cb_imm(rq);
 +
 +   /*
 +    * GuC virtual engine can disappear after this 
call, so let's assign
 +    * something valid, as driver expects this to 
be always valid pointer.

 +    */
 +   for_each_engine_masked(engine, rq->engine->gt, 
rq->execution_mask, tmp) {

 +   rq->engine = engine;

 yes... here the context might lose the virtual engine... I 
wonder
 whether this is the rigth solution, though. Maybe we should 
set

 rq->engine = NULL; and check for NULL? Don't know.


Setting NULL causes occasional null page de-reference in

i915_request_wait_timeout:

mutex_release(>engine->gt->reset.mutex.dep_map, _THIS_IP_)

rq->engine after removing rq from context is (IMHO) used as a 
set of aliases

for gt and i915 (despite rq itself contains the alias to i915).

without investigating further, but maybe that code is not even
supposed to be executed, at this point, if the request's assigned
virtual engine is removed.


Real tests show it is executed and the function 
i915_request_wait_timeout is quite generic
I guess it is quite typical use-case, the only question is about 
timings - what happens earlier -

finalization of i915_request_wait_timeout or context removal.

The other point rq->engine is accessed after context removal is 
i915_fence_release -
there is long comment there regarding virtual context and reuse 
retired rq.
Anyway calling there "intel_engine_is_virtual(rq->engine)" is 
risky without this patch and KASAN complains clearly about it:
http://gfx-ci.igk.intel.com/tree/drm-tip/kasan.html?testfilter=gem_exec_balancer 



Looks like a bug introduced in bcb9aa45d5a0 ("Revert "drm/i915: 
Hold reference to intel_context over life of i915_request""), which 
was a partial revert of 1e98d8c52ed5 ("drm/i915: Hold reference to 
intel_context over life of i915_request").


Ie. if 1e98d8c52ed5 recognised the problem with disappearing 
rq->engine, then I am confused how bcb9aa45d5a0 left the rq->engine 
dereference in there after removing the extra reference.


Could it be that the intel_engine_is_virtual check simply needs to 
be removed from i915_fence_release, restoring things to how they 
were before 1e98d8c52ed5? Could you try it out?



I have already tried something similar [1] and KASAN bugs 
disappeared, or more precisely gem_exec_balance tests passed. But I 
have been warned by Nirmoy guc virtual engines can be created for 
only one real engine (ie. is_power_of_2(rq->execution_mask) is true 
but rq->engine points to virtual engine).


[1]: https://patchwork.freedesktop.org/series/118879/


Ugh.. Try involving media umd folks to see if they need a single 
engine virtual engine? Or we could always just not create it in the 
driver, I mean just use the physical one.



In case there is single physical engine intel_engine_create_virtual 
falls back to intel_context_create (no virtual engine), but in case of 
parallel contexts there is special KMD flag FORCE_VIRTUAL which 
enforces virtual engine even for single physical engine. So it seems 
to be KMD concept.


Anyway is it worth investigating how to make 
"is_power_of_2(rq->execution_mask)" indication of dangling engine 
pointer? It will not help in 1st case:

mutex_release(>engine->gt->reset.mutex.dep_map, _THIS_IP_)


There seems to be a fundamental problem here. Object 1 (rq) is holding a 
pointer to a reference counted and transient object 2 (engine) but 
without taking a reference count for itself. That is a Bad Thing(tm). 
I'm not following the description in the revert patch as to why rq can't 
ref count the context/engine. Is there actually a recursive counting 
problem? Or is it just a lifetime issue caused 

Re: [Intel-gfx] [PATCH] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset

2023-06-07 Thread John Harrison

On 6/7/2023 12:03, Zhanjun Dong wrote:

This attempts to avoid circular locking dependency between flush delayed work 
and intel_gt_reset.
Switched from cancel_delayed_work_sync to cancel_delayed_work, the non-sync 
version for reset path, it is safe as the worker has the trylock code to handle 
the lock; Meanwhile keep the sync version for park/fini to ensure the worker is 
not still running during suspend or shutdown.

WARNING: possible circular locking dependency detected
6.4.0-rc1-drmtip_1340-g31e3463b0edb+ #1 Not tainted
--
kms_pipe_crc_ba/6415 is trying to acquire lock:
88813e6cc640 
((work_completion)(&(>timestamp.work)->work)){+.+.}-{0:0}, at: 
__flush_work+0x42/0x530

but task is already holding lock:
88813e6cce90 (>reset.mutex){+.+.}-{3:3}, at: intel_gt_reset+0x19e/0x470 
[i915]

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #3 (>reset.mutex){+.+.}-{3:3}:
 lock_acquire+0xd8/0x2d0
 i915_gem_shrinker_taints_mutex+0x31/0x50 [i915]
 intel_gt_init_reset+0x65/0x80 [i915]
 intel_gt_common_init_early+0xe1/0x170 [i915]
 intel_root_gt_init_early+0x48/0x60 [i915]
 i915_driver_probe+0x671/0xcb0 [i915]
 i915_pci_probe+0xdc/0x210 [i915]
 pci_device_probe+0x95/0x120
 really_probe+0x164/0x3c0
 __driver_probe_device+0x73/0x160
 driver_probe_device+0x19/0xa0
 __driver_attach+0xb6/0x180
 bus_for_each_dev+0x77/0xd0
 bus_add_driver+0x114/0x210
 driver_register+0x5b/0x110
 __pfx_vgem_open+0x3/0x10 [vgem]
 do_one_initcall+0x57/0x270
 do_init_module+0x5f/0x220
 load_module+0x1ca4/0x1f00
 __do_sys_finit_module+0xb4/0x130
 do_syscall_64+0x3c/0x90
 entry_SYSCALL_64_after_hwframe+0x72/0xdc

-> #2 (fs_reclaim){+.+.}-{0:0}:
 lock_acquire+0xd8/0x2d0
 fs_reclaim_acquire+0xac/0xe0
 kmem_cache_alloc+0x32/0x260
 i915_vma_instance+0xb2/0xc60 [i915]
 i915_gem_object_ggtt_pin_ww+0x175/0x370 [i915]
 vm_fault_gtt+0x22d/0xf60 [i915]
 __do_fault+0x2f/0x1d0
 do_pte_missing+0x4a/0xd20
 __handle_mm_fault+0x5b0/0x790
 handle_mm_fault+0xa2/0x230
 do_user_addr_fault+0x3ea/0xa10
 exc_page_fault+0x68/0x1a0
 asm_exc_page_fault+0x26/0x30

-> #1 (>reset.backoff_srcu){}-{0:0}:
 lock_acquire+0xd8/0x2d0
 _intel_gt_reset_lock+0x57/0x330 [i915]
 guc_timestamp_ping+0x35/0x130 [i915]
 process_one_work+0x250/0x510
 worker_thread+0x4f/0x3a0
 kthread+0xff/0x130
 ret_from_fork+0x29/0x50

-> #0 ((work_completion)(&(>timestamp.work)->work)){+.+.}-{0:0}:
 check_prev_add+0x90/0xc60
 __lock_acquire+0x1998/0x2590
 lock_acquire+0xd8/0x2d0
 __flush_work+0x74/0x530
 __cancel_work_timer+0x14f/0x1f0
 intel_guc_submission_reset_prepare+0x81/0x4b0 [i915]
 intel_uc_reset_prepare+0x9c/0x120 [i915]
 reset_prepare+0x21/0x60 [i915]
 intel_gt_reset+0x1dd/0x470 [i915]
 intel_gt_reset_global+0xfb/0x170 [i915]
 intel_gt_handle_error+0x368/0x420 [i915]
 intel_gt_debugfs_reset_store+0x5c/0xc0 [i915]
 i915_wedged_set+0x29/0x40 [i915]
 simple_attr_write_xsigned.constprop.0+0xb4/0x110
 full_proxy_write+0x52/0x80
 vfs_write+0xc5/0x4f0
 ksys_write+0x64/0xe0
 do_syscall_64+0x3c/0x90
 entry_SYSCALL_64_after_hwframe+0x72/0xdc

other info that might help us debug this:
  Chain exists of:
   (work_completion)(&(>timestamp.work)->work) --> fs_reclaim --> 
>reset.mutex
   Possible unsafe locking scenario:
 CPU0CPU1
 
lock(>reset.mutex);
 lock(fs_reclaim);
 lock(>reset.mutex);
lock((work_completion)(&(>timestamp.work)->work));

  *** DEADLOCK ***
  3 locks held by kms_pipe_crc_ba/6415:
   #0: 888101541430 (sb_writers#15){.+.+}-{0:0}, at: ksys_write+0x64/0xe0
   #1: 888136c7eab8 (>mutex){+.+.}-{3:3}, at: 
simple_attr_write_xsigned.constprop.0+0x47/0x110
   #2: 88813e6cce90 (>reset.mutex){+.+.}-{3:3}, at: 
intel_gt_reset+0x19e/0x470 [i915]

Signed-off-by: Zhanjun Dong
---
  drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 15 +--
  1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index a0e3ef1c65d2..cca6960d3490 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1357,9 +1357,12 @@ static void guc_enable_busyness_worker(struct intel_guc 
*guc)
mod_delayed_work(system_highpri_wq, >timestamp.work, 
guc->timestamp.ping_delay);
  }
  
-static void 

  1   2   3   4   5   6   7   8   9   10   >