Re: [Intel-gfx] [PATCH 1/3] drm/i915/guc: Fix missing return code checks in submission init

2022-12-22 Thread Ceraolo Spurio, Daniele




On 12/21/2022 8:30 PM, john.c.harri...@intel.com wrote:

From: John Harrison 

The CI results for the 'fast request' patch set (enables error return
codes for fire-and-forget H2G messages) hit an issue with the KMD
sending context submission requests on an invalid context. That was
caused by a fault injection probe failing the context creation of a
kernel context. However, there was no return code checking on any of
the kernel context registration paths. So the driver kept going and
tried to use the kernel context for the record defaults process.

Fix that by checking for errors and aborting as appropriate when
creating kernel contexts. While at it, clean up some other submission
init related failure cleanup paths. Also, rename guc_init_lrc_mapping
to guc_init_submission as the former name hasn't been valid in a long
time.


IMO we should specify that this isn't a bug in the current code. The 
only way this can fail if we don't inject the failure is if the GuC is 
dead and/or is not seeing our messages, in which case we'll eventually 
correctly wedge. We only need this to get ready to switch to fast 
request messages.




Signed-off-by: John Harrison 
---
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 74 ++-
  .../gpu/drm/i915/gt/uc/intel_guc_submission.h |  2 +-
  drivers/gpu/drm/i915/gt/uc/intel_uc.c |  7 +-
  3 files changed, 62 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 53f7f599cde3a..4682ec1dbd9c0 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1431,21 +1431,25 @@ static int guc_action_enable_usage_stats(struct 
intel_guc *guc)
return intel_guc_send(guc, action, ARRAY_SIZE(action));
  }
  
-static void guc_init_engine_stats(struct intel_guc *guc)

+static int guc_init_engine_stats(struct intel_guc *guc)
  {
struct intel_gt *gt = guc_to_gt(guc);
intel_wakeref_t wakeref;
+   int ret;
  
  	mod_delayed_work(system_highpri_wq, >timestamp.work,

 guc->timestamp.ping_delay);
  
  	with_intel_runtime_pm(>i915->runtime_pm, wakeref) {

-   int ret = guc_action_enable_usage_stats(guc);
+   ret = guc_action_enable_usage_stats(guc);
  
-		if (ret)

-   drm_err(>i915->drm,
-   "Failed to enable usage stats: %d!\n", ret);
+   if (ret) {
+   cancel_delayed_work_sync(>timestamp.work);
+   drm_err(>i915->drm, "Failed to enable usage stats: 
%d!\n", ret);
+   }
}
+
+   return ret;
  }
  
  void intel_guc_busyness_park(struct intel_gt *gt)

@@ -4101,9 +4105,11 @@ static void guc_set_default_submission(struct 
intel_engine_cs *engine)
engine->submit_request = guc_submit_request;
  }
  
-static inline void guc_kernel_context_pin(struct intel_guc *guc,

- struct intel_context *ce)
+static inline int guc_kernel_context_pin(struct intel_guc *guc,
+struct intel_context *ce)
  {
+   int ret;
+
/*
 * Note: we purposefully do not check the returns below because
 * the registration can only fail if a reset is just starting.
@@ -4111,16 +4117,24 @@ static inline void guc_kernel_context_pin(struct 
intel_guc *guc,
 * isn't happening and even it did this code would be run again.
 */
  
-	if (context_guc_id_invalid(ce))

-   pin_guc_id(guc, ce);
+   if (context_guc_id_invalid(ce)) {
+   int ret = pin_guc_id(guc, ce);
+
+   if (ret < 0)
+   return ret;
+   }
  
  	if (!test_bit(CONTEXT_GUC_INIT, >flags))

guc_context_init(ce);
  
-	try_context_registration(ce, true);

+   ret = try_context_registration(ce, true);
+   if (ret)
+   unpin_guc_id(guc, ce);
+
+   return ret;
  }
  
-static inline void guc_init_lrc_mapping(struct intel_guc *guc)

+static inline int guc_init_submission(struct intel_guc *guc)
  {
struct intel_gt *gt = guc_to_gt(guc);
struct intel_engine_cs *engine;
@@ -4147,9 +4161,17 @@ static inline void guc_init_lrc_mapping(struct intel_guc 
*guc)
struct intel_context *ce;
  
  		list_for_each_entry(ce, >pinned_contexts_list,

-   pinned_contexts_link)
-   guc_kernel_context_pin(guc, ce);
+   pinned_contexts_link) {
+   int ret = guc_kernel_context_pin(guc, ce);
+
+   if (ret) {
+   /* No point in trying to clean up as i915 will 
wedge on failure */
+   return ret;
+   }


nit: no need for {} here


+   }
}
+
+   return 0;
  }
  
 

[PATCH 1/3] drm/i915/guc: Fix missing return code checks in submission init

2022-12-21 Thread John . C . Harrison
From: John Harrison 

The CI results for the 'fast request' patch set (enables error return
codes for fire-and-forget H2G messages) hit an issue with the KMD
sending context submission requests on an invalid context. That was
caused by a fault injection probe failing the context creation of a
kernel context. However, there was no return code checking on any of
the kernel context registration paths. So the driver kept going and
tried to use the kernel context for the record defaults process.

Fix that by checking for errors and aborting as appropriate when
creating kernel contexts. While at it, clean up some other submission
init related failure cleanup paths. Also, rename guc_init_lrc_mapping
to guc_init_submission as the former name hasn't been valid in a long
time.

Signed-off-by: John Harrison 
---
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 74 ++-
 .../gpu/drm/i915/gt/uc/intel_guc_submission.h |  2 +-
 drivers/gpu/drm/i915/gt/uc/intel_uc.c |  7 +-
 3 files changed, 62 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 53f7f599cde3a..4682ec1dbd9c0 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1431,21 +1431,25 @@ static int guc_action_enable_usage_stats(struct 
intel_guc *guc)
return intel_guc_send(guc, action, ARRAY_SIZE(action));
 }
 
-static void guc_init_engine_stats(struct intel_guc *guc)
+static int guc_init_engine_stats(struct intel_guc *guc)
 {
struct intel_gt *gt = guc_to_gt(guc);
intel_wakeref_t wakeref;
+   int ret;
 
mod_delayed_work(system_highpri_wq, >timestamp.work,
 guc->timestamp.ping_delay);
 
with_intel_runtime_pm(>i915->runtime_pm, wakeref) {
-   int ret = guc_action_enable_usage_stats(guc);
+   ret = guc_action_enable_usage_stats(guc);
 
-   if (ret)
-   drm_err(>i915->drm,
-   "Failed to enable usage stats: %d!\n", ret);
+   if (ret) {
+   cancel_delayed_work_sync(>timestamp.work);
+   drm_err(>i915->drm, "Failed to enable usage stats: 
%d!\n", ret);
+   }
}
+
+   return ret;
 }
 
 void intel_guc_busyness_park(struct intel_gt *gt)
@@ -4101,9 +4105,11 @@ static void guc_set_default_submission(struct 
intel_engine_cs *engine)
engine->submit_request = guc_submit_request;
 }
 
-static inline void guc_kernel_context_pin(struct intel_guc *guc,
- struct intel_context *ce)
+static inline int guc_kernel_context_pin(struct intel_guc *guc,
+struct intel_context *ce)
 {
+   int ret;
+
/*
 * Note: we purposefully do not check the returns below because
 * the registration can only fail if a reset is just starting.
@@ -4111,16 +4117,24 @@ static inline void guc_kernel_context_pin(struct 
intel_guc *guc,
 * isn't happening and even it did this code would be run again.
 */
 
-   if (context_guc_id_invalid(ce))
-   pin_guc_id(guc, ce);
+   if (context_guc_id_invalid(ce)) {
+   int ret = pin_guc_id(guc, ce);
+
+   if (ret < 0)
+   return ret;
+   }
 
if (!test_bit(CONTEXT_GUC_INIT, >flags))
guc_context_init(ce);
 
-   try_context_registration(ce, true);
+   ret = try_context_registration(ce, true);
+   if (ret)
+   unpin_guc_id(guc, ce);
+
+   return ret;
 }
 
-static inline void guc_init_lrc_mapping(struct intel_guc *guc)
+static inline int guc_init_submission(struct intel_guc *guc)
 {
struct intel_gt *gt = guc_to_gt(guc);
struct intel_engine_cs *engine;
@@ -4147,9 +4161,17 @@ static inline void guc_init_lrc_mapping(struct intel_guc 
*guc)
struct intel_context *ce;
 
list_for_each_entry(ce, >pinned_contexts_list,
-   pinned_contexts_link)
-   guc_kernel_context_pin(guc, ce);
+   pinned_contexts_link) {
+   int ret = guc_kernel_context_pin(guc, ce);
+
+   if (ret) {
+   /* No point in trying to clean up as i915 will 
wedge on failure */
+   return ret;
+   }
+   }
}
+
+   return 0;
 }
 
 static void guc_release(struct intel_engine_cs *engine)
@@ -4391,9 +4413,10 @@ static int guc_init_global_schedule_policy(struct 
intel_guc *guc)
return ret;
 }
 
-void intel_guc_submission_enable(struct intel_guc *guc)
+int intel_guc_submission_enable(struct intel_guc *guc)
 {
struct intel_gt *gt = guc_to_gt(guc);
+   int ret;
 
/* Enable and 

[PATCH 1/3] drm/i915/guc: Fix missing return code checks in submission init

2022-12-19 Thread John . C . Harrison
From: John Harrison 

The CI results for the 'fast request' patch set (enables error return
codes for fire-and-forget H2G messages) hit an issue with the KMD
sending context submission requests on an invalid context. That was
caused by a fault injection probe failing the context creation of a
kernel context. However, there was no return code checking on any of
the kernel context registration paths. So the driver kept going and
tried to use the kernel context for the record defaults process.

Fix that by checking for errors and aborting as appropriate when
creating kernel contexts. While at it, clean up some other submission
init related failure cleanup paths. Also, rename guc_init_lrc_mapping
to guc_init_submission as the former name hasn't been valid in a long
time.

Signed-off-by: John Harrison 
---
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 74 ++-
 .../gpu/drm/i915/gt/uc/intel_guc_submission.h |  2 +-
 drivers/gpu/drm/i915/gt/uc/intel_uc.c |  7 +-
 3 files changed, 62 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 53f7f599cde3a..4682ec1dbd9c0 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1431,21 +1431,25 @@ static int guc_action_enable_usage_stats(struct 
intel_guc *guc)
return intel_guc_send(guc, action, ARRAY_SIZE(action));
 }
 
-static void guc_init_engine_stats(struct intel_guc *guc)
+static int guc_init_engine_stats(struct intel_guc *guc)
 {
struct intel_gt *gt = guc_to_gt(guc);
intel_wakeref_t wakeref;
+   int ret;
 
mod_delayed_work(system_highpri_wq, >timestamp.work,
 guc->timestamp.ping_delay);
 
with_intel_runtime_pm(>i915->runtime_pm, wakeref) {
-   int ret = guc_action_enable_usage_stats(guc);
+   ret = guc_action_enable_usage_stats(guc);
 
-   if (ret)
-   drm_err(>i915->drm,
-   "Failed to enable usage stats: %d!\n", ret);
+   if (ret) {
+   cancel_delayed_work_sync(>timestamp.work);
+   drm_err(>i915->drm, "Failed to enable usage stats: 
%d!\n", ret);
+   }
}
+
+   return ret;
 }
 
 void intel_guc_busyness_park(struct intel_gt *gt)
@@ -4101,9 +4105,11 @@ static void guc_set_default_submission(struct 
intel_engine_cs *engine)
engine->submit_request = guc_submit_request;
 }
 
-static inline void guc_kernel_context_pin(struct intel_guc *guc,
- struct intel_context *ce)
+static inline int guc_kernel_context_pin(struct intel_guc *guc,
+struct intel_context *ce)
 {
+   int ret;
+
/*
 * Note: we purposefully do not check the returns below because
 * the registration can only fail if a reset is just starting.
@@ -4111,16 +4117,24 @@ static inline void guc_kernel_context_pin(struct 
intel_guc *guc,
 * isn't happening and even it did this code would be run again.
 */
 
-   if (context_guc_id_invalid(ce))
-   pin_guc_id(guc, ce);
+   if (context_guc_id_invalid(ce)) {
+   int ret = pin_guc_id(guc, ce);
+
+   if (ret < 0)
+   return ret;
+   }
 
if (!test_bit(CONTEXT_GUC_INIT, >flags))
guc_context_init(ce);
 
-   try_context_registration(ce, true);
+   ret = try_context_registration(ce, true);
+   if (ret)
+   unpin_guc_id(guc, ce);
+
+   return ret;
 }
 
-static inline void guc_init_lrc_mapping(struct intel_guc *guc)
+static inline int guc_init_submission(struct intel_guc *guc)
 {
struct intel_gt *gt = guc_to_gt(guc);
struct intel_engine_cs *engine;
@@ -4147,9 +4161,17 @@ static inline void guc_init_lrc_mapping(struct intel_guc 
*guc)
struct intel_context *ce;
 
list_for_each_entry(ce, >pinned_contexts_list,
-   pinned_contexts_link)
-   guc_kernel_context_pin(guc, ce);
+   pinned_contexts_link) {
+   int ret = guc_kernel_context_pin(guc, ce);
+
+   if (ret) {
+   /* No point in trying to clean up as i915 will 
wedge on failure */
+   return ret;
+   }
+   }
}
+
+   return 0;
 }
 
 static void guc_release(struct intel_engine_cs *engine)
@@ -4391,9 +4413,10 @@ static int guc_init_global_schedule_policy(struct 
intel_guc *guc)
return ret;
 }
 
-void intel_guc_submission_enable(struct intel_guc *guc)
+int intel_guc_submission_enable(struct intel_guc *guc)
 {
struct intel_gt *gt = guc_to_gt(guc);
+   int ret;
 
/* Enable and