On Tue, Nov 14, 2023 at 07:52:29AM -0800, Alan Previn wrote:
> If we are at the end of suspend or very early in resume
> its possible an async fence signal (via rcu_call) is triggered
> to free_engines which could lead us to the execution of
> the context destruction worker (after a prior worker flush).
> 
> Thus, when suspending, insert rcu_barriers at the start
> of i915_gem_suspend (part of driver's suspend prepare) and
> again in i915_gem_suspend_late so that all such cases have
> completed and context destruction list isn't missing anything.
> 
> In destroyed_worker_func, close the race against CT-loss
> by checking that CT is enabled before calling into
> deregister_destroyed_contexts.
> 
> Based on testing, guc_lrc_desc_unpin may still race and fail
> as we traverse the GuC's context-destroy list because the
> CT could be disabled right before calling GuC's CT send function.
> 
> We've witnessed this race condition once every ~6000-8000
> suspend-resume cycles while ensuring workloads that render
> something onscreen is continuously started just before
> we suspend (and the workload is small enough to complete
> and trigger the queued engine/context free-up either very
> late in suspend or very early in resume).
> 
> In such a case, we need to unroll the entire process because
> guc-lrc-unpin takes a gt wakeref which only gets released in
> the G2H IRQ reply that never comes through in this corner
> case. Without the unroll, the taken wakeref is leaked and will
> cascade into a kernel hang later at the tail end of suspend in
> this function:
> 
>    intel_wakeref_wait_for_idle(&gt->wakeref)
>    (called by) - intel_gt_pm_wait_for_idle
>    (called by) - wait_for_suspend
> 
> Thus, do an unroll in guc_lrc_desc_unpin and deregister_destroyed_-
> contexts if guc_lrc_desc_unpin fails due to CT send falure.
> When unrolling, keep the context in the GuC's destroy-list so
> it can get picked up on the next destroy worker invocation
> (if suspend aborted) or get fully purged as part of a GuC
> sanitization (end of suspend) or a reset flow.
> 
> Signed-off-by: Alan Previn <alan.previn.teres.ale...@intel.com>
> Signed-off-by: Anshuman Gupta <anshuman.gu...@intel.com>
> Tested-by: Mousumi Jana <mousumi.j...@intel.com>
> ---
>  drivers/gpu/drm/i915/gem/i915_gem_pm.c        | 10 +++
>  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 81 ++++++++++++++++---
>  2 files changed, 80 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pm.c 
> b/drivers/gpu/drm/i915/gem/i915_gem_pm.c
> index 0d812f4d787d..3b27218aabe2 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_pm.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_pm.c
> @@ -28,6 +28,13 @@ void i915_gem_suspend(struct drm_i915_private *i915)
>       GEM_TRACE("%s\n", dev_name(i915->drm.dev));
>  
>       intel_wakeref_auto(&i915->runtime_pm.userfault_wakeref, 0);
> +     /*
> +      * On rare occasions, we've observed the fence completion triggers
> +      * free_engines asynchronously via rcu_call. Ensure those are done.
> +      * This path is only called on suspend, so it's an acceptable cost.
> +      */
> +     rcu_barrier();
> +
>       flush_workqueue(i915->wq);
>  
>       /*
> @@ -160,6 +167,9 @@ void i915_gem_suspend_late(struct drm_i915_private *i915)
>        * machine in an unusable condition.
>        */
>  
> +     /* Like i915_gem_suspend, flush tasks staged from fence triggers */
> +     rcu_barrier();
> +
>       for_each_gt(gt, i915, i)
>               intel_gt_suspend_late(gt);
>  
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
> b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 9d1915482898..225747115f78 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -236,6 +236,13 @@ set_context_destroyed(struct intel_context *ce)
>       ce->guc_state.sched_state |= SCHED_STATE_DESTROYED;
>  }
>  
> +static inline void
> +clr_context_destroyed(struct intel_context *ce)
> +{
> +     lockdep_assert_held(&ce->guc_state.lock);
> +     ce->guc_state.sched_state &= ~SCHED_STATE_DESTROYED;
> +}
> +
>  static inline bool context_pending_disable(struct intel_context *ce)
>  {
>       return ce->guc_state.sched_state & SCHED_STATE_PENDING_DISABLE;
> @@ -613,6 +620,8 @@ static int guc_submission_send_busy_loop(struct intel_guc 
> *guc,
>                                        u32 g2h_len_dw,
>                                        bool loop)
>  {
> +     int ret;
> +
>       /*
>        * We always loop when a send requires a reply (i.e. g2h_len_dw > 0),
>        * so we don't handle the case where we don't get a reply because we
> @@ -623,7 +632,11 @@ static int guc_submission_send_busy_loop(struct 
> intel_guc *guc,
>       if (g2h_len_dw)
>               atomic_inc(&guc->outstanding_submission_g2h);
>  
> -     return intel_guc_send_busy_loop(guc, action, len, g2h_len_dw, loop);
> +     ret = intel_guc_send_busy_loop(guc, action, len, g2h_len_dw, loop);
> +     if (ret)
> +             atomic_dec(&guc->outstanding_submission_g2h);
> +
> +     return ret;
>  }
>  
>  int intel_guc_wait_for_pending_msg(struct intel_guc *guc,
> @@ -3286,12 +3299,13 @@ static void guc_context_close(struct intel_context 
> *ce)
>       spin_unlock_irqrestore(&ce->guc_state.lock, flags);
>  }
>  
> -static inline void guc_lrc_desc_unpin(struct intel_context *ce)
> +static inline int guc_lrc_desc_unpin(struct intel_context *ce)
>  {
>       struct intel_guc *guc = ce_to_guc(ce);
>       struct intel_gt *gt = guc_to_gt(guc);
>       unsigned long flags;
>       bool disabled;
> +     int ret;
>  
>       GEM_BUG_ON(!intel_gt_pm_is_awake(gt));
>       GEM_BUG_ON(!ctx_id_mapped(guc, ce->guc_id.id));
> @@ -3301,19 +3315,38 @@ static inline void guc_lrc_desc_unpin(struct 
> intel_context *ce)
>       /* Seal race with Reset */
>       spin_lock_irqsave(&ce->guc_state.lock, flags);
>       disabled = submission_disabled(guc);
> -     if (likely(!disabled)) {
> -             __intel_gt_pm_get(gt);
> -             set_context_destroyed(ce);
> -             clr_context_registered(ce);
> -     }
> -     spin_unlock_irqrestore(&ce->guc_state.lock, flags);

you are now holding this spin lock for too long...

>       if (unlikely(disabled)) {
> +             spin_unlock_irqrestore(&ce->guc_state.lock, flags);
>               release_guc_id(guc, ce);
>               __guc_context_destroy(ce);
> -             return;
> +             return 0;
>       }
>  
> -     deregister_context(ce, ce->guc_id.id);
> +     /* GuC is active, lets destroy this context,

for multi-line comments you need to start with '/*' only
and start the real comment below, like:

/*
 * GuC is active, ...

> +      * but at this point we can still be racing with
> +      * suspend, so we undo everything if the H2G fails
> +      */
> +
> +     /* Change context state to destroyed and get gt-pm */
> +     __intel_gt_pm_get(gt);
> +     set_context_destroyed(ce);
> +     clr_context_registered(ce);
> +
> +     ret = deregister_context(ce, ce->guc_id.id);
> +     if (ret) {
> +             /* Undo the state change and put gt-pm if that failed */
> +             set_context_registered(ce);
> +             clr_context_destroyed(ce);
> +             /*
> +              * Dont use might_sleep / ASYNC verion of put because
> +              * CT loss in deregister_context could mean an ongoing
> +              * reset or suspend flow. Immediately put before the unlock
> +              */
> +             __intel_wakeref_put(&gt->wakeref, 0);

interesting enough you use the '__' version to bypass the might_sleep(),
but also sending 0 as argument what might lead you in the mutex_lock inside
the spin lock area, what is not allowed.

> +     }
> +     spin_unlock_irqrestore(&ce->guc_state.lock, flags);
> +
> +     return ret;
>  }
>  
>  static void __guc_context_destroy(struct intel_context *ce)
> @@ -3381,7 +3414,22 @@ static void deregister_destroyed_contexts(struct 
> intel_guc *guc)
>               if (!ce)
>                       break;
>  
> -             guc_lrc_desc_unpin(ce);
> +             if (guc_lrc_desc_unpin(ce)) {
> +                     /*
> +                      * This means GuC's CT link severed mid-way which could 
> happen

here you got the comment style right.

> +                      * in suspend-resume corner cases. In this case, put the
> +                      * context back into the destroyed_contexts list which 
> will
> +                      * get picked up on the next context deregistration 
> event or
> +                      * purged in a GuC sanitization event 
> (reset/unload/wedged/...).
> +                      */
> +                     spin_lock_irqsave(&guc->submission_state.lock, flags);
> +                     list_add_tail(&ce->destroyed_link,
> +                                   
> &guc->submission_state.destroyed_contexts);
> +                     spin_unlock_irqrestore(&guc->submission_state.lock, 
> flags);
> +                     /* Bail now since the list might never be emptied if 
> h2gs fail */

For this GuC interaction part I'd like to get an ack from John Harrison please.

> +                     break;
> +             }
> +
>       }
>  }
>  
> @@ -3392,6 +3440,17 @@ static void destroyed_worker_func(struct work_struct 
> *w)
>       struct intel_gt *gt = guc_to_gt(guc);
>       int tmp;
>  
> +     /*
> +      * In rare cases we can get here via async context-free fence-signals 
> that
> +      * come very late in suspend flow or very early in resume flows. In 
> these
> +      * cases, GuC won't be ready but just skipping it here is fine as these
> +      * pending-destroy-contexts get destroyed totally at GuC reset time at 
> the
> +      * end of suspend.. OR.. this worker can be picked up later on the next
> +      * context destruction trigger after resume-completes
> +      */
> +     if (!intel_guc_is_ready(guc))
> +             return;

is this racy?

> +
>       with_intel_gt_pm(gt, tmp)
>               deregister_destroyed_contexts(guc);
>  }
> -- 
> 2.39.0
> 

Reply via email to