Chris Wilson <[email protected]> writes:

> Currently, we accumulate each time a context hangs the GPU, offset
> against the number of requests it submits, and if that score exceeds a
> certain threshold, we ban that context from submitting any more requests
> (cancelling any work in flight). In contrast, we use a simple timer on
> the file, that if we see more than a 9 hangs faster than 60s apart in
> total across all of its contexts, we will ban the client from creating
> any more contexts. This leads to a confusing situation where the file
> may be banned before the context, so lets use a simple timer scheme for
> each.
>
> If the context submits 3 hanging requests within a 120s period, declare
> it forbidden to ever send more requests.
>
> This has the advantage of not being easy to repair by simply sending
> empty requests, but has the disadvantage that if the context is idle
> then it is forgiven. However, if the context is idle, it is not
> disrupting the system, but a hog can evade the request counting and
> cause much more severe disruption to the system.
>
> Updating ban_score from request retirement is dubious as the retirement
> is purposely not in sync with request submission (i.e. we try and batch
> retirement to reduce overhead and avoid latency on submission), which
> leads to surprising situations where we can forgive a hang immediately
> due to a backlog of requests from before the hang being retired
> afterwards.
>
> Signed-off-by: Chris Wilson <[email protected]>
> Cc: Mika Kuoppala <[email protected]>
> ---
>  drivers/gpu/drm/i915/i915_gem_context.c |  4 ++++
>  drivers/gpu/drm/i915/i915_gem_context.h |  9 +++----
>  drivers/gpu/drm/i915/i915_gpu_error.c   | 31 +++++++------------------
>  drivers/gpu/drm/i915/i915_gpu_error.h   |  3 ---
>  drivers/gpu/drm/i915/i915_request.c     |  2 --
>  drivers/gpu/drm/i915/i915_reset.c       | 27 ++++++++++++---------
>  6 files changed, 33 insertions(+), 43 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c 
> b/drivers/gpu/drm/i915/i915_gem_context.c
> index da21c843fed8..7541c6f961b3 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -355,6 +355,7 @@ __create_hw_context(struct drm_i915_private *dev_priv,
>       struct i915_gem_context *ctx;
>       unsigned int n;
>       int ret;
> +     int i;
>  
>       ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
>       if (ctx == NULL)
> @@ -407,6 +408,9 @@ __create_hw_context(struct drm_i915_private *dev_priv,
>       ctx->desc_template =
>               default_desc_template(dev_priv, dev_priv->mm.aliasing_ppgtt);
>  
> +     for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp); i++)
> +             ctx->hang_timestamp[i] = jiffies - CONTEXT_FAST_HANG_JIFFIES;
> +
>       return ctx;
>  
>  err_pid:
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h 
> b/drivers/gpu/drm/i915/i915_gem_context.h
> index 071108d34ae0..dc6c58f38cfa 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -209,10 +209,11 @@ struct i915_gem_context {
>        */
>       atomic_t active_count;
>  
> -#define CONTEXT_SCORE_GUILTY         10
> -#define CONTEXT_SCORE_BAN_THRESHOLD  40
> -     /** ban_score: Accumulated score of all hangs caused by this context. */
> -     atomic_t ban_score;
> +     /**
> +      * @hang_timestamp: The last time(s) this context caused a GPU hang
> +      */
> +     unsigned long hang_timestamp[2];
> +#define CONTEXT_FAST_HANG_JIFFIES (120 * HZ) /* 3 hangs within 120s? Banned! 
> */
>  
>       /** remap_slice: Bitmask of cache lines that need remapping */
>       u8 remap_slice;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
> b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 9a65341fec09..3f6eddb6f6de 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -434,11 +434,6 @@ static void error_print_instdone(struct 
> drm_i915_error_state_buf *m,
>                          ee->instdone.row[slice][subslice]);
>  }
>  
> -static const char *bannable(const struct drm_i915_error_context *ctx)
> -{
> -     return ctx->bannable ? "" : " (unbannable)";
> -}
> -
>  static void error_print_request(struct drm_i915_error_state_buf *m,
>                               const char *prefix,
>                               const struct drm_i915_error_request *erq,
> @@ -447,9 +442,8 @@ static void error_print_request(struct 
> drm_i915_error_state_buf *m,
>       if (!erq->seqno)
>               return;
>  
> -     err_printf(m, "%s pid %d, ban score %d, seqno %8x:%08x%s%s, prio %d, 
> emitted %dms, start %08x, head %08x, tail %08x\n",
> -                prefix, erq->pid, erq->ban_score,
> -                erq->context, erq->seqno,
> +     err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, 
> start %08x, head %08x, tail %08x\n",
> +                prefix, erq->pid, erq->context, erq->seqno,
>                  test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
>                           &erq->flags) ? "!" : "",
>                  test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
> @@ -463,10 +457,9 @@ static void error_print_context(struct 
> drm_i915_error_state_buf *m,
>                               const char *header,
>                               const struct drm_i915_error_context *ctx)
>  {
> -     err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, ban score 
> %d%s guilty %d active %d\n",
> +     err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, guilty %d 
> active %d\n",
>                  header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
> -                ctx->sched_attr.priority, ctx->ban_score, bannable(ctx),
> -                ctx->guilty, ctx->active);
> +                ctx->sched_attr.priority, ctx->guilty, ctx->active);
>  }
>  
>  static void error_print_engine(struct drm_i915_error_state_buf *m,
> @@ -688,12 +681,10 @@ static void __err_print_to_sgl(struct 
> drm_i915_error_state_buf *m,
>               if (!error->engine[i].context.pid)
>                       continue;
>  
> -             err_printf(m, "Active process (on ring %s): %s [%d], score 
> %d%s\n",
> +             err_printf(m, "Active process (on ring %s): %s [%d]\n",
>                          engine_name(m->i915, i),
>                          error->engine[i].context.comm,
> -                        error->engine[i].context.pid,
> -                        error->engine[i].context.ban_score,
> -                        bannable(&error->engine[i].context));
> +                        error->engine[i].context.pid);
>       }
>       err_printf(m, "Reset count: %u\n", error->reset_count);
>       err_printf(m, "Suspend count: %u\n", error->suspend_count);
> @@ -779,13 +770,11 @@ static void __err_print_to_sgl(struct 
> drm_i915_error_state_buf *m,
>               if (obj) {
>                       err_puts(m, m->i915->engine[i]->name);
>                       if (ee->context.pid)
> -                             err_printf(m, " (submitted by %s [%d], ctx %d 
> [%d], score %d%s)",
> +                             err_printf(m, " (submitted by %s [%d], ctx %d 
> [%d])",
>                                          ee->context.comm,
>                                          ee->context.pid,
>                                          ee->context.handle,
> -                                        ee->context.hw_id,
> -                                        ee->context.ban_score,
> -                                        bannable(&ee->context));
> +                                        ee->context.hw_id);
>                       err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
>                                  upper_32_bits(obj->gtt_offset),
>                                  lower_32_bits(obj->gtt_offset));
> @@ -1301,8 +1290,6 @@ static void record_request(struct i915_request *request,
>       erq->flags = request->fence.flags;
>       erq->context = ctx->hw_id;
>       erq->sched_attr = request->sched.attr;
> -     erq->ban_score = atomic_read(&ctx->ban_score);
> -     erq->seqno = request->global_seqno;
>       erq->jiffies = request->emitted_jiffies;
>       erq->start = i915_ggtt_offset(request->ring->vma);
>       erq->head = request->head;
> @@ -1396,8 +1383,6 @@ static void record_context(struct 
> drm_i915_error_context *e,
>       e->handle = ctx->user_handle;
>       e->hw_id = ctx->hw_id;
>       e->sched_attr = ctx->sched;
> -     e->ban_score = atomic_read(&ctx->ban_score);
> -     e->bannable = i915_gem_context_is_bannable(ctx);
>       e->guilty = atomic_read(&ctx->guilty_count);
>       e->active = atomic_read(&ctx->active_count);
>  }
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h 
> b/drivers/gpu/drm/i915/i915_gpu_error.h
> index afa3adb28f02..94eaf8ab9051 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
> @@ -122,10 +122,8 @@ struct i915_gpu_state {
>                       pid_t pid;
>                       u32 handle;
>                       u32 hw_id;
> -                     int ban_score;
>                       int active;
>                       int guilty;
> -                     bool bannable;
>                       struct i915_sched_attr sched_attr;
>               } context;
>  
> @@ -149,7 +147,6 @@ struct i915_gpu_state {
>                       long jiffies;
>                       pid_t pid;
>                       u32 context;
> -                     int ban_score;
>                       u32 seqno;
>                       u32 start;
>                       u32 head;
> diff --git a/drivers/gpu/drm/i915/i915_request.c 
> b/drivers/gpu/drm/i915/i915_request.c
> index 5ab4e1c01618..a61e3a4fc9dc 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -290,8 +290,6 @@ static void i915_request_retire(struct i915_request 
> *request)
>  
>       i915_request_remove_from_client(request);
>  
> -     /* Retirement decays the ban score as it is a sign of ctx progress */
> -     atomic_dec_if_positive(&request->gem_context->ban_score);
>       intel_context_unpin(request->hw_context);
>  
>       __retire_engine_upto(request->engine, request);
> diff --git a/drivers/gpu/drm/i915/i915_reset.c 
> b/drivers/gpu/drm/i915/i915_reset.c
> index 1911e00d2581..bae88a4ea924 100644
> --- a/drivers/gpu/drm/i915/i915_reset.c
> +++ b/drivers/gpu/drm/i915/i915_reset.c
> @@ -59,24 +59,29 @@ static void client_mark_guilty(struct 
> drm_i915_file_private *file_priv,
>  
>  static bool context_mark_guilty(struct i915_gem_context *ctx)
>  {
> -     unsigned int score;
> -     bool banned, bannable;
> +     unsigned long prev_hang;
> +     bool banned;
> +     int i;
>  
>       atomic_inc(&ctx->guilty_count);
>  
> -     bannable = i915_gem_context_is_bannable(ctx);
> -     score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
> -     banned = (!i915_gem_context_is_recoverable(ctx) ||
> -               score >= CONTEXT_SCORE_BAN_THRESHOLD);
> -
>       /* Cool contexts don't accumulate client ban score */

This comment becomes misleading and can be removed.

> -     if (!bannable)
> +     if (!i915_gem_context_is_bannable(ctx))
>               return false;
>  
> +     /* Record the timestamp for the last N hangs */
> +     prev_hang = ctx->hang_timestamp[0];
> +     for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++)
> +             ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1];
> +     ctx->hang_timestamp[i] = jiffies;
> +
> +     /* If we have hung N+1 times in rapid succession, we ban the context! */
> +     banned = !i915_gem_context_is_recoverable(ctx);
> +     if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
> +             banned = true;

Ok, the initialization to jiffies - CONTEXT_FAST_HANG_JIFFIES guarantees
that it cant be banned even if it manages to immediately hang. Which
in itself is difficult feat to accomplish.

>       if (banned) {
> -             DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
> -                              ctx->name, atomic_read(&ctx->guilty_count),
> -                              score);
> +             DRM_DEBUG_DRIVER("context %s: guilty %d, banned\n",
> +                              ctx->name,
> atomic_read(&ctx->guilty_count));

Now when we know when it previously hanged, we could improve the
logging a bit by saying how long ago. But not sure
about if it worth the trouble.

With the misleading comment removed/corrected,
Reviewed-by: Mika Kuoppala <[email protected]>


>               i915_gem_context_set_banned(ctx);
>       }
>  
> -- 
> 2.20.1
>
> _______________________________________________
> Intel-gfx mailing list
> [email protected]
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to