Chris Wilson <[email protected]> writes:

> Since unbannable contexts are special and supposed not to be causing GPU
> hangs in the first place, make it clear when they are implicated in said
> hang. In practice, most unbannable contexts are those created by igt
> for the express purpose of throwing untold thousands of hangs at the GPU
> and wish to keep doing so to finish the test. Normally they are cleaned
> up, but it's when they or the other unbannable kernel contexts stay
> stuck in an erroneous state that we need to worry and so need
> highlighting.
>
> Suggested-by: Mika Kuoppala <[email protected]>
> Signed-off-by: Chris Wilson <[email protected]>
> Cc: Mika Kuoppala <[email protected]

+>

Well, this should make things obvious if this happens.

Reviewed-by: Mika Kuoppala <[email protected]>

> ---
>  drivers/gpu/drm/i915/i915_drv.h       |  1 +
>  drivers/gpu/drm/i915/i915_gpu_error.c | 21 +++++++++++++++------
>  2 files changed, 16 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 4e158aab36d6..d6b5ac2a563d 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -555,6 +555,7 @@ struct i915_gpu_state {
>                       int ban_score;
>                       int active;
>                       int guilty;
> +                     bool bannable;
>               } context;
>  
>               struct drm_i915_error_object {
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
> b/drivers/gpu/drm/i915/i915_gpu_error.c
> index a81351d9e3a6..67c902412193 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -396,6 +396,11 @@ static void error_print_instdone(struct 
> drm_i915_error_state_buf *m,
>                          ee->instdone.row[slice][subslice]);
>  }
>  
> +static const char *bannable(const struct drm_i915_error_context *ctx)
> +{
> +     return ctx->bannable ? "" : " (unbannable)";
> +}
> +
>  static void error_print_request(struct drm_i915_error_state_buf *m,
>                               const char *prefix,
>                               const struct drm_i915_error_request *erq)
> @@ -414,9 +419,10 @@ static void error_print_context(struct 
> drm_i915_error_state_buf *m,
>                               const char *header,
>                               const struct drm_i915_error_context *ctx)
>  {
> -     err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, ban score %d 
> guilty %d active %d\n",
> +     err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, ban score 
> %d%s guilty %d active %d\n",
>                  header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
> -                ctx->priority, ctx->ban_score, ctx->guilty, ctx->active);
> +                ctx->priority, ctx->ban_score, bannable(ctx),
> +                ctx->guilty, ctx->active);
>  }
>  
>  static void error_print_engine(struct drm_i915_error_state_buf *m,
> @@ -644,11 +650,12 @@ int i915_error_state_to_str(struct 
> drm_i915_error_state_buf *m,
>       for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
>               if (error->engine[i].hangcheck_stalled &&
>                   error->engine[i].context.pid) {
> -                     err_printf(m, "Active process (on ring %s): %s [%d], 
> score %d\n",
> +                     err_printf(m, "Active process (on ring %s): %s [%d], 
> score %d%s\n",
>                                  engine_name(m->i915, i),
>                                  error->engine[i].context.comm,
>                                  error->engine[i].context.pid,
> -                                error->engine[i].context.ban_score);
> +                                error->engine[i].context.ban_score,
> +                                bannable(&error->engine[i].context));
>               }
>       }
>       err_printf(m, "Reset count: %u\n", error->reset_count);
> @@ -736,12 +743,13 @@ int i915_error_state_to_str(struct 
> drm_i915_error_state_buf *m,
>               if (obj) {
>                       err_puts(m, dev_priv->engine[i]->name);
>                       if (ee->context.pid)
> -                             err_printf(m, " (submitted by %s [%d], ctx %d 
> [%d], score %d)",
> +                             err_printf(m, " (submitted by %s [%d], ctx %d 
> [%d], score %d%s)",
>                                          ee->context.comm,
>                                          ee->context.pid,
>                                          ee->context.handle,
>                                          ee->context.hw_id,
> -                                        ee->context.ban_score);
> +                                        ee->context.ban_score,
> +                                        bannable(&ee->context));
>                       err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
>                                  upper_32_bits(obj->gtt_offset),
>                                  lower_32_bits(obj->gtt_offset));
> @@ -1383,6 +1391,7 @@ static void record_context(struct 
> drm_i915_error_context *e,
>       e->hw_id = ctx->hw_id;
>       e->priority = ctx->priority;
>       e->ban_score = atomic_read(&ctx->ban_score);
> +     e->bannable = i915_gem_context_is_bannable(ctx);
>       e->guilty = atomic_read(&ctx->guilty_count);
>       e->active = atomic_read(&ctx->active_count);
>  }
> -- 
> 2.15.1
>
> _______________________________________________
> Intel-gfx mailing list
> [email protected]
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to