Re: [Intel-gfx] [PATCH] drm/i915: Record the current requests queue for execlists upon hang

2016-10-13 Thread Chris Wilson
On Thu, Oct 13, 2016 at 12:51:26PM +0300, Mika Kuoppala wrote:
> Chris Wilson  writes:
> > +static void record_request(struct drm_i915_gem_request *request,
> > +  struct drm_i915_error_request *erq)
> > +{
> > +   erq->context = request->ctx->hw_id;
> > +   erq->seqno = request->fence.seqno;
> > +   erq->jiffies = request->emitted_jiffies;
> > +   erq->head = request->head;
> > +   erq->tail = request->tail;
> > +
> > +   rcu_read_lock();
> > +   erq->pid = request->ctx->pid ? pid_nr(request->ctx->pid) : 0;
> 
> This lock is only for the pid_nr and nothing to do with ctx dereference?
> Not that it was added by this patch...

It's for the struct task lookup inside pid_nr.

But...

> > +   for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++)
> > +   if (engine->execlist_port[n].request)
> > +   record_request(engine->execlist_port[n].request,
> > +  >execlist[n]);
> 
> Ok even if we get interrupt at around here and reset the ports,
> the pointer should stay in request_list and at that part we should be
> safe.

Note that we don't even get interrupts anymore as we completely stop the
machine whilst capturing. So even rcu_read_lock() above is overkill,
mere documentation.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH] drm/i915: Record the current requests queue for execlists upon hang

2016-10-13 Thread Mika Kuoppala
Chris Wilson  writes:

> Mika wanted to know what requests were pending at the time of a hang as
> we now track which requests we have submitted to the hardware.
>
> Signed-off-by: Chris Wilson 
> Cc: Mika Kuoppala 
> ---
>  drivers/gpu/drm/i915/i915_drv.h   |  3 +-
>  drivers/gpu/drm/i915/i915_gpu_error.c | 64 
> ---
>  2 files changed, 47 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index bf397b643cc0..6360e807c6ba 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -832,10 +832,11 @@ struct drm_i915_error_state {
>   struct drm_i915_error_request {
>   long jiffies;
>   pid_t pid;
> + u32 context;
>   u32 seqno;
>   u32 head;
>   u32 tail;
> - } *requests;
> + } *requests, execlist[2];
>  
>   struct drm_i915_error_waiter {
>   char comm[TASK_COMM_LEN];
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
> b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 78cc13b9b2a5..026b78c66219 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -363,6 +363,20 @@ static void error_print_instdone(struct 
> drm_i915_error_state_buf *m,
>  ee->instdone.row[slice][subslice]);
>  }
>  
> +static void error_print_request(struct drm_i915_error_state_buf *m,
> + const char *prefix,
> + struct drm_i915_error_request *erq)
> +{
> + if (!erq->seqno)
> + return;
> +
> + err_printf(m, "%s pid %d, seqno %8x:%08x, emitted %dms ago, head %08x, 
> tail %08x\n",
> +prefix, erq->pid,
> +erq->context, erq->seqno,
> +jiffies_to_msecs(jiffies - erq->jiffies),
> +erq->head, erq->tail);
> +}
> +
>  static void error_print_engine(struct drm_i915_error_state_buf *m,
>  struct drm_i915_error_engine *ee)
>  {
> @@ -434,6 +448,8 @@ static void error_print_engine(struct 
> drm_i915_error_state_buf *m,
>   err_printf(m, "  hangcheck: %s [%d]\n",
>  hangcheck_action_to_str(ee->hangcheck_action),
>  ee->hangcheck_score);
> + error_print_request(m, "  ELSP[0]: ", >execlist[0]);
> + error_print_request(m, "  ELSP[1]: ", >execlist[1]);
>  }
>  
>  void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, 
> ...)
> @@ -649,14 +665,8 @@ int i915_error_state_to_str(struct 
> drm_i915_error_state_buf *m,
>   err_printf(m, "%s --- %d requests\n",
>  dev_priv->engine[i].name,
>  ee->num_requests);
> - for (j = 0; j < ee->num_requests; j++) {
> - err_printf(m, "  pid %d, seqno 0x%08x, emitted 
> %ld, head 0x%08x, tail 0x%08x\n",
> -ee->requests[j].pid,
> -ee->requests[j].seqno,
> -ee->requests[j].jiffies,
> -ee->requests[j].head,
> -ee->requests[j].tail);
> - }
> + for (j = 0; j < ee->num_requests; j++)
> + error_print_request(m, " ", >requests[j]);
>   }
>  
>   if (IS_ERR(ee->waiters)) {
> @@ -1155,6 +1165,20 @@ static void error_record_engine_registers(struct 
> drm_i915_error_state *error,
>   }
>  }
>  
> +static void record_request(struct drm_i915_gem_request *request,
> +struct drm_i915_error_request *erq)
> +{
> + erq->context = request->ctx->hw_id;
> + erq->seqno = request->fence.seqno;
> + erq->jiffies = request->emitted_jiffies;
> + erq->head = request->head;
> + erq->tail = request->tail;
> +
> + rcu_read_lock();
> + erq->pid = request->ctx->pid ? pid_nr(request->ctx->pid) : 0;

This lock is only for the pid_nr and nothing to do with ctx dereference?
Not that it was added by this patch...

> + rcu_read_unlock();
> +}
> +
>  static void engine_record_requests(struct intel_engine_cs *engine,
>  struct drm_i915_gem_request *first,
>  struct drm_i915_error_engine *ee)
> @@ -1178,8 +1202,6 @@ static void engine_record_requests(struct 
> intel_engine_cs *engine,
>   count = 0;
>   request = first;
>   list_for_each_entry_from(request, >request_list, link) {
> - struct drm_i915_error_request *erq;
> -
>   if (count >= ee->num_requests) {
>   /*
>