Re: [Mesa-dev] [PATCH] i965: Be resilient in the face of GPU hangs

2019-02-17 Thread Kenneth Graunke
On Saturday, February 16, 2019 4:46:27 AM PST Chris Wilson wrote:
> If we hang the GPU and end up banning our context, we will no longer be
> able to submit and abort with an error (exit(1) no less). As we submit
> minimal incremental batches that rely on the logical context state of
> previous batches, we can not rely on the kernel's recovery mechanism
> which tries to restore the context back to a "golden" renderstate (the
> default HW setup) and replay the batches in flight. Instead, we must
> create a new context and set it up, including all the lost register
> settings that we only apply once during setup, before allow the user to
> continue rendering. The batches already submitted are lost
> (unrecoverable) so there will be a momentarily glitch and lost rendering
> across frames, but the application should be able to recover and
> continue on fairly oblivious.
> 
> To make wedging even more likely, we use a new "no recovery" context
> parameter that tells the kernel to not even attempt to replay any
> batches in flight against the default context image, as experience shows
> the HW is not always robust enough to cope with the conflicting state.
> 
> v2: Export brw_reset_state() to improve the amount of state we clobber
> on return to a starting context. (Kenneth)
> 
> Cc: Kenneth Graunke 
> ---
> The intent was to refactor the existing brw_reset_state() out of
> brw_init_state() so that we could reuse, so reuse it!
> ---
>  src/mesa/drivers/dri/i965/brw_bufmgr.c| 25 +++
>  src/mesa/drivers/dri/i965/brw_bufmgr.h|  2 ++
>  src/mesa/drivers/dri/i965/brw_context.h   |  3 +++
>  src/mesa/drivers/dri/i965/brw_state_upload.c  | 22 
>  src/mesa/drivers/dri/i965/intel_batchbuffer.c | 20 +++
>  5 files changed, 67 insertions(+), 5 deletions(-)

Even better, thanks!

Reviewed-by: Kenneth Graunke 


signature.asc
Description: This is a digitally signed message part.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] i965: Be resilient in the face of GPU hangs

2019-02-16 Thread Chris Wilson
If we hang the GPU and end up banning our context, we will no longer be
able to submit and abort with an error (exit(1) no less). As we submit
minimal incremental batches that rely on the logical context state of
previous batches, we can not rely on the kernel's recovery mechanism
which tries to restore the context back to a "golden" renderstate (the
default HW setup) and replay the batches in flight. Instead, we must
create a new context and set it up, including all the lost register
settings that we only apply once during setup, before allow the user to
continue rendering. The batches already submitted are lost
(unrecoverable) so there will be a momentarily glitch and lost rendering
across frames, but the application should be able to recover and
continue on fairly oblivious.

To make wedging even more likely, we use a new "no recovery" context
parameter that tells the kernel to not even attempt to replay any
batches in flight against the default context image, as experience shows
the HW is not always robust enough to cope with the conflicting state.

v2: Export brw_reset_state() to improve the amount of state we clobber
on return to a starting context. (Kenneth)

Cc: Kenneth Graunke 
---
The intent was to refactor the existing brw_reset_state() out of
brw_init_state() so that we could reuse, so reuse it!
---
 src/mesa/drivers/dri/i965/brw_bufmgr.c| 25 +++
 src/mesa/drivers/dri/i965/brw_bufmgr.h|  2 ++
 src/mesa/drivers/dri/i965/brw_context.h   |  3 +++
 src/mesa/drivers/dri/i965/brw_state_upload.c  | 22 
 src/mesa/drivers/dri/i965/intel_batchbuffer.c | 20 +++
 5 files changed, 67 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c 
b/src/mesa/drivers/dri/i965/brw_bufmgr.c
index b33a30930db..d8a9f0c450d 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@@ -1589,6 +1589,16 @@ init_cache_buckets(struct brw_bufmgr *bufmgr)
}
 }
 
+static void init_context(struct brw_bufmgr *bufmgr, uint32_t ctx_id)
+{
+   struct drm_i915_gem_context_param p = {
+  .ctx_id = ctx_id,
+  .param = 0x8, // I915_CONTEXT_PARAM_RECOVERABLE,
+   };
+
+   drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, );
+}
+
 uint32_t
 brw_create_hw_context(struct brw_bufmgr *bufmgr)
 {
@@ -1599,6 +1609,8 @@ brw_create_hw_context(struct brw_bufmgr *bufmgr)
   return 0;
}
 
+   init_context(bufmgr, create.ctx_id);
+
return create.ctx_id;
 }
 
@@ -1621,6 +1633,19 @@ brw_hw_context_set_priority(struct brw_bufmgr *bufmgr,
return err;
 }
 
+int
+brw_hw_context_get_priority(struct brw_bufmgr *bufmgr, uint32_t ctx_id)
+{
+   struct drm_i915_gem_context_param p = {
+  .ctx_id = ctx_id,
+  .param = I915_CONTEXT_PARAM_PRIORITY,
+   };
+
+   drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, );
+
+   return p.value; /* on error, return 0 i.e. default priority */
+}
+
 void
 brw_destroy_hw_context(struct brw_bufmgr *bufmgr, uint32_t ctx_id)
 {
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.h 
b/src/mesa/drivers/dri/i965/brw_bufmgr.h
index 32fc7a553c9..886b2e607ce 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.h
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.h
@@ -356,6 +356,8 @@ uint32_t brw_create_hw_context(struct brw_bufmgr *bufmgr);
 int brw_hw_context_set_priority(struct brw_bufmgr *bufmgr,
 uint32_t ctx_id,
 int priority);
+int
+brw_hw_context_get_priority(struct brw_bufmgr *bufmgr, uint32_t ctx_id);
 
 void brw_destroy_hw_context(struct brw_bufmgr *bufmgr, uint32_t ctx_id);
 
diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
b/src/mesa/drivers/dri/i965/brw_context.h
index 66fe5b3a8a0..4a306c4217a 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1647,6 +1647,9 @@ brw_get_graphics_reset_status(struct gl_context *ctx);
 void
 brw_check_for_reset(struct brw_context *brw);
 
+void
+brw_reset_state(struct brw_context *brw);
+
 /* brw_compute.c */
 extern void
 brw_init_compute_functions(struct dd_function_table *functions);
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c 
b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 50049d325b3..a320c24edc5 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -228,12 +228,8 @@ brw_copy_pipeline_atoms(struct brw_context *brw,
 
 void brw_init_state( struct brw_context *brw )
 {
-   struct gl_context *ctx = >ctx;
const struct gen_device_info *devinfo = >screen->devinfo;
 
-   /* Force the first brw_select_pipeline to emit pipeline select */
-   brw->last_pipeline = BRW_NUM_PIPELINES;
-
brw_init_caches(brw);
 
if (devinfo->gen >= 11)
@@ -257,6 +253,16 @@ void brw_init_state( struct brw_context *brw )
else
   gen4_init_atoms(brw);
 
+   brw_reset_state(brw);
+}
+
+void brw_reset_state( struct brw_context *brw )
+{
+   

Re: [Mesa-dev] [PATCH] i965: Be resilient in the face of GPU hangs

2018-12-04 Thread Chris Wilson
Quoting Chris Wilson (2018-10-24 09:40:08)
> If we hang the GPU and end up banning our context, we will no longer be
> able to submit and abort with an error (exit(1) no less). As we submit
> minimal incremental batches that rely on the logical context state of
> previous batches, we can not rely on the kernel's recovery mechanism
> which tries to restore the context back to a "golden" renderstate (the
> default HW setup) and replay the batches in flight. Instead, we must
> create a new context and set it up, including all the lost register
> settings that we only apply once during setup, before allow the user to
> continue rendering. The batches already submitted are lost
> (unrecoverable) so there will be a momentarily glitch and lost rendering
> across frames, but the application should be able to recover and
> continue on fairly oblivious.
> 
> To make wedging even more likely, we use a new "no recovery" context
> parameter that tells the kernel to not even attempt to replay any
> batches in flight against the default context image, as experience shows
> the HW is not always robust enough to cope with the conflicting state.
> 
> Cc: Kenneth Graunke 

So, give or take some forgotten state that is not reset on context reload,
what do you think about this as a stepping stone to handling GPU resets
robustly?

> ---
>  src/mesa/drivers/dri/i965/brw_bufmgr.c| 25 +++
>  src/mesa/drivers/dri/i965/brw_bufmgr.h|  2 ++
>  src/mesa/drivers/dri/i965/intel_batchbuffer.c | 19 ++
>  3 files changed, 46 insertions(+)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c 
> b/src/mesa/drivers/dri/i965/brw_bufmgr.c
> index f1675b191c1..328393e2ade 100644
> --- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
> +++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
> @@ -1589,6 +1589,16 @@ init_cache_buckets(struct brw_bufmgr *bufmgr)
> }
>  }
>  
> +static void init_context(struct brw_bufmgr *bufmgr, uint32_t ctx_id)
> +{
> +   struct drm_i915_gem_context_param p = {
> +  .ctx_id = ctx_id,
> +  .param = 0x7, // I915_CONTEXT_PARAM_RECOVERABLE,
> +   };
> +
> +   drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, );
> +}
> +
>  uint32_t
>  brw_create_hw_context(struct brw_bufmgr *bufmgr)
>  {
> @@ -1599,6 +1609,8 @@ brw_create_hw_context(struct brw_bufmgr *bufmgr)
>return 0;
> }
>  
> +   init_context(bufmgr, create.ctx_id);
> +
> return create.ctx_id;
>  }
>  
> @@ -1621,6 +1633,19 @@ brw_hw_context_set_priority(struct brw_bufmgr *bufmgr,
> return err;
>  }
>  
> +int
> +brw_hw_context_get_priority(struct brw_bufmgr *bufmgr, uint32_t ctx_id)
> +{
> +   struct drm_i915_gem_context_param p = {
> +  .ctx_id = ctx_id,
> +  .param = I915_CONTEXT_PARAM_PRIORITY,
> +   };
> +
> +   drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, );
> +
> +   return p.value; /* on error, return 0 i.e. default priority */
> +}
> +
>  void
>  brw_destroy_hw_context(struct brw_bufmgr *bufmgr, uint32_t ctx_id)
>  {
> diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.h 
> b/src/mesa/drivers/dri/i965/brw_bufmgr.h
> index 32fc7a553c9..886b2e607ce 100644
> --- a/src/mesa/drivers/dri/i965/brw_bufmgr.h
> +++ b/src/mesa/drivers/dri/i965/brw_bufmgr.h
> @@ -356,6 +356,8 @@ uint32_t brw_create_hw_context(struct brw_bufmgr *bufmgr);
>  int brw_hw_context_set_priority(struct brw_bufmgr *bufmgr,
>  uint32_t ctx_id,
>  int priority);
> +int
> +brw_hw_context_get_priority(struct brw_bufmgr *bufmgr, uint32_t ctx_id);
>  
>  void brw_destroy_hw_context(struct brw_bufmgr *bufmgr, uint32_t ctx_id);
>  
> diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c 
> b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> index 4363b146150..73c2bbab18e 100644
> --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> @@ -735,6 +735,18 @@ execbuffer(int fd,
> return ret;
>  }
>  
> +static void recreate_context(struct brw_context *brw)
> +{
> +   struct brw_bufmgr *bufmgr = brw->bufmgr;
> +   int prio;
> +
> +   prio = brw_hw_context_get_priority(bufmgr, brw->hw_ctx);
> +   brw_destroy_hw_context(bufmgr, brw->hw_ctx);
> +
> +   brw->hw_ctx = brw_create_hw_context(bufmgr);
> +   brw_hw_context_set_priority(bufmgr, brw->hw_ctx, prio);
> +}
> +
>  static int
>  submit_batch(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
>  {
> @@ -821,6 +833,13 @@ submit_batch(struct brw_context *brw, int in_fence_fd, 
> int *out_fence_fd)
> if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
>brw_check_for_reset(brw);
>  
> +   if (ret == -EIO) {
> +  recreate_context(brw);
> +  brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
> +  brw_upload_invariant_state(brw);
> +  ret = 0;
> +   }
> +
> if (ret != 0) {
>fprintf(stderr, "i965: Failed to submit batchbuffer: %s\n",
>strerror(-ret));
> -- 
> 2.19.1
> 

[Mesa-dev] [PATCH] i965: Be resilient in the face of GPU hangs

2018-10-24 Thread Chris Wilson
If we hang the GPU and end up banning our context, we will no longer be
able to submit and abort with an error (exit(1) no less). As we submit
minimal incremental batches that rely on the logical context state of
previous batches, we can not rely on the kernel's recovery mechanism
which tries to restore the context back to a "golden" renderstate (the
default HW setup) and replay the batches in flight. Instead, we must
create a new context and set it up, including all the lost register
settings that we only apply once during setup, before allow the user to
continue rendering. The batches already submitted are lost
(unrecoverable) so there will be a momentarily glitch and lost rendering
across frames, but the application should be able to recover and
continue on fairly oblivious.

To make wedging even more likely, we use a new "no recovery" context
parameter that tells the kernel to not even attempt to replay any
batches in flight against the default context image, as experience shows
the HW is not always robust enough to cope with the conflicting state.

Cc: Kenneth Graunke 
---
 src/mesa/drivers/dri/i965/brw_bufmgr.c| 25 +++
 src/mesa/drivers/dri/i965/brw_bufmgr.h|  2 ++
 src/mesa/drivers/dri/i965/intel_batchbuffer.c | 19 ++
 3 files changed, 46 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c 
b/src/mesa/drivers/dri/i965/brw_bufmgr.c
index f1675b191c1..328393e2ade 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@@ -1589,6 +1589,16 @@ init_cache_buckets(struct brw_bufmgr *bufmgr)
}
 }
 
+static void init_context(struct brw_bufmgr *bufmgr, uint32_t ctx_id)
+{
+   struct drm_i915_gem_context_param p = {
+  .ctx_id = ctx_id,
+  .param = 0x7, // I915_CONTEXT_PARAM_RECOVERABLE,
+   };
+
+   drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, );
+}
+
 uint32_t
 brw_create_hw_context(struct brw_bufmgr *bufmgr)
 {
@@ -1599,6 +1609,8 @@ brw_create_hw_context(struct brw_bufmgr *bufmgr)
   return 0;
}
 
+   init_context(bufmgr, create.ctx_id);
+
return create.ctx_id;
 }
 
@@ -1621,6 +1633,19 @@ brw_hw_context_set_priority(struct brw_bufmgr *bufmgr,
return err;
 }
 
+int
+brw_hw_context_get_priority(struct brw_bufmgr *bufmgr, uint32_t ctx_id)
+{
+   struct drm_i915_gem_context_param p = {
+  .ctx_id = ctx_id,
+  .param = I915_CONTEXT_PARAM_PRIORITY,
+   };
+
+   drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, );
+
+   return p.value; /* on error, return 0 i.e. default priority */
+}
+
 void
 brw_destroy_hw_context(struct brw_bufmgr *bufmgr, uint32_t ctx_id)
 {
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.h 
b/src/mesa/drivers/dri/i965/brw_bufmgr.h
index 32fc7a553c9..886b2e607ce 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.h
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.h
@@ -356,6 +356,8 @@ uint32_t brw_create_hw_context(struct brw_bufmgr *bufmgr);
 int brw_hw_context_set_priority(struct brw_bufmgr *bufmgr,
 uint32_t ctx_id,
 int priority);
+int
+brw_hw_context_get_priority(struct brw_bufmgr *bufmgr, uint32_t ctx_id);
 
 void brw_destroy_hw_context(struct brw_bufmgr *bufmgr, uint32_t ctx_id);
 
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c 
b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 4363b146150..73c2bbab18e 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -735,6 +735,18 @@ execbuffer(int fd,
return ret;
 }
 
+static void recreate_context(struct brw_context *brw)
+{
+   struct brw_bufmgr *bufmgr = brw->bufmgr;
+   int prio;
+
+   prio = brw_hw_context_get_priority(bufmgr, brw->hw_ctx);
+   brw_destroy_hw_context(bufmgr, brw->hw_ctx);
+
+   brw->hw_ctx = brw_create_hw_context(bufmgr);
+   brw_hw_context_set_priority(bufmgr, brw->hw_ctx, prio);
+}
+
 static int
 submit_batch(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
 {
@@ -821,6 +833,13 @@ submit_batch(struct brw_context *brw, int in_fence_fd, int 
*out_fence_fd)
if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
   brw_check_for_reset(brw);
 
+   if (ret == -EIO) {
+  recreate_context(brw);
+  brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
+  brw_upload_invariant_state(brw);
+  ret = 0;
+   }
+
if (ret != 0) {
   fprintf(stderr, "i965: Failed to submit batchbuffer: %s\n",
   strerror(-ret));
-- 
2.19.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev