Re: [Intel-gfx] [PATCH 3/4] drm/i915/guc: Fix GuC HW/SW state cleanup in unload path

2017-09-07 Thread Michal Wajdeczko
On Fri, Sep 01, 2017 at 11:02:11AM +0530, Sagar Arun Kamble wrote:
> Teardown of GuC HW/SW state was not properly done in unload path.
> During unload, we can rely on intel_guc_reset_prepare being done
> as part of i915_gem_suspend for disabling GuC interfaces.
> We will have to disable GuC submission prior to suspend as that involves
> communication with GuC to destroy doorbell. So intel_uc_fini_hw has to
> be called as part of i915_gem_suspend during unload as that really
> takes care of finishing the GuC operations. Created new parameter for
> i915_gem_suspend to handle unload/suspend path w.r.t gem and GuC suspend.
> GuC related allocations are cleaned up as part of intel_uc_cleanup_hw.
> 
> v2: Prepared i915_gem_unload. (Michal)
> 
> Cc: Chris Wilson 
> Cc: Michal Wajdeczko 
> Cc: Daniele Ceraolo Spurio 
> Signed-off-by: Sagar Arun Kamble 
> ---
>  drivers/gpu/drm/i915/i915_drv.c|  6 +--
>  drivers/gpu/drm/i915/i915_drv.h|  1 +
>  drivers/gpu/drm/i915/i915_gem.c| 79 
> ++
>  drivers/gpu/drm/i915/intel_guc.c   | 13 ++
>  drivers/gpu/drm/i915/intel_guc.h   |  1 +
>  drivers/gpu/drm/i915/intel_uc.c| 14 +++---
>  drivers/gpu/drm/i915/intel_uc_common.h |  1 +
>  7 files changed, 103 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index b2e8f95..b6cc2fe 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -601,7 +601,7 @@ static void i915_gem_fini(struct drm_i915_private 
> *dev_priv)
>   i915_gem_drain_workqueue(dev_priv);
>  
>   mutex_lock(_priv->drm.struct_mutex);
> - intel_uc_fini_hw(dev_priv);
> + intel_uc_cleanup_hw(dev_priv);
>   i915_gem_cleanup_engines(dev_priv);
>   i915_gem_contexts_fini(dev_priv);
>   i915_gem_cleanup_userptr(dev_priv);
> @@ -682,7 +682,7 @@ static int i915_load_modeset_init(struct drm_device *dev)
>   return 0;
>  
>  cleanup_gem:
> - if (i915_gem_suspend(dev_priv))
> + if (i915_gem_unload(dev_priv))
>   DRM_ERROR("failed to idle hardware; continuing to unload!\n");
>   i915_gem_fini(dev_priv);
>  cleanup_uc:
> @@ -1375,7 +1375,7 @@ void i915_driver_unload(struct drm_device *dev)
>  
>   i915_driver_unregister(dev_priv);
>  
> - if (i915_gem_suspend(dev_priv))
> + if (i915_gem_unload(dev_priv))
>   DRM_ERROR("failed to idle hardware; continuing to unload!\n");
>  
>   intel_display_power_get(dev_priv, POWER_DOMAIN_INIT);
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 064bf0f..570e71e 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3628,6 +3628,7 @@ void i915_gem_reset_engine(struct intel_engine_cs 
> *engine,
>  int i915_gem_wait_for_idle(struct drm_i915_private *dev_priv,
>  unsigned int flags);
>  int __must_check i915_gem_suspend(struct drm_i915_private *dev_priv);
> +int __must_check i915_gem_unload(struct drm_i915_private *dev_priv);
>  void i915_gem_resume(struct drm_i915_private *dev_priv);
>  int i915_gem_fault(struct vm_fault *vmf);
>  int i915_gem_object_wait(struct drm_i915_gem_object *obj,
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 977500f..24cefe9 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -4624,6 +4624,85 @@ int i915_gem_suspend(struct drm_i915_private *dev_priv)
>   return ret;
>  }
>  
> +int i915_gem_unload(struct drm_i915_private *dev_priv)
> +{
> + struct drm_device *dev = _priv->drm;
> + int ret;
> +
> + intel_runtime_pm_get(dev_priv);
> + intel_suspend_gt_powersave(dev_priv);
> +
> + mutex_lock(>struct_mutex);
> +
> + /* We have to flush all the executing contexts to main memory so
> +  * that they can saved in the hibernation image. To ensure the last
> +  * context image is coherent, we have to switch away from it. That
> +  * leaves the dev_priv->kernel_context still active when
> +  * we actually suspend, and its image in memory may not match the GPU
> +  * state. Fortunately, the kernel_context is disposable and we do
> +  * not rely on its state.
> +  */
> + ret = i915_gem_switch_to_kernel_context(dev_priv);
> + if (ret)
> + goto err_unlock;
> +
> + ret = i915_gem_wait_for_idle(dev_priv,
> +  I915_WAIT_INTERRUPTIBLE |
> +  I915_WAIT_LOCKED);
> + if (ret)
> + goto err_unlock;
> +
> + assert_kernel_context_is_current(dev_priv);
> + i915_gem_contexts_lost(dev_priv);
> + mutex_unlock(>struct_mutex);
> +
> + cancel_delayed_work_sync(_priv->gpu_error.hangcheck_work);
> + 

Re: [Intel-gfx] [PATCH 3/4] drm/i915/guc: Fix GuC HW/SW state cleanup in unload path

2017-09-07 Thread Kamble, Sagar A



On 9/5/2017 8:24 PM, Michał Winiarski wrote:

On Fri, Sep 01, 2017 at 11:02:11AM +0530, Sagar Arun Kamble wrote:

Teardown of GuC HW/SW state was not properly done in unload path.
During unload, we can rely on intel_guc_reset_prepare being done
as part of i915_gem_suspend for disabling GuC interfaces.
We will have to disable GuC submission prior to suspend as that involves
communication with GuC to destroy doorbell. So intel_uc_fini_hw has to
be called as part of i915_gem_suspend during unload as that really
takes care of finishing the GuC operations. Created new parameter for
i915_gem_suspend to handle unload/suspend path w.r.t gem and GuC suspend.
GuC related allocations are cleaned up as part of intel_uc_cleanup_hw.

Is this still accurate description? After changes from v2?

No. Sorry. Will update this in the new version.



v2: Prepared i915_gem_unload. (Michal)

Cc: Chris Wilson 
Cc: Michal Wajdeczko 
Cc: Daniele Ceraolo Spurio 
Signed-off-by: Sagar Arun Kamble 
---
  drivers/gpu/drm/i915/i915_drv.c|  6 +--
  drivers/gpu/drm/i915/i915_drv.h|  1 +
  drivers/gpu/drm/i915/i915_gem.c| 79 ++
  drivers/gpu/drm/i915/intel_guc.c   | 13 ++
  drivers/gpu/drm/i915/intel_guc.h   |  1 +
  drivers/gpu/drm/i915/intel_uc.c| 14 +++---
  drivers/gpu/drm/i915/intel_uc_common.h |  1 +
  7 files changed, 103 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index b2e8f95..b6cc2fe 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -601,7 +601,7 @@ static void i915_gem_fini(struct drm_i915_private *dev_priv)
i915_gem_drain_workqueue(dev_priv);
  
  	mutex_lock(_priv->drm.struct_mutex);

-   intel_uc_fini_hw(dev_priv);
+   intel_uc_cleanup_hw(dev_priv);
i915_gem_cleanup_engines(dev_priv);
i915_gem_contexts_fini(dev_priv);
i915_gem_cleanup_userptr(dev_priv);
@@ -682,7 +682,7 @@ static int i915_load_modeset_init(struct drm_device *dev)
return 0;
  
  cleanup_gem:

-   if (i915_gem_suspend(dev_priv))
+   if (i915_gem_unload(dev_priv))
DRM_ERROR("failed to idle hardware; continuing to unload!\n");
i915_gem_fini(dev_priv);
  cleanup_uc:
@@ -1375,7 +1375,7 @@ void i915_driver_unload(struct drm_device *dev)
  
  	i915_driver_unregister(dev_priv);
  
-	if (i915_gem_suspend(dev_priv))

+   if (i915_gem_unload(dev_priv))
DRM_ERROR("failed to idle hardware; continuing to unload!\n");
  
  	intel_display_power_get(dev_priv, POWER_DOMAIN_INIT);

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 064bf0f..570e71e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3628,6 +3628,7 @@ void i915_gem_reset_engine(struct intel_engine_cs *engine,
  int i915_gem_wait_for_idle(struct drm_i915_private *dev_priv,
   unsigned int flags);
  int __must_check i915_gem_suspend(struct drm_i915_private *dev_priv);
+int __must_check i915_gem_unload(struct drm_i915_private *dev_priv);
  void i915_gem_resume(struct drm_i915_private *dev_priv);
  int i915_gem_fault(struct vm_fault *vmf);
  int i915_gem_object_wait(struct drm_i915_gem_object *obj,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 977500f..24cefe9 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4624,6 +4624,85 @@ int i915_gem_suspend(struct drm_i915_private *dev_priv)
return ret;
  }
  
+int i915_gem_unload(struct drm_i915_private *dev_priv)

+{
+   struct drm_device *dev = _priv->drm;
+   int ret;
+
+   intel_runtime_pm_get(dev_priv);
+   intel_suspend_gt_powersave(dev_priv);
+
+   mutex_lock(>struct_mutex);
+
+   /* We have to flush all the executing contexts to main memory so
+* that they can saved in the hibernation image. To ensure the last
+* context image is coherent, we have to switch away from it. That
+* leaves the dev_priv->kernel_context still active when
+* we actually suspend, and its image in memory may not match the GPU
+* state. Fortunately, the kernel_context is disposable and we do
+* not rely on its state.
+*/
+   ret = i915_gem_switch_to_kernel_context(dev_priv);
+   if (ret)
+   goto err_unlock;
+
+   ret = i915_gem_wait_for_idle(dev_priv,
+I915_WAIT_INTERRUPTIBLE |
+I915_WAIT_LOCKED);
+   if (ret)
+   goto err_unlock;
+
+   assert_kernel_context_is_current(dev_priv);
+   i915_gem_contexts_lost(dev_priv);
+   mutex_unlock(>struct_mutex);
+
+   cancel_delayed_work_sync(_priv->gpu_error.hangcheck_work);
+   

Re: [Intel-gfx] [PATCH 3/4] drm/i915/guc: Fix GuC HW/SW state cleanup in unload path

2017-09-05 Thread Michał Winiarski
On Fri, Sep 01, 2017 at 11:02:11AM +0530, Sagar Arun Kamble wrote:
> Teardown of GuC HW/SW state was not properly done in unload path.
> During unload, we can rely on intel_guc_reset_prepare being done
> as part of i915_gem_suspend for disabling GuC interfaces.
> We will have to disable GuC submission prior to suspend as that involves
> communication with GuC to destroy doorbell. So intel_uc_fini_hw has to
> be called as part of i915_gem_suspend during unload as that really
> takes care of finishing the GuC operations. Created new parameter for
> i915_gem_suspend to handle unload/suspend path w.r.t gem and GuC suspend.
> GuC related allocations are cleaned up as part of intel_uc_cleanup_hw.

Is this still accurate description? After changes from v2?

> 
> v2: Prepared i915_gem_unload. (Michal)
> 
> Cc: Chris Wilson 
> Cc: Michal Wajdeczko 
> Cc: Daniele Ceraolo Spurio 
> Signed-off-by: Sagar Arun Kamble 
> ---
>  drivers/gpu/drm/i915/i915_drv.c|  6 +--
>  drivers/gpu/drm/i915/i915_drv.h|  1 +
>  drivers/gpu/drm/i915/i915_gem.c| 79 
> ++
>  drivers/gpu/drm/i915/intel_guc.c   | 13 ++
>  drivers/gpu/drm/i915/intel_guc.h   |  1 +
>  drivers/gpu/drm/i915/intel_uc.c| 14 +++---
>  drivers/gpu/drm/i915/intel_uc_common.h |  1 +
>  7 files changed, 103 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index b2e8f95..b6cc2fe 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -601,7 +601,7 @@ static void i915_gem_fini(struct drm_i915_private 
> *dev_priv)
>   i915_gem_drain_workqueue(dev_priv);
>  
>   mutex_lock(_priv->drm.struct_mutex);
> - intel_uc_fini_hw(dev_priv);
> + intel_uc_cleanup_hw(dev_priv);
>   i915_gem_cleanup_engines(dev_priv);
>   i915_gem_contexts_fini(dev_priv);
>   i915_gem_cleanup_userptr(dev_priv);
> @@ -682,7 +682,7 @@ static int i915_load_modeset_init(struct drm_device *dev)
>   return 0;
>  
>  cleanup_gem:
> - if (i915_gem_suspend(dev_priv))
> + if (i915_gem_unload(dev_priv))
>   DRM_ERROR("failed to idle hardware; continuing to unload!\n");
>   i915_gem_fini(dev_priv);
>  cleanup_uc:
> @@ -1375,7 +1375,7 @@ void i915_driver_unload(struct drm_device *dev)
>  
>   i915_driver_unregister(dev_priv);
>  
> - if (i915_gem_suspend(dev_priv))
> + if (i915_gem_unload(dev_priv))
>   DRM_ERROR("failed to idle hardware; continuing to unload!\n");
>  
>   intel_display_power_get(dev_priv, POWER_DOMAIN_INIT);
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 064bf0f..570e71e 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3628,6 +3628,7 @@ void i915_gem_reset_engine(struct intel_engine_cs 
> *engine,
>  int i915_gem_wait_for_idle(struct drm_i915_private *dev_priv,
>  unsigned int flags);
>  int __must_check i915_gem_suspend(struct drm_i915_private *dev_priv);
> +int __must_check i915_gem_unload(struct drm_i915_private *dev_priv);
>  void i915_gem_resume(struct drm_i915_private *dev_priv);
>  int i915_gem_fault(struct vm_fault *vmf);
>  int i915_gem_object_wait(struct drm_i915_gem_object *obj,
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 977500f..24cefe9 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -4624,6 +4624,85 @@ int i915_gem_suspend(struct drm_i915_private *dev_priv)
>   return ret;
>  }
>  
> +int i915_gem_unload(struct drm_i915_private *dev_priv)
> +{
> + struct drm_device *dev = _priv->drm;
> + int ret;
> +
> + intel_runtime_pm_get(dev_priv);
> + intel_suspend_gt_powersave(dev_priv);
> +
> + mutex_lock(>struct_mutex);
> +
> + /* We have to flush all the executing contexts to main memory so
> +  * that they can saved in the hibernation image. To ensure the last
> +  * context image is coherent, we have to switch away from it. That
> +  * leaves the dev_priv->kernel_context still active when
> +  * we actually suspend, and its image in memory may not match the GPU
> +  * state. Fortunately, the kernel_context is disposable and we do
> +  * not rely on its state.
> +  */
> + ret = i915_gem_switch_to_kernel_context(dev_priv);
> + if (ret)
> + goto err_unlock;
> +
> + ret = i915_gem_wait_for_idle(dev_priv,
> +  I915_WAIT_INTERRUPTIBLE |
> +  I915_WAIT_LOCKED);
> + if (ret)
> + goto err_unlock;
> +
> + assert_kernel_context_is_current(dev_priv);
> + i915_gem_contexts_lost(dev_priv);
> + mutex_unlock(>struct_mutex);
> +
> + 

[Intel-gfx] [PATCH 3/4] drm/i915/guc: Fix GuC HW/SW state cleanup in unload path

2017-08-31 Thread Sagar Arun Kamble
Teardown of GuC HW/SW state was not properly done in unload path.
During unload, we can rely on intel_guc_reset_prepare being done
as part of i915_gem_suspend for disabling GuC interfaces.
We will have to disable GuC submission prior to suspend as that involves
communication with GuC to destroy doorbell. So intel_uc_fini_hw has to
be called as part of i915_gem_suspend during unload as that really
takes care of finishing the GuC operations. Created new parameter for
i915_gem_suspend to handle unload/suspend path w.r.t gem and GuC suspend.
GuC related allocations are cleaned up as part of intel_uc_cleanup_hw.

v2: Prepared i915_gem_unload. (Michal)

Cc: Chris Wilson 
Cc: Michal Wajdeczko 
Cc: Daniele Ceraolo Spurio 
Signed-off-by: Sagar Arun Kamble 
---
 drivers/gpu/drm/i915/i915_drv.c|  6 +--
 drivers/gpu/drm/i915/i915_drv.h|  1 +
 drivers/gpu/drm/i915/i915_gem.c| 79 ++
 drivers/gpu/drm/i915/intel_guc.c   | 13 ++
 drivers/gpu/drm/i915/intel_guc.h   |  1 +
 drivers/gpu/drm/i915/intel_uc.c| 14 +++---
 drivers/gpu/drm/i915/intel_uc_common.h |  1 +
 7 files changed, 103 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index b2e8f95..b6cc2fe 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -601,7 +601,7 @@ static void i915_gem_fini(struct drm_i915_private *dev_priv)
i915_gem_drain_workqueue(dev_priv);
 
mutex_lock(_priv->drm.struct_mutex);
-   intel_uc_fini_hw(dev_priv);
+   intel_uc_cleanup_hw(dev_priv);
i915_gem_cleanup_engines(dev_priv);
i915_gem_contexts_fini(dev_priv);
i915_gem_cleanup_userptr(dev_priv);
@@ -682,7 +682,7 @@ static int i915_load_modeset_init(struct drm_device *dev)
return 0;
 
 cleanup_gem:
-   if (i915_gem_suspend(dev_priv))
+   if (i915_gem_unload(dev_priv))
DRM_ERROR("failed to idle hardware; continuing to unload!\n");
i915_gem_fini(dev_priv);
 cleanup_uc:
@@ -1375,7 +1375,7 @@ void i915_driver_unload(struct drm_device *dev)
 
i915_driver_unregister(dev_priv);
 
-   if (i915_gem_suspend(dev_priv))
+   if (i915_gem_unload(dev_priv))
DRM_ERROR("failed to idle hardware; continuing to unload!\n");
 
intel_display_power_get(dev_priv, POWER_DOMAIN_INIT);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 064bf0f..570e71e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3628,6 +3628,7 @@ void i915_gem_reset_engine(struct intel_engine_cs *engine,
 int i915_gem_wait_for_idle(struct drm_i915_private *dev_priv,
   unsigned int flags);
 int __must_check i915_gem_suspend(struct drm_i915_private *dev_priv);
+int __must_check i915_gem_unload(struct drm_i915_private *dev_priv);
 void i915_gem_resume(struct drm_i915_private *dev_priv);
 int i915_gem_fault(struct vm_fault *vmf);
 int i915_gem_object_wait(struct drm_i915_gem_object *obj,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 977500f..24cefe9 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4624,6 +4624,85 @@ int i915_gem_suspend(struct drm_i915_private *dev_priv)
return ret;
 }
 
+int i915_gem_unload(struct drm_i915_private *dev_priv)
+{
+   struct drm_device *dev = _priv->drm;
+   int ret;
+
+   intel_runtime_pm_get(dev_priv);
+   intel_suspend_gt_powersave(dev_priv);
+
+   mutex_lock(>struct_mutex);
+
+   /* We have to flush all the executing contexts to main memory so
+* that they can saved in the hibernation image. To ensure the last
+* context image is coherent, we have to switch away from it. That
+* leaves the dev_priv->kernel_context still active when
+* we actually suspend, and its image in memory may not match the GPU
+* state. Fortunately, the kernel_context is disposable and we do
+* not rely on its state.
+*/
+   ret = i915_gem_switch_to_kernel_context(dev_priv);
+   if (ret)
+   goto err_unlock;
+
+   ret = i915_gem_wait_for_idle(dev_priv,
+I915_WAIT_INTERRUPTIBLE |
+I915_WAIT_LOCKED);
+   if (ret)
+   goto err_unlock;
+
+   assert_kernel_context_is_current(dev_priv);
+   i915_gem_contexts_lost(dev_priv);
+   mutex_unlock(>struct_mutex);
+
+   cancel_delayed_work_sync(_priv->gpu_error.hangcheck_work);
+   cancel_delayed_work_sync(_priv->gt.retire_work);
+
+   /* As the idle_work is rearming if it detects a race, play safe and
+* repeat the flush until it is definitely idle.
+*/
+   while