RE: [PATCH 11/11] drm/amdgpu:fix kiq_resume routine (V2)

2017-02-08 Thread Yu, Xiangliang
Reviewed-by: Xiangliang Yu 



Thanks!
Xiangliang Yu


> -Original Message-
> From: amd-gfx [mailto:amd-gfx-boun...@lists.freedesktop.org] On Behalf
> Of Monk Liu
> Sent: Wednesday, February 08, 2017 5:27 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk 
> Subject: [PATCH 11/11] drm/amdgpu:fix kiq_resume routine (V2)
> 
> v2:
> use in_rest to fix compute ring test failure issue which occured after
> FLR/gpu_reset.
> 
> we need backup a clean status of MQD which was created in drv load stage,
> and use it in resume stage, otherwise KCQ and KIQ all may faild in ring/ib 
> test.
> 
> Change-Id: I41be940454a6638e9a8a05f096601eaa1fbebaab
> Signed-off-by: Monk Liu 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 ++
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c  | 44
> ++
>  2 files changed, 35 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 5215fc5..afcae15 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2410,6 +2410,7 @@ int amdgpu_sriov_gpu_reset(struct
> amdgpu_device *adev, bool voluntary)
> 
>   mutex_lock(>virt.lock_reset);
>   atomic_inc(>gpu_reset_counter);
> + adev->gfx.in_reset = true;
> 
>   /* block TTM */
>   resched = ttm_bo_lock_delayed_workqueue(>mman.bdev);
> @@ -2494,6 +2495,7 @@ int amdgpu_sriov_gpu_reset(struct
> amdgpu_device *adev, bool voluntary)
>   dev_info(adev->dev, "GPU reset failed\n");
>   }
> 
> + adev->gfx.in_reset = false;
>   mutex_unlock(>virt.lock_reset);
>   return r;
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 6584173..1822420 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -4877,24 +4877,46 @@ static int gfx_v8_0_kiq_init_queue(struct
> amdgpu_ring *ring,
>   struct amdgpu_kiq *kiq = >gfx.kiq;
>   uint64_t eop_gpu_addr;
>   bool is_kiq = (ring->funcs->type == AMDGPU_RING_TYPE_KIQ);
> + int mqd_idx = AMDGPU_MAX_COMPUTE_RINGS;
> 
>   if (is_kiq) {
>   eop_gpu_addr = kiq->eop_gpu_addr;
>   gfx_v8_0_kiq_setting(>ring);
> - } else
> + } else {
>   eop_gpu_addr = adev->gfx.mec.hpd_eop_gpu_addr +
>   ring->queue * MEC_HPD_SIZE;
> + mqd_idx = ring - >gfx.compute_ring[0];
> + }
> 
> - mutex_lock(>srbm_mutex);
> - vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
> + if (!adev->gfx.in_reset) {
> + memset((void *)mqd, 0, sizeof(*mqd));
> + mutex_lock(>srbm_mutex);
> + vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
> + gfx_v8_0_mqd_init(adev, mqd, mqd_gpu_addr,
> eop_gpu_addr, ring);
> + if (is_kiq)
> + gfx_v8_0_kiq_init_register(adev, mqd, ring);
> + vi_srbm_select(adev, 0, 0, 0, 0);
> + mutex_unlock(>srbm_mutex);
> 
> - gfx_v8_0_mqd_init(adev, mqd, mqd_gpu_addr, eop_gpu_addr,
> ring);
> + if (adev->gfx.mec.mqd_backup[mqd_idx])
> + memcpy(adev->gfx.mec.mqd_backup[mqd_idx],
> mqd, sizeof(*mqd));
> + } else { /* for GPU_RESET case */
> + /* reset MQD to a clean status */
> + if (adev->gfx.mec.mqd_backup[mqd_idx])
> + memcpy(mqd, adev-
> >gfx.mec.mqd_backup[mqd_idx], sizeof(*mqd));
> 
> - if (is_kiq)
> - gfx_v8_0_kiq_init_register(adev, mqd, ring);
> -
> - vi_srbm_select(adev, 0, 0, 0, 0);
> - mutex_unlock(>srbm_mutex);
> + /* reset ring buffer */
> + ring->wptr = 0;
> + amdgpu_ring_clear_ring(ring);
> +
> + if (is_kiq) {
> + mutex_lock(>srbm_mutex);
> + vi_srbm_select(adev, ring->me, ring->pipe, ring->queue,
> 0);
> + gfx_v8_0_kiq_init_register(adev, mqd, ring);
> + vi_srbm_select(adev, 0, 0, 0, 0);
> + mutex_unlock(>srbm_mutex);
> + }
> + }
> 
>   if (is_kiq)
>   gfx_v8_0_kiq_enable(ring);
> @@ -4913,9 +4935,9 @@ static int gfx_v8_0_kiq_resume(struct
> amdgpu_device *adev)
> 
>   ring = >gfx.kiq.ring;
>   if (!amdgpu_bo_kmap(ring->mqd_obj, (void **)>mqd_ptr)) {
> - memset((void *)ring->mqd_ptr, 0, sizeof(struct vi_mqd));
>   r = gfx_v8_0_kiq_init_queue(ring, ring->mqd_ptr, ring-
> >mqd_gpu_addr);
>   amdgpu_bo_kunmap(ring->mqd_obj);
> + ring->mqd_ptr = NULL;
>   if (r)
>   return r;
>   } else {
> @@ -4925,9 +4947,9 @@ static int gfx_v8_0_kiq_resume(struct
> amdgpu_device *adev)
>   for (i = 0; i < adev->gfx.num_compute_rings; i++) {
> 

RE: [PATCH 11/11] drm/amdgpu:fix kiq_resume routine (V2)

2017-02-08 Thread Deucher, Alexander
> -Original Message-
> From: amd-gfx [mailto:amd-gfx-boun...@lists.freedesktop.org] On Behalf
> Of Monk Liu
> Sent: Wednesday, February 08, 2017 4:27 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk
> Subject: [PATCH 11/11] drm/amdgpu:fix kiq_resume routine (V2)
> 
> v2:
> use in_rest to fix compute ring test failure issue
> which occured after FLR/gpu_reset.
> 
> we need backup a clean status of MQD which was created in drv load
> stage, and use it in resume stage, otherwise KCQ and KIQ all may
> faild in ring/ib test.
> 
> Change-Id: I41be940454a6638e9a8a05f096601eaa1fbebaab
> Signed-off-by: Monk Liu 

Reviewed-by: Alex Deucher 

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 ++
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c  | 44
> ++
>  2 files changed, 35 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 5215fc5..afcae15 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2410,6 +2410,7 @@ int amdgpu_sriov_gpu_reset(struct
> amdgpu_device *adev, bool voluntary)
> 
>   mutex_lock(>virt.lock_reset);
>   atomic_inc(>gpu_reset_counter);
> + adev->gfx.in_reset = true;
> 
>   /* block TTM */
>   resched = ttm_bo_lock_delayed_workqueue(>mman.bdev);
> @@ -2494,6 +2495,7 @@ int amdgpu_sriov_gpu_reset(struct
> amdgpu_device *adev, bool voluntary)
>   dev_info(adev->dev, "GPU reset failed\n");
>   }
> 
> + adev->gfx.in_reset = false;
>   mutex_unlock(>virt.lock_reset);
>   return r;
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 6584173..1822420 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -4877,24 +4877,46 @@ static int gfx_v8_0_kiq_init_queue(struct
> amdgpu_ring *ring,
>   struct amdgpu_kiq *kiq = >gfx.kiq;
>   uint64_t eop_gpu_addr;
>   bool is_kiq = (ring->funcs->type == AMDGPU_RING_TYPE_KIQ);
> + int mqd_idx = AMDGPU_MAX_COMPUTE_RINGS;
> 
>   if (is_kiq) {
>   eop_gpu_addr = kiq->eop_gpu_addr;
>   gfx_v8_0_kiq_setting(>ring);
> - } else
> + } else {
>   eop_gpu_addr = adev->gfx.mec.hpd_eop_gpu_addr +
>   ring->queue * MEC_HPD_SIZE;
> + mqd_idx = ring - >gfx.compute_ring[0];
> + }
> 
> - mutex_lock(>srbm_mutex);
> - vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
> + if (!adev->gfx.in_reset) {
> + memset((void *)mqd, 0, sizeof(*mqd));
> + mutex_lock(>srbm_mutex);
> + vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
> + gfx_v8_0_mqd_init(adev, mqd, mqd_gpu_addr,
> eop_gpu_addr, ring);
> + if (is_kiq)
> + gfx_v8_0_kiq_init_register(adev, mqd, ring);
> + vi_srbm_select(adev, 0, 0, 0, 0);
> + mutex_unlock(>srbm_mutex);
> 
> - gfx_v8_0_mqd_init(adev, mqd, mqd_gpu_addr, eop_gpu_addr,
> ring);
> + if (adev->gfx.mec.mqd_backup[mqd_idx])
> + memcpy(adev->gfx.mec.mqd_backup[mqd_idx],
> mqd, sizeof(*mqd));
> + } else { /* for GPU_RESET case */
> + /* reset MQD to a clean status */
> + if (adev->gfx.mec.mqd_backup[mqd_idx])
> + memcpy(mqd, adev-
> >gfx.mec.mqd_backup[mqd_idx], sizeof(*mqd));
> 
> - if (is_kiq)
> - gfx_v8_0_kiq_init_register(adev, mqd, ring);
> -
> - vi_srbm_select(adev, 0, 0, 0, 0);
> - mutex_unlock(>srbm_mutex);
> + /* reset ring buffer */
> + ring->wptr = 0;
> + amdgpu_ring_clear_ring(ring);
> +
> + if (is_kiq) {
> + mutex_lock(>srbm_mutex);
> + vi_srbm_select(adev, ring->me, ring->pipe, ring->queue,
> 0);
> + gfx_v8_0_kiq_init_register(adev, mqd, ring);
> + vi_srbm_select(adev, 0, 0, 0, 0);
> + mutex_unlock(>srbm_mutex);
> + }
> + }
> 
>   if (is_kiq)
>   gfx_v8_0_kiq_enable(ring);
> @@ -4913,9 +4935,9 @@ static int gfx_v8_0_kiq_resume(struct
> amdgpu_device *adev)
> 
>   ring = >gfx.kiq.ring;
>   if (!amdgpu_bo_kmap(ring->mqd_obj, (void **)>mqd_ptr)) {
> - memset((void *)ring->mqd_ptr, 0, sizeof(struct vi_mqd));
>   r = gfx_v8_0_kiq_init_queue(ring, ring->mqd_ptr, ring-
> >mqd_gpu_addr);
>   amdgpu_bo_kunmap(ring->mqd_obj);
> + ring->mqd_ptr = NULL;
>   if (r)
>   return r;
>   } else {
> @@ -4925,9 +4947,9 @@ static int gfx_v8_0_kiq_resume(struct
> amdgpu_device *adev)
>   for (i = 0; i < adev->gfx.num_compute_rings; i++) {
>   ring = >gfx.compute_ring[i];
>