RE: [PATCH 11/11] drm/amdgpu:fix kiq_resume routine (V2)
Reviewed-by: Xiangliang YuThanks! Xiangliang Yu > -Original Message- > From: amd-gfx [mailto:amd-gfx-boun...@lists.freedesktop.org] On Behalf > Of Monk Liu > Sent: Wednesday, February 08, 2017 5:27 PM > To: amd-gfx@lists.freedesktop.org > Cc: Liu, Monk > Subject: [PATCH 11/11] drm/amdgpu:fix kiq_resume routine (V2) > > v2: > use in_rest to fix compute ring test failure issue which occured after > FLR/gpu_reset. > > we need backup a clean status of MQD which was created in drv load stage, > and use it in resume stage, otherwise KCQ and KIQ all may faild in ring/ib > test. > > Change-Id: I41be940454a6638e9a8a05f096601eaa1fbebaab > Signed-off-by: Monk Liu > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ > drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 44 > ++ > 2 files changed, 35 insertions(+), 11 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 5215fc5..afcae15 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -2410,6 +2410,7 @@ int amdgpu_sriov_gpu_reset(struct > amdgpu_device *adev, bool voluntary) > > mutex_lock(>virt.lock_reset); > atomic_inc(>gpu_reset_counter); > + adev->gfx.in_reset = true; > > /* block TTM */ > resched = ttm_bo_lock_delayed_workqueue(>mman.bdev); > @@ -2494,6 +2495,7 @@ int amdgpu_sriov_gpu_reset(struct > amdgpu_device *adev, bool voluntary) > dev_info(adev->dev, "GPU reset failed\n"); > } > > + adev->gfx.in_reset = false; > mutex_unlock(>virt.lock_reset); > return r; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > index 6584173..1822420 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > @@ -4877,24 +4877,46 @@ static int gfx_v8_0_kiq_init_queue(struct > amdgpu_ring *ring, > struct amdgpu_kiq *kiq = >gfx.kiq; > uint64_t eop_gpu_addr; > bool is_kiq = (ring->funcs->type == AMDGPU_RING_TYPE_KIQ); > + int mqd_idx = AMDGPU_MAX_COMPUTE_RINGS; > > if (is_kiq) { > eop_gpu_addr = kiq->eop_gpu_addr; > gfx_v8_0_kiq_setting(>ring); > - } else > + } else { > eop_gpu_addr = adev->gfx.mec.hpd_eop_gpu_addr + > ring->queue * MEC_HPD_SIZE; > + mqd_idx = ring - >gfx.compute_ring[0]; > + } > > - mutex_lock(>srbm_mutex); > - vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0); > + if (!adev->gfx.in_reset) { > + memset((void *)mqd, 0, sizeof(*mqd)); > + mutex_lock(>srbm_mutex); > + vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0); > + gfx_v8_0_mqd_init(adev, mqd, mqd_gpu_addr, > eop_gpu_addr, ring); > + if (is_kiq) > + gfx_v8_0_kiq_init_register(adev, mqd, ring); > + vi_srbm_select(adev, 0, 0, 0, 0); > + mutex_unlock(>srbm_mutex); > > - gfx_v8_0_mqd_init(adev, mqd, mqd_gpu_addr, eop_gpu_addr, > ring); > + if (adev->gfx.mec.mqd_backup[mqd_idx]) > + memcpy(adev->gfx.mec.mqd_backup[mqd_idx], > mqd, sizeof(*mqd)); > + } else { /* for GPU_RESET case */ > + /* reset MQD to a clean status */ > + if (adev->gfx.mec.mqd_backup[mqd_idx]) > + memcpy(mqd, adev- > >gfx.mec.mqd_backup[mqd_idx], sizeof(*mqd)); > > - if (is_kiq) > - gfx_v8_0_kiq_init_register(adev, mqd, ring); > - > - vi_srbm_select(adev, 0, 0, 0, 0); > - mutex_unlock(>srbm_mutex); > + /* reset ring buffer */ > + ring->wptr = 0; > + amdgpu_ring_clear_ring(ring); > + > + if (is_kiq) { > + mutex_lock(>srbm_mutex); > + vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, > 0); > + gfx_v8_0_kiq_init_register(adev, mqd, ring); > + vi_srbm_select(adev, 0, 0, 0, 0); > + mutex_unlock(>srbm_mutex); > + } > + } > > if (is_kiq) > gfx_v8_0_kiq_enable(ring); > @@ -4913,9 +4935,9 @@ static int gfx_v8_0_kiq_resume(struct > amdgpu_device *adev) > > ring = >gfx.kiq.ring; > if (!amdgpu_bo_kmap(ring->mqd_obj, (void **)>mqd_ptr)) { > - memset((void *)ring->mqd_ptr, 0, sizeof(struct vi_mqd)); > r = gfx_v8_0_kiq_init_queue(ring, ring->mqd_ptr, ring- > >mqd_gpu_addr); > amdgpu_bo_kunmap(ring->mqd_obj); > + ring->mqd_ptr = NULL; > if (r) > return r; > } else { > @@ -4925,9 +4947,9 @@ static int gfx_v8_0_kiq_resume(struct > amdgpu_device *adev) > for (i = 0; i < adev->gfx.num_compute_rings; i++) { >
RE: [PATCH 11/11] drm/amdgpu:fix kiq_resume routine (V2)
> -Original Message- > From: amd-gfx [mailto:amd-gfx-boun...@lists.freedesktop.org] On Behalf > Of Monk Liu > Sent: Wednesday, February 08, 2017 4:27 AM > To: amd-gfx@lists.freedesktop.org > Cc: Liu, Monk > Subject: [PATCH 11/11] drm/amdgpu:fix kiq_resume routine (V2) > > v2: > use in_rest to fix compute ring test failure issue > which occured after FLR/gpu_reset. > > we need backup a clean status of MQD which was created in drv load > stage, and use it in resume stage, otherwise KCQ and KIQ all may > faild in ring/ib test. > > Change-Id: I41be940454a6638e9a8a05f096601eaa1fbebaab > Signed-off-by: Monk LiuReviewed-by: Alex Deucher > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ > drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 44 > ++ > 2 files changed, 35 insertions(+), 11 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 5215fc5..afcae15 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -2410,6 +2410,7 @@ int amdgpu_sriov_gpu_reset(struct > amdgpu_device *adev, bool voluntary) > > mutex_lock(>virt.lock_reset); > atomic_inc(>gpu_reset_counter); > + adev->gfx.in_reset = true; > > /* block TTM */ > resched = ttm_bo_lock_delayed_workqueue(>mman.bdev); > @@ -2494,6 +2495,7 @@ int amdgpu_sriov_gpu_reset(struct > amdgpu_device *adev, bool voluntary) > dev_info(adev->dev, "GPU reset failed\n"); > } > > + adev->gfx.in_reset = false; > mutex_unlock(>virt.lock_reset); > return r; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > index 6584173..1822420 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > @@ -4877,24 +4877,46 @@ static int gfx_v8_0_kiq_init_queue(struct > amdgpu_ring *ring, > struct amdgpu_kiq *kiq = >gfx.kiq; > uint64_t eop_gpu_addr; > bool is_kiq = (ring->funcs->type == AMDGPU_RING_TYPE_KIQ); > + int mqd_idx = AMDGPU_MAX_COMPUTE_RINGS; > > if (is_kiq) { > eop_gpu_addr = kiq->eop_gpu_addr; > gfx_v8_0_kiq_setting(>ring); > - } else > + } else { > eop_gpu_addr = adev->gfx.mec.hpd_eop_gpu_addr + > ring->queue * MEC_HPD_SIZE; > + mqd_idx = ring - >gfx.compute_ring[0]; > + } > > - mutex_lock(>srbm_mutex); > - vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0); > + if (!adev->gfx.in_reset) { > + memset((void *)mqd, 0, sizeof(*mqd)); > + mutex_lock(>srbm_mutex); > + vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0); > + gfx_v8_0_mqd_init(adev, mqd, mqd_gpu_addr, > eop_gpu_addr, ring); > + if (is_kiq) > + gfx_v8_0_kiq_init_register(adev, mqd, ring); > + vi_srbm_select(adev, 0, 0, 0, 0); > + mutex_unlock(>srbm_mutex); > > - gfx_v8_0_mqd_init(adev, mqd, mqd_gpu_addr, eop_gpu_addr, > ring); > + if (adev->gfx.mec.mqd_backup[mqd_idx]) > + memcpy(adev->gfx.mec.mqd_backup[mqd_idx], > mqd, sizeof(*mqd)); > + } else { /* for GPU_RESET case */ > + /* reset MQD to a clean status */ > + if (adev->gfx.mec.mqd_backup[mqd_idx]) > + memcpy(mqd, adev- > >gfx.mec.mqd_backup[mqd_idx], sizeof(*mqd)); > > - if (is_kiq) > - gfx_v8_0_kiq_init_register(adev, mqd, ring); > - > - vi_srbm_select(adev, 0, 0, 0, 0); > - mutex_unlock(>srbm_mutex); > + /* reset ring buffer */ > + ring->wptr = 0; > + amdgpu_ring_clear_ring(ring); > + > + if (is_kiq) { > + mutex_lock(>srbm_mutex); > + vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, > 0); > + gfx_v8_0_kiq_init_register(adev, mqd, ring); > + vi_srbm_select(adev, 0, 0, 0, 0); > + mutex_unlock(>srbm_mutex); > + } > + } > > if (is_kiq) > gfx_v8_0_kiq_enable(ring); > @@ -4913,9 +4935,9 @@ static int gfx_v8_0_kiq_resume(struct > amdgpu_device *adev) > > ring = >gfx.kiq.ring; > if (!amdgpu_bo_kmap(ring->mqd_obj, (void **)>mqd_ptr)) { > - memset((void *)ring->mqd_ptr, 0, sizeof(struct vi_mqd)); > r = gfx_v8_0_kiq_init_queue(ring, ring->mqd_ptr, ring- > >mqd_gpu_addr); > amdgpu_bo_kunmap(ring->mqd_obj); > + ring->mqd_ptr = NULL; > if (r) > return r; > } else { > @@ -4925,9 +4947,9 @@ static int gfx_v8_0_kiq_resume(struct > amdgpu_device *adev) > for (i = 0; i < adev->gfx.num_compute_rings; i++) { > ring = >gfx.compute_ring[i]; >