RE: [PATCH] drm/amdgpu: block ring buffer access during GPU recovery

Li, Dennis Mon, 31 Aug 2020 19:46:17 -0700

[AMD Official Use Only - Internal Distribution Only]

Hi, Andrey,
    
RE- Isn't adev->reset_sem non-recursive ? How this works when you try to access 
registers from within GPU reset thread while adev->reset_sem is already write 
locked from amdgpu_device_lock_adev earlier in the same thread ?


Deli: down_read_trylock will fail in this case, return false immediately and 
will not lock adev->reset_sem. In GPU reset thread, we should use MMIO instead 
of KIQ to access registers. 

Best Regards
Dennis Li
-----Original Message-----
From: Grodzovsky, Andrey <andrey.grodzov...@amd.com> 
Sent: Tuesday, September 1, 2020 9:40 AM
To: Li, Dennis <dennis...@amd.com>; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander <alexander.deuc...@amd.com>; Kuehling, Felix 
<felix.kuehl...@amd.com>; Zhang, Hawking <hawking.zh...@amd.com>; Koenig, 
Christian <christian.koe...@amd.com>
Subject: Re: [PATCH] drm/amdgpu: block ring buffer access during GPU recovery


On 8/31/20 9:17 PM, Dennis Li wrote:
> When GPU is in reset, its status isn't stable and ring buffer also 
> need be reset when resuming. Therefore driver should protect GPU 
> recovery thread from ring buffer accessed by other threads. Otherwise 
> GPU will randomly hang during recovery.
>
> Signed-off-by: Dennis Li <dennis...@amd.com>
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 172dc47b7f39..8db56a22cd1b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -319,8 +319,13 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, 
> uint32_t reg,
>   {
>       uint32_t ret;
>   
> -     if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
> -             return amdgpu_kiq_rreg(adev, reg);
> +     if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
> +             amdgpu_sriov_runtime(adev) &&
> +             down_read_trylock(&adev->reset_sem)) {
> +             ret = amdgpu_kiq_rreg(adev, reg);
> +             up_read(&adev->reset_sem);
> +             return ret;
> +     }


Isn't adev->reset_sem non-recursive ? How this works when you try to access 
registers from within GPU reset thread while adev->reset_sem is already write 
locked from amdgpu_device_lock_adev earlier in the same thread ?

Andrey


>   
>       if ((reg * 4) < adev->rmmio_size)
>               ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); @@ 
> -332,6 
> +337,7 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
>               ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
>               spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
>       }
> +
>       trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
>       return ret;
>   }
> @@ -407,8 +413,13 @@ void static inline amdgpu_mm_wreg_mmio(struct 
> amdgpu_device *adev, uint32_t reg,
>   void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
>                   uint32_t acc_flags)
>   {
> -     if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
> -             return amdgpu_kiq_wreg(adev, reg, v);
> +     if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
> +             amdgpu_sriov_runtime(adev) &&
> +             down_read_trylock(&adev->reset_sem)) {
> +             amdgpu_kiq_wreg(adev, reg, v);
> +             up_read(&adev->reset_sem);
> +             return;
> +     }
>   
>       amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index ad9ad622ccce..4ea2a065daa9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -287,7 +287,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device 
> *adev, uint32_t vmid,
>        */
>       if (adev->gfx.kiq.ring.sched.ready &&
>           (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
> -         !amdgpu_in_reset(adev)) {
> +         down_read_trylock(&adev->reset_sem)) {
>   
>               struct amdgpu_vmhub *hub = &adev->vmhub[vmhub];
>               const unsigned eng = 17;
> @@ -297,6 +297,8 @@ static void gmc_v10_0_flush_gpu_tlb(struct 
> amdgpu_device *adev, uint32_t vmid,
>   
>               amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
>                               1 << vmid);
> +
> +             up_read(&adev->reset_sem);
>               return;
>       }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index e1a0ae327cf5..33b7cf1c79ec 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -501,12 +501,13 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device 
> *adev, uint32_t vmid,
>        */
>       if (adev->gfx.kiq.ring.sched.ready &&
>                       (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) 
> &&
> -                     !amdgpu_in_reset(adev)) {
> +                     down_read_trylock(&adev->reset_sem)) {
>               uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng;
>               uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng;
>   
>               amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
>                                                  1 << vmid);
> +             up_read(&adev->reset_sem);
>               return;
>       }
>   
> @@ -599,7 +600,8 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct 
> amdgpu_device *adev,
>       if (amdgpu_in_reset(adev))
>               return -EIO;
>   
> -     if (ring->sched.ready) {
> +     if (ring->sched.ready &&
> +              down_read_trylock(&adev->reset_sem)) {
>               /* Vega20+XGMI caches PTEs in TC and TLB. Add a
>                * heavy-weight TLB flush (type 2), which flushes
>                * both. Due to a race condition with concurrent @@ -626,6 
> +628,7 
> @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>               if (r) {
>                       amdgpu_ring_undo(ring);
>                       spin_unlock(&adev->gfx.kiq.ring_lock);
> +                     up_read(&adev->reset_sem);
>                       return -ETIME;
>               }
>   
> @@ -634,9 +637,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct 
> amdgpu_device *adev,
>               r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
>               if (r < 1) {
>                       dev_err(adev->dev, "wait for kiq fence error: %ld.\n", 
> r);
> +                     up_read(&adev->reset_sem);
>                       return -ETIME;
>               }
> -
> +             up_read(&adev->reset_sem);
>               return 0;
>       }
>   
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH] drm/amdgpu: block ring buffer access during GPU recovery

Reply via email to