Am 2021-10-22 um 1:06 p.m. schrieb Philip Yang:
> The userptr can be unmapped by application and still registered to
> driver, restore userptr work return user pages will get -EFAULT bad
> address error. Pretend this error as succeed. GPU access this userptr
> will have VM fault later, it is better than application soft hangs with
> stalled user mode queues.
>
> Signed-off-by: Philip Yang <philip.y...@amd.com>

Reviewed-by: Felix Kuehling <felix.kuehl...@amd.com>


> ---
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 27 ++++++++++++-------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c       |  3 +++
>  2 files changed, 20 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index cdf46bd0d8d5..6f01c6145a87 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -2041,19 +2041,26 @@ static int update_invalid_user_pages(struct 
> amdkfd_process_info *process_info,
>               /* Get updated user pages */
>               ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>               if (ret) {
> -                     pr_debug("%s: Failed to get user pages: %d\n",
> -                             __func__, ret);
> +                     pr_debug("Failed %d to get user pages\n", ret);
> +
> +                     /* Return -EFAULT bad address error as success. It will
> +                      * fail later with a VM fault if the GPU tries to access
> +                      * it. Better than hanging indefinitely with stalled
> +                      * user mode queues.
> +                      *
> +                      * Return other error -EBUSY or -ENOMEM to retry restore
> +                      */
> +                     if (ret != -EFAULT)
> +                             return ret;
> +             } else {
>  
> -                     /* Return error -EBUSY or -ENOMEM, retry restore */
> -                     return ret;
> +                     /*
> +                      * FIXME: Cannot ignore the return code, must hold
> +                      * notifier_lock
> +                      */
> +                     amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>               }
>  
> -             /*
> -              * FIXME: Cannot ignore the return code, must hold
> -              * notifier_lock
> -              */
> -             amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
> -
>               /* Mark the BO as valid unless it was invalidated
>                * again concurrently.
>                */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index d784f8d3a834..ae6694f2c73d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -693,6 +693,9 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, 
> struct page **pages)
>       r = amdgpu_hmm_range_get_pages(&bo->notifier, mm, pages, start,
>                                      ttm->num_pages, &gtt->range, readonly,
>                                      false, NULL);
> +     if (r)
> +             pr_debug("failed %d to get user pages 0x%llx\n", r, start);
> +
>  out_putmm:
>       mmput(mm);
>  

Reply via email to