Re: [PATCH] drm/amdgpu: improve HMM error -ENOMEM and -EBUSY handling

2019-06-17 Thread Kuehling, Felix
On 2019-06-14 9:52 p.m., Yang, Philip wrote:
> Under memory pressure, hmm_range_fault may return error code -ENOMEM
> or -EBUSY, change pr_info to pr_debug to remove unnecessary kernel log
> message because we will retry restore again.
>
> Call get_user_pages_done if TTM get user pages failed will have
> WARN_ONCE kernel calling stack dump log.
>
> Change-Id: I086f92944630f9d1a70365c00417cb9440662464
> Signed-off-by: Philip Yang 

Reviewed-by: Felix Kuehling 


> ---
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 38 +++
>   1 file changed, 6 insertions(+), 32 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 74e86952553f..10abae398e51 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -1731,35 +1731,17 @@ static int update_invalid_user_pages(struct 
> amdkfd_process_info *process_info,
>   ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm,
>  bo->tbo.ttm->pages);
>   if (ret) {
> - bo->tbo.ttm->pages[0] = NULL;
> - pr_info("%s: Failed to get user pages: %d\n",
> + pr_debug("%s: Failed to get user pages: %d\n",
>   __func__, ret);
> - /* Pretend it succeeded. It will fail later
> -  * with a VM fault if the GPU tries to access
> -  * it. Better than hanging indefinitely with
> -  * stalled user mode queues.
> -  */
> - }
> - }
> -
> - return 0;
> -}
>   
> -/* Remove invalid userptr BOs from hmm track list
> - *
> - * Stop HMM track the userptr update
> - */
> -static void untrack_invalid_user_pages(struct amdkfd_process_info 
> *process_info)
> -{
> - struct kgd_mem *mem, *tmp_mem;
> - struct amdgpu_bo *bo;
> + /* Return error -EBUSY or -ENOMEM, retry restore */
> + return ret;
> + }
>   
> - list_for_each_entry_safe(mem, tmp_mem,
> -  _info->userptr_inval_list,
> -  validate_list.head) {
> - bo = mem->bo;
>   amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>   }
> +
> + return 0;
>   }
>   
>   /* Validate invalid userptr BOs
> @@ -1841,13 +1823,6 @@ static int validate_invalid_user_pages(struct 
> amdkfd_process_info *process_info)
>   list_move_tail(>validate_list.head,
>  _info->userptr_valid_list);
>   
> - /* Stop HMM track the userptr update. We dont check the return
> -  * value for concurrent CPU page table update because we will
> -  * reschedule the restore worker if process_info->evicted_bos
> -  * is updated.
> -  */
> - amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
> -
>   /* Update mapping. If the BO was not validated
>* (because we couldn't get user pages), this will
>* clear the page table entries, which will result in
> @@ -1946,7 +1921,6 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct 
> work_struct *work)
>   }
>   
>   unlock_out:
> - untrack_invalid_user_pages(process_info);
>   mutex_unlock(_info->lock);
>   mmput(mm);
>   put_task_struct(usertask);
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu: improve HMM error -ENOMEM and -EBUSY handling

2019-06-14 Thread Yang, Philip
Under memory pressure, hmm_range_fault may return error code -ENOMEM
or -EBUSY, change pr_info to pr_debug to remove unnecessary kernel log
message because we will retry restore again.

Call get_user_pages_done if TTM get user pages failed will have
WARN_ONCE kernel calling stack dump log.

Change-Id: I086f92944630f9d1a70365c00417cb9440662464
Signed-off-by: Philip Yang 
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 38 +++
 1 file changed, 6 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 74e86952553f..10abae398e51 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1731,35 +1731,17 @@ static int update_invalid_user_pages(struct 
amdkfd_process_info *process_info,
ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm,
   bo->tbo.ttm->pages);
if (ret) {
-   bo->tbo.ttm->pages[0] = NULL;
-   pr_info("%s: Failed to get user pages: %d\n",
+   pr_debug("%s: Failed to get user pages: %d\n",
__func__, ret);
-   /* Pretend it succeeded. It will fail later
-* with a VM fault if the GPU tries to access
-* it. Better than hanging indefinitely with
-* stalled user mode queues.
-*/
-   }
-   }
-
-   return 0;
-}
 
-/* Remove invalid userptr BOs from hmm track list
- *
- * Stop HMM track the userptr update
- */
-static void untrack_invalid_user_pages(struct amdkfd_process_info 
*process_info)
-{
-   struct kgd_mem *mem, *tmp_mem;
-   struct amdgpu_bo *bo;
+   /* Return error -EBUSY or -ENOMEM, retry restore */
+   return ret;
+   }
 
-   list_for_each_entry_safe(mem, tmp_mem,
-_info->userptr_inval_list,
-validate_list.head) {
-   bo = mem->bo;
amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
}
+
+   return 0;
 }
 
 /* Validate invalid userptr BOs
@@ -1841,13 +1823,6 @@ static int validate_invalid_user_pages(struct 
amdkfd_process_info *process_info)
list_move_tail(>validate_list.head,
   _info->userptr_valid_list);
 
-   /* Stop HMM track the userptr update. We dont check the return
-* value for concurrent CPU page table update because we will
-* reschedule the restore worker if process_info->evicted_bos
-* is updated.
-*/
-   amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
-
/* Update mapping. If the BO was not validated
 * (because we couldn't get user pages), this will
 * clear the page table entries, which will result in
@@ -1946,7 +1921,6 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct 
work_struct *work)
}
 
 unlock_out:
-   untrack_invalid_user_pages(process_info);
mutex_unlock(_info->lock);
mmput(mm);
put_task_struct(usertask);
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx