Hi Christian,

Would you review this patch? Just looking at the code, calling
amdgpu_sync_fence with adev=NULL should be OK for us. It's just a bit
unusual compared to amdgpu's usage of this function. We've had this
patch in kfd-staging for a while without problems. If you're OK with
this I'll go ahead and push this upstream as well.

Thanks,
  Felix

On 2018-11-05 8:40 p.m., Kuehling, Felix wrote:
> The adev parameter in amdgpu_sync_fence and amdgpu_sync_resv is only
> needed for updating sync->last_vm_update. This breaks if different
> adevs are passed to calls for the same sync object.
>
> Always pass NULL for calls from KFD because sync objects used for
> KFD don't belong to any particular device, and KFD doesn't need the
> sync->last_vm_update fence.
>
> This fixes kernel log warnings on multi-GPU systems after recent
> changes in amdgpu_amdkfd_gpuvm_restore_process_bos.
>
> Signed-off-by: Felix Kuehling <felix.kuehl...@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 28 
> +++++-------------------
>  1 file changed, 5 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index d005371..572ac5f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -395,23 +395,6 @@ static int vm_validate_pt_pd_bos(struct amdgpu_vm *vm)
>       return 0;
>  }
>  
> -static int sync_vm_fence(struct amdgpu_device *adev, struct amdgpu_sync 
> *sync,
> -                      struct dma_fence *f)
> -{
> -     int ret = amdgpu_sync_fence(adev, sync, f, false);
> -
> -     /* Sync objects can't handle multiple GPUs (contexts) updating
> -      * sync->last_vm_update. Fortunately we don't need it for
> -      * KFD's purposes, so we can just drop that fence.
> -      */
> -     if (sync->last_vm_update) {
> -             dma_fence_put(sync->last_vm_update);
> -             sync->last_vm_update = NULL;
> -     }
> -
> -     return ret;
> -}
> -
>  static int vm_update_pds(struct amdgpu_vm *vm, struct amdgpu_sync *sync)
>  {
>       struct amdgpu_bo *pd = vm->root.base.bo;
> @@ -422,7 +405,7 @@ static int vm_update_pds(struct amdgpu_vm *vm, struct 
> amdgpu_sync *sync)
>       if (ret)
>               return ret;
>  
> -     return sync_vm_fence(adev, sync, vm->last_update);
> +     return amdgpu_sync_fence(NULL, sync, vm->last_update, false);
>  }
>  
>  /* add_bo_to_vm - Add a BO to a VM
> @@ -826,7 +809,7 @@ static int unmap_bo_from_gpuvm(struct amdgpu_device *adev,
>       /* Add the eviction fence back */
>       amdgpu_bo_fence(pd, &vm->process_info->eviction_fence->base, true);
>  
> -     sync_vm_fence(adev, sync, bo_va->last_pt_update);
> +     amdgpu_sync_fence(NULL, sync, bo_va->last_pt_update, false);
>  
>       return 0;
>  }
> @@ -851,7 +834,7 @@ static int update_gpuvm_pte(struct amdgpu_device *adev,
>               return ret;
>       }
>  
> -     return sync_vm_fence(adev, sync, bo_va->last_pt_update);
> +     return amdgpu_sync_fence(NULL, sync, bo_va->last_pt_update, false);
>  }
>  
>  static int map_bo_to_gpuvm(struct amdgpu_device *adev,
> @@ -911,7 +894,7 @@ static int process_sync_pds_resv(struct 
> amdkfd_process_info *process_info,
>                           vm_list_node) {
>               struct amdgpu_bo *pd = peer_vm->root.base.bo;
>  
> -             ret = amdgpu_sync_resv(amdgpu_ttm_adev(pd->tbo.bdev),
> +             ret = amdgpu_sync_resv(NULL,
>                                       sync, pd->tbo.resv,
>                                       AMDGPU_FENCE_OWNER_UNDEFINED, false);
>               if (ret)
> @@ -2084,8 +2067,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, 
> struct dma_fence **ef)
>                       pr_debug("Memory eviction: Validate BOs failed. Try 
> again\n");
>                       goto validate_map_fail;
>               }
> -             ret = amdgpu_sync_fence(amdgpu_ttm_adev(bo->tbo.bdev),
> -                                     &sync_obj, bo->tbo.moving, false);
> +             ret = amdgpu_sync_fence(NULL, &sync_obj, bo->tbo.moving, false);
>               if (ret) {
>                       pr_debug("Memory eviction: Sync BO fence failed. Try 
> again\n");
>                       goto validate_map_fail;
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to