On 2018-01-27 04:16 AM, Christian König wrote: > Am 27.01.2018 um 02:09 schrieb Felix Kuehling: [snip] >> +struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context, >> + void *mm) >> +{ >> + struct amdgpu_amdkfd_fence *fence = NULL; >> + >> + fence = kzalloc(sizeof(*fence), GFP_KERNEL); >> + if (fence == NULL) >> + return NULL; >> + >> + /* mm_struct mm is used as void pointer to identify the parent >> + * KFD process. Don't dereference it. Fence and any threads using >> + * mm is guranteed to be released before process termination. >> + */ >> + fence->mm = mm; > > That won't work. Fences can live much longer than any process who > created them. > > I've already found a fence in a BO still living hours after the > process was killed and the pid long recycled. > > I suggest to make fence->mm a real mm_struct pointer with reference > counting and then set it to NULL and drop the reference in > enable_signaling.
I agree. But enable_signaling may be too early to drop the reference. amd_kfd_fence_check_mm could still be called later from amdgpu_ttm_bo_eviction_valuable, as long as the fence hasn't signaled yet. The safe place is problably in amd_kfd_fence_release. > >> + get_task_comm(fence->timeline_name, current); >> + spin_lock_init(&fence->lock); >> + >> + dma_fence_init(&fence->base, &amd_kfd_fence_ops, &fence->lock, >> + context, atomic_inc_return(&fence_seq)); >> + >> + return fence; >> +} >> + >> +struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f) >> +{ >> + struct amdgpu_amdkfd_fence *fence; >> + >> + if (!f) >> + return NULL; >> + >> + fence = container_of(f, struct amdgpu_amdkfd_fence, base); >> + if (fence && f->ops == &amd_kfd_fence_ops) >> + return fence; >> + >> + return NULL; >> +} >> + >> +static const char *amd_kfd_fence_get_driver_name(struct dma_fence *f) >> +{ >> + return "amdgpu_amdkfd_fence"; >> +} >> + >> +static const char *amd_kfd_fence_get_timeline_name(struct dma_fence *f) >> +{ >> + struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); >> + >> + return fence->timeline_name; >> +} >> + >> +/** >> + * amd_kfd_fence_enable_signaling - This gets called when TTM wants >> to evict >> + * a KFD BO and schedules a job to move the BO. >> + * If fence is already signaled return true. >> + * If fence is not signaled schedule a evict KFD process work item. >> + */ >> +static bool amd_kfd_fence_enable_signaling(struct dma_fence *f) >> +{ >> + struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); >> + >> + if (!fence) >> + return false; >> + >> + if (dma_fence_is_signaled(f)) >> + return true; >> + >> + if (!kgd2kfd->schedule_evict_and_restore_process( >> + (struct mm_struct *)fence->mm, f)) >> + return true; >> + >> + return false; >> +} >> + >> +static int amd_kfd_fence_signal(struct dma_fence *f) >> +{ >> + unsigned long flags; >> + int ret; >> + >> + spin_lock_irqsave(f->lock, flags); >> + /* Set enabled bit so cb will called */ >> + set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &f->flags); > > Mhm, why is that necessary? This only gets called from fence_release below. I think this is to avoid needlessly scheduling an eviction/restore cycle when an eviction fence gets destroyed that hasn't been triggered before, probably during process termination. Harish, do you remember any other reason for this? > >> + ret = dma_fence_signal_locked(f); >> + spin_unlock_irqrestore(f->lock, flags); >> + >> + return ret; >> +} >> + >> +/** >> + * amd_kfd_fence_release - callback that fence can be freed >> + * >> + * @fence: fence >> + * >> + * This function is called when the reference count becomes zero. >> + * It just RCU schedules freeing up the fence. >> + */ >> +static void amd_kfd_fence_release(struct dma_fence *f) >> +{ >> + struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); >> + /* Unconditionally signal the fence. The process is getting >> + * terminated. >> + */ >> + if (WARN_ON(!fence)) >> + return; /* Not an amdgpu_amdkfd_fence */ >> + >> + amd_kfd_fence_signal(f); >> + kfree_rcu(f, rcu); >> +} >> + >> +/** >> + * amd_kfd_fence_check_mm - Check if @mm is same as that of the >> fence @f >> + * if same return TRUE else return FALSE. >> + * >> + * @f: [IN] fence >> + * @mm: [IN] mm that needs to be verified >> + */ >> +bool amd_kfd_fence_check_mm(struct dma_fence *f, void *mm) >> +{ >> + struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); >> + >> + if (!fence) >> + return false; >> + else if (fence->mm == mm) >> + return true; >> + >> + return false; >> +} >> + >> +const struct dma_fence_ops amd_kfd_fence_ops = { >> + .get_driver_name = amd_kfd_fence_get_driver_name, >> + .get_timeline_name = amd_kfd_fence_get_timeline_name, >> + .enable_signaling = amd_kfd_fence_enable_signaling, >> + .signaled = NULL, >> + .wait = dma_fence_default_wait, >> + .release = amd_kfd_fence_release, >> +}; >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h >> index 65d5a4e..ca00dd2 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h >> @@ -36,8 +36,9 @@ >> #define AMDGPU_MAX_UVD_ENC_RINGS 2 >> /* some special values for the owner field */ >> -#define AMDGPU_FENCE_OWNER_UNDEFINED ((void*)0ul) >> -#define AMDGPU_FENCE_OWNER_VM ((void*)1ul) >> +#define AMDGPU_FENCE_OWNER_UNDEFINED ((void *)0ul) >> +#define AMDGPU_FENCE_OWNER_VM ((void *)1ul) >> +#define AMDGPU_FENCE_OWNER_KFD ((void *)2ul) >> #define AMDGPU_FENCE_FLAG_64BIT (1 << 0) >> #define AMDGPU_FENCE_FLAG_INT (1 << 1) >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c >> index df65c66..0cb31d9 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c >> @@ -31,6 +31,7 @@ >> #include <drm/drmP.h> >> #include "amdgpu.h" >> #include "amdgpu_trace.h" >> +#include "amdgpu_amdkfd.h" >> struct amdgpu_sync_entry { >> struct hlist_node node; >> @@ -86,10 +87,18 @@ static bool amdgpu_sync_same_dev(struct >> amdgpu_device *adev, >> static void *amdgpu_sync_get_owner(struct dma_fence *f) >> { >> struct drm_sched_fence *s_fence = to_drm_sched_fence(f); >> + struct amdgpu_amdkfd_fence *kfd_fence; >> + >> + if (!f) >> + return AMDGPU_FENCE_OWNER_UNDEFINED; > > When you add the extra NULL check here then please move the > to_drm_sched_fence() after it as well. Yeah, makes sense. Regards, Felix > > Christian. > >> if (s_fence) >> return s_fence->owner; >> + kfd_fence = to_amdgpu_amdkfd_fence(f); >> + if (kfd_fence) >> + return AMDGPU_FENCE_OWNER_KFD; >> + >> return AMDGPU_FENCE_OWNER_UNDEFINED; >> } >> @@ -204,11 +213,18 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, >> for (i = 0; i < flist->shared_count; ++i) { >> f = rcu_dereference_protected(flist->shared[i], >> reservation_object_held(resv)); >> + /* We only want to trigger KFD eviction fences on >> + * evict or move jobs. Skip KFD fences otherwise. >> + */ >> + fence_owner = amdgpu_sync_get_owner(f); >> + if (fence_owner == AMDGPU_FENCE_OWNER_KFD && >> + owner != AMDGPU_FENCE_OWNER_UNDEFINED) >> + continue; >> + >> if (amdgpu_sync_same_dev(adev, f)) { >> /* VM updates are only interesting >> * for other VM updates and moves. >> */ >> - fence_owner = amdgpu_sync_get_owner(f); >> if ((owner != AMDGPU_FENCE_OWNER_UNDEFINED) && >> (fence_owner != AMDGPU_FENCE_OWNER_UNDEFINED) && >> ((owner == AMDGPU_FENCE_OWNER_VM) != >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c >> index e4bb435..c3f33d3 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c >> @@ -46,6 +46,7 @@ >> #include "amdgpu.h" >> #include "amdgpu_object.h" >> #include "amdgpu_trace.h" >> +#include "amdgpu_amdkfd.h" >> #include "bif/bif_4_1_d.h" >> #define DRM_FILE_PAGE_OFFSET (0x100000000ULL >> PAGE_SHIFT) >> @@ -1170,6 +1171,23 @@ static bool >> amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo, >> { >> unsigned long num_pages = bo->mem.num_pages; >> struct drm_mm_node *node = bo->mem.mm_node; >> + struct reservation_object_list *flist; >> + struct dma_fence *f; >> + int i; >> + >> + /* If bo is a KFD BO, check if the bo belongs to the current >> process. >> + * If true, then return false as any KFD process needs all its >> BOs to >> + * be resident to run successfully >> + */ >> + flist = reservation_object_get_list(bo->resv); >> + if (flist) { >> + for (i = 0; i < flist->shared_count; ++i) { >> + f = rcu_dereference_protected(flist->shared[i], >> + reservation_object_held(bo->resv)); >> + if (amd_kfd_fence_check_mm(f, current->mm)) >> + return false; >> + } >> + } >> switch (bo->mem.mem_type) { >> case TTM_PL_TT: >> diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h >> b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h >> index 94eab54..9e35249 100644 >> --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h >> +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h >> @@ -30,6 +30,7 @@ >> #include <linux/types.h> >> #include <linux/bitmap.h> >> +#include <linux/dma-fence.h> >> struct pci_dev; >> @@ -286,6 +287,9 @@ struct kfd2kgd_calls { >> * >> * @resume: Notifies amdkfd about a resume action done to a kgd device >> * >> + * @schedule_evict_and_restore_process: Schedules work queue that >> will prepare >> + * for safe eviction of KFD BOs that belong to the specified process. >> + * >> * This structure contains function callback pointers so the kgd >> driver >> * will notify to the amdkfd about certain status changes. >> * >> @@ -300,6 +304,8 @@ struct kgd2kfd_calls { >> void (*interrupt)(struct kfd_dev *kfd, const void *ih_ring_entry); >> void (*suspend)(struct kfd_dev *kfd); >> int (*resume)(struct kfd_dev *kfd); >> + int (*schedule_evict_and_restore_process)(struct mm_struct *mm, >> + struct dma_fence *fence); >> }; >> int kgd2kfd_init(unsigned interface_version, > _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx