Hi Felix & Harish,

maybe explain why I found that odd: dma_fence_add_callback() sets the DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT flag before adding the callback.

So the flag should always be set when there are callbacks.

Did I miss anything?

Regards,
Christian.

Am 29.01.2018 um 00:55 schrieb Felix Kuehling:
[+Harish, forgot to acknowledge him in the commit description, will fix
that in v2]

Harish, please see Christian's question below in amd_kfd_fence_signal.
Did I understand this correctly?

Regards,
   Felix

On 2018-01-28 06:42 PM, Felix Kuehling wrote:
On 2018-01-27 04:16 AM, Christian König wrote:
Am 27.01.2018 um 02:09 schrieb Felix Kuehling:
[snip]
+struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context,
+                               void *mm)
+{
+    struct amdgpu_amdkfd_fence *fence = NULL;
+
+    fence = kzalloc(sizeof(*fence), GFP_KERNEL);
+    if (fence == NULL)
+        return NULL;
+
+    /* mm_struct mm is used as void pointer to identify the parent
+     * KFD process. Don't dereference it. Fence and any threads using
+     * mm is guranteed to be released before process termination.
+     */
+    fence->mm = mm;
That won't work. Fences can live much longer than any process who
created them.

I've already found a fence in a BO still living hours after the
process was killed and the pid long recycled.

I suggest to make fence->mm a real mm_struct pointer with reference
counting and then set it to NULL and drop the reference in
enable_signaling.
I agree. But enable_signaling may be too early to drop the reference.
amd_kfd_fence_check_mm could still be called later from
amdgpu_ttm_bo_eviction_valuable, as long as the fence hasn't signaled yet.

The safe place is problably in amd_kfd_fence_release.

+    get_task_comm(fence->timeline_name, current);
+    spin_lock_init(&fence->lock);
+
+    dma_fence_init(&fence->base, &amd_kfd_fence_ops, &fence->lock,
+           context, atomic_inc_return(&fence_seq));
+
+    return fence;
+}
+
+struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f)
+{
+    struct amdgpu_amdkfd_fence *fence;
+
+    if (!f)
+        return NULL;
+
+    fence = container_of(f, struct amdgpu_amdkfd_fence, base);
+    if (fence && f->ops == &amd_kfd_fence_ops)
+        return fence;
+
+    return NULL;
+}
+
+static const char *amd_kfd_fence_get_driver_name(struct dma_fence *f)
+{
+    return "amdgpu_amdkfd_fence";
+}
+
+static const char *amd_kfd_fence_get_timeline_name(struct dma_fence *f)
+{
+    struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f);
+
+    return fence->timeline_name;
+}
+
+/**
+ * amd_kfd_fence_enable_signaling - This gets called when TTM wants
to evict
+ *  a KFD BO and schedules a job to move the BO.
+ *  If fence is already signaled return true.
+ *  If fence is not signaled schedule a evict KFD process work item.
+ */
+static bool amd_kfd_fence_enable_signaling(struct dma_fence *f)
+{
+    struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f);
+
+    if (!fence)
+        return false;
+
+    if (dma_fence_is_signaled(f))
+        return true;
+
+    if (!kgd2kfd->schedule_evict_and_restore_process(
+                (struct mm_struct *)fence->mm, f))
+        return true;
+
+    return false;
+}
+
+static int amd_kfd_fence_signal(struct dma_fence *f)
+{
+    unsigned long flags;
+    int ret;
+
+    spin_lock_irqsave(f->lock, flags);
+    /* Set enabled bit so cb will called */
+    set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &f->flags);
Mhm, why is that necessary?
This only gets called from fence_release below. I think this is to avoid
needlessly scheduling an eviction/restore cycle when an eviction fence
gets destroyed that hasn't been triggered before, probably during
process termination.

Harish, do you remember any other reason for this?

+    ret = dma_fence_signal_locked(f);
+    spin_unlock_irqrestore(f->lock, flags);
+
+    return ret;
+}
+
+/**
+ * amd_kfd_fence_release - callback that fence can be freed
+ *
+ * @fence: fence
+ *
+ * This function is called when the reference count becomes zero.
+ * It just RCU schedules freeing up the fence.
+ */
+static void amd_kfd_fence_release(struct dma_fence *f)
+{
+    struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f);
+    /* Unconditionally signal the fence. The process is getting
+     * terminated.
+     */
+    if (WARN_ON(!fence))
+        return; /* Not an amdgpu_amdkfd_fence */
+
+    amd_kfd_fence_signal(f);
+    kfree_rcu(f, rcu);
+}
+
+/**
+ * amd_kfd_fence_check_mm - Check if @mm is same as that of the
fence @f
+ *  if same return TRUE else return FALSE.
+ *
+ * @f: [IN] fence
+ * @mm: [IN] mm that needs to be verified
+ */
+bool amd_kfd_fence_check_mm(struct dma_fence *f, void *mm)
+{
+    struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f);
+
+    if (!fence)
+        return false;
+    else if (fence->mm == mm)
+        return true;
+
+    return false;
+}
+
+const struct dma_fence_ops amd_kfd_fence_ops = {
+    .get_driver_name = amd_kfd_fence_get_driver_name,
+    .get_timeline_name = amd_kfd_fence_get_timeline_name,
+    .enable_signaling = amd_kfd_fence_enable_signaling,
+    .signaled = NULL,
+    .wait = dma_fence_default_wait,
+    .release = amd_kfd_fence_release,
+};
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 65d5a4e..ca00dd2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -36,8 +36,9 @@
   #define AMDGPU_MAX_UVD_ENC_RINGS    2
     /* some special values for the owner field */
-#define AMDGPU_FENCE_OWNER_UNDEFINED    ((void*)0ul)
-#define AMDGPU_FENCE_OWNER_VM        ((void*)1ul)
+#define AMDGPU_FENCE_OWNER_UNDEFINED    ((void *)0ul)
+#define AMDGPU_FENCE_OWNER_VM        ((void *)1ul)
+#define AMDGPU_FENCE_OWNER_KFD        ((void *)2ul)
     #define AMDGPU_FENCE_FLAG_64BIT         (1 << 0)
   #define AMDGPU_FENCE_FLAG_INT           (1 << 1)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
index df65c66..0cb31d9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
@@ -31,6 +31,7 @@
   #include <drm/drmP.h>
   #include "amdgpu.h"
   #include "amdgpu_trace.h"
+#include "amdgpu_amdkfd.h"
     struct amdgpu_sync_entry {
       struct hlist_node    node;
@@ -86,10 +87,18 @@ static bool amdgpu_sync_same_dev(struct
amdgpu_device *adev,
   static void *amdgpu_sync_get_owner(struct dma_fence *f)
   {
       struct drm_sched_fence *s_fence = to_drm_sched_fence(f);
+    struct amdgpu_amdkfd_fence *kfd_fence;
+
+    if (!f)
+        return AMDGPU_FENCE_OWNER_UNDEFINED;
When you add the extra NULL check here then please move the
to_drm_sched_fence() after it as well.
Yeah, makes sense.

Regards,
   Felix

Christian.

         if (s_fence)
           return s_fence->owner;
   +    kfd_fence = to_amdgpu_amdkfd_fence(f);
+    if (kfd_fence)
+        return AMDGPU_FENCE_OWNER_KFD;
+
       return AMDGPU_FENCE_OWNER_UNDEFINED;
   }
   @@ -204,11 +213,18 @@ int amdgpu_sync_resv(struct amdgpu_device *adev,
       for (i = 0; i < flist->shared_count; ++i) {
           f = rcu_dereference_protected(flist->shared[i],
                             reservation_object_held(resv));
+        /* We only want to trigger KFD eviction fences on
+         * evict or move jobs. Skip KFD fences otherwise.
+         */
+        fence_owner = amdgpu_sync_get_owner(f);
+        if (fence_owner == AMDGPU_FENCE_OWNER_KFD &&
+            owner != AMDGPU_FENCE_OWNER_UNDEFINED)
+            continue;
+
           if (amdgpu_sync_same_dev(adev, f)) {
               /* VM updates are only interesting
                * for other VM updates and moves.
                */
-            fence_owner = amdgpu_sync_get_owner(f);
               if ((owner != AMDGPU_FENCE_OWNER_UNDEFINED) &&
                   (fence_owner != AMDGPU_FENCE_OWNER_UNDEFINED) &&
                   ((owner == AMDGPU_FENCE_OWNER_VM) !=
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index e4bb435..c3f33d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -46,6 +46,7 @@
   #include "amdgpu.h"
   #include "amdgpu_object.h"
   #include "amdgpu_trace.h"
+#include "amdgpu_amdkfd.h"
   #include "bif/bif_4_1_d.h"
     #define DRM_FILE_PAGE_OFFSET (0x100000000ULL >> PAGE_SHIFT)
@@ -1170,6 +1171,23 @@ static bool
amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
   {
       unsigned long num_pages = bo->mem.num_pages;
       struct drm_mm_node *node = bo->mem.mm_node;
+    struct reservation_object_list *flist;
+    struct dma_fence *f;
+    int i;
+
+    /* If bo is a KFD BO, check if the bo belongs to the current
process.
+     * If true, then return false as any KFD process needs all its
BOs to
+     * be resident to run successfully
+     */
+    flist = reservation_object_get_list(bo->resv);
+    if (flist) {
+        for (i = 0; i < flist->shared_count; ++i) {
+            f = rcu_dereference_protected(flist->shared[i],
+                reservation_object_held(bo->resv));
+            if (amd_kfd_fence_check_mm(f, current->mm))
+                return false;
+        }
+    }
         switch (bo->mem.mem_type) {
       case TTM_PL_TT:
diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index 94eab54..9e35249 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -30,6 +30,7 @@
     #include <linux/types.h>
   #include <linux/bitmap.h>
+#include <linux/dma-fence.h>
     struct pci_dev;
   @@ -286,6 +287,9 @@ struct kfd2kgd_calls {
    *
    * @resume: Notifies amdkfd about a resume action done to a kgd device
    *
+ * @schedule_evict_and_restore_process: Schedules work queue that
will prepare
+ * for safe eviction of KFD BOs that belong to the specified process.
+ *
    * This structure contains function callback pointers so the kgd
driver
    * will notify to the amdkfd about certain status changes.
    *
@@ -300,6 +304,8 @@ struct kgd2kfd_calls {
       void (*interrupt)(struct kfd_dev *kfd, const void *ih_ring_entry);
       void (*suspend)(struct kfd_dev *kfd);
       int (*resume)(struct kfd_dev *kfd);
+    int (*schedule_evict_and_restore_process)(struct mm_struct *mm,
+            struct dma_fence *fence);
   };
     int kgd2kfd_init(unsigned interface_version,
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to