On Thu, 2025-12-11 at 13:16 +0100, Christian König wrote:
> This allows amdkfd_fences to outlive the amdgpu module.
>
> v2: implement Felix suggestion to lock the fence while signaling it.
> v3: fix typos
> v4: fix return code in signal_eviction_fence
>
> Signed-off-by: Christian König <[email protected]>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 7 +++
> .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c | 44 +++++++++----------
> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 2 +-
> drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 4 +-
> 4 files changed, 31 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 8bdfcde2029b..2f2b277cfaed 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -196,6 +196,7 @@ int kfd_debugfs_kfd_mem_limits(struct seq_file *m, void
> *data);
> #endif
> #if IS_ENABLED(CONFIG_HSA_AMD)
> bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm);
> +bool amdkfd_fence_signal(struct dma_fence *f);
> struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f);
> void amdgpu_amdkfd_remove_all_eviction_fences(struct amdgpu_bo *bo);
> int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni,
> @@ -210,6 +211,12 @@ bool amdkfd_fence_check_mm(struct dma_fence *f, struct
> mm_struct *mm)
> return false;
> }
>
> +static inline
> +bool amdkfd_fence_signal(struct dma_fence *f)
> +{
> + return false;
> +}
Huh? What's that?
That function seems to be just a NOP. It's return code is used nowhere,
is it?
> +
> static inline
> struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f)
> {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c
> index 09c919f72b6c..9cd413e325f0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c
> @@ -127,29 +127,9 @@ static bool amdkfd_fence_enable_signaling(struct
> dma_fence *f)
> if (!svm_range_schedule_evict_svm_bo(fence))
> return true;
> }
> - return false;
> -}
> -
> -/**
> - * amdkfd_fence_release - callback that fence can be freed
> - *
> - * @f: dma_fence
> - *
> - * This function is called when the reference count becomes zero.
> - * Drops the mm_struct reference and RCU schedules freeing up the fence.
> - */
> -static void amdkfd_fence_release(struct dma_fence *f)
> -{
> - struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f);
> -
> - /* Unconditionally signal the fence. The process is getting
> - * terminated.
> - */
> - if (WARN_ON(!fence))
> - return; /* Not an amdgpu_amdkfd_fence */
> -
> mmdrop(fence->mm);
> - kfree_rcu(f, rcu);
> + fence->mm = NULL;
> + return false;
> }
>
> /**
> @@ -174,9 +154,27 @@ bool amdkfd_fence_check_mm(struct dma_fence *f, struct
> mm_struct *mm)
> return false;
> }
>
> +bool amdkfd_fence_signal(struct dma_fence *f)
> +{
> + struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f);
> + unsigned long flags;
> + bool was_signaled;
> +
> + dma_fence_lock_irqsave(f, flags);
> + if (fence->mm) {
> + mmdrop(fence->mm);
> + fence->mm = NULL;
> + }
> + was_signaled = dma_fence_is_signaled_locked(f);
> + if (!was_signaled)
> + dma_fence_signal_locked(f);
> + dma_fence_unlock_irqrestore(f, flags);
> +
> + return was_signaled;
> +}
> +
> static const struct dma_fence_ops amdkfd_fence_ops = {
> .get_driver_name = amdkfd_fence_get_driver_name,
> .get_timeline_name = amdkfd_fence_get_timeline_name,
> .enable_signaling = amdkfd_fence_enable_signaling,
> - .release = amdkfd_fence_release,
> };
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index bb252ec43733..2cf39e3d3fae 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1173,7 +1173,7 @@ static void kfd_process_wq_release(struct work_struct
> *work)
> synchronize_rcu();
> ef = rcu_access_pointer(p->ef);
> if (ef)
> - dma_fence_signal(ef);
> + amdkfd_fence_signal(ef);
>
> kfd_process_remove_sysfs(p);
> kfd_debugfs_remove_process(p);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index 97c2270f278f..0e94f3a976b1 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -428,7 +428,7 @@ static void svm_range_bo_release(struct kref *kref)
>
> if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base))
> /* We're not in the eviction worker. Signal the fence. */
> - dma_fence_signal(&svm_bo->eviction_fence->base);
> + amdkfd_fence_signal(&svm_bo->eviction_fence->base);
> dma_fence_put(&svm_bo->eviction_fence->base);
> amdgpu_bo_unref(&svm_bo->bo);
> kfree(svm_bo);
> @@ -3628,7 +3628,7 @@ static void svm_range_evict_svm_bo_worker(struct
> work_struct *work)
> mmap_read_unlock(mm);
> mmput(mm);
>
> - dma_fence_signal(&svm_bo->eviction_fence->base);
> + amdkfd_fence_signal(&svm_bo->eviction_fence->base);
And why do you do those changes and why doesn't the commit message
explain it?
You stop signalling those fences, after all.
P.
>
> /* This is the last reference to svm_bo, after svm_range_vram_node_free
> * has been called in svm_migrate_vram_to_ram