On Tue, 2026-02-10 at 11:01 +0100, Christian König wrote: > Implement per-fence spinlocks, allowing implementations to not give an > external spinlock to protect the fence internal statei. Instead a spinlock
s/statei/state > embedded into the fence structure itself is used in this case. > > Shared spinlocks have the problem that implementations need to guarantee > that the lock live at least as long all fences referencing them. s/live/lives > > Using a per-fence spinlock allows completely decoupling spinlock producer > and consumer life times, simplifying the handling in most use cases. That's a good commit message btw, detailing what the motivation is. Would be great to see messages like that more frequently :] > > v2: improve naming, coverage and function documentation > v3: fix one additional locking in the selftests > v4: separate out some changes to make the patch smaller, > fix one amdgpu crash found by CI systems > > Signed-off-by: Christian König <[email protected]> > --- > drivers/dma-buf/dma-fence.c | 21 ++++++++++++++++----- > drivers/dma-buf/sync_debug.h | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +- > drivers/gpu/drm/drm_crtc.c | 2 +- > drivers/gpu/drm/drm_writeback.c | 2 +- > drivers/gpu/drm/nouveau/nouveau_fence.c | 3 ++- > drivers/gpu/drm/qxl/qxl_release.c | 3 ++- > drivers/gpu/drm/vmwgfx/vmwgfx_fence.c | 3 ++- > drivers/gpu/drm/xe/xe_hw_fence.c | 3 ++- > include/linux/dma-fence.h | 19 +++++++++++++------ > 10 files changed, 41 insertions(+), 19 deletions(-) > > diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c > index 56aa59867eaa..1833889e7466 100644 > --- a/drivers/dma-buf/dma-fence.c > +++ b/drivers/dma-buf/dma-fence.c > @@ -343,7 +343,6 @@ void __dma_fence_might_wait(void) > } > #endif > > - > /** > * dma_fence_signal_timestamp_locked - signal completion of a fence > * @fence: the fence to signal > @@ -1067,7 +1066,6 @@ static void > __dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops, > spinlock_t *lock, u64 context, u64 seqno, unsigned long flags) > { > - BUG_ON(!lock); > BUG_ON(!ops || !ops->get_driver_name || !ops->get_timeline_name); > > kref_init(&fence->refcount); > @@ -1078,10 +1076,15 @@ __dma_fence_init(struct dma_fence *fence, const > struct dma_fence_ops *ops, > */ > RCU_INIT_POINTER(fence->ops, ops); > INIT_LIST_HEAD(&fence->cb_list); > - fence->lock = lock; > fence->context = context; > fence->seqno = seqno; > fence->flags = flags | BIT(DMA_FENCE_FLAG_INITIALIZED_BIT); > + if (lock) { > + fence->extern_lock = lock; > + } else { > + spin_lock_init(&fence->inline_lock); > + fence->flags |= BIT(DMA_FENCE_FLAG_INLINE_LOCK_BIT); > + } > fence->error = 0; > > trace_dma_fence_init(fence); > @@ -1091,7 +1094,7 @@ __dma_fence_init(struct dma_fence *fence, const struct > dma_fence_ops *ops, > * dma_fence_init - Initialize a custom fence. > * @fence: the fence to initialize > * @ops: the dma_fence_ops for operations on this fence > - * @lock: the irqsafe spinlock to use for locking this fence > + * @lock: optional irqsafe spinlock to use for locking this fence > * @context: the execution context this fence is run on > * @seqno: a linear increasing sequence number for this context > * > @@ -1101,6 +1104,10 @@ __dma_fence_init(struct dma_fence *fence, const struct > dma_fence_ops *ops, > * > * context and seqno are used for easy comparison between fences, allowing > * to check which fence is later by simply using dma_fence_later(). > + * > + * It is strongly discouraged to provide an external lock. This is only > allowed "strongly discouraged […] because this does not decouple lock and fence life times." ? > + * for legacy use cases when multiple fences need to be prevented from > + * signaling out of order. I think our previous discussions revealed that the external lock does not even help with that, does it? > */ > void > dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops, > @@ -1114,7 +1121,7 @@ EXPORT_SYMBOL(dma_fence_init); > * dma_fence_init64 - Initialize a custom fence with 64-bit seqno support. > * @fence: the fence to initialize > * @ops: the dma_fence_ops for operations on this fence > - * @lock: the irqsafe spinlock to use for locking this fence > + * @lock: optional irqsafe spinlock to use for locking this fence > * @context: the execution context this fence is run on > * @seqno: a linear increasing sequence number for this context > * > @@ -1124,6 +1131,10 @@ EXPORT_SYMBOL(dma_fence_init); > * > * Context and seqno are used for easy comparison between fences, allowing > * to check which fence is later by simply using dma_fence_later(). > + * > + * It is strongly discouraged to provide an external lock. This is only > allowed same > + * for legacy use cases when multiple fences need to be prevented from > + * signaling out of order. > */ > void > dma_fence_init64(struct dma_fence *fence, const struct dma_fence_ops *ops, > diff --git a/drivers/dma-buf/sync_debug.h b/drivers/dma-buf/sync_debug.h > index 02af347293d0..c49324505b20 100644 > --- a/drivers/dma-buf/sync_debug.h > +++ b/drivers/dma-buf/sync_debug.h > @@ -47,7 +47,7 @@ struct sync_timeline { > > static inline struct sync_timeline *dma_fence_parent(struct dma_fence *fence) > { > - return container_of(fence->lock, struct sync_timeline, lock); > + return container_of(fence->extern_lock, struct sync_timeline, lock); You're sure that this will never have to check for the flag? > } > > /** > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > index 139642eacdd0..d5c41e24fb51 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > @@ -638,7 +638,7 @@ static inline uint64_t amdgpu_vm_tlb_seq(struct amdgpu_vm > *vm) > * sure that the dma_fence structure isn't freed up. > */ > rcu_read_lock(); > - lock = vm->last_tlb_flush->lock; > + lock = dma_fence_spinlock(vm->last_tlb_flush); > rcu_read_unlock(); > > spin_lock_irqsave(lock, flags); > diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c > index a7797d260f1e..17472915842f 100644 > --- a/drivers/gpu/drm/drm_crtc.c > +++ b/drivers/gpu/drm/drm_crtc.c > @@ -159,7 +159,7 @@ static const struct dma_fence_ops drm_crtc_fence_ops; > static struct drm_crtc *fence_to_crtc(struct dma_fence *fence) > { > BUG_ON(fence->ops != &drm_crtc_fence_ops); > - return container_of(fence->lock, struct drm_crtc, fence_lock); > + return container_of(fence->extern_lock, struct drm_crtc, fence_lock); > } > > static const char *drm_crtc_fence_get_driver_name(struct dma_fence *fence) > diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c > index 95b8a2e4bda6..624a4e8b6c99 100644 > --- a/drivers/gpu/drm/drm_writeback.c > +++ b/drivers/gpu/drm/drm_writeback.c > @@ -81,7 +81,7 @@ > * From userspace, this property will always read as zero. > */ > > -#define fence_to_wb_connector(x) container_of(x->lock, \ > +#define fence_to_wb_connector(x) container_of(x->extern_lock, \ > struct drm_writeback_connector, \ > fence_lock) > > diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c > b/drivers/gpu/drm/nouveau/nouveau_fence.c > index 4a193b7d6d9e..c282c94138b2 100644 > --- a/drivers/gpu/drm/nouveau/nouveau_fence.c > +++ b/drivers/gpu/drm/nouveau/nouveau_fence.c > @@ -41,7 +41,8 @@ static const struct dma_fence_ops nouveau_fence_ops_legacy; > static inline struct nouveau_fence_chan * > nouveau_fctx(struct nouveau_fence *fence) > { > - return container_of(fence->base.lock, struct nouveau_fence_chan, lock); > + return container_of(fence->base.extern_lock, struct nouveau_fence_chan, > + lock); > } > > static bool > diff --git a/drivers/gpu/drm/qxl/qxl_release.c > b/drivers/gpu/drm/qxl/qxl_release.c > index 06b0b2aa7953..37d4ae0faf0d 100644 > --- a/drivers/gpu/drm/qxl/qxl_release.c > +++ b/drivers/gpu/drm/qxl/qxl_release.c > @@ -62,7 +62,8 @@ static long qxl_fence_wait(struct dma_fence *fence, bool > intr, > struct qxl_device *qdev; > unsigned long cur, end = jiffies + timeout; > > - qdev = container_of(fence->lock, struct qxl_device, release_lock); > + qdev = container_of(fence->extern_lock, struct qxl_device, > + release_lock); > > if (!wait_event_timeout(qdev->release_event, > (dma_fence_is_signaled(fence) || > diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c > b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c > index 85795082fef9..d251eec57df9 100644 > --- a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c > +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c > @@ -47,7 +47,8 @@ struct vmw_event_fence_action { > static struct vmw_fence_manager * > fman_from_fence(struct vmw_fence_obj *fence) > { > - return container_of(fence->base.lock, struct vmw_fence_manager, lock); > + return container_of(fence->base.extern_lock, struct vmw_fence_manager, > + lock); > } > > static void vmw_fence_obj_destroy(struct dma_fence *f) > diff --git a/drivers/gpu/drm/xe/xe_hw_fence.c > b/drivers/gpu/drm/xe/xe_hw_fence.c > index ae8ed15b64c5..14720623ad00 100644 > --- a/drivers/gpu/drm/xe/xe_hw_fence.c > +++ b/drivers/gpu/drm/xe/xe_hw_fence.c > @@ -124,7 +124,8 @@ static struct xe_hw_fence *to_xe_hw_fence(struct > dma_fence *fence); > > static struct xe_hw_fence_irq *xe_hw_fence_irq(struct xe_hw_fence *fence) > { > - return container_of(fence->dma.lock, struct xe_hw_fence_irq, lock); > + return container_of(fence->dma.extern_lock, struct xe_hw_fence_irq, > + lock); > } > > static const char *xe_hw_fence_get_driver_name(struct dma_fence *dma_fence) > diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h > index 88c842fc35d5..6eabbb1c471c 100644 > --- a/include/linux/dma-fence.h > +++ b/include/linux/dma-fence.h > @@ -34,7 +34,8 @@ struct seq_file; > * @ops: dma_fence_ops associated with this fence > * @rcu: used for releasing fence with kfree_rcu > * @cb_list: list of all callbacks to call > - * @lock: spin_lock_irqsave used for locking > + * @extern_lock: external spin_lock_irqsave used for locking Add a "(deprecated)" ? > + * @inline_lock: alternative internal spin_lock_irqsave used for locking > * @context: execution context this fence belongs to, returned by > * dma_fence_context_alloc() > * @seqno: the sequence number of this fence inside the execution context, > @@ -49,6 +50,7 @@ struct seq_file; > * of the time. > * > * DMA_FENCE_FLAG_INITIALIZED_BIT - fence was initialized > + * DMA_FENCE_FLAG_INLINE_LOCK_BIT - use inline spinlock instead of external > one > * DMA_FENCE_FLAG_SIGNALED_BIT - fence is already signaled > * DMA_FENCE_FLAG_TIMESTAMP_BIT - timestamp recorded for fence signaling > * DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT - enable_signaling might have been called > @@ -66,7 +68,10 @@ struct seq_file; > * been completed, or never called at all. > */ > struct dma_fence { > - spinlock_t *lock; > + union { > + spinlock_t *extern_lock; > + spinlock_t inline_lock; > + }; > const struct dma_fence_ops __rcu *ops; > /* > * We clear the callback list on kref_put so that by the time we > @@ -100,6 +105,7 @@ struct dma_fence { > > enum dma_fence_flag_bits { > DMA_FENCE_FLAG_INITIALIZED_BIT, > + DMA_FENCE_FLAG_INLINE_LOCK_BIT, Just asking about a nit: what's the order here, always alphabetically? > DMA_FENCE_FLAG_SEQNO64_BIT, > DMA_FENCE_FLAG_SIGNALED_BIT, > DMA_FENCE_FLAG_TIMESTAMP_BIT, > @@ -381,11 +387,12 @@ dma_fence_get_rcu_safe(struct dma_fence __rcu **fencep) > * dma_fence_spinlock - return pointer to the spinlock protecting the fence > * @fence: the fence to get the lock from > * > - * Return the pointer to the extern lock. > + * Return either the pointer to the embedded or the external spin lock. > */ > static inline spinlock_t *dma_fence_spinlock(struct dma_fence *fence) > { > - return fence->lock; > + return test_bit(DMA_FENCE_FLAG_INLINE_LOCK_BIT, &fence->flags) ? > + &fence->inline_lock : fence->extern_lock; I personally am not a fan of using '?' for anything longer than 1 line and think that if (condition) return a; return b; is much better readable. P. > } > > /** > @@ -396,7 +403,7 @@ static inline spinlock_t *dma_fence_spinlock(struct > dma_fence *fence) > * Lock the fence, preventing it from changing to the signaled state. > */ > #define dma_fence_lock_irqsave(fence, flags) \ > - spin_lock_irqsave(fence->lock, flags) > + spin_lock_irqsave(dma_fence_spinlock(fence), flags) > > /** > * dma_fence_unlock_irqrestore - unlock the fence and irqrestore > @@ -406,7 +413,7 @@ static inline spinlock_t *dma_fence_spinlock(struct > dma_fence *fence) > * Unlock the fence, allowing it to change it's state to signaled again. > */ > #define dma_fence_unlock_irqrestore(fence, flags) \ > - spin_unlock_irqrestore(fence->lock, flags) > + spin_unlock_irqrestore(dma_fence_spinlock(fence), flags) > > /** > * dma_fence_assert_held - lockdep assertion that fence is locked
