On Wed, Dec 17, 2025 at 08:45:30PM -0500, Mathieu Desnoyers wrote:
[...]
> +static inline
> +struct hazptr_slot *hazptr_get_free_percpu_slot(void)
> +{
> +     struct hazptr_percpu_slots *percpu_slots = 
> this_cpu_ptr(&hazptr_percpu_slots);
> +     unsigned int idx;
> +
> +     for (idx = 0; idx < NR_HAZPTR_PERCPU_SLOTS; idx++) {
> +             struct hazptr_slot *slot = &percpu_slots->slots[idx];
> +
> +             if (!READ_ONCE(slot->addr))
> +                     return slot;
> +     }
> +     /* All slots are in use. */
> +     return NULL;
> +}
> +
> +static inline
> +bool hazptr_slot_is_backup(struct hazptr_ctx *ctx, struct hazptr_slot *slot)
> +{
> +     return slot == &ctx->backup_slot.slot;
> +}
> +
> +/*
> + * hazptr_acquire: Load pointer at address and protect with hazard pointer.
> + *
> + * Load @addr_p, and protect the loaded pointer with hazard pointer.
> + *
> + * Returns a non-NULL protected address if the loaded pointer is non-NULL.
> + * Returns NULL if the loaded pointer is NULL.
> + *
> + * On success the protected hazptr slot is stored in @ctx->slot.
> + */
> +static inline
> +void *hazptr_acquire(struct hazptr_ctx *ctx, void * const * addr_p)
> +{
> +     struct hazptr_slot *slot = NULL;
> +     void *addr, *addr2;
> +
> +     /*
> +      * Load @addr_p to know which address should be protected.
> +      */
> +     addr = READ_ONCE(*addr_p);
> +     for (;;) {
> +             if (!addr)
> +                     return NULL;
> +             guard(preempt)();
> +             if (likely(!hazptr_slot_is_backup(ctx, slot))) {
> +                     slot = hazptr_get_free_percpu_slot();

I need to continue share my concerns about this "allocating slot while
protecting" pattern. Here realistically, we will go over a few of the
per-CPU hazard pointer slots *every time* instead of directly using a
pre-allocated hazard pointer slot. Could you utilize this[1] to see a
comparison of the reader-side performance against RCU/SRCU?

> +                     /*
> +                      * If all the per-CPU slots are already in use, fallback
> +                      * to the backup slot.
> +                      */
> +                     if (unlikely(!slot))
> +                             slot = hazptr_chain_backup_slot(ctx);
> +             }
> +             WRITE_ONCE(slot->addr, addr);   /* Store B */
> +
> +             /* Memory ordering: Store B before Load A. */
> +             smp_mb();
> +
> +             /*
> +              * Re-load @addr_p after storing it to the hazard pointer slot.
> +              */
> +             addr2 = READ_ONCE(*addr_p);     /* Load A */
> +             if (likely(ptr_eq(addr2, addr)))
> +                     break;
> +             /*
> +              * If @addr_p content has changed since the first load,
> +              * release the hazard pointer and try again.
> +              */
> +             WRITE_ONCE(slot->addr, NULL);
> +             if (!addr2) {
> +                     if (hazptr_slot_is_backup(ctx, slot))
> +                             hazptr_unchain_backup_slot(ctx);
> +                     return NULL;
> +             }
> +             addr = addr2;
> +     }
> +     ctx->slot = slot;
> +     /*
> +      * Use addr2 loaded from the second READ_ONCE() to preserve
> +      * address dependency ordering.
> +      */
> +     return addr2;
> +}
> +
> +/* Release the protected hazard pointer from @slot. */
> +static inline
> +void hazptr_release(struct hazptr_ctx *ctx, void *addr)
> +{
> +     struct hazptr_slot *slot;
> +
> +     if (!addr)
> +             return;
> +     slot = ctx->slot;
> +     WARN_ON_ONCE(slot->addr != addr);
> +     smp_store_release(&slot->addr, NULL);
> +     if (unlikely(hazptr_slot_is_backup(ctx, slot)))
> +             hazptr_unchain_backup_slot(ctx);
> +}
> +
> +void hazptr_init(void);
> +
> +#endif /* _LINUX_HAZPTR_H */
> diff --git a/init/main.c b/init/main.c
> index 07a3116811c5..858eaa87bde7 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -104,6 +104,7 @@
>  #include <linux/pidfs.h>
>  #include <linux/ptdump.h>
>  #include <linux/time_namespace.h>
> +#include <linux/hazptr.h>
>  #include <net/net_namespace.h>
>  
>  #include <asm/io.h>
> @@ -1002,6 +1003,7 @@ void start_kernel(void)
>       workqueue_init_early();
>  
>       rcu_init();
> +     hazptr_init();
>       kvfree_rcu_init();
>  
>       /* Trace events are available after this */
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 9fe722305c9b..1178907fe0ea 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -7,7 +7,7 @@ obj-y     = fork.o exec_domain.o panic.o \
>           cpu.o exit.o softirq.o resource.o \
>           sysctl.o capability.o ptrace.o user.o \
>           signal.o sys.o umh.o workqueue.o pid.o task_work.o \
> -         extable.o params.o \
> +         extable.o params.o hazptr.o \
>           kthread.o sys_ni.o nsproxy.o nstree.o nscommon.o \
>           notifier.o ksysfs.o cred.o reboot.o \
>           async.o range.o smpboot.o ucount.o regset.o ksyms_common.o
> diff --git a/kernel/hazptr.c b/kernel/hazptr.c
> new file mode 100644
> index 000000000000..2ec288bc1132
> --- /dev/null
> +++ b/kernel/hazptr.c
> @@ -0,0 +1,150 @@
> +// SPDX-FileCopyrightText: 2024 Mathieu Desnoyers 
> <[email protected]>
> +//
> +// SPDX-License-Identifier: LGPL-2.1-or-later
> +
> +/*
> + * hazptr: Hazard Pointers
> + */
> +
> +#include <linux/hazptr.h>
> +#include <linux/percpu.h>
> +#include <linux/spinlock.h>
> +#include <linux/list.h>
> +#include <linux/export.h>
> +
> +struct overflow_list {
> +     raw_spinlock_t lock;            /* Lock protecting overflow list and 
> list generation. */
> +     struct list_head head;          /* Overflow list head. */
> +     uint64_t gen;                   /* Overflow list generation. */
> +};
> +
> +static DEFINE_PER_CPU(struct overflow_list, percpu_overflow_list);
> +
> +DEFINE_PER_CPU(struct hazptr_percpu_slots, hazptr_percpu_slots);
> +EXPORT_PER_CPU_SYMBOL_GPL(hazptr_percpu_slots);
> +
> +/*
> + * Perform piecewise iteration on overflow list waiting until "addr" is
> + * not present. Raw spinlock is released and taken between each list
> + * item and busy loop iteration. The overflow list generation is checked
> + * each time the lock is taken to validate that the list has not changed
> + * before resuming iteration or busy wait. If the generation has
> + * changed, retry the entire list traversal.
> + */
> +static
> +void hazptr_synchronize_overflow_list(struct overflow_list *overflow_list, 
> void *addr)
> +{
> +     struct hazptr_backup_slot *backup_slot;
> +     uint64_t snapshot_gen;
> +
> +     raw_spin_lock(&overflow_list->lock);
> +retry:
> +     snapshot_gen = overflow_list->gen;
> +     list_for_each_entry(backup_slot, &overflow_list->head, node) {
> +             /* Busy-wait if node is found. */
> +             while (smp_load_acquire(&backup_slot->slot.addr) == addr) { /* 
> Load B */
> +                     raw_spin_unlock(&overflow_list->lock);
> +                     cpu_relax();

I think we should prioritize the scan thread solution [2] instead of
busy waiting hazrd pointer updaters, because when we have multiple
hazard pointer usages we would want to consolidate the scans from
updater side. If so, the whole ->gen can be avoided.

However this ->gen idea does seem ot resolve another issue for me, I'm
trying to make shazptr critical section preemptive by using a per-task
backup slot (if you recall, this is your idea from the hallway
discussions we had during LPC 2024), and currently I could not make it
work because the following sequeue:

1. CPU 0 already has one pointer protected.

2. CPU 1 begins the updater scan, and it scans the list of preempted
   hazard pointer readers, no reader.

3. CPU 0 does a context switch, it stores the current hazard pointer
   value to the current task's ->hazard_slot (let's say the task is task
   A), and add it to the list of preempted hazard pointer readers.

4. CPU 0 clears its percpu hazptr_slots for the next task (B).

5. CPU 1 continues the updater scan, and it scans the percpu slot of
   CPU 0, and finds no reader.

in this situation, updater will miss a reader. But if we add a
generation snapshotting at step 2 and generation increment at step 3, I
think it'll work.

IMO, if we make this work, it's better than the current backup slot
mechanism IMO, because we only need to acquire the lock if context
switch happens. I will look into the implementation of this and if I
could get it down, I will send it in my next version of shazptr. Mention
it here just to add this option into the discussion.

[1]: https://lore.kernel.org/lkml/[email protected]/
[2]: https://lore.kernel.org/lkml/[email protected]/

Regards,
Boqun

> +                     raw_spin_lock(&overflow_list->lock);
> +                     if (overflow_list->gen != snapshot_gen)
> +                             goto retry;
> +             }
> +             raw_spin_unlock(&overflow_list->lock);
> +             /*
> +              * Release raw spinlock, validate generation after
> +              * re-acquiring the lock.
> +              */
> +             raw_spin_lock(&overflow_list->lock);
> +             if (overflow_list->gen != snapshot_gen)
> +                     goto retry;
> +     }
> +     raw_spin_unlock(&overflow_list->lock);
> +}
> +
> +static
> +void hazptr_synchronize_cpu_slots(int cpu, void *addr)
> +{
> +     struct hazptr_percpu_slots *percpu_slots = 
> per_cpu_ptr(&hazptr_percpu_slots, cpu);
> +     unsigned int idx;
> +
> +     for (idx = 0; idx < NR_HAZPTR_PERCPU_SLOTS; idx++) {
> +             struct hazptr_slot *slot = &percpu_slots->slots[idx];
> +
> +             /* Busy-wait if node is found. */
> +             smp_cond_load_acquire(&slot->addr, VAL != addr); /* Load B */
> +     }
> +}
> +
> +/*
> + * hazptr_synchronize: Wait until @addr is released from all slots.
> + *
> + * Wait to observe that each slot contains a value that differs from
> + * @addr before returning.
> + * Should be called from preemptible context.
> + */
> +void hazptr_synchronize(void *addr)
> +{
> +     int cpu;
> +
> +     /*
> +      * Busy-wait should only be done from preemptible context.
> +      */
> +     lockdep_assert_preemption_enabled();
> +
> +     /*
> +      * Store A precedes hazptr_scan(): it unpublishes addr (sets it to
> +      * NULL or to a different value), and thus hides it from hazard
> +      * pointer readers.
> +      */
> +     if (!addr)
> +             return;
> +     /* Memory ordering: Store A before Load B. */
> +     smp_mb();
> +     /* Scan all CPUs slots. */
> +     for_each_possible_cpu(cpu) {
> +             /* Scan CPU slots. */
> +             hazptr_synchronize_cpu_slots(cpu, addr);
> +             /* Scan backup slots in percpu overflow list. */
> +             
> hazptr_synchronize_overflow_list(per_cpu_ptr(&percpu_overflow_list, cpu), 
> addr);
> +     }
> +}
> +EXPORT_SYMBOL_GPL(hazptr_synchronize);
> +
> +struct hazptr_slot *hazptr_chain_backup_slot(struct hazptr_ctx *ctx)
> +{
> +     struct overflow_list *overflow_list = 
> this_cpu_ptr(&percpu_overflow_list);
> +     struct hazptr_slot *slot = &ctx->backup_slot.slot;
> +
> +     slot->addr = NULL;
> +
> +     raw_spin_lock(&overflow_list->lock);
> +     overflow_list->gen++;
> +     list_add(&ctx->backup_slot.node, &overflow_list->head);
> +     ctx->backup_slot.cpu = smp_processor_id();
> +     raw_spin_unlock(&overflow_list->lock);
> +     return slot;
> +}
> +EXPORT_SYMBOL_GPL(hazptr_chain_backup_slot);
> +
> +void hazptr_unchain_backup_slot(struct hazptr_ctx *ctx)
> +{
> +     struct overflow_list *overflow_list = 
> per_cpu_ptr(&percpu_overflow_list, ctx->backup_slot.cpu);
> +
> +     raw_spin_lock(&overflow_list->lock);
> +     overflow_list->gen++;
> +     list_del(&ctx->backup_slot.node);
> +     raw_spin_unlock(&overflow_list->lock);
> +}
> +EXPORT_SYMBOL_GPL(hazptr_unchain_backup_slot);
> +
> +void __init hazptr_init(void)
> +{
> +     int cpu;
> +
> +     for_each_possible_cpu(cpu) {
> +             struct overflow_list *overflow_list = 
> per_cpu_ptr(&percpu_overflow_list, cpu);
> +
> +             raw_spin_lock_init(&overflow_list->lock);
> +             INIT_LIST_HEAD(&overflow_list->head);
> +     }
> +}
> -- 
> 2.39.5
> 

Reply via email to