> This patch replaces use of the deprecated rte_atomic32 code with > GCC builtin atomic operations. > > Although it would be preferable to use C11 version on all architectures, > there is a performance loss if we do it that way: > > Measured on i9-13900H, two physical cores MP/MC bulk n=128, 10 runs: > with C11 builtin: 5.86 cycles/elem > with __sync builtin: 5.36 cycles/elem (-9.4%) > > The C11 __atomic_compare_exchange_n builtin writes the actual value back > to its expected pointer on failure. On x86 this forces GCC > to emit extra instructions on the critical path between the CAS > and the success-test. > > __sync_bool_compare_and_swap returns a plain bool with no pointer > writeback, allowing GCC to emit tighter code. > > Signed-off-by: Stephen Hemminger <[email protected]> > --- > lib/ring/meson.build | 2 +- > lib/ring/rte_ring_elem_pvt.h | 2 +- > ..._ring_generic_pvt.h => rte_ring_gcc_pvt.h} | 33 +++++++++++-------- > 3 files changed, 21 insertions(+), 16 deletions(-) > rename lib/ring/{rte_ring_generic_pvt.h => rte_ring_gcc_pvt.h} (88%) > > diff --git a/lib/ring/meson.build b/lib/ring/meson.build > index 21f2c12989..2ba160b178 100644 > --- a/lib/ring/meson.build > +++ b/lib/ring/meson.build > @@ -9,7 +9,7 @@ indirect_headers += files ( > 'rte_ring_elem.h', > 'rte_ring_elem_pvt.h', > 'rte_ring_c11_pvt.h', > - 'rte_ring_generic_pvt.h', > + 'rte_ring_gcc_pvt.h', > 'rte_ring_hts.h', > 'rte_ring_hts_elem_pvt.h', > 'rte_ring_peek.h', > diff --git a/lib/ring/rte_ring_elem_pvt.h b/lib/ring/rte_ring_elem_pvt.h > index a0fdec9812..9a0170c4f0 100644 > --- a/lib/ring/rte_ring_elem_pvt.h > +++ b/lib/ring/rte_ring_elem_pvt.h > @@ -309,7 +309,7 @@ __rte_ring_dequeue_elems(struct rte_ring *r, uint32_t > cons_head, > #ifdef RTE_USE_C11_MEM_MODEL > #include "rte_ring_c11_pvt.h" > #else > -#include "rte_ring_generic_pvt.h" > +#include "rte_ring_gcc_pvt.h" > #endif > > /** > diff --git a/lib/ring/rte_ring_generic_pvt.h b/lib/ring/rte_ring_gcc_pvt.h > similarity index 88% > rename from lib/ring/rte_ring_generic_pvt.h > rename to lib/ring/rte_ring_gcc_pvt.h > index c044b0824f..68ab1355e8 100644 > --- a/lib/ring/rte_ring_generic_pvt.h > +++ b/lib/ring/rte_ring_gcc_pvt.h > @@ -7,11 +7,11 @@ > * Used as BSD-3 Licensed with permission from Kip Macy. > */ > > -#ifndef _RTE_RING_GENERIC_PVT_H_ > -#define _RTE_RING_GENERIC_PVT_H_ > +#ifndef _RTE_RING_GCC_PVT_H_ > +#define _RTE_RING_GCC_PVT_H_ > > /** > - * @file rte_ring_generic_pvt.h > + * @file rte_ring_gcc_pvt.h > * It is not recommended to include this file directly, > * include <rte_ring.h> instead. > * Contains internal helper functions for MP/SP and MC/SC ring modes. > @@ -25,10 +25,8 @@ static __rte_always_inline void > __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t old_val, > uint32_t new_val, uint32_t single, uint32_t enqueue) > { > - if (enqueue) > - rte_smp_wmb(); > - else > - rte_smp_rmb(); > + RTE_SET_USED(enqueue); > + > /* > * If there are other enqueues/dequeues in progress that preceded us, > * we need to wait for them to complete > @@ -37,7 +35,12 @@ __rte_ring_update_tail(struct rte_ring_headtail *ht, > uint32_t old_val, > rte_wait_until_equal_32((volatile uint32_t > *)(uintptr_t)&ht->tail, > old_val, > rte_memory_order_relaxed); > > - ht->tail = new_val; > + /* > + * R0: Establishes a synchronizing edge with load-acquire of tail at A1. > + * Ensures that memory effects by this thread on ring elements array > + * is observed by a different thread of the other type. > + */ > + __atomic_store_n(&ht->tail, new_val, __ATOMIC_RELEASE); > } > > /** > @@ -73,7 +76,7 @@ __rte_ring_headtail_move_head_mt(struct > rte_ring_headtail *d, > uint32_t *old_head, uint32_t *new_head, uint32_t *entries) > { > unsigned int max = n; > - int success; > + bool success; > > do { > /* Reset n to the initial burst count */ > @@ -81,10 +84,10 @@ __rte_ring_headtail_move_head_mt(struct > rte_ring_headtail *d, > > *old_head = d->head; > > - /* add rmb barrier to avoid load/load reorder in weak > + /* add fence to avoid load/load reorder in weak > * memory model. It is noop on x86 > */ > - rte_smp_rmb(); > + __atomic_thread_fence(__ATOMIC_ACQUIRE); > > /* > * The subtraction is done between two unsigned 32bits value > @@ -103,10 +106,12 @@ __rte_ring_headtail_move_head_mt(struct > rte_ring_headtail *d, > return 0; > > *new_head = *old_head + n; > - success = rte_atomic32_cmpset( > + > + success = __sync_bool_compare_and_swap( > (uint32_t *)(uintptr_t)&d->head, > *old_head, *new_head); > - } while (unlikely(success == 0)); > + } while (unlikely(!success)); > + > return n; > } > > @@ -169,4 +174,4 @@ __rte_ring_headtail_move_head_st(struct > rte_ring_headtail *d, > return n; > } > > -#endif /* _RTE_RING_GENERIC_PVT_H_ */ > +#endif /* _RTE_RING_GCC_PVT_H_ */ > --
Acked-by: Konstantin Ananyev <[email protected]> Tested-by: Konstantin Ananyev <[email protected]> > 2.53.0

