> From: Bruce Richardson [mailto:[email protected]] > Sent: Wednesday, 22 October 2025 17.03 > > On Wed, Oct 22, 2025 at 02:47:08PM +0000, Morten Brørup wrote: > > Refactored rte_pktmbuf_prefree_seg() for both performance and > readability. > > > > With the optimized RTE_MBUF_DIRECT() macro, the common likely code > path > > now fits within one instruction cache line on x86-64 when built with > GCC. > > > > Signed-off-by: Morten Brørup <[email protected]> > > Acked-by: Konstantin Ananyev <[email protected]> > > Acked-by: Chengwen Feng <[email protected]> > > Reviewed-by: Bruce Richardson <[email protected]> > > --- > > v4: > > * Enabled the optimized RTE_MBUF_DIRECT() macro for GCC on all > > architectures. > > v3: > > * Rewrote the optimized RTE_MBUF_DIRECT() macro for readability; use > > numerical value instead of long names. (Bruce Richardson) > > * Enabled the optimized RTE_MBUF_DIRECT() macro for Loongarch too. > > v2: > > * Fixed typo in commit description. > > * Fixed indentation. > > * Added detailed description to the optimized RTE_MBUF_DIRECT() > macro. > > (Stephen Hemminger) > > * Added static_assert() to verify that the optimized > RTE_MBUF_DIRECT() > > macro is valid, specifically that the tested bits are in the MSB of > the > > 64-bit field. > > --- > > lib/mbuf/rte_mbuf.h | 51 +++++++++++++++----------------------- > -- > > lib/mbuf/rte_mbuf_core.h | 47 ++++++++++++++++++++++++++++++++++++ > > 2 files changed, 66 insertions(+), 32 deletions(-) > > > > diff --git a/lib/mbuf/rte_mbuf.h b/lib/mbuf/rte_mbuf.h > > index 3df22125de..2004391f57 100644 > > --- a/lib/mbuf/rte_mbuf.h > > +++ b/lib/mbuf/rte_mbuf.h > > @@ -31,6 +31,7 @@ > > * http://www.kohala.com/start/tcpipiv2.html > > */ > > > > +#include <stdbool.h> > > #include <stdint.h> > > > > #include <rte_common.h> > > @@ -1458,44 +1459,30 @@ static inline int > __rte_pktmbuf_pinned_extbuf_decref(struct rte_mbuf *m) > > static __rte_always_inline struct rte_mbuf * > > rte_pktmbuf_prefree_seg(struct rte_mbuf *m) > > { > > - __rte_mbuf_sanity_check(m, 0); > > - > > - if (likely(rte_mbuf_refcnt_read(m) == 1)) { > > - > > - if (!RTE_MBUF_DIRECT(m)) { > > - rte_pktmbuf_detach(m); > > - if (RTE_MBUF_HAS_EXTBUF(m) && > > - RTE_MBUF_HAS_PINNED_EXTBUF(m) && > > - __rte_pktmbuf_pinned_extbuf_decref(m)) > > - return NULL; > > - } > > - > > - if (m->next != NULL) > > - m->next = NULL; > > - if (m->nb_segs != 1) > > - m->nb_segs = 1; > > + bool refcnt_not_one; > > > > - return m; > > + __rte_mbuf_sanity_check(m, 0); > > > > - } else if (__rte_mbuf_refcnt_update(m, -1) == 0) { > > + refcnt_not_one = unlikely(rte_mbuf_refcnt_read(m) != 1); > > + if (refcnt_not_one && __rte_mbuf_refcnt_update(m, -1) != 0) > > + return NULL; > > > > - if (!RTE_MBUF_DIRECT(m)) { > > - rte_pktmbuf_detach(m); > > - if (RTE_MBUF_HAS_EXTBUF(m) && > > - RTE_MBUF_HAS_PINNED_EXTBUF(m) && > > - __rte_pktmbuf_pinned_extbuf_decref(m)) > > - return NULL; > > - } > > + if (unlikely(!RTE_MBUF_DIRECT(m))) { > > + rte_pktmbuf_detach(m); > > + if (RTE_MBUF_HAS_EXTBUF(m) && > > + RTE_MBUF_HAS_PINNED_EXTBUF(m) && > > + __rte_pktmbuf_pinned_extbuf_decref(m)) > > + return NULL; > > + } > > > > - if (m->next != NULL) > > - m->next = NULL; > > - if (m->nb_segs != 1) > > - m->nb_segs = 1; > > + if (refcnt_not_one) > > rte_mbuf_refcnt_set(m, 1); > > + if (m->nb_segs != 1) > > + m->nb_segs = 1; > > + if (m->next != NULL) > > + m->next = NULL; > > > > - return m; > > - } > > - return NULL; > > + return m; > > } > > > > /** > > diff --git a/lib/mbuf/rte_mbuf_core.h b/lib/mbuf/rte_mbuf_core.h > > index a0df265b5d..37f2975158 100644 > > --- a/lib/mbuf/rte_mbuf_core.h > > +++ b/lib/mbuf/rte_mbuf_core.h > > @@ -706,6 +706,7 @@ struct rte_mbuf_ext_shared_info { > > */ > > #define RTE_MBUF_HAS_EXTBUF(mb) ((mb)->ol_flags & > RTE_MBUF_F_EXTERNAL) > > > > +#if !defined(RTE_TOOLCHAIN_GCC) || defined __DOXYGEN__ > > /** > > * Returns TRUE if given mbuf is direct, or FALSE otherwise. > > * > > @@ -714,6 +715,52 @@ struct rte_mbuf_ext_shared_info { > > */ > > #define RTE_MBUF_DIRECT(mb) \ > > (!((mb)->ol_flags & (RTE_MBUF_F_INDIRECT | RTE_MBUF_F_EXTERNAL))) > > +#else /* RTE_TOOLCHAIN_GCC */ > > Do we need the non-gcc block? > > > +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN > > +/* Macro optimized for code size. > > + * GCC only optimizes single-bit MSB tests this way, so we do it by > hand with multi-bit. > > + * > > + * The flags RTE_MBUF_F_INDIRECT and RTE_MBUF_F_EXTERNAL are both in > the MSB of the > > + * 64-bit ol_flags field, so we only compare this one byte instead > of all 64 bits. > > + * On little endian architecture, the MSB of a 64-bit integer is at > byte offset 7. > > + * > > + * Note: Tested on x86-64 using GCC version 16.0.0 20251019 > (experimental). > > + * > > + * Without this optimization, GCC generates 17 bytes of > instructions: > > + * movabs rax,0x6000000000000000 // 10 bytes > > + * and rax,QWORD PTR [rdi+0x18] // 4 bytes > > + * sete al // 3 bytes > > + * With this optimization, GCC generates only 7 bytes of > instructions: > > + * test BYTE PTR [rdi+0x1f],0x60 // 4 bytes > > + * sete al // 3 bytes > > + * > > + * Note: Tested on loongarch using GCC version 15.2.0. > > + * > > + * Without this optimization, GCC generates 5 instructions: > > + * ld.d $a0, $a0, 24 > > + * move $t0, $zero > > + * lu52i.d $t0, $t0, 1536 > > + * and $a0, $a0, $t0 > > + * sltui $a0, $a0, 1 > > + * With this optimization, GCC generates only 3 instructions: > > + * ld.bu $a0, $a0, 31 > > + * andi $a0, $a0, 0x60 > > + * sltui $a0, $a0, 1 > > + * > > + * Note: GCC also generates smaller code size with the optimized > macro on many other architectures. > > + * > > + * Note: GCC generates the same code size as with the plain macro on > ARM (64 and 32 bit). > > + */ > > +static_assert((RTE_MBUF_F_INDIRECT | RTE_MBUF_F_EXTERNAL) == > UINT64_C(0x60) << (7 * CHAR_BIT), > > + "(RTE_MBUF_F_INDIRECT | RTE_MBUF_F_EXTERNAL) is not 0x60 at byte > offset 7"); > > +#define RTE_MBUF_DIRECT(mb) !(((const char *)(&(mb)- > >ol_flags))[7] & 0x60) > > +#elif RTE_BYTE_ORDER == RTE_BIG_ENDIAN > > +/* As described above; but on big endian architecture, the MSB is at > byte offset 0. */ > > +static_assert((RTE_MBUF_F_INDIRECT | RTE_MBUF_F_EXTERNAL) == > UINT64_C(0x60) << (7 * CHAR_BIT), > > + "(RTE_MBUF_F_INDIRECT | RTE_MBUF_F_EXTERNAL) is not 0x60 at byte > offset 0"); > > +#define RTE_MBUF_DIRECT(mb) !(((const char *)(&(mb)- > >ol_flags))[0] & 0x60) > > +#endif /* RTE_BYTE_ORDER */ > > Minor nit, the static assert is common, so can be put out of the > endianness > check rather than duplicated.
I considered a generic common static_assert, but prefer individual ones with specific error texts (byte offset 7 or 0). > I'd also suggest putting the the comment > outside it too, so that the endianness check only covers one line each. Good idea. While doing this, I'll see how it looks with a common static_assert and individual ones. > Also, in the comment, showing an example on one arch is probably > enough, > showing two sort of implies that you should show all, while if you show > just one, you can clearly state its just one example. Agree. Less is more. > > BTW: if you want to remove the endian ifdefs, you can compute the > offset > as e.g. > > #define OLF_MSB (7 * (RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN)) // 7 for > LE, 0 for BE > #define RTE_MBUF_DIRECT(mb) !(((const char *)(&(mb)- > >ol_flags))[OLF_MSB] & 0x60) I don't think it improves source code readability, so not going to do it. Also, it's a header file, so the OLF_MSB would need to have some obscure name. Alternatively reside in rte_common.h with a generic name for similar purposes. > > > > +#endif /* RTE_TOOLCHAIN_GCC */ > > > > /** Uninitialized or unspecified port. */ > > #define RTE_MBUF_PORT_INVALID UINT16_MAX > > -- > > 2.43.0 > > Will post a v5.

