> > Refactored rte_pktmbuf_prefree_seg() for both performance and readability. > > With the optimized RTE_MBUF_DIRECT() macro, the common likely code path > now fits within one instruction cache line on x86-64 when built with GCC. > > Signed-off-by: Morten Brørup <[email protected]> > Acked-by: Konstantin Ananyev <[email protected]> > Acked-by: Chengwen Feng <[email protected]> > Reviewed-by: Bruce Richardson <[email protected]> > --- > v5: > * Removed the plain RTE_MBUF_DIRECT() macro, and only use the optimized > variant. (Bruce Richardson) > Further testing on Godbolt confirmed that other compilers benefit from > the optimized macro too. > * Shortened the description of the RTE_MBUF_DIRECT() macro, and only > provide one example of code emitted by a compiler. (Bruce Richardson) > * Consolidated the static_assert() into one, covering both little and big > endian. > This also reduces the amount of endian-conditional source code and > improves readability. > (Bruce Richardson) > * Added comment about MSB meaning "most significant byte". > v4: > * Enabled the optimized RTE_MBUF_DIRECT() macro for GCC on all > architectures. > v3: > * Rewrote the optimized RTE_MBUF_DIRECT() macro for readability; use > numerical value instead of long names. (Bruce Richardson) > * Enabled the optimized RTE_MBUF_DIRECT() macro for Loongarch too. > v2: > * Fixed typo in commit description. > * Fixed indentation. > * Added detailed description to the optimized RTE_MBUF_DIRECT() macro. > (Stephen Hemminger) > * Added static_assert() to verify that the optimized RTE_MBUF_DIRECT() > macro is valid, specifically that the tested bits are in the MSB of the > 64-bit field. > --- > lib/mbuf/rte_mbuf.h | 51 +++++++++++++++------------------------- > lib/mbuf/rte_mbuf_core.h | 33 +++++++++++++++++++++++--- > 2 files changed, 49 insertions(+), 35 deletions(-) > > diff --git a/lib/mbuf/rte_mbuf.h b/lib/mbuf/rte_mbuf.h > index 3df22125de..2004391f57 100644 > --- a/lib/mbuf/rte_mbuf.h > +++ b/lib/mbuf/rte_mbuf.h > @@ -31,6 +31,7 @@ > * http://www.kohala.com/start/tcpipiv2.html > */ > > +#include <stdbool.h> > #include <stdint.h> > > #include <rte_common.h> > @@ -1458,44 +1459,30 @@ static inline int > __rte_pktmbuf_pinned_extbuf_decref(struct rte_mbuf *m) > static __rte_always_inline struct rte_mbuf * > rte_pktmbuf_prefree_seg(struct rte_mbuf *m) > { > - __rte_mbuf_sanity_check(m, 0); > - > - if (likely(rte_mbuf_refcnt_read(m) == 1)) { > - > - if (!RTE_MBUF_DIRECT(m)) { > - rte_pktmbuf_detach(m); > - if (RTE_MBUF_HAS_EXTBUF(m) && > - RTE_MBUF_HAS_PINNED_EXTBUF(m) && > - __rte_pktmbuf_pinned_extbuf_decref(m)) > - return NULL; > - } > - > - if (m->next != NULL) > - m->next = NULL; > - if (m->nb_segs != 1) > - m->nb_segs = 1; > + bool refcnt_not_one; > > - return m; > + __rte_mbuf_sanity_check(m, 0); > > - } else if (__rte_mbuf_refcnt_update(m, -1) == 0) { > + refcnt_not_one = unlikely(rte_mbuf_refcnt_read(m) != 1); > + if (refcnt_not_one && __rte_mbuf_refcnt_update(m, -1) != 0) > + return NULL; > > - if (!RTE_MBUF_DIRECT(m)) { > - rte_pktmbuf_detach(m); > - if (RTE_MBUF_HAS_EXTBUF(m) && > - RTE_MBUF_HAS_PINNED_EXTBUF(m) && > - __rte_pktmbuf_pinned_extbuf_decref(m)) > - return NULL; > - } > + if (unlikely(!RTE_MBUF_DIRECT(m))) { > + rte_pktmbuf_detach(m); > + if (RTE_MBUF_HAS_EXTBUF(m) && > + RTE_MBUF_HAS_PINNED_EXTBUF(m) && > + __rte_pktmbuf_pinned_extbuf_decref(m)) > + return NULL; > + } > > - if (m->next != NULL) > - m->next = NULL; > - if (m->nb_segs != 1) > - m->nb_segs = 1; > + if (refcnt_not_one) > rte_mbuf_refcnt_set(m, 1); > + if (m->nb_segs != 1) > + m->nb_segs = 1; > + if (m->next != NULL) > + m->next = NULL; > > - return m; > - } > - return NULL; > + return m; > } > > /** > diff --git a/lib/mbuf/rte_mbuf_core.h b/lib/mbuf/rte_mbuf_core.h > index a0df265b5d..1dfeab0511 100644 > --- a/lib/mbuf/rte_mbuf_core.h > +++ b/lib/mbuf/rte_mbuf_core.h > @@ -711,9 +711,36 @@ struct rte_mbuf_ext_shared_info { > * > * If a mbuf embeds its own data after the rte_mbuf structure, this mbuf > * can be defined as a direct mbuf. > - */ > -#define RTE_MBUF_DIRECT(mb) \ > - (!((mb)->ol_flags & (RTE_MBUF_F_INDIRECT | RTE_MBUF_F_EXTERNAL))) > + * > + * Note: Macro optimized for code size. > + * > + * The plain macro would be: > + * #define RTE_MBUF_DIRECT(mb) \ > + * (!((mb)->ol_flags & (RTE_MBUF_F_INDIRECT | RTE_MBUF_F_EXTERNAL))) > + * > + * The flags RTE_MBUF_F_INDIRECT and RTE_MBUF_F_EXTERNAL are both in the > MSB (most significant > + * byte) of the 64-bit ol_flags field, so we only compare this one byte > instead of all > 64 bits. > + * > + * E.g., GCC version 16.0.0 20251019 (experimental) generates the following > code > for x86-64. > + * > + * With the plain macro, 17 bytes of instructions: > + * movabs rax,0x6000000000000000 // 10 bytes > + * and rax,QWORD PTR [rdi+0x18] // 4 bytes > + * sete al // 3 bytes > + * With this optimized macro, only 7 bytes of instructions: > + * test BYTE PTR [rdi+0x1f],0x60 // 4 bytes > + * sete al // 3 bytes > + */ > +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN > +/* On little endian architecture, the MSB of a 64-bit integer is at byte > offset 7. */ > +#define RTE_MBUF_DIRECT(mb) !(((const char *)(&(mb)->ol_flags))[7] & > 0x60) > +#elif RTE_BYTE_ORDER == RTE_BIG_ENDIAN > +/* On big endian architecture, the MSB of a 64-bit integer is at byte offset > 0. */ > +#define RTE_MBUF_DIRECT(mb) !(((const char *)(&(mb)->ol_flags))[0] & > 0x60)
A stupid q: why then not simply do: (mb->ol_flags >> 56) & 0x60 then? Should help to all these LE/BE casts, etc. > +#endif > +/* Verify the optimization above. */ > +static_assert((RTE_MBUF_F_INDIRECT | RTE_MBUF_F_EXTERNAL) == > UINT64_C(0x60) << (7 * CHAR_BIT), > + "(RTE_MBUF_F_INDIRECT | RTE_MBUF_F_EXTERNAL) is not 0x60 at MSB"); > > /** Uninitialized or unspecified port. */ > #define RTE_MBUF_PORT_INVALID UINT16_MAX > -- > 2.43.0

