>diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h >index 529debb..e3ee0b3 100644 >--- a/lib/librte_mbuf/rte_mbuf.h >+++ b/lib/librte_mbuf/rte_mbuf.h >@@ -842,6 +842,44 @@ struct rte_mbuf { > uint16_t timesync; > } __rte_cache_aligned; > >+/** >+ * Prefetch the first part of the mbuf >+ * >+ * The first 64 bytes of the mbuf corresponds to fields that are used early >+ * in the receive path. If the cache line of the architecture is higher than >+ * 64B, the second part will also be prefetched. >+ * >+ * @param m >+ * The pointer to the mbuf. >+ */ >+static inline void >+rte_mbuf_prefetch_part0(struct rte_mbuf *m) >+{ >+ rte_prefetch0(&m->cacheline0); >+} >+ >+/** >+ * Prefetch the second part of the mbuf >+ * >+ * The next 64 bytes of the mbuf corresponds to fields that are used in the >+ * transmit path. If the cache line of the architecture is higher than 64B, >+ * this function does nothing as it is expected that the full mbuf is >+ * already in cache. >+ * >+ * @param m >+ * The pointer to the mbuf. >+ */ >+static inline void >+rte_mbuf_prefetch_part1(struct rte_mbuf *m) >+{ >+#if RTE_CACHE_LINE_SIZE == 64 >+ rte_prefetch0(&m->cacheline1); >+#else >+ RTE_SET_USED(m); >+#endif >+}
I am not super happy with the names here, but I understand that rte_mbuf_prefetch_cacheline0() is a bit long. I could live with them being longer if that makes more sense and adds to readability. Another idea is to have only one function for both: enum { MBUF_CACHELINE0 = 0, MBUF_CACHELINE1, MBUF_CACHELINES }; // Optional enum if you want static inline void rte_mbuf_prefetch(struct rte_mbuf *m, unsigned cacheline) // Make sure we add a comment about the constant value { if (cacheline == MBUF_CACHELINE0) rte_prefetch0(&m->cacheline0); else if (cacheline == MBUF_CACHELINE1) rte_prefetch0(&m->cacheline1); else { rte_prefetch0(&m->cacheline0); rte_prefetch0(&m->cacheline1); } } I believe if you use constant value in the call for the cacheline variable then the extra code should be optimized out. If not then what about a macro instead. #define rte_mbuf_prefetch(m, c) \ do { \ if ((c) == MBUF_CACHELINE0) \ rte_prefetch0(&(m)->cacheline0); \ else if ((c) == MBUF_CACHELINE1) \ rte_prefetch0(&(m)->cacheline1); \ else { \ rte_prefetch0(&(m)->cacheline0); \ rte_prefetch0(&(m)->cacheline1); \ } \ } while((0)) Call like this: rte_mbuf_prefetch(m, 0); // For cacheline 0 rte_mbuf_prefetch(m, 1); // For cacheline 1 rte_mbuf_prefetch(m, 2); // For cacheline 0 and 1 We could have another routine: rte_mbuf_prefetch_data(m, 0); // Prefetch the first cacheline of the packet data. Just a thought and I did not test the above code, so I hope it works that way. I noticed something like this in the linux spinlock code a few years ago. >+ >+ > static inline uint16_t rte_pktmbuf_priv_size(struct rte_mempool *mp); > > /** >-- >2.8.0.rc3 > > Regards, Keith