>diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
>index 529debb..e3ee0b3 100644
>--- a/lib/librte_mbuf/rte_mbuf.h
>+++ b/lib/librte_mbuf/rte_mbuf.h
>@@ -842,6 +842,44 @@ struct rte_mbuf {
>       uint16_t timesync;
> } __rte_cache_aligned;
> 
>+/**
>+ * Prefetch the first part of the mbuf
>+ *
>+ * The first 64 bytes of the mbuf corresponds to fields that are used early
>+ * in the receive path. If the cache line of the architecture is higher than
>+ * 64B, the second part will also be prefetched.
>+ *
>+ * @param m
>+ *   The pointer to the mbuf.
>+ */
>+static inline void
>+rte_mbuf_prefetch_part0(struct rte_mbuf *m)
>+{
>+      rte_prefetch0(&m->cacheline0);
>+}
>+
>+/**
>+ * Prefetch the second part of the mbuf
>+ *
>+ * The next 64 bytes of the mbuf corresponds to fields that are used in the
>+ * transmit path. If the cache line of the architecture is higher than 64B,
>+ * this function does nothing as it is expected that the full mbuf is
>+ * already in cache.
>+ *
>+ * @param m
>+ *   The pointer to the mbuf.
>+ */
>+static inline void
>+rte_mbuf_prefetch_part1(struct rte_mbuf *m)
>+{
>+#if RTE_CACHE_LINE_SIZE == 64
>+      rte_prefetch0(&m->cacheline1);
>+#else
>+      RTE_SET_USED(m);
>+#endif
>+}

I am not super happy with the names here, but I understand that 
rte_mbuf_prefetch_cacheline0() is a bit long. I could live with them being 
longer if that makes more sense and adds to readability.

Another idea is to have only one function for both:

enum { MBUF_CACHELINE0 = 0, MBUF_CACHELINE1, MBUF_CACHELINES };         // 
Optional enum if you want

static inline void
rte_mbuf_prefetch(struct rte_mbuf *m, unsigned cacheline)       // Make sure we 
add a comment about the constant value
{
        if (cacheline == MBUF_CACHELINE0)
                rte_prefetch0(&m->cacheline0);
        else if (cacheline == MBUF_CACHELINE1)
                rte_prefetch0(&m->cacheline1);
        else {
                rte_prefetch0(&m->cacheline0);
                rte_prefetch0(&m->cacheline1);
        }
}

I believe if you use constant value in the call for the cacheline variable then 
the extra code should be optimized out. If not then what about a macro instead.

#define rte_mbuf_prefetch(m, c) \
        do { \
                if ((c) == MBUF_CACHELINE0) \
                        rte_prefetch0(&(m)->cacheline0); \
                else if ((c) == MBUF_CACHELINE1) \
                        rte_prefetch0(&(m)->cacheline1); \
                else { \
                        rte_prefetch0(&(m)->cacheline0); \
                        rte_prefetch0(&(m)->cacheline1); \
                } \
        } while((0))

Call like this:
        rte_mbuf_prefetch(m, 0);        // For cacheline 0
        rte_mbuf_prefetch(m, 1);                // For cacheline 1
        rte_mbuf_prefetch(m, 2);                // For cacheline 0 and 1

We could have another routine:
        rte_mbuf_prefetch_data(m, 0);   // Prefetch the first cacheline of the 
packet data.

Just a thought and I did not test the above code, so I hope it works that way. 
I noticed something like this in the linux spinlock code a few years ago.



>+
>+
> static inline uint16_t rte_pktmbuf_priv_size(struct rte_mempool *mp);
> 
> /**
>-- 
>2.8.0.rc3
>
>


Regards,
Keith




Reply via email to