Implement the single/dual/quad loop design pattern from FD.IO VPP to improve cache efficiency in the af_packet PMD receive path.
The original implementation processes packets one at a time in a simple loop, which can result in cache misses when accessing frame headers and packet data. The new implementation: - Processes packets in batches of 4 (quad), 2 (dual), and 1 (single) - Prefetches next batch of frame headers while processing current batch - Prefetches packet data before memcpy to hide memory latency - Reduces loop overhead through partial unrolling Two helper functions are introduced: - af_packet_get_frame(): Returns frame pointer at index with wraparound - af_packet_rx_one(): Common per-packet processing (mbuf alloc, memcpy, VLAN handling, timestamp offload) The quad loop checks availability of all 4 frames before processing, falling through to dual/single loops when fewer frames are ready. Early exit paths (out_advance1/2/3) ensure correct frame index tracking when mbuf allocation fails mid-batch. Prefetch strategy: - Frame headers: prefetch N+4..N+7 while processing N..N+3 - Packet data: prefetch at tp_mac offset before memcpy This pattern is well-established in high-performance packet processing and should improve throughput by better utilizing CPU cache hierarchy, particularly beneficial when processing bursts of packets. Signed-off-by: Stephen Hemminger <[email protected]> --- drivers/net/af_packet/rte_eth_af_packet.c | 208 +++++++++++++++++----- 1 file changed, 164 insertions(+), 44 deletions(-) diff --git a/drivers/net/af_packet/rte_eth_af_packet.c b/drivers/net/af_packet/rte_eth_af_packet.c index 5847e14d80..946c21d878 100644 --- a/drivers/net/af_packet/rte_eth_af_packet.c +++ b/drivers/net/af_packet/rte_eth_af_packet.c @@ -14,6 +14,7 @@ #include <rte_malloc.h> #include <rte_kvargs.h> #include <bus_vdev_driver.h> +#include <rte_prefetch.h> #include <errno.h> #include <linux/if_ether.h> @@ -120,75 +121,194 @@ RTE_LOG_REGISTER_DEFAULT(af_packet_logtype, NOTICE); RTE_LOG_LINE(level, AFPACKET, "%s(): " fmt ":%s", __func__, \ ## __VA_ARGS__, strerror(errno)) -static uint16_t -eth_af_packet_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) +/* + * Helper to get the frame pointer at a given index with wraparound + */ +static inline struct tpacket2_hdr * +af_packet_get_frame(struct pkt_rx_queue *pkt_q, unsigned int idx) +{ + if (idx >= pkt_q->framecount) + idx -= pkt_q->framecount; + return (struct tpacket2_hdr *)pkt_q->rd[idx].iov_base; +} + +/* + * Process a single received packet - common code for all loop variants + */ +static inline int +af_packet_rx_one(struct pkt_rx_queue *pkt_q, + struct tpacket2_hdr *ppd, + struct rte_mbuf **mbuf_out, + unsigned long *rx_bytes) { - unsigned i; - struct tpacket2_hdr *ppd; struct rte_mbuf *mbuf; uint8_t *pbuf; + + mbuf = rte_pktmbuf_alloc(pkt_q->mb_pool); + if (unlikely(mbuf == NULL)) { + pkt_q->rx_nombuf++; + return -1; + } + + rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf) = ppd->tp_snaplen; + pbuf = (uint8_t *)ppd + ppd->tp_mac; + memcpy(rte_pktmbuf_mtod(mbuf, void *), pbuf, rte_pktmbuf_data_len(mbuf)); + + if (ppd->tp_status & TP_STATUS_VLAN_VALID) { + mbuf->vlan_tci = ppd->tp_vlan_tci; + mbuf->ol_flags |= (RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED); + if (!pkt_q->vlan_strip && rte_vlan_insert(&mbuf)) + PMD_LOG(ERR, "Failed to reinsert VLAN tag"); + } + + if (pkt_q->timestamp_offloading) { + *RTE_MBUF_DYNFIELD(mbuf, timestamp_dynfield_offset, + rte_mbuf_timestamp_t *) = + (uint64_t)ppd->tp_sec * 1000000000 + ppd->tp_nsec; + mbuf->ol_flags |= timestamp_dynflag; + } + + mbuf->port = pkt_q->in_port; + *mbuf_out = mbuf; + *rx_bytes += mbuf->pkt_len; + ppd->tp_status = TP_STATUS_KERNEL; + + return 0; +} + +/* + * Receive packets using VPP-style single/dual/quad loop pattern with prefetching. + */ +static uint16_t +eth_af_packet_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) +{ struct pkt_rx_queue *pkt_q = queue; + struct tpacket2_hdr *ppd0, *ppd1, *ppd2, *ppd3; uint16_t num_rx = 0; unsigned long num_rx_bytes = 0; unsigned int framecount, framenum; + uint16_t n_left; if (unlikely(nb_pkts == 0)) return 0; - /* - * Reads the given number of packets from the AF_PACKET socket one by - * one and copies the packet data into a newly allocated mbuf. - */ framecount = pkt_q->framecount; framenum = pkt_q->framenum; - for (i = 0; i < nb_pkts; i++) { - /* point at the next incoming frame */ - ppd = (struct tpacket2_hdr *) pkt_q->rd[framenum].iov_base; - if ((ppd->tp_status & TP_STATUS_USER) == 0) + n_left = nb_pkts; + + /* Quad loop: Process 4 packets at a time with prefetching */ + while (n_left >= 4) { + ppd0 = af_packet_get_frame(pkt_q, framenum); + ppd1 = af_packet_get_frame(pkt_q, framenum + 1); + ppd2 = af_packet_get_frame(pkt_q, framenum + 2); + ppd3 = af_packet_get_frame(pkt_q, framenum + 3); + + if ((ppd0->tp_status & TP_STATUS_USER) == 0) break; + if ((ppd1->tp_status & TP_STATUS_USER) == 0) + goto dual_loop; + if ((ppd2->tp_status & TP_STATUS_USER) == 0) + goto dual_loop; + if ((ppd3->tp_status & TP_STATUS_USER) == 0) + goto dual_loop; + + /* Prefetch next 4 frame headers */ + rte_prefetch0(af_packet_get_frame(pkt_q, framenum + 4)); + rte_prefetch0(af_packet_get_frame(pkt_q, framenum + 5)); + rte_prefetch0(af_packet_get_frame(pkt_q, framenum + 6)); + rte_prefetch0(af_packet_get_frame(pkt_q, framenum + 7)); + + /* Prefetch packet data */ + rte_prefetch0((uint8_t *)ppd0 + ppd0->tp_mac); + rte_prefetch0((uint8_t *)ppd1 + ppd1->tp_mac); + rte_prefetch0((uint8_t *)ppd2 + ppd2->tp_mac); + rte_prefetch0((uint8_t *)ppd3 + ppd3->tp_mac); + + if (unlikely(af_packet_rx_one(pkt_q, ppd0, &bufs[num_rx], &num_rx_bytes) < 0)) + goto out; + num_rx++; + if (unlikely(af_packet_rx_one(pkt_q, ppd1, &bufs[num_rx], &num_rx_bytes) < 0)) + goto out_advance1; + num_rx++; + if (unlikely(af_packet_rx_one(pkt_q, ppd2, &bufs[num_rx], &num_rx_bytes) < 0)) + goto out_advance2; + num_rx++; + if (unlikely(af_packet_rx_one(pkt_q, ppd3, &bufs[num_rx], &num_rx_bytes) < 0)) + goto out_advance3; + num_rx++; - /* allocate the next mbuf */ - mbuf = rte_pktmbuf_alloc(pkt_q->mb_pool); - if (unlikely(mbuf == NULL)) { - pkt_q->rx_nombuf++; + framenum += 4; + if (framenum >= framecount) + framenum -= framecount; + n_left -= 4; + } + +dual_loop: + /* Dual loop: Process 2 packets at a time */ + while (n_left >= 2) { + ppd0 = af_packet_get_frame(pkt_q, framenum); + ppd1 = af_packet_get_frame(pkt_q, framenum + 1); + + if ((ppd0->tp_status & TP_STATUS_USER) == 0) break; - } + if ((ppd1->tp_status & TP_STATUS_USER) == 0) + goto single_loop; - /* packet will fit in the mbuf, go ahead and receive it */ - rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf) = ppd->tp_snaplen; - pbuf = (uint8_t *) ppd + ppd->tp_mac; - memcpy(rte_pktmbuf_mtod(mbuf, void *), pbuf, rte_pktmbuf_data_len(mbuf)); + rte_prefetch0(af_packet_get_frame(pkt_q, framenum + 2)); + rte_prefetch0(af_packet_get_frame(pkt_q, framenum + 3)); + rte_prefetch0((uint8_t *)ppd0 + ppd0->tp_mac); + rte_prefetch0((uint8_t *)ppd1 + ppd1->tp_mac); - /* check for vlan info */ - if (ppd->tp_status & TP_STATUS_VLAN_VALID) { - mbuf->vlan_tci = ppd->tp_vlan_tci; - mbuf->ol_flags |= (RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED); + if (unlikely(af_packet_rx_one(pkt_q, ppd0, &bufs[num_rx], &num_rx_bytes) < 0)) + goto out; + num_rx++; + if (unlikely(af_packet_rx_one(pkt_q, ppd1, &bufs[num_rx], &num_rx_bytes) < 0)) + goto out_advance1; + num_rx++; - if (!pkt_q->vlan_strip && rte_vlan_insert(&mbuf)) - PMD_LOG(ERR, "Failed to reinsert VLAN tag"); - } + framenum += 2; + if (framenum >= framecount) + framenum -= framecount; + n_left -= 2; + } - /* add kernel provided timestamp when offloading is enabled */ - if (pkt_q->timestamp_offloading) { - /* since TPACKET_V2 timestamps are provided in nanoseconds resolution */ - *RTE_MBUF_DYNFIELD(mbuf, timestamp_dynfield_offset, - rte_mbuf_timestamp_t *) = - (uint64_t)ppd->tp_sec * 1000000000 + ppd->tp_nsec; +single_loop: + /* Single loop: Process remaining packets */ + while (n_left >= 1) { + ppd0 = af_packet_get_frame(pkt_q, framenum); - mbuf->ol_flags |= timestamp_dynflag; - } + if ((ppd0->tp_status & TP_STATUS_USER) == 0) + break; - /* release incoming frame and advance ring buffer */ - ppd->tp_status = TP_STATUS_KERNEL; - if (++framenum >= framecount) - framenum = 0; - mbuf->port = pkt_q->in_port; + rte_prefetch0(af_packet_get_frame(pkt_q, framenum + 1)); + rte_prefetch0((uint8_t *)ppd0 + ppd0->tp_mac); - /* account for the receive frame */ - bufs[i] = mbuf; + if (unlikely(af_packet_rx_one(pkt_q, ppd0, &bufs[num_rx], &num_rx_bytes) < 0)) + goto out; num_rx++; - num_rx_bytes += mbuf->pkt_len; + + if (++framenum >= framecount) + framenum = 0; + n_left--; } + + goto out; + +out_advance3: + framenum += 3; + if (framenum >= framecount) + framenum -= framecount; + goto out; +out_advance2: + framenum += 2; + if (framenum >= framecount) + framenum -= framecount; + goto out; +out_advance1: + framenum += 1; + if (framenum >= framecount) + framenum -= framecount; +out: pkt_q->framenum = framenum; pkt_q->rx_pkts += num_rx; pkt_q->rx_bytes += num_rx_bytes; -- 2.51.0

