From: Scott Mitchell <[email protected]>

Thread Safety:
The tp_status field was accessed without proper memory barriers,
violating the kernel's PACKET_MMAP synchronization protocol. The
kernel implements this protocol in net/packet/af_packet.c:
- __packet_get_status: smp_rmb() then READ_ONCE() (volatile read)
- __packet_set_status: WRITE_ONCE() (volatile write) then smp_wmb()

READ_ONCE/WRITE_ONCE use __may_alias__ attribute via __uXX_alias_t
types to prevent compiler optimizations that assume type-based aliasing
rules, which is critical for tp_status access that may be misaligned
within the ring buffer.

Userspace must use equivalent semantics: volatile unaligned_uint32_t
(with __rte_may_alias) reads/writes with explicit memory barriers
(rte_smp_rmb/rte_smp_wmb).

On aarch64 and other weakly-ordered architectures, missing barriers
cause packet corruption because:
- RX: CPU may read stale packet data before seeing tp_status update
- TX: CPU may reorder stores, causing kernel to see tp_status before
  packet data is fully written

This becomes critical with io_uring SQPOLL mode where the kernel
polling thread on a different CPU core asynchronously updates
tp_status, making proper memory ordering essential.

Note: Uses rte_smp_[r/w]mb which triggers checkpatch warnings, but
C11 atomics cannot be used because tp_status is not declared _Atomic
in the kernel's tpacket2_hdr structure. We must match the kernel's
volatile + barrier memory model with __may_alias__ protection.

Frame Calculation Issues:
1. Frame overhead incorrectly calculated as TPACKET_ALIGN(TPACKET2_HDRLEN)
   instead of TPACKET2_HDRLEN - sizeof(struct sockaddr_ll), causing
   incorrect usable frame data size.

2. Frame address calculation assumed sequential layout
   (frame_base + i * frame_size), but the kernel's packet_lookup_frame()
   uses block-based addressing:
     block_idx = position / frames_per_block
     frame_offset = position % frames_per_block
     address = block_start[block_idx] + (frame_offset * frame_size)

   This caused memory corruption when frames don't evenly divide blocks.

Fixes: 364e08f2bbc0 ("af_packet: add PMD for AF_PACKET-based virtual devices")
Cc: [email protected]
Cc: [email protected]
Signed-off-by: Scott Mitchell <[email protected]>
---
Depends-on: patch-160468 ("eal: add __rte_may_alias and __rte_aligned to 
unaligned typedefs")

 drivers/net/af_packet/rte_eth_af_packet.c | 149 +++++++++++++++++-----
 1 file changed, 114 insertions(+), 35 deletions(-)

diff --git a/drivers/net/af_packet/rte_eth_af_packet.c 
b/drivers/net/af_packet/rte_eth_af_packet.c
index ef11b8fb6b..6c276bb7fc 100644
--- a/drivers/net/af_packet/rte_eth_af_packet.c
+++ b/drivers/net/af_packet/rte_eth_af_packet.c
@@ -9,6 +9,8 @@
 #include <rte_common.h>
 #include <rte_string_fns.h>
 #include <rte_mbuf.h>
+#include <rte_atomic.h>
+#include <rte_bitops.h>
 #include <ethdev_driver.h>
 #include <ethdev_vdev.h>
 #include <rte_malloc.h>
@@ -41,6 +43,10 @@
 #define DFLT_FRAME_SIZE                (1 << 11)
 #define DFLT_FRAME_COUNT       (1 << 9)
 
+static const uint16_t ETH_AF_PACKET_FRAME_SIZE_MAX = RTE_IPV4_MAX_PKT_LEN;
+#define ETH_AF_PACKET_FRAME_OVERHEAD (TPACKET2_HDRLEN - sizeof(struct 
sockaddr_ll))
+#define ETH_AF_PACKET_ETH_OVERHEAD (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN)
+
 static uint64_t timestamp_dynflag;
 static int timestamp_dynfield_offset = -1;
 
@@ -120,6 +126,28 @@ RTE_LOG_REGISTER_DEFAULT(af_packet_logtype, NOTICE);
        RTE_LOG_LINE(level, AFPACKET, "%s(): " fmt ":%s", __func__, \
                ## __VA_ARGS__, strerror(errno))
 
+/**
+ * Read tp_status from packet mmap ring. Matches kernel's READ_ONCE() with 
smp_rmb()
+ * ordering in af_packet.c __packet_get_status.
+ */
+static inline uint32_t
+tpacket_read_status(const volatile void *tp_status)
+{
+       rte_smp_rmb();
+       return *((const volatile unaligned_uint32_t *)tp_status);
+}
+
+/**
+ * Write tp_status to packet mmap ring. Matches kernel's WRITE_ONCE() with 
smp_wmb()
+ * ordering in af_packet.c __packet_set_status.
+ */
+static inline void
+tpacket_write_status(volatile void *tp_status, uint32_t status)
+{
+       *((volatile unaligned_uint32_t *)tp_status) = status;
+       rte_smp_wmb();
+}
+
 static uint16_t
 eth_af_packet_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 {
@@ -129,7 +157,8 @@ eth_af_packet_rx(void *queue, struct rte_mbuf **bufs, 
uint16_t nb_pkts)
        uint8_t *pbuf;
        struct pkt_rx_queue *pkt_q = queue;
        uint16_t num_rx = 0;
-       unsigned long num_rx_bytes = 0;
+       uint32_t num_rx_bytes = 0;
+       uint32_t tp_status;
        unsigned int framecount, framenum;
 
        if (unlikely(nb_pkts == 0))
@@ -144,7 +173,8 @@ eth_af_packet_rx(void *queue, struct rte_mbuf **bufs, 
uint16_t nb_pkts)
        for (i = 0; i < nb_pkts; i++) {
                /* point at the next incoming frame */
                ppd = (struct tpacket2_hdr *) pkt_q->rd[framenum].iov_base;
-               if ((ppd->tp_status & TP_STATUS_USER) == 0)
+               tp_status = tpacket_read_status(&ppd->tp_status);
+               if ((tp_status & TP_STATUS_USER) == 0)
                        break;
 
                /* allocate the next mbuf */
@@ -160,7 +190,7 @@ eth_af_packet_rx(void *queue, struct rte_mbuf **bufs, 
uint16_t nb_pkts)
                memcpy(rte_pktmbuf_mtod(mbuf, void *), pbuf, 
rte_pktmbuf_data_len(mbuf));
 
                /* check for vlan info */
-               if (ppd->tp_status & TP_STATUS_VLAN_VALID) {
+               if (tp_status & TP_STATUS_VLAN_VALID) {
                        mbuf->vlan_tci = ppd->tp_vlan_tci;
                        mbuf->ol_flags |= (RTE_MBUF_F_RX_VLAN | 
RTE_MBUF_F_RX_VLAN_STRIPPED);
 
@@ -179,7 +209,7 @@ eth_af_packet_rx(void *queue, struct rte_mbuf **bufs, 
uint16_t nb_pkts)
                }
 
                /* release incoming frame and advance ring buffer */
-               ppd->tp_status = TP_STATUS_KERNEL;
+               tpacket_write_status(&ppd->tp_status, TP_STATUS_KERNEL);
                if (++framenum >= framecount)
                        framenum = 0;
                mbuf->port = pkt_q->in_port;
@@ -228,8 +258,8 @@ eth_af_packet_tx(void *queue, struct rte_mbuf **bufs, 
uint16_t nb_pkts)
        struct pollfd pfd;
        struct pkt_tx_queue *pkt_q = queue;
        uint16_t num_tx = 0;
-       unsigned long num_tx_bytes = 0;
-       int i;
+       uint32_t num_tx_bytes = 0;
+       uint16_t i;
 
        if (unlikely(nb_pkts == 0))
                return 0;
@@ -259,16 +289,6 @@ eth_af_packet_tx(void *queue, struct rte_mbuf **bufs, 
uint16_t nb_pkts)
                        }
                }
 
-               /* point at the next incoming frame */
-               if (!tx_ring_status_available(ppd->tp_status)) {
-                       if (poll(&pfd, 1, -1) < 0)
-                               break;
-
-                       /* poll() can return POLLERR if the interface is down */
-                       if (pfd.revents & POLLERR)
-                               break;
-               }
-
                /*
                 * poll() will almost always return POLLOUT, even if there
                 * are no extra buffers available
@@ -283,26 +303,28 @@ eth_af_packet_tx(void *queue, struct rte_mbuf **bufs, 
uint16_t nb_pkts)
                 *
                 * This results in poll() returning POLLOUT.
                 */
-               if (!tx_ring_status_available(ppd->tp_status))
+               if 
(unlikely(!tx_ring_status_available(tpacket_read_status(&ppd->tp_status)) &&
+                       (poll(&pfd, 1, -1) < 0 || (pfd.revents & POLLERR) != 0 
||
+                        
!tx_ring_status_available(tpacket_read_status(&ppd->tp_status))))) {
+                       /* Ring is full, stop here. Don't process bufs[i]. */
                        break;
+               }
 
-               /* copy the tx frame data */
-               pbuf = (uint8_t *) ppd + TPACKET2_HDRLEN -
-                       sizeof(struct sockaddr_ll);
+               pbuf = (uint8_t *)ppd + ETH_AF_PACKET_FRAME_OVERHEAD;
 
                struct rte_mbuf *tmp_mbuf = mbuf;
-               while (tmp_mbuf) {
+               do {
                        uint16_t data_len = rte_pktmbuf_data_len(tmp_mbuf);
                        memcpy(pbuf, rte_pktmbuf_mtod(tmp_mbuf, void*), 
data_len);
                        pbuf += data_len;
                        tmp_mbuf = tmp_mbuf->next;
-               }
+               } while (tmp_mbuf);
 
                ppd->tp_len = mbuf->pkt_len;
                ppd->tp_snaplen = mbuf->pkt_len;
 
                /* release incoming frame and advance ring buffer */
-               ppd->tp_status = TP_STATUS_SEND_REQUEST;
+               tpacket_write_status(&ppd->tp_status, TP_STATUS_SEND_REQUEST);
                if (++framenum >= framecount)
                        framenum = 0;
                ppd = (struct tpacket2_hdr *) pkt_q->rd[framenum].iov_base;
@@ -392,10 +414,12 @@ eth_dev_info(struct rte_eth_dev *dev, struct 
rte_eth_dev_info *dev_info)
 
        dev_info->if_index = internals->if_index;
        dev_info->max_mac_addrs = 1;
-       dev_info->max_rx_pktlen = RTE_ETHER_MAX_LEN;
+       dev_info->max_rx_pktlen = (uint32_t)ETH_AF_PACKET_FRAME_SIZE_MAX +
+               ETH_AF_PACKET_ETH_OVERHEAD;
+       dev_info->max_mtu = ETH_AF_PACKET_FRAME_SIZE_MAX;
        dev_info->max_rx_queues = (uint16_t)internals->nb_queues;
        dev_info->max_tx_queues = (uint16_t)internals->nb_queues;
-       dev_info->min_rx_bufsize = 0;
+       dev_info->min_rx_bufsize = ETH_AF_PACKET_ETH_OVERHEAD;
        dev_info->tx_offload_capa = RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
                RTE_ETH_TX_OFFLOAD_VLAN_INSERT;
        dev_info->rx_offload_capa = RTE_ETH_RX_OFFLOAD_VLAN_STRIP |
@@ -572,8 +596,7 @@ eth_rx_queue_setup(struct rte_eth_dev *dev,
        /* Now get the space available for data in the mbuf */
        buf_size = rte_pktmbuf_data_room_size(pkt_q->mb_pool) -
                RTE_PKTMBUF_HEADROOM;
-       data_size = internals->req.tp_frame_size;
-       data_size -= TPACKET2_HDRLEN - sizeof(struct sockaddr_ll);
+       data_size = internals->req.tp_frame_size - ETH_AF_PACKET_FRAME_OVERHEAD;
 
        if (data_size > buf_size) {
                PMD_LOG(ERR,
@@ -612,7 +635,7 @@ eth_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
        int ret;
        int s;
        unsigned int data_size = internals->req.tp_frame_size -
-                                TPACKET2_HDRLEN;
+                                ETH_AF_PACKET_FRAME_OVERHEAD;
 
        if (mtu > data_size)
                return -EINVAL;
@@ -977,25 +1000,38 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
                rx_queue->rd = rte_zmalloc_socket(name, rdsize, 0, numa_node);
                if (rx_queue->rd == NULL)
                        goto error;
+               /* Frame addresses must match kernel's packet_lookup_frame():
+                * block_idx = position / frames_per_block
+                * frame_offset = position % frames_per_block
+                * address = block_start + (frame_offset * frame_size)
+                */
+               const uint32_t frames_per_block = req->tp_block_size / 
req->tp_frame_size;
                for (i = 0; i < req->tp_frame_nr; ++i) {
-                       rx_queue->rd[i].iov_base = rx_queue->map + (i * 
framesize);
+                       const uint32_t block_idx = i / frames_per_block;
+                       const uint32_t frame_in_block = i % frames_per_block;
+                       rx_queue->rd[i].iov_base = rx_queue->map +
+                               (block_idx * req->tp_block_size) +
+                               (frame_in_block * req->tp_frame_size);
                        rx_queue->rd[i].iov_len = req->tp_frame_size;
                }
                rx_queue->sockfd = qsockfd;
 
                tx_queue = &((*internals)->tx_queue[q]);
                tx_queue->framecount = req->tp_frame_nr;
-               tx_queue->frame_data_size = req->tp_frame_size;
-               tx_queue->frame_data_size -= TPACKET2_HDRLEN -
-                       sizeof(struct sockaddr_ll);
+               tx_queue->frame_data_size = req->tp_frame_size - 
ETH_AF_PACKET_FRAME_OVERHEAD;
 
                tx_queue->map = rx_queue->map + req->tp_block_size * 
req->tp_block_nr;
 
                tx_queue->rd = rte_zmalloc_socket(name, rdsize, 0, numa_node);
                if (tx_queue->rd == NULL)
                        goto error;
+               /* See comment above rx_queue->rd initialization. */
                for (i = 0; i < req->tp_frame_nr; ++i) {
-                       tx_queue->rd[i].iov_base = tx_queue->map + (i * 
framesize);
+                       const uint32_t block_idx = i / frames_per_block;
+                       const uint32_t frame_in_block = i % frames_per_block;
+                       tx_queue->rd[i].iov_base = tx_queue->map +
+                               (block_idx * req->tp_block_size) +
+                               (frame_in_block * req->tp_frame_size);
                        tx_queue->rd[i].iov_len = req->tp_frame_size;
                }
                tx_queue->sockfd = qsockfd;
@@ -1092,7 +1128,8 @@ rte_eth_from_packet(struct rte_vdev_device *dev,
        if (*sockfd < 0)
                return -1;
 
-       blocksize = getpagesize();
+       const int pagesize = getpagesize();
+       blocksize = pagesize;
 
        /*
         * Walk arguments for configurable settings
@@ -1162,13 +1199,55 @@ rte_eth_from_packet(struct rte_vdev_device *dev,
                return -1;
        }
 
-       blockcount = framecount / (blocksize / framesize);
+       const unsigned int frames_per_block = blocksize / framesize;
+       blockcount = framecount / frames_per_block;
        if (!blockcount) {
                PMD_LOG(ERR,
                        "%s: invalid AF_PACKET MMAP parameters", name);
                return -1;
        }
 
+       /*
+        * https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt
+        * Check constraints that may be enforced by the kernel and cause 
failure
+        * to initialize the rings but explicit error messages aren't provided.
+        * See packet_set_ring in linux kernel for enforcement:
+        * https://github.com/torvalds/linux/blob/master/net/packet/af_packet.c
+        */
+       if (blocksize % pagesize != 0) {
+               /* tp_block_size must be a multiple of PAGE_SIZE */
+               PMD_LOG(WARNING, "%s: %s=%u must be a multiple of PAGE_SIZE=%d",
+                       name, ETH_AF_PACKET_BLOCKSIZE_ARG, blocksize, pagesize);
+       }
+       if (framesize % TPACKET_ALIGNMENT != 0) {
+               /* tp_frame_size must be a multiple of TPACKET_ALIGNMENT */
+               PMD_LOG(WARNING, "%s: %s=%u must be a multiple of 
TPACKET_ALIGNMENT=%d",
+                       name, ETH_AF_PACKET_FRAMESIZE_ARG, framesize, 
TPACKET_ALIGNMENT);
+       }
+       if (frames_per_block == 0 || frames_per_block > UINT_MAX / blockcount ||
+           framecount != frames_per_block * blockcount) {
+               /* tp_frame_nr must be exactly frames_per_block*tp_block_nr */
+               PMD_LOG(WARNING, "%s: %s=%u must be exactly "
+                       "frames_per_block(%s/%s = %u/%u = %u) * blockcount(%u)",
+                       name, ETH_AF_PACKET_FRAMECOUNT_ARG, framecount,
+                       ETH_AF_PACKET_BLOCKSIZE_ARG, 
ETH_AF_PACKET_FRAMESIZE_ARG,
+                       blocksize, framesize, frames_per_block, blockcount);
+       }
+
+       /* Below conditions may not cause errors but provide hints to improve */
+       if (blocksize % framesize != 0) {
+               PMD_LOG(WARNING, "%s: %s=%u not evenly divisible by %s=%u, "
+                       "may waste memory", name,
+                       ETH_AF_PACKET_BLOCKSIZE_ARG, blocksize,
+                       ETH_AF_PACKET_FRAMESIZE_ARG, framesize);
+       }
+       if (!rte_is_power_of_2(blocksize)) {
+               /* tp_block_size should be a power of two or there will be 
waste */
+               PMD_LOG(WARNING, "%s: %s=%u should be a power of two "
+                       "or there will be a waste of memory",
+                       name, ETH_AF_PACKET_BLOCKSIZE_ARG, blocksize);
+       }
+
        PMD_LOG(DEBUG, "%s: AF_PACKET MMAP parameters:", name);
        PMD_LOG(DEBUG, "%s:\tblock size %d", name, blocksize);
        PMD_LOG(DEBUG, "%s:\tblock count %d", name, blockcount);
-- 
2.39.5 (Apple Git-154)

Reply via email to