Changing API xdp_return_frame() to take struct xdp_frame as argument,
seems like a natural choice. But there are some subtle performance
details here that needs extra care, which is a deliberate choice.

When de-referencing xdp_frame on a remote CPU during DMA-TX
completion, result in the cache-line is change to "Shared"
state. Later when the page is reused for RX, then this xdp_frame
cache-line is written, which change the state to "Modified".

This situation already happens (naturally) for, virtio_net, tun and
cpumap as the xdp_frame pointer is the queued object.  In tun and
cpumap, the ptr_ring is used for efficiently transferring cache-lines
(with pointers) between CPUs. Thus, the only option is to
de-referencing xdp_frame.

It is only the ixgbe driver that had an optimization, in which it can
avoid doing the de-reference of xdp_frame.  The driver already have
TX-ring queue, which (in case of remote DMA-TX completion) have to be
transferred between CPUs anyhow.  In this data area, we stored a
struct xdp_mem_info and a data pointer, which allowed us to avoid
de-referencing xdp_frame.

To compensate for this, a prefetchw is used for telling the cache
coherency protocol about our access pattern.  My benchmarks show that
this prefetchw is enough to compensate the ixgbe driver.

V7: Adjust for commit d9314c474d4f ("i40e: add support for XDP_REDIRECT")
V8: Adjust for commit bd658dda4237 ("net/mlx5e: Separate dma base address
and offset in dma_sync call")

Signed-off-by: Jesper Dangaard Brouer <bro...@redhat.com>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c     |    5 ++---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h        |    4 +---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c   |   17 +++++++++++------
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c |    1 +
 drivers/net/tun.c                               |    4 ++--
 drivers/net/virtio_net.c                        |    2 +-
 include/net/xdp.h                               |    2 +-
 kernel/bpf/cpumap.c                             |    6 +++---
 net/core/xdp.c                                  |    4 +++-
 9 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 96c54cbfb1f9..c8bf4d35fdea 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -638,8 +638,7 @@ static void i40e_unmap_and_free_tx_resource(struct 
i40e_ring *ring,
                if (tx_buffer->tx_flags & I40E_TX_FLAGS_FD_SB)
                        kfree(tx_buffer->raw_buf);
                else if (ring_is_xdp(ring))
-                       xdp_return_frame(tx_buffer->xdpf->data,
-                                        &tx_buffer->xdpf->mem);
+                       xdp_return_frame(tx_buffer->xdpf);
                else
                        dev_kfree_skb_any(tx_buffer->skb);
                if (dma_unmap_len(tx_buffer, len))
@@ -842,7 +841,7 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
 
                /* free the skb/XDP data */
                if (ring_is_xdp(tx_ring))
-                       xdp_return_frame(tx_buf->xdpf->data, 
&tx_buf->xdpf->mem);
+                       xdp_return_frame(tx_buf->xdpf);
                else
                        napi_consume_skb(tx_buf->skb, napi_budget);
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h 
b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index abb5248e917e..7dd5038cfcc4 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -241,8 +241,7 @@ struct ixgbe_tx_buffer {
        unsigned long time_stamp;
        union {
                struct sk_buff *skb;
-               /* XDP uses address ptr on irq_clean */
-               void *data;
+               struct xdp_frame *xdpf;
        };
        unsigned int bytecount;
        unsigned short gso_segs;
@@ -250,7 +249,6 @@ struct ixgbe_tx_buffer {
        DEFINE_DMA_UNMAP_ADDR(dma);
        DEFINE_DMA_UNMAP_LEN(len);
        u32 tx_flags;
-       struct xdp_mem_info xdp_mem;
 };
 
 struct ixgbe_rx_buffer {
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index f10904ec2172..4f2864165723 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1216,7 +1216,7 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector 
*q_vector,
 
                /* free the skb */
                if (ring_is_xdp(tx_ring))
-                       xdp_return_frame(tx_buffer->data, &tx_buffer->xdp_mem);
+                       xdp_return_frame(tx_buffer->xdpf);
                else
                        napi_consume_skb(tx_buffer->skb, napi_budget);
 
@@ -2386,6 +2386,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector 
*q_vector,
                        xdp.data_hard_start = xdp.data -
                                              ixgbe_rx_offset(rx_ring);
                        xdp.data_end = xdp.data + size;
+                       prefetchw(xdp.data_hard_start); /* xdp_frame write */
 
                        skb = ixgbe_run_xdp(adapter, rx_ring, &xdp);
                }
@@ -5797,7 +5798,7 @@ static void ixgbe_clean_tx_ring(struct ixgbe_ring 
*tx_ring)
 
                /* Free all the Tx ring sk_buffs */
                if (ring_is_xdp(tx_ring))
-                       xdp_return_frame(tx_buffer->data, &tx_buffer->xdp_mem);
+                       xdp_return_frame(tx_buffer->xdpf);
                else
                        dev_kfree_skb_any(tx_buffer->skb);
 
@@ -8348,16 +8349,21 @@ static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter 
*adapter,
        struct ixgbe_ring *ring = adapter->xdp_ring[smp_processor_id()];
        struct ixgbe_tx_buffer *tx_buffer;
        union ixgbe_adv_tx_desc *tx_desc;
+       struct xdp_frame *xdpf;
        u32 len, cmd_type;
        dma_addr_t dma;
        u16 i;
 
-       len = xdp->data_end - xdp->data;
+       xdpf = convert_to_xdp_frame(xdp);
+       if (unlikely(!xdpf))
+               return -EOVERFLOW;
+
+       len = xdpf->len;
 
        if (unlikely(!ixgbe_desc_unused(ring)))
                return IXGBE_XDP_CONSUMED;
 
-       dma = dma_map_single(ring->dev, xdp->data, len, DMA_TO_DEVICE);
+       dma = dma_map_single(ring->dev, xdpf->data, len, DMA_TO_DEVICE);
        if (dma_mapping_error(ring->dev, dma))
                return IXGBE_XDP_CONSUMED;
 
@@ -8372,8 +8378,7 @@ static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter 
*adapter,
 
        dma_unmap_len_set(tx_buffer, len, len);
        dma_unmap_addr_set(tx_buffer, dma, dma);
-       tx_buffer->data = xdp->data;
-       tx_buffer->xdp_mem = xdp->rxq->mem;
+       tx_buffer->xdpf = xdpf;
 
        tx_desc->read.buffer_addr = cpu_to_le64(dma);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index f42436d7f2d9..7bbf0db27a01 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -890,6 +890,7 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct 
mlx5_cqe64 *cqe,
 
        dma_sync_single_range_for_cpu(rq->pdev, di->addr, wi->offset,
                                      frag_size, DMA_FROM_DEVICE);
+       prefetchw(va); /* xdp_frame data area */
        prefetch(data);
        wi->offset += frag_size;
 
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 283bde85c455..bec130cdbd9d 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -663,7 +663,7 @@ void tun_ptr_free(void *ptr)
        if (tun_is_xdp_frame(ptr)) {
                struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
-               xdp_return_frame(xdpf->data, &xdpf->mem);
+               xdp_return_frame(xdpf);
        } else {
                __skb_array_destroy_skb(ptr);
        }
@@ -2196,7 +2196,7 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct 
tun_file *tfile,
                struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
                ret = tun_put_user_xdp(tun, tfile, xdpf, to);
-               xdp_return_frame(xdpf->data, &xdpf->mem);
+               xdp_return_frame(xdpf);
        } else {
                struct sk_buff *skb = ptr;
 
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 42d338fe9a8d..ab3d7cbc4c49 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -430,7 +430,7 @@ static int __virtnet_xdp_xmit(struct virtnet_info *vi,
 
        /* Free up any pending old buffers before queueing new ones. */
        while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
-               xdp_return_frame(xdpf_sent->data, &xdpf_sent->mem);
+               xdp_return_frame(xdpf_sent);
 
        xdpf = convert_to_xdp_frame(xdp);
        if (unlikely(!xdpf))
diff --git a/include/net/xdp.h b/include/net/xdp.h
index d0ee437753dc..137ad5f9f40f 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -103,7 +103,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
        return xdp_frame;
 }
 
-void xdp_return_frame(void *data, struct xdp_mem_info *mem);
+void xdp_return_frame(struct xdp_frame *xdpf);
 
 int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
                     struct net_device *dev, u32 queue_index);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index bcdc4dea5ce7..c95b04ec103e 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -219,7 +219,7 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
 
        while ((xdpf = ptr_ring_consume(ring)))
                if (WARN_ON_ONCE(xdpf))
-                       xdp_return_frame(xdpf->data, &xdpf->mem);
+                       xdp_return_frame(xdpf);
 }
 
 static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
@@ -275,7 +275,7 @@ static int cpu_map_kthread_run(void *data)
 
                        skb = cpu_map_build_skb(rcpu, xdpf);
                        if (!skb) {
-                               xdp_return_frame(xdpf->data, &xdpf->mem);
+                               xdp_return_frame(xdpf);
                                continue;
                        }
 
@@ -578,7 +578,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
                err = __ptr_ring_produce(q, xdpf);
                if (err) {
                        drops++;
-                       xdp_return_frame(xdpf->data, &xdpf->mem);
+                       xdp_return_frame(xdpf);
                }
                processed++;
        }
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 33e382afbd95..0c86b53a3a63 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -308,9 +308,11 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info 
*xdp_rxq,
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
 
-void xdp_return_frame(void *data, struct xdp_mem_info *mem)
+void xdp_return_frame(struct xdp_frame *xdpf)
 {
+       struct xdp_mem_info *mem = &xdpf->mem;
        struct xdp_mem_allocator *xa;
+       void *data = xdpf->data;
        struct page *page;
 
        switch (mem->type) {

Reply via email to