On 2017年02月03日 11:16, John Fastabend wrote:
Add support for XDP adjust head by allocating a 256B header region
that XDP programs can grow into. This is only enabled when a XDP
program is loaded.

In order to ensure that we do not have to unwind queue headroom push
queue setup below bpf_prog_add. It reads better to do a prog ref
unwind vs another queue setup call.

At the moment this code must do a full reset to ensure old buffers
without headroom on program add or with headroom on program removal
are not used incorrectly in the datapath. Ideally we would only
have to disable/enable the RX queues being updated but there is no
API to do this at the moment in virtio so use the big hammer. In
practice it is likely not that big of a problem as this will only
happen when XDP is enabled/disabled changing programs does not
require the reset. There is some risk that the driver may either
have an allocation failure or for some reason fail to correctly
negotiate with the underlying backend in this case the driver will
be left uninitialized. I have not seen this ever happen on my test
systems and for what its worth this same failure case can occur
from probe and other contexts in virtio framework.

Signed-off-by: John Fastabend <john.r.fastab...@intel.com>
---
  drivers/net/virtio_net.c |  154 +++++++++++++++++++++++++++++++++++++---------
  1 file changed, 125 insertions(+), 29 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 07f9076..52a18b8 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -42,6 +42,9 @@
  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
  #define GOOD_COPY_LEN 128
+/* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
+#define VIRTIO_XDP_HEADROOM 256
+
  /* RX packet size EWMA. The average packet size is used to determine the 
packet
   * buffer size when refilling RX rings. As the entire RX ring may be refilled
   * at once, the weight is chosen so that the EWMA will be insensitive to 
short-
@@ -368,6 +371,7 @@ static bool virtnet_xdp_xmit(struct virtnet_info *vi,
        }
if (vi->mergeable_rx_bufs) {
+               xdp->data -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
                /* Zero header and leave csum up to XDP layers */
                hdr = xdp->data;
                memset(hdr, 0, vi->hdr_len);
@@ -384,7 +388,9 @@ static bool virtnet_xdp_xmit(struct virtnet_info *vi,
                num_sg = 2;
                sg_init_table(sq->sg, 2);
                sg_set_buf(sq->sg, hdr, vi->hdr_len);
-               skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
+               skb_to_sgvec(skb, sq->sg + 1,
+                            xdp->data - xdp->data_hard_start,
+                            xdp->data_end - xdp->data);
        }
        err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
                                   data, GFP_ATOMIC);
@@ -412,7 +418,6 @@ static struct sk_buff *receive_small(struct net_device *dev,
        struct bpf_prog *xdp_prog;
len -= vi->hdr_len;
-       skb_trim(skb, len);
rcu_read_lock();
        xdp_prog = rcu_dereference(rq->xdp_prog);
@@ -424,12 +429,16 @@ static struct sk_buff *receive_small(struct net_device 
*dev,
                if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
                        goto err_xdp;
- xdp.data = skb->data;
+               xdp.data_hard_start = skb->data;
+               xdp.data = skb->data + VIRTIO_XDP_HEADROOM;
                xdp.data_end = xdp.data + len;
                act = bpf_prog_run_xdp(xdp_prog, &xdp);
switch (act) {
                case XDP_PASS:
+                       /* Recalculate length in case bpf program changed it */
+                       __skb_pull(skb, xdp.data - xdp.data_hard_start);

But skb->len were trimmed to len below which seems wrong.

+                       len = xdp.data_end - xdp.data;
                        break;
                case XDP_TX:
                        if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp, skb)))
@@ -446,6 +455,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
        }
        rcu_read_unlock();
+ skb_trim(skb, len);
        return skb;
err_xdp:
@@ -494,7 +504,7 @@ static struct page *xdp_linearize_page(struct receive_queue 
*rq,
                                       unsigned int *len)
  {
        struct page *page = alloc_page(GFP_ATOMIC);
-       unsigned int page_off = 0;
+       unsigned int page_off = VIRTIO_XDP_HEADROOM;
if (!page)
                return NULL;
@@ -530,7 +540,8 @@ static struct page *xdp_linearize_page(struct receive_queue 
*rq,
                put_page(p);
        }
- *len = page_off;
+       /* Headroom does not contribute to packet length */
+       *len = page_off - VIRTIO_XDP_HEADROOM;
        return page;
  err_buf:
        __free_pages(page, 0);
@@ -569,7 +580,7 @@ static struct sk_buff *receive_mergeable(struct net_device 
*dev,
                                                      page, offset, &len);
                        if (!xdp_page)
                                goto err_xdp;
-                       offset = 0;
+                       offset = VIRTIO_XDP_HEADROOM;
                } else {
                        xdp_page = page;
                }
@@ -582,19 +593,30 @@ static struct sk_buff *receive_mergeable(struct 
net_device *dev,
                if (unlikely(hdr->hdr.gso_type))
                        goto err_xdp;
+ /* Allow consuming headroom but reserve enough space to push
+                * the descriptor on if we get an XDP_TX return code.
+                */
                data = page_address(xdp_page) + offset;
+               xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;

Should be data - VIRTIO_XDP_HEADROOM I think?

                xdp.data = data + vi->hdr_len;
                xdp.data_end = xdp.data + (len - vi->hdr_len);
                act = bpf_prog_run_xdp(xdp_prog, &xdp);
switch (act) {
                case XDP_PASS:
+                       /* recalculate offset to account for any header
+                        * adjustments. Note other cases do not build an
+                        * skb and avoid using offset
+                        */
+                       offset = xdp.data -
+                                       page_address(xdp_page) - vi->hdr_len;
+
                        /* We can only create skb based on xdp_page. */
                        if (unlikely(xdp_page != page)) {
                                rcu_read_unlock();
                                put_page(page);
                                head_skb = page_to_skb(vi, rq, xdp_page,
-                                                      0, len, PAGE_SIZE);
+                                                      offset, len, PAGE_SIZE);
                                ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
                                return head_skb;
                        }
@@ -761,23 +783,30 @@ static void receive_buf(struct virtnet_info *vi, struct 
receive_queue *rq,
        dev_kfree_skb(skb);
  }
+static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
+{
+       return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0;
+}
+
  static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue 
*rq,
                             gfp_t gfp)
  {
+       int headroom = GOOD_PACKET_LEN + virtnet_get_headroom(vi);
+       unsigned int xdp_headroom = virtnet_get_headroom(vi);
        struct sk_buff *skb;
        struct virtio_net_hdr_mrg_rxbuf *hdr;
        int err;
- skb = __netdev_alloc_skb_ip_align(vi->dev, GOOD_PACKET_LEN, gfp);
+       skb = __netdev_alloc_skb_ip_align(vi->dev, headroom, gfp);
        if (unlikely(!skb))
                return -ENOMEM;
- skb_put(skb, GOOD_PACKET_LEN);
+       skb_put(skb, headroom);
hdr = skb_vnet_hdr(skb);
        sg_init_table(rq->sg, 2);
        sg_set_buf(rq->sg, hdr, vi->hdr_len);
-       skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
+       skb_to_sgvec(skb, rq->sg + 1, xdp_headroom, skb->len - xdp_headroom);
err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp);
        if (err < 0)
@@ -845,24 +874,27 @@ static unsigned int get_mergeable_buf_len(struct 
ewma_pkt_len *avg_pkt_len)
        return ALIGN(len, MERGEABLE_BUFFER_ALIGN);
  }
-static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
+static int add_recvbuf_mergeable(struct virtnet_info *vi,
+                                struct receive_queue *rq, gfp_t gfp)
  {
        struct page_frag *alloc_frag = &rq->alloc_frag;
+       unsigned int headroom = virtnet_get_headroom(vi);
        char *buf;
        unsigned long ctx;
        int err;
        unsigned int len, hole;
len = get_mergeable_buf_len(&rq->mrg_avg_pkt_len);
-       if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
+       if (unlikely(!skb_page_frag_refill(len + headroom, alloc_frag, gfp)))
                return -ENOMEM;
buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
+       buf += headroom; /* advance address leaving hole at front of pkt */

Note: the headroom will reduce the possibility of frag coalescing which may damage the performance more or less.

[...]

Reply via email to