On Tue, Jan 17, 2017 at 02:22:59PM -0800, John Fastabend wrote:
> Add support for XDP adjust head by allocating a 256B header region
> that XDP programs can grow into. This is only enabled when a XDP
> program is loaded.
> 
> In order to ensure that we do not have to unwind queue headroom push
> queue setup below bpf_prog_add. It reads better to do a prog ref
> unwind vs another queue setup call.
> 
> At the moment this code must do a full reset to ensure old buffers
> without headroom on program add or with headroom on program removal
> are not used incorrectly in the datapath. Ideally we would only
> have to disable/enable the RX queues being updated but there is no
> API to do this at the moment in virtio so use the big hammer. In
> practice it is likely not that big of a problem as this will only
> happen when XDP is enabled/disabled changing programs does not
> require the reset. There is some risk that the driver may either
> have an allocation failure or for some reason fail to correctly
> negotiate with the underlying backend in this case the driver will
> be left uninitialized. I have not seen this ever happen on my test
> systems and for what its worth this same failure case can occur
> from probe and other contexts in virtio framework.
> 
> Signed-off-by: John Fastabend <john.r.fastab...@intel.com>

I've been thinking about it - can't we drop
old buffers without the head room which were posted before
xdp attached?

Avoiding the reset would be much nicer.

Thoughts?

> ---
>  drivers/net/virtio_net.c |  149 
> +++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 125 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 62dbf4b..3b129b4 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -41,6 +41,9 @@
>  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
>  #define GOOD_COPY_LEN        128
>  
> +/* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
> +#define VIRTIO_XDP_HEADROOM 256
> +
>  /* RX packet size EWMA. The average packet size is used to determine the 
> packet
>   * buffer size when refilling RX rings. As the entire RX ring may be refilled
>   * at once, the weight is chosen so that the EWMA will be insensitive to 
> short-
> @@ -359,6 +362,7 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
>       }
>  
>       if (vi->mergeable_rx_bufs) {
> +             xdp->data -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
>               /* Zero header and leave csum up to XDP layers */
>               hdr = xdp->data;
>               memset(hdr, 0, vi->hdr_len);
> @@ -375,7 +379,9 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
>               num_sg = 2;
>               sg_init_table(sq->sg, 2);
>               sg_set_buf(sq->sg, hdr, vi->hdr_len);
> -             skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
> +             skb_to_sgvec(skb, sq->sg + 1,
> +                          xdp->data - xdp->data_hard_start,
> +                          xdp->data_end - xdp->data);
>       }
>       err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
>                                  data, GFP_ATOMIC);
> @@ -401,7 +407,6 @@ static struct sk_buff *receive_small(struct net_device 
> *dev,
>       struct bpf_prog *xdp_prog;
>  
>       len -= vi->hdr_len;
> -     skb_trim(skb, len);
>  
>       rcu_read_lock();
>       xdp_prog = rcu_dereference(rq->xdp_prog);
> @@ -413,11 +418,15 @@ static struct sk_buff *receive_small(struct net_device 
> *dev,
>               if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
>                       goto err_xdp;
>  
> -             xdp.data = skb->data;
> +             xdp.data_hard_start = skb->data;
> +             xdp.data = skb->data + VIRTIO_XDP_HEADROOM;
>               xdp.data_end = xdp.data + len;
>               act = bpf_prog_run_xdp(xdp_prog, &xdp);
>               switch (act) {
>               case XDP_PASS:
> +                     /* Recalculate length in case bpf program changed it */
> +                     __skb_pull(skb, xdp.data - xdp.data_hard_start);
> +                     len = xdp.data_end - xdp.data;
>                       break;
>               case XDP_TX:
>                       virtnet_xdp_xmit(vi, rq, &xdp, skb);
> @@ -432,6 +441,7 @@ static struct sk_buff *receive_small(struct net_device 
> *dev,
>       }
>       rcu_read_unlock();
>  
> +     skb_trim(skb, len);
>       return skb;
>  
>  err_xdp:
> @@ -480,7 +490,7 @@ static struct page *xdp_linearize_page(struct 
> receive_queue *rq,
>                                      unsigned int *len)
>  {
>       struct page *page = alloc_page(GFP_ATOMIC);
> -     unsigned int page_off = 0;
> +     unsigned int page_off = VIRTIO_XDP_HEADROOM;
>  
>       if (!page)
>               return NULL;
> @@ -516,7 +526,8 @@ static struct page *xdp_linearize_page(struct 
> receive_queue *rq,
>               put_page(p);
>       }
>  
> -     *len = page_off;
> +     /* Headroom does not contribute to packet length */
> +     *len = page_off - VIRTIO_XDP_HEADROOM;
>       return page;
>  err_buf:
>       __free_pages(page, 0);
> @@ -555,7 +566,7 @@ static struct sk_buff *receive_mergeable(struct 
> net_device *dev,
>                                                     page, offset, &len);
>                       if (!xdp_page)
>                               goto err_xdp;
> -                     offset = 0;
> +                     offset = VIRTIO_XDP_HEADROOM;
>               } else {
>                       xdp_page = page;
>               }
> @@ -568,18 +579,29 @@ static struct sk_buff *receive_mergeable(struct 
> net_device *dev,
>               if (unlikely(hdr->hdr.gso_type))
>                       goto err_xdp;
>  
> +             /* Allow consuming headroom but reserve enough space to push
> +              * the descriptor on if we get an XDP_TX return code.
> +              */
>               data = page_address(xdp_page) + offset;
> +             xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
>               xdp.data = data + vi->hdr_len;
>               xdp.data_end = xdp.data + (len - vi->hdr_len);
>               act = bpf_prog_run_xdp(xdp_prog, &xdp);
>               switch (act) {
>               case XDP_PASS:
> +                     /* recalculate offset to account for any header
> +                      * adjustments. Note other cases do not build an
> +                      * skb and avoid using offset
> +                      */
> +                     offset = xdp.data -
> +                                     page_address(xdp_page) - vi->hdr_len;
> +
>                       /* We can only create skb based on xdp_page. */
>                       if (unlikely(xdp_page != page)) {
>                               rcu_read_unlock();
>                               put_page(page);
>                               head_skb = page_to_skb(vi, rq, xdp_page,
> -                                                    0, len, PAGE_SIZE);
> +                                                    offset, len, PAGE_SIZE);
>                               ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
>                               return head_skb;
>                       }
> @@ -744,23 +766,30 @@ static void receive_buf(struct virtnet_info *vi, struct 
> receive_queue *rq,
>       dev_kfree_skb(skb);
>  }
>  
> +static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
> +{
> +     return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0;
> +}
> +
>  static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue 
> *rq,
>                            gfp_t gfp)
>  {
> +     int headroom = GOOD_PACKET_LEN + virtnet_get_headroom(vi);
> +     unsigned int xdp_headroom = virtnet_get_headroom(vi);
>       struct sk_buff *skb;
>       struct virtio_net_hdr_mrg_rxbuf *hdr;
>       int err;
>  
> -     skb = __netdev_alloc_skb_ip_align(vi->dev, GOOD_PACKET_LEN, gfp);
> +     skb = __netdev_alloc_skb_ip_align(vi->dev, headroom, gfp);
>       if (unlikely(!skb))
>               return -ENOMEM;
>  
> -     skb_put(skb, GOOD_PACKET_LEN);
> +     skb_put(skb, headroom);
>  
>       hdr = skb_vnet_hdr(skb);
>       sg_init_table(rq->sg, 2);
>       sg_set_buf(rq->sg, hdr, vi->hdr_len);
> -     skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
> +     skb_to_sgvec(skb, rq->sg + 1, xdp_headroom, skb->len - xdp_headroom);
>  
>       err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp);
>       if (err < 0)
> @@ -828,24 +857,27 @@ static unsigned int get_mergeable_buf_len(struct 
> ewma_pkt_len *avg_pkt_len)
>       return ALIGN(len, MERGEABLE_BUFFER_ALIGN);
>  }
>  
> -static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
> +static int add_recvbuf_mergeable(struct virtnet_info *vi,
> +                              struct receive_queue *rq, gfp_t gfp)
>  {
>       struct page_frag *alloc_frag = &rq->alloc_frag;
> +     unsigned int headroom = virtnet_get_headroom(vi);
>       char *buf;
>       unsigned long ctx;
>       int err;
>       unsigned int len, hole;
>  
>       len = get_mergeable_buf_len(&rq->mrg_avg_pkt_len);
> -     if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
> +     if (unlikely(!skb_page_frag_refill(len + headroom, alloc_frag, gfp)))
>               return -ENOMEM;
>  
>       buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> +     buf += headroom; /* advance address leaving hole at front of pkt */
>       ctx = mergeable_buf_to_ctx(buf, len);
>       get_page(alloc_frag->page);
> -     alloc_frag->offset += len;
> +     alloc_frag->offset += len + headroom;
>       hole = alloc_frag->size - alloc_frag->offset;
> -     if (hole < len) {
> +     if (hole < len + headroom) {
>               /* To avoid internal fragmentation, if there is very likely not
>                * enough space for another buffer, add the remaining space to
>                * the current buffer. This extra space is not included in
> @@ -879,7 +911,7 @@ static bool try_fill_recv(struct virtnet_info *vi, struct 
> receive_queue *rq,
>       gfp |= __GFP_COLD;
>       do {
>               if (vi->mergeable_rx_bufs)
> -                     err = add_recvbuf_mergeable(rq, gfp);
> +                     err = add_recvbuf_mergeable(vi, rq, gfp);
>               else if (vi->big_packets)
>                       err = add_recvbuf_big(vi, rq, gfp);
>               else
> @@ -1702,6 +1734,7 @@ static void virtnet_freeze_down(struct virtio_device 
> *vdev)
>  }
>  
>  static int init_vqs(struct virtnet_info *vi);
> +static void _remove_vq_common(struct virtnet_info *vi);
>  
>  static int virtnet_restore_up(struct virtio_device *vdev)
>  {
> @@ -1727,12 +1760,45 @@ static int virtnet_restore_up(struct virtio_device 
> *vdev)
>       return err;
>  }
>  
> +static int virtnet_reset(struct virtnet_info *vi)
> +{
> +     struct virtio_device *dev = vi->vdev;
> +     int ret;
> +
> +     virtio_config_disable(dev);
> +     dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED;
> +     virtnet_freeze_down(dev);
> +     _remove_vq_common(vi);
> +
> +     dev->config->reset(dev);
> +     virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
> +     virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER);
> +
> +     ret = virtio_finalize_features(dev);
> +     if (ret)
> +             goto err;
> +
> +     ret = virtnet_restore_up(dev);
> +     if (ret)
> +             goto err;
> +     ret = _virtnet_set_queues(vi, vi->curr_queue_pairs);
> +     if (ret)
> +             goto err;
> +
> +     virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
> +     virtio_config_enable(dev);
> +     return 0;
> +err:
> +     virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
> +     return ret;
> +}
> +
>  static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
>  {
>       unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
>       struct virtnet_info *vi = netdev_priv(dev);
>       struct bpf_prog *old_prog;
> -     u16 xdp_qp = 0, curr_qp;
> +     u16 oxdp_qp, xdp_qp = 0, curr_qp;
>       int i, err;
>  
>       if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
> @@ -1764,21 +1830,32 @@ static int virtnet_xdp_set(struct net_device *dev, 
> struct bpf_prog *prog)
>               return -ENOMEM;
>       }
>  
> +     if (prog) {
> +             prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
> +             if (IS_ERR(prog))
> +                     return PTR_ERR(prog);
> +     }
> +
>       err = _virtnet_set_queues(vi, curr_qp + xdp_qp);
>       if (err) {
>               dev_warn(&dev->dev, "XDP Device queue allocation failure.\n");
> -             return err;
> +             goto virtio_queue_err;
>       }
>  
> -     if (prog) {
> -             prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
> -             if (IS_ERR(prog)) {
> -                     _virtnet_set_queues(vi, curr_qp);
> -                     return PTR_ERR(prog);
> -             }
> +     oxdp_qp = vi->xdp_queue_pairs;
> +
> +     /* Changing the headroom in buffers is a disruptive operation because
> +      * existing buffers must be flushed and reallocated. This will happen
> +      * when a xdp program is initially added or xdp is disabled by removing
> +      * the xdp program resulting in number of XDP queues changing.
> +      */
> +     if (vi->xdp_queue_pairs != xdp_qp) {
> +             vi->xdp_queue_pairs = xdp_qp;
> +             err = virtnet_reset(vi);
> +             if (err)
> +                     goto virtio_reset_err;
>       }
>  
> -     vi->xdp_queue_pairs = xdp_qp;
>       netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
>  
>       for (i = 0; i < vi->max_queue_pairs; i++) {
> @@ -1789,6 +1866,21 @@ static int virtnet_xdp_set(struct net_device *dev, 
> struct bpf_prog *prog)
>       }
>  
>       return 0;
> +
> +virtio_reset_err:
> +     /* On reset error do our best to unwind XDP changes inflight and return
> +      * error up to user space for resolution. The underlying reset hung on
> +      * us so not much we can do here.
> +      */
> +     dev_warn(&dev->dev, "XDP reset failure and queues unstable\n");
> +     vi->xdp_queue_pairs = oxdp_qp;
> +virtio_queue_err:
> +     /* On queue set error we can unwind bpf ref count and user space can
> +      * retry this is most likely an allocation failure.
> +      */
> +     if (prog)
> +             bpf_prog_sub(prog, vi->max_queue_pairs - 1);
> +     return err;
>  }
>  
>  static bool virtnet_xdp_query(struct net_device *dev)
> @@ -2382,6 +2474,15 @@ static int virtnet_probe(struct virtio_device *vdev)
>       return err;
>  }
>  
> +static void _remove_vq_common(struct virtnet_info *vi)
> +{
> +     vi->vdev->config->reset(vi->vdev);
> +     free_unused_bufs(vi);
> +     _free_receive_bufs(vi);
> +     free_receive_page_frags(vi);
> +     virtnet_del_vqs(vi);
> +}
> +
>  static void remove_vq_common(struct virtnet_info *vi)
>  {
>       vi->vdev->config->reset(vi->vdev);

Reply via email to