Re: [PATCH net-next V4 10/10] vhost_net: try batch dequing from skb array

2017-05-10 Thread Jason Wang



On 2017年05月10日 20:34, Michael S. Tsirkin wrote:

On Wed, May 10, 2017 at 11:36:22AM +0800, Jason Wang wrote:

We used to dequeue one skb during recvmsg() from skb_array, this could
be inefficient because of the bad cache utilization and spinlock
touching for each packet. This patch tries to batch them by calling
batch dequeuing helpers explicitly on the exported skb array and pass
the skb back through msg_control for underlayer socket to finish the
userspace copying.

Batch dequeuing is also the requirement for more batching improvement
on rx.

Tests were done by pktgen on tap with XDP1 in guest on top of batch
zeroing:

rx batch | pps

2562.41Mpps (+6.16%)
1282.48Mpps (+8.80%)
64 2.38Mpps (+3.96%) <- Default
16 2.31Mpps (+1.76%)
4  2.31Mpps (+1.76%)
1  2.30Mpps (+1.32%)
0  2.27Mpps (+7.48%)

Signed-off-by: Jason Wang
---
  drivers/vhost/net.c | 117 +---
  1 file changed, 111 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9b51989..fbaecf3 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -28,6 +28,8 @@
  #include 
  #include 
  #include 
+#include 
+#include 
  
  #include 
  
@@ -85,6 +87,13 @@ struct vhost_net_ubuf_ref {

struct vhost_virtqueue *vq;
  };
  
+#define VHOST_RX_BATCH 64

+struct vhost_net_buf {
+   struct sk_buff *queue[VHOST_RX_BATCH];
+   int tail;
+   int head;
+};
+

Do you strictly need to put this inline? This structure is quite big
already. Do you see a measureabe difference if you make it

struct sk_buff **queue;
int tail;
int head;

?


I don't.



Will also make it easier to play with the size in the future
should someone want to see how does it work e.g. for different
ring sizes.



Ok, will do this in next version

Thanks


Re: [PATCH net-next V4 10/10] vhost_net: try batch dequing from skb array

2017-05-10 Thread Michael S. Tsirkin
On Wed, May 10, 2017 at 11:36:22AM +0800, Jason Wang wrote:
> We used to dequeue one skb during recvmsg() from skb_array, this could
> be inefficient because of the bad cache utilization and spinlock
> touching for each packet. This patch tries to batch them by calling
> batch dequeuing helpers explicitly on the exported skb array and pass
> the skb back through msg_control for underlayer socket to finish the
> userspace copying.
> 
> Batch dequeuing is also the requirement for more batching improvement
> on rx.
> 
> Tests were done by pktgen on tap with XDP1 in guest on top of batch
> zeroing:
> 
> rx batch | pps
> 
> 2562.41Mpps (+6.16%)
> 1282.48Mpps (+8.80%)
> 64 2.38Mpps (+3.96%) <- Default
> 16 2.31Mpps (+1.76%)
> 4  2.31Mpps (+1.76%)
> 1  2.30Mpps (+1.32%)
> 0  2.27Mpps (+7.48%)
> 
> Signed-off-by: Jason Wang 
> ---
>  drivers/vhost/net.c | 117 
> +---
>  1 file changed, 111 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 9b51989..fbaecf3 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -28,6 +28,8 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
>  
>  #include 
>  
> @@ -85,6 +87,13 @@ struct vhost_net_ubuf_ref {
>   struct vhost_virtqueue *vq;
>  };
>  
> +#define VHOST_RX_BATCH 64
> +struct vhost_net_buf {
> + struct sk_buff *queue[VHOST_RX_BATCH];
> + int tail;
> + int head;
> +};
> +

Do you strictly need to put this inline? This structure is quite big
already. Do you see a measureabe difference if you make it

struct sk_buff **queue;
int tail;
int head;

?

Will also make it easier to play with the size in the future
should someone want to see how does it work e.g. for different
ring sizes.

>  struct vhost_net_virtqueue {
>   struct vhost_virtqueue vq;
>   size_t vhost_hlen;
> @@ -99,6 +108,8 @@ struct vhost_net_virtqueue {
>   /* Reference counting for outstanding ubufs.
>* Protected by vq mutex. Writers must also take device mutex. */
>   struct vhost_net_ubuf_ref *ubufs;
> + struct skb_array *rx_array;
> + struct vhost_net_buf rxq;
>  };
>  
>  struct vhost_net {
> @@ -117,6 +128,71 @@ struct vhost_net {
>  
>  static unsigned vhost_net_zcopy_mask __read_mostly;
>  
> +static void *vhost_net_buf_get_ptr(struct vhost_net_buf *rxq)
> +{
> + if (rxq->tail != rxq->head)
> + return rxq->queue[rxq->head];
> + else
> + return NULL;
> +}
> +
> +static int vhost_net_buf_get_size(struct vhost_net_buf *rxq)
> +{
> + return rxq->tail - rxq->head;
> +}
> +
> +static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq)
> +{
> + return rxq->tail == rxq->head;
> +}
> +
> +static void *vhost_net_buf_consume(struct vhost_net_buf *rxq)
> +{
> + void *ret = vhost_net_buf_get_ptr(rxq);
> + ++rxq->head;
> + return ret;
> +}
> +
> +static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
> +{
> + struct vhost_net_buf *rxq = >rxq;
> +
> + rxq->head = 0;
> + rxq->tail = skb_array_consume_batched(nvq->rx_array, rxq->queue,
> +   VHOST_RX_BATCH);
> + return rxq->tail;
> +}
> +
> +static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq)
> +{
> + struct vhost_net_buf *rxq = >rxq;
> +
> + if (nvq->rx_array && !vhost_net_buf_is_empty(rxq)) {
> + skb_array_unconsume(nvq->rx_array, rxq->queue + rxq->head,
> + vhost_net_buf_get_size(rxq));
> + rxq->head = rxq->tail = 0;
> + }
> +}
> +
> +static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq)
> +{
> + struct vhost_net_buf *rxq = >rxq;
> +
> + if (!vhost_net_buf_is_empty(rxq))
> + goto out;
> +
> + if (!vhost_net_buf_produce(nvq))
> + return 0;
> +
> +out:
> + return __skb_array_len_with_tag(vhost_net_buf_get_ptr(rxq));
> +}
> +
> +static void vhost_net_buf_init(struct vhost_net_buf *rxq)
> +{
> + rxq->head = rxq->tail = 0;
> +}
> +
>  static void vhost_net_enable_zcopy(int vq)
>  {
>   vhost_net_zcopy_mask |= 0x1 << vq;
> @@ -201,6 +277,7 @@ static void vhost_net_vq_reset(struct vhost_net *n)
>   n->vqs[i].ubufs = NULL;
>   n->vqs[i].vhost_hlen = 0;
>   n->vqs[i].sock_hlen = 0;
> + vhost_net_buf_init(>vqs[i].rxq);
>   }
>  
>  }
> @@ -503,15 +580,14 @@ static void handle_tx(struct vhost_net *net)
>   mutex_unlock(>mutex);
>  }
>  
> -static int peek_head_len(struct sock *sk)
> +static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
>  {
> - struct socket *sock = sk->sk_socket;
>   struct sk_buff *head;
>   int len = 0;
>   unsigned long flags;
>  
> - if (sock->ops->peek_len)
> - return sock->ops->peek_len(sock);
> + if 

[PATCH net-next V4 10/10] vhost_net: try batch dequing from skb array

2017-05-09 Thread Jason Wang
We used to dequeue one skb during recvmsg() from skb_array, this could
be inefficient because of the bad cache utilization and spinlock
touching for each packet. This patch tries to batch them by calling
batch dequeuing helpers explicitly on the exported skb array and pass
the skb back through msg_control for underlayer socket to finish the
userspace copying.

Batch dequeuing is also the requirement for more batching improvement
on rx.

Tests were done by pktgen on tap with XDP1 in guest on top of batch
zeroing:

rx batch | pps

2562.41Mpps (+6.16%)
1282.48Mpps (+8.80%)
64 2.38Mpps (+3.96%) <- Default
16 2.31Mpps (+1.76%)
4  2.31Mpps (+1.76%)
1  2.30Mpps (+1.32%)
0  2.27Mpps (+7.48%)

Signed-off-by: Jason Wang 
---
 drivers/vhost/net.c | 117 +---
 1 file changed, 111 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9b51989..fbaecf3 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -28,6 +28,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 
@@ -85,6 +87,13 @@ struct vhost_net_ubuf_ref {
struct vhost_virtqueue *vq;
 };
 
+#define VHOST_RX_BATCH 64
+struct vhost_net_buf {
+   struct sk_buff *queue[VHOST_RX_BATCH];
+   int tail;
+   int head;
+};
+
 struct vhost_net_virtqueue {
struct vhost_virtqueue vq;
size_t vhost_hlen;
@@ -99,6 +108,8 @@ struct vhost_net_virtqueue {
/* Reference counting for outstanding ubufs.
 * Protected by vq mutex. Writers must also take device mutex. */
struct vhost_net_ubuf_ref *ubufs;
+   struct skb_array *rx_array;
+   struct vhost_net_buf rxq;
 };
 
 struct vhost_net {
@@ -117,6 +128,71 @@ struct vhost_net {
 
 static unsigned vhost_net_zcopy_mask __read_mostly;
 
+static void *vhost_net_buf_get_ptr(struct vhost_net_buf *rxq)
+{
+   if (rxq->tail != rxq->head)
+   return rxq->queue[rxq->head];
+   else
+   return NULL;
+}
+
+static int vhost_net_buf_get_size(struct vhost_net_buf *rxq)
+{
+   return rxq->tail - rxq->head;
+}
+
+static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq)
+{
+   return rxq->tail == rxq->head;
+}
+
+static void *vhost_net_buf_consume(struct vhost_net_buf *rxq)
+{
+   void *ret = vhost_net_buf_get_ptr(rxq);
+   ++rxq->head;
+   return ret;
+}
+
+static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
+{
+   struct vhost_net_buf *rxq = >rxq;
+
+   rxq->head = 0;
+   rxq->tail = skb_array_consume_batched(nvq->rx_array, rxq->queue,
+ VHOST_RX_BATCH);
+   return rxq->tail;
+}
+
+static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq)
+{
+   struct vhost_net_buf *rxq = >rxq;
+
+   if (nvq->rx_array && !vhost_net_buf_is_empty(rxq)) {
+   skb_array_unconsume(nvq->rx_array, rxq->queue + rxq->head,
+   vhost_net_buf_get_size(rxq));
+   rxq->head = rxq->tail = 0;
+   }
+}
+
+static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq)
+{
+   struct vhost_net_buf *rxq = >rxq;
+
+   if (!vhost_net_buf_is_empty(rxq))
+   goto out;
+
+   if (!vhost_net_buf_produce(nvq))
+   return 0;
+
+out:
+   return __skb_array_len_with_tag(vhost_net_buf_get_ptr(rxq));
+}
+
+static void vhost_net_buf_init(struct vhost_net_buf *rxq)
+{
+   rxq->head = rxq->tail = 0;
+}
+
 static void vhost_net_enable_zcopy(int vq)
 {
vhost_net_zcopy_mask |= 0x1 << vq;
@@ -201,6 +277,7 @@ static void vhost_net_vq_reset(struct vhost_net *n)
n->vqs[i].ubufs = NULL;
n->vqs[i].vhost_hlen = 0;
n->vqs[i].sock_hlen = 0;
+   vhost_net_buf_init(>vqs[i].rxq);
}
 
 }
@@ -503,15 +580,14 @@ static void handle_tx(struct vhost_net *net)
mutex_unlock(>mutex);
 }
 
-static int peek_head_len(struct sock *sk)
+static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
 {
-   struct socket *sock = sk->sk_socket;
struct sk_buff *head;
int len = 0;
unsigned long flags;
 
-   if (sock->ops->peek_len)
-   return sock->ops->peek_len(sock);
+   if (rvq->rx_array)
+   return vhost_net_buf_peek(rvq);
 
spin_lock_irqsave(>sk_receive_queue.lock, flags);
head = skb_peek(>sk_receive_queue);
@@ -537,10 +613,11 @@ static int sk_has_rx_data(struct sock *sk)
 
 static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
 {
+   struct vhost_net_virtqueue *rvq = >vqs[VHOST_NET_VQ_RX];
struct vhost_net_virtqueue *nvq = >vqs[VHOST_NET_VQ_TX];
struct vhost_virtqueue *vq = >vq;
unsigned long uninitialized_var(endtime);
-   int len = peek_head_len(sk);
+   int len = peek_head_len(rvq, sk);