[PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header

2013-11-12 Thread Michael Dalton
Commit 2613af0ed18a (virtio_net: migrate mergeable rx buffers to page
frag allocators) changed the mergeable receive buffer size from PAGE_SIZE
to MTU-size. However, the merge buffer size does not take into account the
size of the virtio-net header. Consequently, packets that are MTU-size
will take two buffers intead of one (to store the virtio-net header),
substantially decreasing the throughput of MTU-size traffic due to TCP
window / SKB truesize effects.

This commit changes the mergeable buffer size to include the virtio-net
header. The buffer size is cacheline-aligned because skb_page_frag_refill
will not automatically align the requested size.

Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
between two QEMU VMs on a single physical machine. Each VM has two VCPUs and
vhost enabled. All VMs and vhost threads run in a single 4 CPU cgroup
cpuset, using cgroups to ensure that other processes in the system will not
be scheduled on the benchmark CPUs. Transmit offloads and mergeable receive
buffers are enabled, but guest_tso4 / guest_csum are explicitly disabled to
force MTU-sized packets on the receiver.

next-net trunk before 2613af0ed18a (PAGE_SIZE buf): 3861.08Gb/s
net-next trunk (MTU 1500- packet uses two buf due to size bug): 4076.62Gb/s
net-next trunk (MTU 1480- packet fits in one buf): 6301.34Gb/s
net-next trunk w/ size fix (MTU 1500 - packet fits in one buf): 6445.44Gb/s

Suggested-by: Eric Northup digitale...@google.com
Signed-off-by: Michael Dalton mwdal...@google.com
---
 drivers/net/virtio_net.c | 30 --
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 01f4eb5..69fb225 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -36,7 +36,10 @@ module_param(csum, bool, 0444);
 module_param(gso, bool, 0444);
 
 /* FIXME: MTU in config. */
-#define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
+#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
+#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \
+sizeof(struct virtio_net_hdr_mrg_rxbuf), \
+L1_CACHE_BYTES))
 #define GOOD_COPY_LEN  128
 
 #define VIRTNET_DRIVER_VERSION 1.0.0
@@ -314,10 +317,10 @@ static int receive_mergeable(struct receive_queue *rq, 
struct sk_buff *head_skb)
head_skb-dev-stats.rx_length_errors++;
return -EINVAL;
}
-   if (unlikely(len  MAX_PACKET_LEN)) {
+   if (unlikely(len  MERGE_BUFFER_LEN)) {
pr_debug(%s: rx error: merge buffer too long\n,
 head_skb-dev-name);
-   len = MAX_PACKET_LEN;
+   len = MERGE_BUFFER_LEN;
}
if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
@@ -336,18 +339,17 @@ static int receive_mergeable(struct receive_queue *rq, 
struct sk_buff *head_skb)
if (curr_skb != head_skb) {
head_skb-data_len += len;
head_skb-len += len;
-   head_skb-truesize += MAX_PACKET_LEN;
+   head_skb-truesize += MERGE_BUFFER_LEN;
}
page = virt_to_head_page(buf);
offset = buf - (char *)page_address(page);
if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
put_page(page);
skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
-len, MAX_PACKET_LEN);
+len, MERGE_BUFFER_LEN);
} else {
skb_add_rx_frag(curr_skb, num_skb_frags, page,
-   offset, len,
-   MAX_PACKET_LEN);
+   offset, len, MERGE_BUFFER_LEN);
}
--rq-num;
}
@@ -383,7 +385,7 @@ static void receive_buf(struct receive_queue *rq, void 
*buf, unsigned int len)
struct page *page = virt_to_head_page(buf);
skb = page_to_skb(rq, page,
  (char *)buf - (char *)page_address(page),
- len, MAX_PACKET_LEN);
+ len, MERGE_BUFFER_LEN);
if (unlikely(!skb)) {
dev-stats.rx_dropped++;
put_page(page);
@@ -471,11 +473,11 @@ static int add_recvbuf_small(struct receive_queue *rq, 
gfp_t gfp)
struct skb_vnet_hdr *hdr;
int err;
 
-   skb = __netdev_alloc_skb_ip_align(vi-dev, MAX_PACKET_LEN, gfp);
+   skb = __netdev_alloc_skb_ip_align(vi-dev, GOOD_PACKET_LEN, gfp);
if (unlikely(!skb))

[PATCH net-next 3/4] virtio-net: use per-receive queue page frag alloc for mergeable bufs

2013-11-12 Thread Michael Dalton
The virtio-net driver currently uses netdev_alloc_frag() for GFP_ATOMIC
mergeable rx buffer allocations. This commit migrates virtio-net to use
per-receive queue page frags for GFP_ATOMIC allocation. This change unifies
mergeable rx buffer memory allocation, which now will use skb_refill_frag()
for both atomic and GFP-WAIT buffer allocations.

To address fragmentation concerns, if after buffer allocation there
is too little space left in the page frag to allocate a subsequent
buffer, the remaining space is added to the current allocated buffer
so that the remaining space can be used to store packet data.

Signed-off-by: Michael Dalton mwdal...@google.com
---
 drivers/net/virtio_net.c | 70 +++-
 1 file changed, 39 insertions(+), 31 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 69fb225..0c93054 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -79,6 +79,9 @@ struct receive_queue {
/* Chain pages by the private ptr. */
struct page *pages;
 
+   /* Page frag for GFP_ATOMIC packet buffer allocation. */
+   struct page_frag atomic_frag;
+
/* RX: fragments + linear part + virtio header */
struct scatterlist sg[MAX_SKB_FRAGS + 2];
 
@@ -128,9 +131,9 @@ struct virtnet_info {
struct mutex config_lock;
 
/* Page_frag for GFP_KERNEL packet buffer allocation when we run
-* low on memory.
+* low on memory. May sleep.
 */
-   struct page_frag alloc_frag;
+   struct page_frag sleep_frag;
 
/* Does the affinity hint is set for virtqueues? */
bool affinity_hint_set;
@@ -305,7 +308,7 @@ static int receive_mergeable(struct receive_queue *rq, 
struct sk_buff *head_skb)
struct sk_buff *curr_skb = head_skb;
char *buf;
struct page *page;
-   int num_buf, len, offset;
+   int num_buf, len, offset, truesize;
 
num_buf = hdr-mhdr.num_buffers;
while (--num_buf) {
@@ -317,11 +320,7 @@ static int receive_mergeable(struct receive_queue *rq, 
struct sk_buff *head_skb)
head_skb-dev-stats.rx_length_errors++;
return -EINVAL;
}
-   if (unlikely(len  MERGE_BUFFER_LEN)) {
-   pr_debug(%s: rx error: merge buffer too long\n,
-head_skb-dev-name);
-   len = MERGE_BUFFER_LEN;
-   }
+   truesize = max_t(int, len, MERGE_BUFFER_LEN);
if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
if (unlikely(!nskb)) {
@@ -339,17 +338,17 @@ static int receive_mergeable(struct receive_queue *rq, 
struct sk_buff *head_skb)
if (curr_skb != head_skb) {
head_skb-data_len += len;
head_skb-len += len;
-   head_skb-truesize += MERGE_BUFFER_LEN;
+   head_skb-truesize += truesize;
}
page = virt_to_head_page(buf);
offset = buf - (char *)page_address(page);
if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
put_page(page);
skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
-len, MERGE_BUFFER_LEN);
+len, truesize);
} else {
skb_add_rx_frag(curr_skb, num_skb_frags, page,
-   offset, len, MERGE_BUFFER_LEN);
+   offset, len, truesize);
}
--rq-num;
}
@@ -383,9 +382,10 @@ static void receive_buf(struct receive_queue *rq, void 
*buf, unsigned int len)
skb_trim(skb, len);
} else if (vi-mergeable_rx_bufs) {
struct page *page = virt_to_head_page(buf);
+   int truesize = max_t(int, len, MERGE_BUFFER_LEN);
skb = page_to_skb(rq, page,
  (char *)buf - (char *)page_address(page),
- len, MERGE_BUFFER_LEN);
+ len, truesize);
if (unlikely(!skb)) {
dev-stats.rx_dropped++;
put_page(page);
@@ -540,24 +540,24 @@ static int add_recvbuf_big(struct receive_queue *rq, 
gfp_t gfp)
 static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
 {
struct virtnet_info *vi = rq-vq-vdev-priv;
-   char *buf = NULL;
-   int err;
+   struct page_frag *alloc_frag;
+   char *buf;
+   int err, len, hole;
 
-   if (gfp  __GFP_WAIT) {
-   if (skb_page_frag_refill(MERGE_BUFFER_LEN, vi-alloc_frag,
-gfp)) {
- 

[PATCH net-next 2/4] net: allow 0 order atomic page alloc in skb_page_frag_refill

2013-11-12 Thread Michael Dalton
skb_page_frag_refill currently permits only order-0 page allocs
unless GFP_WAIT is used. Change skb_page_frag_refill to attempt
higher-order page allocations whether or not GFP_WAIT is used. If
memory cannot be allocated, the allocator will fall back to
successively smaller page allocs (down to order-0 page allocs).

This change brings skb_page_frag_refill in line with the existing
page allocation strategy employed by netdev_alloc_frag, which attempts
higher-order page allocations whether or not GFP_WAIT is set, falling
back to successively lower-order page allocations on failure. Part
of migration of virtio-net to per-receive queue page frag allocators.

Signed-off-by: Michael Dalton mwdal...@google.com
---
 net/core/sock.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index ab20ed9..7383d23 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1865,9 +1865,7 @@ bool skb_page_frag_refill(unsigned int sz, struct 
page_frag *pfrag, gfp_t prio)
put_page(pfrag-page);
}
 
-   /* We restrict high order allocations to users that can afford to wait 
*/
-   order = (prio  __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
-
+   order = SKB_FRAG_PAGE_ORDER;
do {
gfp_t gfp = prio;
 
-- 
1.8.4.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance

2013-11-12 Thread Michael Dalton
Commit 2613af0ed18a (virtio_net: migrate mergeable rx buffers to page frag
allocators) changed the mergeable receive buffer size from PAGE_SIZE to
MTU-size, introducing a single-stream regression for benchmarks with large
average packet size. There is no single optimal buffer size for all workloads.
For workloads with packet size = MTU bytes, MTU + virtio-net header-sized
buffers are preferred as larger buffers reduce the TCP window due to SKB
truesize. However, single-stream workloads with large average packet sizes
have higher throughput if larger (e.g., PAGE_SIZE) buffers are used.

This commit auto-tunes the mergeable receiver buffer packet size by choosing
the packet buffer size based on an EWMA of the recent packet sizes for the
receive queue. Packet buffer sizes range from MTU_SIZE + virtio-net header
len to PAGE_SIZE. This improves throughput for large packet workloads, as
any workload with average packet size = PAGE_SIZE will use PAGE_SIZE
buffers.

These optimizations interact positively with recent commit
ba275241030c (virtio-net: coalesce rx frags when possible during rx),
which coalesces adjacent RX SKB fragments in virtio_net. The coalescing
optimizations benefit buffers of any size.

Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
between two QEMU VMs on a single physical machine. Each VM has two VCPUs
with all offloads  vhost enabled. All VMs and vhost threads run in a
single 4 CPU cgroup cpuset, using cgroups to ensure that other processes
in the system will not be scheduled on the benchmark CPUs. Trunk includes
SKB rx frag coalescing.

net-next trunk w/ virtio_net before 2613af0ed18a (PAGE_SIZE bufs): 14642.85Gb/s
net-next trunk (MTU-size bufs):  13170.01Gb/s
net-next trunk + auto-tune: 14555.94Gb/s

Signed-off-by: Michael Dalton mwdal...@google.com
---
 drivers/net/virtio_net.c | 73 +++-
 1 file changed, 53 insertions(+), 20 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 0c93054..b1086e0 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -27,6 +27,7 @@
 #include linux/if_vlan.h
 #include linux/slab.h
 #include linux/cpu.h
+#include linux/average.h
 
 static int napi_weight = NAPI_POLL_WEIGHT;
 module_param(napi_weight, int, 0444);
@@ -37,10 +38,8 @@ module_param(gso, bool, 0444);
 
 /* FIXME: MTU in config. */
 #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
-#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \
-sizeof(struct virtio_net_hdr_mrg_rxbuf), \
-L1_CACHE_BYTES))
 #define GOOD_COPY_LEN  128
+#define RECEIVE_AVG_WEIGHT 64
 
 #define VIRTNET_DRIVER_VERSION 1.0.0
 
@@ -79,6 +78,9 @@ struct receive_queue {
/* Chain pages by the private ptr. */
struct page *pages;
 
+   /* Average packet length for mergeable receive buffers. */
+   struct ewma mrg_avg_pkt_len;
+
/* Page frag for GFP_ATOMIC packet buffer allocation. */
struct page_frag atomic_frag;
 
@@ -302,14 +304,17 @@ static struct sk_buff *page_to_skb(struct receive_queue 
*rq,
return skb;
 }
 
-static int receive_mergeable(struct receive_queue *rq, struct sk_buff 
*head_skb)
+static int receive_mergeable(struct receive_queue *rq, struct sk_buff 
*head_skb,
+struct page *head_page)
 {
struct skb_vnet_hdr *hdr = skb_vnet_hdr(head_skb);
struct sk_buff *curr_skb = head_skb;
+   struct page *page = head_page;
char *buf;
-   struct page *page;
-   int num_buf, len, offset, truesize;
+   int num_buf, len, offset;
+   u32 est_buffer_len;
 
+   len = head_skb-len;
num_buf = hdr-mhdr.num_buffers;
while (--num_buf) {
int num_skb_frags = skb_shinfo(curr_skb)-nr_frags;
@@ -320,7 +325,6 @@ static int receive_mergeable(struct receive_queue *rq, 
struct sk_buff *head_skb)
head_skb-dev-stats.rx_length_errors++;
return -EINVAL;
}
-   truesize = max_t(int, len, MERGE_BUFFER_LEN);
if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
if (unlikely(!nskb)) {
@@ -338,20 +342,38 @@ static int receive_mergeable(struct receive_queue *rq, 
struct sk_buff *head_skb)
if (curr_skb != head_skb) {
head_skb-data_len += len;
head_skb-len += len;
-   head_skb-truesize += truesize;
+   head_skb-truesize += len;
}
page = virt_to_head_page(buf);
offset = buf - (char *)page_address(page);
if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
put_page(page);
skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
-  

Re: [PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header

2013-11-12 Thread Eric Dumazet
On Tue, 2013-11-12 at 14:21 -0800, Michael Dalton wrote:
 Commit 2613af0ed18a (virtio_net: migrate mergeable rx buffers to page
 frag allocators) changed the mergeable receive buffer size from PAGE_SIZE
 to MTU-size. However, the merge buffer size does not take into account the
 size of the virtio-net header. Consequently, packets that are MTU-size
 will take two buffers intead of one (to store the virtio-net header),
 substantially decreasing the throughput of MTU-size traffic due to TCP
 window / SKB truesize effects.
 
 This commit changes the mergeable buffer size to include the virtio-net
 header. The buffer size is cacheline-aligned because skb_page_frag_refill
 will not automatically align the requested size.
 
 Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
 between two QEMU VMs on a single physical machine. Each VM has two VCPUs and
 vhost enabled. All VMs and vhost threads run in a single 4 CPU cgroup
 cpuset, using cgroups to ensure that other processes in the system will not
 be scheduled on the benchmark CPUs. Transmit offloads and mergeable receive
 buffers are enabled, but guest_tso4 / guest_csum are explicitly disabled to
 force MTU-sized packets on the receiver.
 
 next-net trunk before 2613af0ed18a (PAGE_SIZE buf): 3861.08Gb/s
 net-next trunk (MTU 1500- packet uses two buf due to size bug): 4076.62Gb/s
 net-next trunk (MTU 1480- packet fits in one buf): 6301.34Gb/s
 net-next trunk w/ size fix (MTU 1500 - packet fits in one buf): 6445.44Gb/s
 
 Suggested-by: Eric Northup digitale...@google.com
 Signed-off-by: Michael Dalton mwdal...@google.com
 ---

Acked-by: Eric Dumazet eduma...@google.com


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next 2/4] net: allow 0 order atomic page alloc in skb_page_frag_refill

2013-11-12 Thread Eric Dumazet
On Tue, 2013-11-12 at 14:21 -0800, Michael Dalton wrote:
 skb_page_frag_refill currently permits only order-0 page allocs
 unless GFP_WAIT is used. Change skb_page_frag_refill to attempt
 higher-order page allocations whether or not GFP_WAIT is used. If
 memory cannot be allocated, the allocator will fall back to
 successively smaller page allocs (down to order-0 page allocs).
 
 This change brings skb_page_frag_refill in line with the existing
 page allocation strategy employed by netdev_alloc_frag, which attempts
 higher-order page allocations whether or not GFP_WAIT is set, falling
 back to successively lower-order page allocations on failure. Part
 of migration of virtio-net to per-receive queue page frag allocators.
 
 Signed-off-by: Michael Dalton mwdal...@google.com
 ---
  net/core/sock.c | 4 +---
  1 file changed, 1 insertion(+), 3 deletions(-)

Acked-by: Eric Dumazet eduma...@google.com



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next 3/4] virtio-net: use per-receive queue page frag alloc for mergeable bufs

2013-11-12 Thread Eric Dumazet
On Tue, 2013-11-12 at 14:21 -0800, Michael Dalton wrote:
 The virtio-net driver currently uses netdev_alloc_frag() for GFP_ATOMIC
 mergeable rx buffer allocations. This commit migrates virtio-net to use
 per-receive queue page frags for GFP_ATOMIC allocation. This change unifies
 mergeable rx buffer memory allocation, which now will use skb_refill_frag()
 for both atomic and GFP-WAIT buffer allocations.
 
 To address fragmentation concerns, if after buffer allocation there
 is too little space left in the page frag to allocate a subsequent
 buffer, the remaining space is added to the current allocated buffer
 so that the remaining space can be used to store packet data.
 
 Signed-off-by: Michael Dalton mwdal...@google.com
 ---

Acked-by: Eric Dumazet eduma...@google.com


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header

2013-11-12 Thread Jason Wang
On 11/13/2013 06:21 AM, Michael Dalton wrote:
 Commit 2613af0ed18a (virtio_net: migrate mergeable rx buffers to page
 frag allocators) changed the mergeable receive buffer size from PAGE_SIZE
 to MTU-size. However, the merge buffer size does not take into account the
 size of the virtio-net header. Consequently, packets that are MTU-size
 will take two buffers intead of one (to store the virtio-net header),
 substantially decreasing the throughput of MTU-size traffic due to TCP
 window / SKB truesize effects.

 This commit changes the mergeable buffer size to include the virtio-net
 header. The buffer size is cacheline-aligned because skb_page_frag_refill
 will not automatically align the requested size.

 Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
 between two QEMU VMs on a single physical machine. Each VM has two VCPUs and
 vhost enabled. All VMs and vhost threads run in a single 4 CPU cgroup
 cpuset, using cgroups to ensure that other processes in the system will not
 be scheduled on the benchmark CPUs. Transmit offloads and mergeable receive
 buffers are enabled, but guest_tso4 / guest_csum are explicitly disabled to
 force MTU-sized packets on the receiver.

 next-net trunk before 2613af0ed18a (PAGE_SIZE buf): 3861.08Gb/s
 net-next trunk (MTU 1500- packet uses two buf due to size bug): 4076.62Gb/s
 net-next trunk (MTU 1480- packet fits in one buf): 6301.34Gb/s
 net-next trunk w/ size fix (MTU 1500 - packet fits in one buf): 6445.44Gb/s

 Suggested-by: Eric Northup digitale...@google.com
 Signed-off-by: Michael Dalton mwdal...@google.com
 ---
  drivers/net/virtio_net.c | 30 --
  1 file changed, 16 insertions(+), 14 deletions(-)

 diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
 index 01f4eb5..69fb225 100644
 --- a/drivers/net/virtio_net.c
 +++ b/drivers/net/virtio_net.c
 @@ -36,7 +36,10 @@ module_param(csum, bool, 0444);
  module_param(gso, bool, 0444);
  
  /* FIXME: MTU in config. */
 -#define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
 +#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
 +#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \
 +sizeof(struct virtio_net_hdr_mrg_rxbuf), \
 +L1_CACHE_BYTES))
  #define GOOD_COPY_LEN128
  
  #define VIRTNET_DRIVER_VERSION 1.0.0
 @@ -314,10 +317,10 @@ static int receive_mergeable(struct receive_queue *rq, 
 struct sk_buff *head_skb)
   head_skb-dev-stats.rx_length_errors++;
   return -EINVAL;
   }
 - if (unlikely(len  MAX_PACKET_LEN)) {
 + if (unlikely(len  MERGE_BUFFER_LEN)) {
   pr_debug(%s: rx error: merge buffer too long\n,
head_skb-dev-name);
 - len = MAX_PACKET_LEN;
 + len = MERGE_BUFFER_LEN;
   }
   if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
   struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
 @@ -336,18 +339,17 @@ static int receive_mergeable(struct receive_queue *rq, 
 struct sk_buff *head_skb)
   if (curr_skb != head_skb) {
   head_skb-data_len += len;
   head_skb-len += len;
 - head_skb-truesize += MAX_PACKET_LEN;
 + head_skb-truesize += MERGE_BUFFER_LEN;
   }
   page = virt_to_head_page(buf);
   offset = buf - (char *)page_address(page);
   if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
   put_page(page);
   skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
 -  len, MAX_PACKET_LEN);
 +  len, MERGE_BUFFER_LEN);
   } else {
   skb_add_rx_frag(curr_skb, num_skb_frags, page,
 - offset, len,
 - MAX_PACKET_LEN);
 + offset, len, MERGE_BUFFER_LEN);
   }
   --rq-num;
   }
 @@ -383,7 +385,7 @@ static void receive_buf(struct receive_queue *rq, void 
 *buf, unsigned int len)
   struct page *page = virt_to_head_page(buf);
   skb = page_to_skb(rq, page,
 (char *)buf - (char *)page_address(page),
 -   len, MAX_PACKET_LEN);
 +   len, MERGE_BUFFER_LEN);
   if (unlikely(!skb)) {
   dev-stats.rx_dropped++;
   put_page(page);
 @@ -471,11 +473,11 @@ static int add_recvbuf_small(struct receive_queue *rq, 
 gfp_t gfp)
   struct skb_vnet_hdr *hdr;
   int err;
  
 - skb = __netdev_alloc_skb_ip_align(vi-dev, MAX_PACKET_LEN, gfp);
 + skb = __netdev_alloc_skb_ip_align(vi-dev, 

Re: [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance

2013-11-12 Thread Jason Wang
On 11/13/2013 06:21 AM, Michael Dalton wrote:
 Commit 2613af0ed18a (virtio_net: migrate mergeable rx buffers to page frag
 allocators) changed the mergeable receive buffer size from PAGE_SIZE to
 MTU-size, introducing a single-stream regression for benchmarks with large
 average packet size. There is no single optimal buffer size for all workloads.
 For workloads with packet size = MTU bytes, MTU + virtio-net header-sized
 buffers are preferred as larger buffers reduce the TCP window due to SKB
 truesize. However, single-stream workloads with large average packet sizes
 have higher throughput if larger (e.g., PAGE_SIZE) buffers are used.

 This commit auto-tunes the mergeable receiver buffer packet size by choosing
 the packet buffer size based on an EWMA of the recent packet sizes for the
 receive queue. Packet buffer sizes range from MTU_SIZE + virtio-net header
 len to PAGE_SIZE. This improves throughput for large packet workloads, as
 any workload with average packet size = PAGE_SIZE will use PAGE_SIZE
 buffers.

Hi Michael:

There's one concern with EWMA. How well does it handle multiple streams
each with different packet size? E.g there may be two flows, one with
256 bytes each packet another is 64K.  Looks like it can result we
allocate PAGE_SIZE buffer for 256 (which is bad since the
payload/truesize is low) bytes or 1500+ for 64K buffer (which is ok
since we can do coalescing).

 These optimizations interact positively with recent commit
 ba275241030c (virtio-net: coalesce rx frags when possible during rx),
 which coalesces adjacent RX SKB fragments in virtio_net. The coalescing
 optimizations benefit buffers of any size.

 Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
 between two QEMU VMs on a single physical machine. Each VM has two VCPUs
 with all offloads  vhost enabled. All VMs and vhost threads run in a
 single 4 CPU cgroup cpuset, using cgroups to ensure that other processes
 in the system will not be scheduled on the benchmark CPUs. Trunk includes
 SKB rx frag coalescing.

 net-next trunk w/ virtio_net before 2613af0ed18a (PAGE_SIZE bufs): 
 14642.85Gb/s
 net-next trunk (MTU-size bufs):  13170.01Gb/s
 net-next trunk + auto-tune: 14555.94Gb/s

Do you have perf numbers that just without this patch? We need to know
how much EWMA help exactly.

 Signed-off-by: Michael Dalton mwdal...@google.com
 ---
  drivers/net/virtio_net.c | 73 
 +++-
  1 file changed, 53 insertions(+), 20 deletions(-)

 diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
 index 0c93054..b1086e0 100644
 --- a/drivers/net/virtio_net.c
 +++ b/drivers/net/virtio_net.c
 @@ -27,6 +27,7 @@
  #include linux/if_vlan.h
  #include linux/slab.h
  #include linux/cpu.h
 +#include linux/average.h
  
  static int napi_weight = NAPI_POLL_WEIGHT;
  module_param(napi_weight, int, 0444);
 @@ -37,10 +38,8 @@ module_param(gso, bool, 0444);
  
  /* FIXME: MTU in config. */
  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
 -#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \
 -sizeof(struct virtio_net_hdr_mrg_rxbuf), \
 -L1_CACHE_BYTES))
  #define GOOD_COPY_LEN128
 +#define RECEIVE_AVG_WEIGHT 64

Maybe we can make this as a module parameter.
  
  #define VIRTNET_DRIVER_VERSION 1.0.0
  
 @@ -79,6 +78,9 @@ struct receive_queue {
   /* Chain pages by the private ptr. */
   struct page *pages;
  
 + /* Average packet length for mergeable receive buffers. */
 + struct ewma mrg_avg_pkt_len;
 +
   /* Page frag for GFP_ATOMIC packet buffer allocation. */
   struct page_frag atomic_frag;
  
 @@ -302,14 +304,17 @@ static struct sk_buff *page_to_skb(struct receive_queue 
 *rq,
   return skb;
  }
  
 -static int receive_mergeable(struct receive_queue *rq, struct sk_buff 
 *head_skb)
 +static int receive_mergeable(struct receive_queue *rq, struct sk_buff 
 *head_skb,
 +  struct page *head_page)
  {
   struct skb_vnet_hdr *hdr = skb_vnet_hdr(head_skb);
   struct sk_buff *curr_skb = head_skb;
 + struct page *page = head_page;
   char *buf;
 - struct page *page;
 - int num_buf, len, offset, truesize;
 + int num_buf, len, offset;
 + u32 est_buffer_len;
  
 + len = head_skb-len;
   num_buf = hdr-mhdr.num_buffers;
   while (--num_buf) {
   int num_skb_frags = skb_shinfo(curr_skb)-nr_frags;
 @@ -320,7 +325,6 @@ static int receive_mergeable(struct receive_queue *rq, 
 struct sk_buff *head_skb)
   head_skb-dev-stats.rx_length_errors++;
   return -EINVAL;
   }
 - truesize = max_t(int, len, MERGE_BUFFER_LEN);
   if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
   struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
   if (unlikely(!nskb)) {
 @@ -338,20 +342,38 @@ static int 

Re: [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance

2013-11-12 Thread Eric Dumazet
On Wed, 2013-11-13 at 15:10 +0800, Jason Wang wrote:

 There's one concern with EWMA. How well does it handle multiple streams
 each with different packet size? E.g there may be two flows, one with
 256 bytes each packet another is 64K.  Looks like it can result we
 allocate PAGE_SIZE buffer for 256 (which is bad since the
 payload/truesize is low) bytes or 1500+ for 64K buffer (which is ok
 since we can do coalescing).

It's hard to predict the future ;)

256 bytes frames consume 2.5 KB anyway on a traditional NIC.
If it was a concern, we would have it already.

If you receive a mix of big and small frames, there is no win.

  +   if (page) {
  +   est_buffer_len = page_private(page);
  +   if (est_buffer_len  len) {
  +   u32 truesize_delta = est_buffer_len - len;
  +
  +   curr_skb-truesize += truesize_delta;
  +   if (curr_skb != head_skb)
  +   head_skb-truesize += truesize_delta;
  +   }
 
 Is there a chance that est_buffer_len was smaller than or equal with len?

Yes, and in this case we do not really care, see below.

  +   }
  +   ewma_add(rq-mrg_avg_pkt_len, head_skb-len);
  return 0;
   }
   
  @@ -382,16 +404,21 @@ static void receive_buf(struct receive_queue *rq, 
  void *buf, unsigned int len)
  skb_trim(skb, len);
  } else if (vi-mergeable_rx_bufs) {
  struct page *page = virt_to_head_page(buf);
  -   int truesize = max_t(int, len, MERGE_BUFFER_LEN);
  +   /* Use an initial truesize of 'len' bytes for page_to_skb --
  +* receive_mergeable will fixup the truesize of the last page
  +* frag if the packet is non-linear ( GOOD_COPY_LEN bytes).
  +*/
  skb = page_to_skb(rq, page,
(char *)buf - (char *)page_address(page),
  - len, truesize);
  + len, len);
  if (unlikely(!skb)) {
  dev-stats.rx_dropped++;
  put_page(page);
  return;
  }
  -   if (receive_mergeable(rq, skb)) {
  +   if (!skb_is_nonlinear(skb))
  +   page = NULL;
  +   if (receive_mergeable(rq, skb, page)) {
  dev_kfree_skb(skb);
  return;
  }
  @@ -540,24 +567,29 @@ static int add_recvbuf_big(struct receive_queue *rq, 
  gfp_t gfp)
   static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
   {
  struct virtnet_info *vi = rq-vq-vdev-priv;
  +   const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
  struct page_frag *alloc_frag;
  char *buf;
  -   int err, len, hole;
  +   int err, hole;
  +   u32 buflen;
   
  +   buflen = hdr_len + clamp_t(u32, ewma_read(rq-mrg_avg_pkt_len),
  +  GOOD_PACKET_LEN, PAGE_SIZE - hdr_len);
  +   buflen = ALIGN(buflen, L1_CACHE_BYTES);
  alloc_frag = (gfp  __GFP_WAIT) ? vi-sleep_frag : rq-atomic_frag;
  -   if (unlikely(!skb_page_frag_refill(MERGE_BUFFER_LEN, alloc_frag, gfp)))
  +   if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, gfp)))
  return -ENOMEM;
  buf = (char *)page_address(alloc_frag-page) + alloc_frag-offset;
  get_page(alloc_frag-page);
  -   len = MERGE_BUFFER_LEN;
  -   alloc_frag-offset += len;
  +   alloc_frag-offset += buflen;
  +   set_page_private(alloc_frag-page, buflen);
 
 Not sure this is accurate, since buflen may change and several frags may
 share a single page. So the est_buffer_len we get in receive_mergeable()
 may not be the correct value.

skb-truesize has to be reasonably accurate. 

For example, fast clone storage is not accounted for in TCP skbs stored
in socket write queues. Thats ~256 bytes per skb of 'missing'
accounting.

This is about 10% error when TSO/GSO is off.

With this EWMA using a factor of 64, the potential error will be much
less than 10%.

Small frames tend to be consumed quite fast (ACK messages, small UDP
frames) in most cases.


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization