Re: [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance
On 11/13/2013 12:21 AM, Michael Dalton wrote: Commit 2613af0ed18a (virtio_net: migrate mergeable rx buffers to page frag allocators) changed the mergeable receive buffer size from PAGE_SIZE to MTU-size, introducing a single-stream regression for benchmarks with large average packet size. There is no single optimal buffer size for all workloads. For workloads with packet size = MTU bytes, MTU + virtio-net header-sized buffers are preferred as larger buffers reduce the TCP window due to SKB truesize. However, single-stream workloads with large average packet sizes have higher throughput if larger (e.g., PAGE_SIZE) buffers are used. This commit auto-tunes the mergeable receiver buffer packet size by choosing the packet buffer size based on an EWMA of the recent packet sizes for the receive queue. Packet buffer sizes range from MTU_SIZE + virtio-net header len to PAGE_SIZE. This improves throughput for large packet workloads, as any workload with average packet size = PAGE_SIZE will use PAGE_SIZE buffers. These optimizations interact positively with recent commit ba275241030c (virtio-net: coalesce rx frags when possible during rx), which coalesces adjacent RX SKB fragments in virtio_net. The coalescing optimizations benefit buffers of any size. Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs between two QEMU VMs on a single physical machine. Each VM has two VCPUs with all offloads vhost enabled. All VMs and vhost threads run in a single 4 CPU cgroup cpuset, using cgroups to ensure that other processes in the system will not be scheduled on the benchmark CPUs. Trunk includes SKB rx frag coalescing. net-next trunk w/ virtio_net before 2613af0ed18a (PAGE_SIZE bufs): 14642.85Gb/s net-next trunk (MTU-size bufs): 13170.01Gb/s net-next trunk + auto-tune: 14555.94Gb/s Signed-off-by: Michael Dalton mwdal...@google.com --- drivers/net/virtio_net.c | 73 +++- 1 file changed, 53 insertions(+), 20 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 0c93054..b1086e0 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -27,6 +27,7 @@ #include linux/if_vlan.h #include linux/slab.h #include linux/cpu.h +#include linux/average.h static int napi_weight = NAPI_POLL_WEIGHT; module_param(napi_weight, int, 0444); @@ -37,10 +38,8 @@ module_param(gso, bool, 0444); /* FIXME: MTU in config. */ #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) -#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \ -sizeof(struct virtio_net_hdr_mrg_rxbuf), \ -L1_CACHE_BYTES)) #define GOOD_COPY_LEN 128 +#define RECEIVE_AVG_WEIGHT 64 #define VIRTNET_DRIVER_VERSION 1.0.0 @@ -79,6 +78,9 @@ struct receive_queue { /* Chain pages by the private ptr. */ struct page *pages; + /* Average packet length for mergeable receive buffers. */ + struct ewma mrg_avg_pkt_len; + /* Page frag for GFP_ATOMIC packet buffer allocation. */ struct page_frag atomic_frag; @@ -302,14 +304,17 @@ static struct sk_buff *page_to_skb(struct receive_queue *rq, return skb; } -static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb) +static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb, +struct page *head_page) { struct skb_vnet_hdr *hdr = skb_vnet_hdr(head_skb); struct sk_buff *curr_skb = head_skb; + struct page *page = head_page; char *buf; - struct page *page; - int num_buf, len, offset, truesize; + int num_buf, len, offset; + u32 est_buffer_len; + len = head_skb-len; num_buf = hdr-mhdr.num_buffers; while (--num_buf) { int num_skb_frags = skb_shinfo(curr_skb)-nr_frags; @@ -320,7 +325,6 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb) head_skb-dev-stats.rx_length_errors++; return -EINVAL; } - truesize = max_t(int, len, MERGE_BUFFER_LEN); if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) { struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC); if (unlikely(!nskb)) { @@ -338,20 +342,38 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb) if (curr_skb != head_skb) { head_skb-data_len += len; head_skb-len += len; - head_skb-truesize += truesize; + head_skb-truesize += len; } page = virt_to_head_page(buf); offset = buf - (char *)page_address(page); if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { put_page(page);
Re: [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance
On Wed, 2013-11-13 at 10:47 +0200, Ronen Hod wrote: I looked at how ewma works, and although it is computationally efficient, and it does what it is supposed to do, initially (at the first samples) it is strongly biased towards the value that was added at the first ewma_add. I suggest that you print the values of ewma_add() and ewma_read(). If you are happy with the results, then ignore my comments. If you are not, then I can provide a version that does better for the first samples. Unfortunately, it will be slightly less efficient. Value is clamped by (GOOD_PACKET_LEN, PAGE_SIZE - hdr_len) So initial value is conservative and not really used. Thanks ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance
On 11/13/2013 04:19 PM, Eric Dumazet wrote: On Wed, 2013-11-13 at 10:47 +0200, Ronen Hod wrote: I looked at how ewma works, and although it is computationally efficient, and it does what it is supposed to do, initially (at the first samples) it is strongly biased towards the value that was added at the first ewma_add. I suggest that you print the values of ewma_add() and ewma_read(). If you are happy with the results, then ignore my comments. If you are not, then I can provide a version that does better for the first samples. Unfortunately, it will be slightly less efficient. Value is clamped by (GOOD_PACKET_LEN, PAGE_SIZE - hdr_len) So initial value is conservative and not really used. Hi Eric, This initial value, that you do not really want to use, will slowly fade, but it will still pretty much dominate the returned value for the first RECEIVE_AVG_WEIGHT(==64) samples or so (most ewma implementations suffer from this bug). Naturally, it doesn't matter much if you just keep it running forever. However, if you will want to restart the learning process more often, which might make sense upon changes, then the auto-tuning will be very sub-optimal. Ronen. Thanks -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance
On Wed, 2013-11-13 at 18:43 +0200, Ronen Hod wrote: This initial value, that you do not really want to use, will slowly fade, but it will still pretty much dominate the returned value for the first RECEIVE_AVG_WEIGHT(==64) samples or so (most ewma implementations suffer from this bug). Naturally, it doesn't matter much if you just keep it running forever. However, if you will want to restart the learning process more often, which might make sense upon changes, then the auto-tuning will be very sub-optimal. Note that we fill a ring buffer at open time (try_fill_recv()), all these buffers will be of the minimal size. By the time we have refilled the ring buffer, EWMA value will be GOOD_PACKET_LEN. These sizes are a hint, clamped between 1500 and PAGE_SIZE. We do not care of very first allocated buffers, they are good enough. We only care of the million of following allocations. Also note the EWMA is per queue, not global to the device. Of course, there is no 'one size' perfect for all usages. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header
On Tue, Nov 12, 2013 at 02:21:22PM -0800, Michael Dalton wrote: Commit 2613af0ed18a (virtio_net: migrate mergeable rx buffers to page frag allocators) changed the mergeable receive buffer size from PAGE_SIZE to MTU-size. However, the merge buffer size does not take into account the size of the virtio-net header. Consequently, packets that are MTU-size will take two buffers intead of one (to store the virtio-net header), substantially decreasing the throughput of MTU-size traffic due to TCP window / SKB truesize effects. This commit changes the mergeable buffer size to include the virtio-net header. The buffer size is cacheline-aligned because skb_page_frag_refill will not automatically align the requested size. Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs between two QEMU VMs on a single physical machine. Each VM has two VCPUs and vhost enabled. All VMs and vhost threads run in a single 4 CPU cgroup cpuset, using cgroups to ensure that other processes in the system will not be scheduled on the benchmark CPUs. Transmit offloads and mergeable receive buffers are enabled, but guest_tso4 / guest_csum are explicitly disabled to force MTU-sized packets on the receiver. next-net trunk before 2613af0ed18a (PAGE_SIZE buf): 3861.08Gb/s net-next trunk (MTU 1500- packet uses two buf due to size bug): 4076.62Gb/s net-next trunk (MTU 1480- packet fits in one buf): 6301.34Gb/s net-next trunk w/ size fix (MTU 1500 - packet fits in one buf): 6445.44Gb/s Suggested-by: Eric Northup digitale...@google.com Signed-off-by: Michael Dalton mwdal...@google.com Acked-by: Michael S. Tsirkin m...@redhat.com --- drivers/net/virtio_net.c | 30 -- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 01f4eb5..69fb225 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -36,7 +36,10 @@ module_param(csum, bool, 0444); module_param(gso, bool, 0444); /* FIXME: MTU in config. */ -#define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) +#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) +#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \ +sizeof(struct virtio_net_hdr_mrg_rxbuf), \ +L1_CACHE_BYTES)) #define GOOD_COPY_LEN128 #define VIRTNET_DRIVER_VERSION 1.0.0 @@ -314,10 +317,10 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb) head_skb-dev-stats.rx_length_errors++; return -EINVAL; } - if (unlikely(len MAX_PACKET_LEN)) { + if (unlikely(len MERGE_BUFFER_LEN)) { pr_debug(%s: rx error: merge buffer too long\n, head_skb-dev-name); - len = MAX_PACKET_LEN; + len = MERGE_BUFFER_LEN; } if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) { struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC); @@ -336,18 +339,17 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb) if (curr_skb != head_skb) { head_skb-data_len += len; head_skb-len += len; - head_skb-truesize += MAX_PACKET_LEN; + head_skb-truesize += MERGE_BUFFER_LEN; } page = virt_to_head_page(buf); offset = buf - (char *)page_address(page); if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { put_page(page); skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, - len, MAX_PACKET_LEN); + len, MERGE_BUFFER_LEN); } else { skb_add_rx_frag(curr_skb, num_skb_frags, page, - offset, len, - MAX_PACKET_LEN); + offset, len, MERGE_BUFFER_LEN); } --rq-num; } @@ -383,7 +385,7 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len) struct page *page = virt_to_head_page(buf); skb = page_to_skb(rq, page, (char *)buf - (char *)page_address(page), - len, MAX_PACKET_LEN); + len, MERGE_BUFFER_LEN); if (unlikely(!skb)) { dev-stats.rx_dropped++; put_page(page); @@ -471,11 +473,11 @@ static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp) struct skb_vnet_hdr *hdr; int err; - skb = __netdev_alloc_skb_ip_align(vi-dev,
Re: [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance
On Wed, Nov 13, 2013 at 03:10:20PM +0800, Jason Wang wrote: On 11/13/2013 06:21 AM, Michael Dalton wrote: Commit 2613af0ed18a (virtio_net: migrate mergeable rx buffers to page frag allocators) changed the mergeable receive buffer size from PAGE_SIZE to MTU-size, introducing a single-stream regression for benchmarks with large average packet size. There is no single optimal buffer size for all workloads. For workloads with packet size = MTU bytes, MTU + virtio-net header-sized buffers are preferred as larger buffers reduce the TCP window due to SKB truesize. However, single-stream workloads with large average packet sizes have higher throughput if larger (e.g., PAGE_SIZE) buffers are used. This commit auto-tunes the mergeable receiver buffer packet size by choosing the packet buffer size based on an EWMA of the recent packet sizes for the receive queue. Packet buffer sizes range from MTU_SIZE + virtio-net header len to PAGE_SIZE. This improves throughput for large packet workloads, as any workload with average packet size = PAGE_SIZE will use PAGE_SIZE buffers. Hi Michael: There's one concern with EWMA. How well does it handle multiple streams each with different packet size? E.g there may be two flows, one with 256 bytes each packet another is 64K. Looks like it can result we allocate PAGE_SIZE buffer for 256 (which is bad since the payload/truesize is low) bytes or 1500+ for 64K buffer (which is ok since we can do coalescing). These optimizations interact positively with recent commit ba275241030c (virtio-net: coalesce rx frags when possible during rx), which coalesces adjacent RX SKB fragments in virtio_net. The coalescing optimizations benefit buffers of any size. Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs between two QEMU VMs on a single physical machine. Each VM has two VCPUs with all offloads vhost enabled. All VMs and vhost threads run in a single 4 CPU cgroup cpuset, using cgroups to ensure that other processes in the system will not be scheduled on the benchmark CPUs. Trunk includes SKB rx frag coalescing. net-next trunk w/ virtio_net before 2613af0ed18a (PAGE_SIZE bufs): 14642.85Gb/s net-next trunk (MTU-size bufs): 13170.01Gb/s net-next trunk + auto-tune: 14555.94Gb/s Do you have perf numbers that just without this patch? We need to know how much EWMA help exactly. Yes I'm curious too. Signed-off-by: Michael Dalton mwdal...@google.com --- drivers/net/virtio_net.c | 73 +++- 1 file changed, 53 insertions(+), 20 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 0c93054..b1086e0 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -27,6 +27,7 @@ #include linux/if_vlan.h #include linux/slab.h #include linux/cpu.h +#include linux/average.h static int napi_weight = NAPI_POLL_WEIGHT; module_param(napi_weight, int, 0444); @@ -37,10 +38,8 @@ module_param(gso, bool, 0444); /* FIXME: MTU in config. */ #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) -#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \ -sizeof(struct virtio_net_hdr_mrg_rxbuf), \ -L1_CACHE_BYTES)) #define GOOD_COPY_LEN 128 +#define RECEIVE_AVG_WEIGHT 64 Maybe we can make this as a module parameter. I'm not sure it's useful - no one is likely to tune it in practice. But how about a comment explaining how was the number chosen? #define VIRTNET_DRIVER_VERSION 1.0.0 @@ -79,6 +78,9 @@ struct receive_queue { /* Chain pages by the private ptr. */ struct page *pages; + /* Average packet length for mergeable receive buffers. */ + struct ewma mrg_avg_pkt_len; + /* Page frag for GFP_ATOMIC packet buffer allocation. */ struct page_frag atomic_frag; @@ -302,14 +304,17 @@ static struct sk_buff *page_to_skb(struct receive_queue *rq, return skb; } -static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb) +static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb, +struct page *head_page) { struct skb_vnet_hdr *hdr = skb_vnet_hdr(head_skb); struct sk_buff *curr_skb = head_skb; + struct page *page = head_page; char *buf; - struct page *page; - int num_buf, len, offset, truesize; + int num_buf, len, offset; + u32 est_buffer_len; + len = head_skb-len; num_buf = hdr-mhdr.num_buffers; while (--num_buf) { int num_skb_frags = skb_shinfo(curr_skb)-nr_frags; @@ -320,7 +325,6 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb) head_skb-dev-stats.rx_length_errors++; return -EINVAL; }
Re: [PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header
On Tue, Nov 12, 2013 at 02:21:22PM -0800, Michael Dalton wrote: Commit 2613af0ed18a (virtio_net: migrate mergeable rx buffers to page frag allocators) changed the mergeable receive buffer size from PAGE_SIZE to MTU-size. However, the merge buffer size does not take into account the size of the virtio-net header. Consequently, packets that are MTU-size will take two buffers intead of one (to store the virtio-net header), substantially decreasing the throughput of MTU-size traffic due to TCP window / SKB truesize effects. This commit changes the mergeable buffer size to include the virtio-net header. The buffer size is cacheline-aligned because skb_page_frag_refill will not automatically align the requested size. Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs between two QEMU VMs on a single physical machine. Each VM has two VCPUs and vhost enabled. All VMs and vhost threads run in a single 4 CPU cgroup cpuset, using cgroups to ensure that other processes in the system will not be scheduled on the benchmark CPUs. Transmit offloads and mergeable receive buffers are enabled, but guest_tso4 / guest_csum are explicitly disabled to force MTU-sized packets on the receiver. next-net trunk before 2613af0ed18a (PAGE_SIZE buf): 3861.08Gb/s net-next trunk (MTU 1500- packet uses two buf due to size bug): 4076.62Gb/s net-next trunk (MTU 1480- packet fits in one buf): 6301.34Gb/s net-next trunk w/ size fix (MTU 1500 - packet fits in one buf): 6445.44Gb/s Suggested-by: Eric Northup digitale...@google.com Signed-off-by: Michael Dalton mwdal...@google.com Please note this is a bugfix - useful by itself. --- drivers/net/virtio_net.c | 30 -- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 01f4eb5..69fb225 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -36,7 +36,10 @@ module_param(csum, bool, 0444); module_param(gso, bool, 0444); /* FIXME: MTU in config. */ -#define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) +#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) +#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \ +sizeof(struct virtio_net_hdr_mrg_rxbuf), \ +L1_CACHE_BYTES)) #define GOOD_COPY_LEN128 #define VIRTNET_DRIVER_VERSION 1.0.0 @@ -314,10 +317,10 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb) head_skb-dev-stats.rx_length_errors++; return -EINVAL; } - if (unlikely(len MAX_PACKET_LEN)) { + if (unlikely(len MERGE_BUFFER_LEN)) { pr_debug(%s: rx error: merge buffer too long\n, head_skb-dev-name); - len = MAX_PACKET_LEN; + len = MERGE_BUFFER_LEN; } if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) { struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC); @@ -336,18 +339,17 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb) if (curr_skb != head_skb) { head_skb-data_len += len; head_skb-len += len; - head_skb-truesize += MAX_PACKET_LEN; + head_skb-truesize += MERGE_BUFFER_LEN; } page = virt_to_head_page(buf); offset = buf - (char *)page_address(page); if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { put_page(page); skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, - len, MAX_PACKET_LEN); + len, MERGE_BUFFER_LEN); } else { skb_add_rx_frag(curr_skb, num_skb_frags, page, - offset, len, - MAX_PACKET_LEN); + offset, len, MERGE_BUFFER_LEN); } --rq-num; } @@ -383,7 +385,7 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len) struct page *page = virt_to_head_page(buf); skb = page_to_skb(rq, page, (char *)buf - (char *)page_address(page), - len, MAX_PACKET_LEN); + len, MERGE_BUFFER_LEN); if (unlikely(!skb)) { dev-stats.rx_dropped++; put_page(page); @@ -471,11 +473,11 @@ static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp) struct skb_vnet_hdr *hdr; int err; - skb = __netdev_alloc_skb_ip_align(vi-dev,