Re: [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance

2013-11-13 Thread Ronen Hod

On 11/13/2013 12:21 AM, Michael Dalton wrote:

Commit 2613af0ed18a (virtio_net: migrate mergeable rx buffers to page frag
allocators) changed the mergeable receive buffer size from PAGE_SIZE to
MTU-size, introducing a single-stream regression for benchmarks with large
average packet size. There is no single optimal buffer size for all workloads.
For workloads with packet size = MTU bytes, MTU + virtio-net header-sized
buffers are preferred as larger buffers reduce the TCP window due to SKB
truesize. However, single-stream workloads with large average packet sizes
have higher throughput if larger (e.g., PAGE_SIZE) buffers are used.

This commit auto-tunes the mergeable receiver buffer packet size by choosing
the packet buffer size based on an EWMA of the recent packet sizes for the
receive queue. Packet buffer sizes range from MTU_SIZE + virtio-net header
len to PAGE_SIZE. This improves throughput for large packet workloads, as
any workload with average packet size = PAGE_SIZE will use PAGE_SIZE
buffers.

These optimizations interact positively with recent commit
ba275241030c (virtio-net: coalesce rx frags when possible during rx),
which coalesces adjacent RX SKB fragments in virtio_net. The coalescing
optimizations benefit buffers of any size.

Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
between two QEMU VMs on a single physical machine. Each VM has two VCPUs
with all offloads  vhost enabled. All VMs and vhost threads run in a
single 4 CPU cgroup cpuset, using cgroups to ensure that other processes
in the system will not be scheduled on the benchmark CPUs. Trunk includes
SKB rx frag coalescing.

net-next trunk w/ virtio_net before 2613af0ed18a (PAGE_SIZE bufs): 14642.85Gb/s
net-next trunk (MTU-size bufs):  13170.01Gb/s
net-next trunk + auto-tune: 14555.94Gb/s

Signed-off-by: Michael Dalton mwdal...@google.com
---
  drivers/net/virtio_net.c | 73 +++-
  1 file changed, 53 insertions(+), 20 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 0c93054..b1086e0 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -27,6 +27,7 @@
  #include linux/if_vlan.h
  #include linux/slab.h
  #include linux/cpu.h
+#include linux/average.h
  
  static int napi_weight = NAPI_POLL_WEIGHT;

  module_param(napi_weight, int, 0444);
@@ -37,10 +38,8 @@ module_param(gso, bool, 0444);
  
  /* FIXME: MTU in config. */

  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
-#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \
-sizeof(struct virtio_net_hdr_mrg_rxbuf), \
-L1_CACHE_BYTES))
  #define GOOD_COPY_LEN 128
+#define RECEIVE_AVG_WEIGHT 64
  
  #define VIRTNET_DRIVER_VERSION 1.0.0
  
@@ -79,6 +78,9 @@ struct receive_queue {

/* Chain pages by the private ptr. */
struct page *pages;
  
+	/* Average packet length for mergeable receive buffers. */

+   struct ewma mrg_avg_pkt_len;
+
/* Page frag for GFP_ATOMIC packet buffer allocation. */
struct page_frag atomic_frag;
  
@@ -302,14 +304,17 @@ static struct sk_buff *page_to_skb(struct receive_queue *rq,

return skb;
  }
  
-static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)

+static int receive_mergeable(struct receive_queue *rq, struct sk_buff 
*head_skb,
+struct page *head_page)
  {
struct skb_vnet_hdr *hdr = skb_vnet_hdr(head_skb);
struct sk_buff *curr_skb = head_skb;
+   struct page *page = head_page;
char *buf;
-   struct page *page;
-   int num_buf, len, offset, truesize;
+   int num_buf, len, offset;
+   u32 est_buffer_len;
  
+	len = head_skb-len;

num_buf = hdr-mhdr.num_buffers;
while (--num_buf) {
int num_skb_frags = skb_shinfo(curr_skb)-nr_frags;
@@ -320,7 +325,6 @@ static int receive_mergeable(struct receive_queue *rq, 
struct sk_buff *head_skb)
head_skb-dev-stats.rx_length_errors++;
return -EINVAL;
}
-   truesize = max_t(int, len, MERGE_BUFFER_LEN);
if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
if (unlikely(!nskb)) {
@@ -338,20 +342,38 @@ static int receive_mergeable(struct receive_queue *rq, 
struct sk_buff *head_skb)
if (curr_skb != head_skb) {
head_skb-data_len += len;
head_skb-len += len;
-   head_skb-truesize += truesize;
+   head_skb-truesize += len;
}
page = virt_to_head_page(buf);
offset = buf - (char *)page_address(page);
if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
put_page(page);

Re: [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance

2013-11-13 Thread Eric Dumazet
On Wed, 2013-11-13 at 10:47 +0200, Ronen Hod wrote:

 I looked at how ewma works, and although it is computationally efficient,
 and it does what it is supposed to do, initially (at the first samples) it is 
 strongly
 biased towards the value that was added at the first ewma_add.
 I suggest that you print the values of ewma_add() and ewma_read(). If you are
 happy with the results, then ignore my comments. If you are not, then I can
 provide a version that does better for the first samples.
 Unfortunately, it will be slightly less efficient.

Value is clamped by (GOOD_PACKET_LEN, PAGE_SIZE - hdr_len)

So initial value is conservative and not really used.

Thanks


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance

2013-11-13 Thread Ronen Hod

On 11/13/2013 04:19 PM, Eric Dumazet wrote:

On Wed, 2013-11-13 at 10:47 +0200, Ronen Hod wrote:


I looked at how ewma works, and although it is computationally efficient,
and it does what it is supposed to do, initially (at the first samples) it is 
strongly
biased towards the value that was added at the first ewma_add.
I suggest that you print the values of ewma_add() and ewma_read(). If you are
happy with the results, then ignore my comments. If you are not, then I can
provide a version that does better for the first samples.
Unfortunately, it will be slightly less efficient.

Value is clamped by (GOOD_PACKET_LEN, PAGE_SIZE - hdr_len)

So initial value is conservative and not really used.


Hi Eric,

This initial value, that you do not really want to use, will slowly fade, but it
will still pretty much dominate the returned value for the first 
RECEIVE_AVG_WEIGHT(==64)
samples or so (most ewma implementations suffer from this bug).
Naturally, it doesn't matter much if you just keep it running forever.
However, if you will want to restart the learning process more often, which 
might make
sense upon changes, then the auto-tuning will be very sub-optimal.

Ronen.


Thanks


--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance

2013-11-13 Thread Eric Dumazet
On Wed, 2013-11-13 at 18:43 +0200, Ronen Hod wrote:

 
 This initial value, that you do not really want to use, will slowly fade, but 
 it
 will still pretty much dominate the returned value for the first 
 RECEIVE_AVG_WEIGHT(==64)
 samples or so (most ewma implementations suffer from this bug).
 Naturally, it doesn't matter much if you just keep it running forever.
 However, if you will want to restart the learning process more often, which 
 might make
 sense upon changes, then the auto-tuning will be very sub-optimal.

Note that we fill a ring buffer at open time (try_fill_recv()),
all these buffers will be of the minimal size.

By the time we have refilled the ring buffer, EWMA value will be
GOOD_PACKET_LEN.

These sizes are a hint, clamped between 1500 and PAGE_SIZE.

We do not care of very first allocated buffers, they are good enough.

We only care of the million of following allocations.

Also note the EWMA is per queue, not global to the device.

Of course, there is no 'one size' perfect for all usages.


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header

2013-11-13 Thread Michael S. Tsirkin
On Tue, Nov 12, 2013 at 02:21:22PM -0800, Michael Dalton wrote:
 Commit 2613af0ed18a (virtio_net: migrate mergeable rx buffers to page
 frag allocators) changed the mergeable receive buffer size from PAGE_SIZE
 to MTU-size. However, the merge buffer size does not take into account the
 size of the virtio-net header. Consequently, packets that are MTU-size
 will take two buffers intead of one (to store the virtio-net header),
 substantially decreasing the throughput of MTU-size traffic due to TCP
 window / SKB truesize effects.
 
 This commit changes the mergeable buffer size to include the virtio-net
 header. The buffer size is cacheline-aligned because skb_page_frag_refill
 will not automatically align the requested size.
 
 Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
 between two QEMU VMs on a single physical machine. Each VM has two VCPUs and
 vhost enabled. All VMs and vhost threads run in a single 4 CPU cgroup
 cpuset, using cgroups to ensure that other processes in the system will not
 be scheduled on the benchmark CPUs. Transmit offloads and mergeable receive
 buffers are enabled, but guest_tso4 / guest_csum are explicitly disabled to
 force MTU-sized packets on the receiver.
 
 next-net trunk before 2613af0ed18a (PAGE_SIZE buf): 3861.08Gb/s
 net-next trunk (MTU 1500- packet uses two buf due to size bug): 4076.62Gb/s
 net-next trunk (MTU 1480- packet fits in one buf): 6301.34Gb/s
 net-next trunk w/ size fix (MTU 1500 - packet fits in one buf): 6445.44Gb/s
 
 Suggested-by: Eric Northup digitale...@google.com
 Signed-off-by: Michael Dalton mwdal...@google.com

Acked-by: Michael S. Tsirkin m...@redhat.com

 ---
  drivers/net/virtio_net.c | 30 --
  1 file changed, 16 insertions(+), 14 deletions(-)
 
 diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
 index 01f4eb5..69fb225 100644
 --- a/drivers/net/virtio_net.c
 +++ b/drivers/net/virtio_net.c
 @@ -36,7 +36,10 @@ module_param(csum, bool, 0444);
  module_param(gso, bool, 0444);
  
  /* FIXME: MTU in config. */
 -#define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
 +#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
 +#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \
 +sizeof(struct virtio_net_hdr_mrg_rxbuf), \
 +L1_CACHE_BYTES))
  #define GOOD_COPY_LEN128
  
  #define VIRTNET_DRIVER_VERSION 1.0.0
 @@ -314,10 +317,10 @@ static int receive_mergeable(struct receive_queue *rq, 
 struct sk_buff *head_skb)
   head_skb-dev-stats.rx_length_errors++;
   return -EINVAL;
   }
 - if (unlikely(len  MAX_PACKET_LEN)) {
 + if (unlikely(len  MERGE_BUFFER_LEN)) {
   pr_debug(%s: rx error: merge buffer too long\n,
head_skb-dev-name);
 - len = MAX_PACKET_LEN;
 + len = MERGE_BUFFER_LEN;
   }
   if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
   struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
 @@ -336,18 +339,17 @@ static int receive_mergeable(struct receive_queue *rq, 
 struct sk_buff *head_skb)
   if (curr_skb != head_skb) {
   head_skb-data_len += len;
   head_skb-len += len;
 - head_skb-truesize += MAX_PACKET_LEN;
 + head_skb-truesize += MERGE_BUFFER_LEN;
   }
   page = virt_to_head_page(buf);
   offset = buf - (char *)page_address(page);
   if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
   put_page(page);
   skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
 -  len, MAX_PACKET_LEN);
 +  len, MERGE_BUFFER_LEN);
   } else {
   skb_add_rx_frag(curr_skb, num_skb_frags, page,
 - offset, len,
 - MAX_PACKET_LEN);
 + offset, len, MERGE_BUFFER_LEN);
   }
   --rq-num;
   }
 @@ -383,7 +385,7 @@ static void receive_buf(struct receive_queue *rq, void 
 *buf, unsigned int len)
   struct page *page = virt_to_head_page(buf);
   skb = page_to_skb(rq, page,
 (char *)buf - (char *)page_address(page),
 -   len, MAX_PACKET_LEN);
 +   len, MERGE_BUFFER_LEN);
   if (unlikely(!skb)) {
   dev-stats.rx_dropped++;
   put_page(page);
 @@ -471,11 +473,11 @@ static int add_recvbuf_small(struct receive_queue *rq, 
 gfp_t gfp)
   struct skb_vnet_hdr *hdr;
   int err;
  
 - skb = __netdev_alloc_skb_ip_align(vi-dev, 

Re: [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance

2013-11-13 Thread Michael S. Tsirkin
On Wed, Nov 13, 2013 at 03:10:20PM +0800, Jason Wang wrote:
 On 11/13/2013 06:21 AM, Michael Dalton wrote:
  Commit 2613af0ed18a (virtio_net: migrate mergeable rx buffers to page frag
  allocators) changed the mergeable receive buffer size from PAGE_SIZE to
  MTU-size, introducing a single-stream regression for benchmarks with large
  average packet size. There is no single optimal buffer size for all 
  workloads.
  For workloads with packet size = MTU bytes, MTU + virtio-net header-sized
  buffers are preferred as larger buffers reduce the TCP window due to SKB
  truesize. However, single-stream workloads with large average packet sizes
  have higher throughput if larger (e.g., PAGE_SIZE) buffers are used.
 
  This commit auto-tunes the mergeable receiver buffer packet size by choosing
  the packet buffer size based on an EWMA of the recent packet sizes for the
  receive queue. Packet buffer sizes range from MTU_SIZE + virtio-net header
  len to PAGE_SIZE. This improves throughput for large packet workloads, as
  any workload with average packet size = PAGE_SIZE will use PAGE_SIZE
  buffers.
 
 Hi Michael:
 
 There's one concern with EWMA. How well does it handle multiple streams
 each with different packet size? E.g there may be two flows, one with
 256 bytes each packet another is 64K.  Looks like it can result we
 allocate PAGE_SIZE buffer for 256 (which is bad since the
 payload/truesize is low) bytes or 1500+ for 64K buffer (which is ok
 since we can do coalescing).
 
  These optimizations interact positively with recent commit
  ba275241030c (virtio-net: coalesce rx frags when possible during rx),
  which coalesces adjacent RX SKB fragments in virtio_net. The coalescing
  optimizations benefit buffers of any size.
 
  Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
  between two QEMU VMs on a single physical machine. Each VM has two VCPUs
  with all offloads  vhost enabled. All VMs and vhost threads run in a
  single 4 CPU cgroup cpuset, using cgroups to ensure that other processes
  in the system will not be scheduled on the benchmark CPUs. Trunk includes
  SKB rx frag coalescing.
 
  net-next trunk w/ virtio_net before 2613af0ed18a (PAGE_SIZE bufs): 
  14642.85Gb/s
  net-next trunk (MTU-size bufs):  13170.01Gb/s
  net-next trunk + auto-tune: 14555.94Gb/s
 
 Do you have perf numbers that just without this patch? We need to know
 how much EWMA help exactly.

Yes I'm curious too.

 
  Signed-off-by: Michael Dalton mwdal...@google.com
  ---
   drivers/net/virtio_net.c | 73 
  +++-
   1 file changed, 53 insertions(+), 20 deletions(-)
 
  diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
  index 0c93054..b1086e0 100644
  --- a/drivers/net/virtio_net.c
  +++ b/drivers/net/virtio_net.c
  @@ -27,6 +27,7 @@
   #include linux/if_vlan.h
   #include linux/slab.h
   #include linux/cpu.h
  +#include linux/average.h
   
   static int napi_weight = NAPI_POLL_WEIGHT;
   module_param(napi_weight, int, 0444);
  @@ -37,10 +38,8 @@ module_param(gso, bool, 0444);
   
   /* FIXME: MTU in config. */
   #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
  -#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \
  -sizeof(struct virtio_net_hdr_mrg_rxbuf), \
  -L1_CACHE_BYTES))
   #define GOOD_COPY_LEN  128
  +#define RECEIVE_AVG_WEIGHT 64
 
 Maybe we can make this as a module parameter.

I'm not sure it's useful - no one is likely to tune it in practice.
But how about a comment explaining how was the number chosen?

   
   #define VIRTNET_DRIVER_VERSION 1.0.0
   
  @@ -79,6 +78,9 @@ struct receive_queue {
  /* Chain pages by the private ptr. */
  struct page *pages;
   
  +   /* Average packet length for mergeable receive buffers. */
  +   struct ewma mrg_avg_pkt_len;
  +
  /* Page frag for GFP_ATOMIC packet buffer allocation. */
  struct page_frag atomic_frag;
   
  @@ -302,14 +304,17 @@ static struct sk_buff *page_to_skb(struct 
  receive_queue *rq,
  return skb;
   }
   
  -static int receive_mergeable(struct receive_queue *rq, struct sk_buff 
  *head_skb)
  +static int receive_mergeable(struct receive_queue *rq, struct sk_buff 
  *head_skb,
  +struct page *head_page)
   {
  struct skb_vnet_hdr *hdr = skb_vnet_hdr(head_skb);
  struct sk_buff *curr_skb = head_skb;
  +   struct page *page = head_page;
  char *buf;
  -   struct page *page;
  -   int num_buf, len, offset, truesize;
  +   int num_buf, len, offset;
  +   u32 est_buffer_len;
   
  +   len = head_skb-len;
  num_buf = hdr-mhdr.num_buffers;
  while (--num_buf) {
  int num_skb_frags = skb_shinfo(curr_skb)-nr_frags;
  @@ -320,7 +325,6 @@ static int receive_mergeable(struct receive_queue *rq, 
  struct sk_buff *head_skb)
  head_skb-dev-stats.rx_length_errors++;
  return -EINVAL;
  }
  

Re: [PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header

2013-11-13 Thread Michael S. Tsirkin
On Tue, Nov 12, 2013 at 02:21:22PM -0800, Michael Dalton wrote:
 Commit 2613af0ed18a (virtio_net: migrate mergeable rx buffers to page
 frag allocators) changed the mergeable receive buffer size from PAGE_SIZE
 to MTU-size. However, the merge buffer size does not take into account the
 size of the virtio-net header. Consequently, packets that are MTU-size
 will take two buffers intead of one (to store the virtio-net header),
 substantially decreasing the throughput of MTU-size traffic due to TCP
 window / SKB truesize effects.
 
 This commit changes the mergeable buffer size to include the virtio-net
 header. The buffer size is cacheline-aligned because skb_page_frag_refill
 will not automatically align the requested size.
 
 Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
 between two QEMU VMs on a single physical machine. Each VM has two VCPUs and
 vhost enabled. All VMs and vhost threads run in a single 4 CPU cgroup
 cpuset, using cgroups to ensure that other processes in the system will not
 be scheduled on the benchmark CPUs. Transmit offloads and mergeable receive
 buffers are enabled, but guest_tso4 / guest_csum are explicitly disabled to
 force MTU-sized packets on the receiver.
 
 next-net trunk before 2613af0ed18a (PAGE_SIZE buf): 3861.08Gb/s
 net-next trunk (MTU 1500- packet uses two buf due to size bug): 4076.62Gb/s
 net-next trunk (MTU 1480- packet fits in one buf): 6301.34Gb/s
 net-next trunk w/ size fix (MTU 1500 - packet fits in one buf): 6445.44Gb/s
 
 Suggested-by: Eric Northup digitale...@google.com
 Signed-off-by: Michael Dalton mwdal...@google.com

Please note this is a bugfix - useful by itself.

 ---
  drivers/net/virtio_net.c | 30 --
  1 file changed, 16 insertions(+), 14 deletions(-)
 
 diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
 index 01f4eb5..69fb225 100644
 --- a/drivers/net/virtio_net.c
 +++ b/drivers/net/virtio_net.c
 @@ -36,7 +36,10 @@ module_param(csum, bool, 0444);
  module_param(gso, bool, 0444);
  
  /* FIXME: MTU in config. */
 -#define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
 +#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
 +#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \
 +sizeof(struct virtio_net_hdr_mrg_rxbuf), \
 +L1_CACHE_BYTES))
  #define GOOD_COPY_LEN128
  
  #define VIRTNET_DRIVER_VERSION 1.0.0
 @@ -314,10 +317,10 @@ static int receive_mergeable(struct receive_queue *rq, 
 struct sk_buff *head_skb)
   head_skb-dev-stats.rx_length_errors++;
   return -EINVAL;
   }
 - if (unlikely(len  MAX_PACKET_LEN)) {
 + if (unlikely(len  MERGE_BUFFER_LEN)) {
   pr_debug(%s: rx error: merge buffer too long\n,
head_skb-dev-name);
 - len = MAX_PACKET_LEN;
 + len = MERGE_BUFFER_LEN;
   }
   if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
   struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
 @@ -336,18 +339,17 @@ static int receive_mergeable(struct receive_queue *rq, 
 struct sk_buff *head_skb)
   if (curr_skb != head_skb) {
   head_skb-data_len += len;
   head_skb-len += len;
 - head_skb-truesize += MAX_PACKET_LEN;
 + head_skb-truesize += MERGE_BUFFER_LEN;
   }
   page = virt_to_head_page(buf);
   offset = buf - (char *)page_address(page);
   if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
   put_page(page);
   skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
 -  len, MAX_PACKET_LEN);
 +  len, MERGE_BUFFER_LEN);
   } else {
   skb_add_rx_frag(curr_skb, num_skb_frags, page,
 - offset, len,
 - MAX_PACKET_LEN);
 + offset, len, MERGE_BUFFER_LEN);
   }
   --rq-num;
   }
 @@ -383,7 +385,7 @@ static void receive_buf(struct receive_queue *rq, void 
 *buf, unsigned int len)
   struct page *page = virt_to_head_page(buf);
   skb = page_to_skb(rq, page,
 (char *)buf - (char *)page_address(page),
 -   len, MAX_PACKET_LEN);
 +   len, MERGE_BUFFER_LEN);
   if (unlikely(!skb)) {
   dev-stats.rx_dropped++;
   put_page(page);
 @@ -471,11 +473,11 @@ static int add_recvbuf_small(struct receive_queue *rq, 
 gfp_t gfp)
   struct skb_vnet_hdr *hdr;
   int err;
  
 - skb = __netdev_alloc_skb_ip_align(vi-dev,