This patch changes the runtime behavior of IP defrag queue:
incoming in-order fragments are added to the end of the current
list/"run" of in-order fragments at the tail.

On some workloads, UDP stream performance is substantially improved:

RX: ./udp_stream -F 10 -T 2 -l 60
TX: ./udp_stream -c -H <host> -F 10 -T 5 -l 60

with this patchset applied on a 10Gbps receiver:

  throughput=9524.18
  throughput_units=Mbit/s

upstream (net-next):

  throughput=4608.93
  throughput_units=Mbit/s

Reported-by: Willem de Bruijn <will...@google.com>
Signed-off-by: Peter Oskolkov <p...@google.com>
Cc: Eric Dumazet <eduma...@google.com>
Cc: Florian Westphal <f...@strlen.de>

---
 net/ipv4/inet_fragment.c |   2 +-
 net/ipv4/ip_fragment.c   | 110 ++++++++++++++++++++++++---------------
 2 files changed, 70 insertions(+), 42 deletions(-)

diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 6d258a5669e7..bcb11f3a27c0 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -146,7 +146,7 @@ void inet_frag_destroy(struct inet_frag_queue *q)
                        fp = xp;
                } while (fp);
        } else {
-               sum_truesize = skb_rbtree_purge(&q->rb_fragments);
+               sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
        }
        sum = sum_truesize + f->qsize;
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 26ace9d2d976..88281fbce88c 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -126,8 +126,8 @@ static u8 ip4_frag_ecn(u8 tos)
 
 static struct inet_frags ip4_frags;
 
-static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
-                        struct net_device *dev);
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
+                        struct sk_buff *prev_tail, struct net_device *dev);
 
 
 static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
@@ -219,7 +219,12 @@ static void ip_expire(struct timer_list *t)
                head = skb_rb_first(&qp->q.rb_fragments);
                if (!head)
                        goto out;
-               rb_erase(&head->rbnode, &qp->q.rb_fragments);
+               if (FRAG_CB(head)->next_frag)
+                       rb_replace_node(&head->rbnode,
+                                       &FRAG_CB(head)->next_frag->rbnode,
+                                       &qp->q.rb_fragments);
+               else
+                       rb_erase(&head->rbnode, &qp->q.rb_fragments);
                memset(&head->rbnode, 0, sizeof(head->rbnode));
                barrier();
        }
@@ -320,7 +325,7 @@ static int ip_frag_reinit(struct ipq *qp)
                return -ETIMEDOUT;
        }
 
-       sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
+       sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
        sub_frag_mem_limit(qp->q.net, sum_truesize);
 
        qp->q.flags = 0;
@@ -329,6 +334,7 @@ static int ip_frag_reinit(struct ipq *qp)
        qp->q.fragments = NULL;
        qp->q.rb_fragments = RB_ROOT;
        qp->q.fragments_tail = NULL;
+       qp->q.last_run_head = NULL;
        qp->iif = 0;
        qp->ecn = 0;
 
@@ -340,7 +346,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff 
*skb)
 {
        struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
        struct rb_node **rbn, *parent;
-       struct sk_buff *skb1;
+       struct sk_buff *skb1, *prev_tail;
        struct net_device *dev;
        unsigned int fragsize;
        int flags, offset;
@@ -418,38 +424,41 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff 
*skb)
         */
 
        /* Find out where to put this fragment.  */
-       skb1 = qp->q.fragments_tail;
-       if (!skb1) {
-               /* This is the first fragment we've received. */
-               rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
-               qp->q.fragments_tail = skb;
-       } else if ((skb1->ip_defrag_offset + skb1->len) < end) {
-               /* This is the common/special case: skb goes to the end. */
+       prev_tail = qp->q.fragments_tail;
+       if (!prev_tail)
+               ip4_frag_create_run(&qp->q, skb);  /* First fragment. */
+       else if (prev_tail->ip_defrag_offset + prev_tail->len < end) {
+               /* This is the common case: skb goes to the end. */
                /* Detect and discard overlaps. */
-               if (offset < (skb1->ip_defrag_offset + skb1->len))
+               if (offset < prev_tail->ip_defrag_offset + prev_tail->len)
                        goto discard_qp;
-               /* Insert after skb1. */
-               rb_link_node(&skb->rbnode, &skb1->rbnode, 
&skb1->rbnode.rb_right);
-               qp->q.fragments_tail = skb;
+               if (offset == prev_tail->ip_defrag_offset + prev_tail->len)
+                       ip4_frag_append_to_last_run(&qp->q, skb);
+               else
+                       ip4_frag_create_run(&qp->q, skb);
        } else {
-               /* Binary search. Note that skb can become the first fragment, 
but
-                * not the last (covered above). */
+               /* Binary search. Note that skb can become the first fragment,
+                * but not the last (covered above).
+                */
                rbn = &qp->q.rb_fragments.rb_node;
                do {
                        parent = *rbn;
                        skb1 = rb_to_skb(parent);
                        if (end <= skb1->ip_defrag_offset)
                                rbn = &parent->rb_left;
-                       else if (offset >= skb1->ip_defrag_offset + skb1->len)
+                       else if (offset >= skb1->ip_defrag_offset +
+                                               FRAG_CB(skb1)->frag_run_len)
                                rbn = &parent->rb_right;
                        else /* Found an overlap with skb1. */
                                goto discard_qp;
                } while (*rbn);
                /* Here we have parent properly set, and rbn pointing to
-                * one of its NULL left/right children. Insert skb. */
+                * one of its NULL left/right children. Insert skb.
+                */
+               ip4_frag_init_run(skb);
                rb_link_node(&skb->rbnode, parent, rbn);
+               rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
        }
-       rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
 
        if (dev)
                qp->iif = dev->ifindex;
@@ -476,7 +485,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff 
*skb)
                unsigned long orefdst = skb->_skb_refdst;
 
                skb->_skb_refdst = 0UL;
-               err = ip_frag_reasm(qp, skb, dev);
+               err = ip_frag_reasm(qp, skb, prev_tail, dev);
                skb->_skb_refdst = orefdst;
                return err;
        }
@@ -495,7 +504,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff 
*skb)
 
 /* Build a new IP datagram from all its fragments. */
 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
-                        struct net_device *dev)
+                        struct sk_buff *prev_tail, struct net_device *dev)
 {
        struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
        struct iphdr *iph;
@@ -519,10 +528,16 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff 
*skb,
                fp = skb_clone(skb, GFP_ATOMIC);
                if (!fp)
                        goto out_nomem;
-               rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments);
+               FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
+               if (RB_EMPTY_NODE(&skb->rbnode))
+                       FRAG_CB(prev_tail)->next_frag = fp;
+               else
+                       rb_replace_node(&skb->rbnode, &fp->rbnode,
+                                       &qp->q.rb_fragments);
                if (qp->q.fragments_tail == skb)
                        qp->q.fragments_tail = fp;
                skb_morph(skb, head);
+               FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
                rb_replace_node(&head->rbnode, &skb->rbnode,
                                &qp->q.rb_fragments);
                consume_skb(head);
@@ -558,7 +573,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff 
*skb,
                for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
                        plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
                clone->len = clone->data_len = head->data_len - plen;
-               skb->truesize += clone->truesize;
+               head->truesize += clone->truesize;
                clone->csum = 0;
                clone->ip_summed = head->ip_summed;
                add_frag_mem_limit(qp->q.net, clone->truesize);
@@ -571,24 +586,36 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff 
*skb,
        skb_push(head, head->data - skb_network_header(head));
 
        /* Traverse the tree in order, to build frag_list. */
+       fp = FRAG_CB(head)->next_frag;
        rbn = rb_next(&head->rbnode);
        rb_erase(&head->rbnode, &qp->q.rb_fragments);
-       while (rbn) {
-               struct rb_node *rbnext = rb_next(rbn);
-               fp = rb_to_skb(rbn);
-               rb_erase(rbn, &qp->q.rb_fragments);
-               rbn = rbnext;
-               *nextp = fp;
-               nextp = &fp->next;
-               fp->prev = NULL;
-               memset(&fp->rbnode, 0, sizeof(fp->rbnode));
-               head->data_len += fp->len;
-               head->len += fp->len;
-               if (head->ip_summed != fp->ip_summed)
-                       head->ip_summed = CHECKSUM_NONE;
-               else if (head->ip_summed == CHECKSUM_COMPLETE)
-                       head->csum = csum_add(head->csum, fp->csum);
-               head->truesize += fp->truesize;
+       while (rbn || fp) {
+               /* fp points to the next sk_buff in the current run;
+                * rbn points to the next run.
+                */
+               /* Go through the current run. */
+               while (fp) {
+                       *nextp = fp;
+                       nextp = &fp->next;
+                       fp->prev = NULL;
+                       memset(&fp->rbnode, 0, sizeof(fp->rbnode));
+                       head->data_len += fp->len;
+                       head->len += fp->len;
+                       if (head->ip_summed != fp->ip_summed)
+                               head->ip_summed = CHECKSUM_NONE;
+                       else if (head->ip_summed == CHECKSUM_COMPLETE)
+                               head->csum = csum_add(head->csum, fp->csum);
+                       head->truesize += fp->truesize;
+                       fp = FRAG_CB(fp)->next_frag;
+               }
+               /* Move to the next run. */
+               if (rbn) {
+                       struct rb_node *rbnext = rb_next(rbn);
+
+                       fp = rb_to_skb(rbn);
+                       rb_erase(rbn, &qp->q.rb_fragments);
+                       rbn = rbnext;
+               }
        }
        sub_frag_mem_limit(qp->q.net, head->truesize);
 
@@ -624,6 +651,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff 
*skb,
        qp->q.fragments = NULL;
        qp->q.rb_fragments = RB_ROOT;
        qp->q.fragments_tail = NULL;
+       qp->q.last_run_head = NULL;
        return 0;
 
 out_nomem:
-- 
2.18.0.597.ga71716f1ad-goog

Reply via email to