Hi,

markus@ has seen problems with IPv4 fragments on high volume IPsec
tunnels.  The fragment id gets reused to fast, this causes packet
loss and the situation does not recover.  Details are in RFC 4963
"IPv4 Reassembly Errors at High Data Rates".

In ESP IPsec tunnels source/destination address and protocol are
always the same, and IPv4 fragment id has only 16 bit.  To solve
this we have to timeout such fragments more aggressively.

To avoid that high volume IPsec causes aggressive dops for other
fragements, we have to split the data structure into two red-black
trees.  On is holding the address and protocol information and the
other has only the fragment id.  This allows to react for specific
connections.

Adjusted timeouts will follow in the next diff.

ok?

bluhm

Index: net/pf_norm.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf_norm.c,v
retrieving revision 1.206
diff -u -p -r1.206 pf_norm.c
--- net/pf_norm.c       19 Jun 2017 17:58:49 -0000      1.206
+++ net/pf_norm.c       23 Jun 2017 12:25:38 -0000
@@ -76,29 +76,28 @@ struct pf_frent {
        u_int16_t        fe_mff;        /* more fragment flag */
 };
 
-/* keep synced with struct pf_fragment, used in RB_FIND */
-struct pf_fragment_cmp {
-       struct pf_addr  fr_src;
-       struct pf_addr  fr_dst;
-       u_int32_t       fr_id;
-       sa_family_t     fr_af;
-       u_int8_t        fr_proto;
-       u_int8_t        fr_direction;
+RB_HEAD(pf_frag_tree, pf_fragment);
+struct pf_frnode {
+       struct pf_addr  fn_src;         /* ip source address */
+       struct pf_addr  fn_dst;         /* ip destination address */
+       sa_family_t     fn_af;          /* address family */
+       u_int8_t        fn_proto;       /* protocol for fragments in fn_tree */
+       u_int8_t        fn_direction;   /* pf packet direction */
+       u_int32_t       fn_fragments;   /* number of entries in fn_tree */
+
+       RB_ENTRY(pf_frnode) fn_entry;
+       struct pf_frag_tree fn_tree;    /* matching fragments, lookup by id */
 };
 
 struct pf_fragment {
-       struct pf_addr  fr_src;         /* ip source address */
-       struct pf_addr  fr_dst;         /* ip destination address */
        u_int32_t       fr_id;          /* fragment id for reassemble */
-       sa_family_t     fr_af;          /* address family */
-       u_int8_t        fr_proto;       /* protocol of this fragment */
-       u_int8_t        fr_direction;   /* pf packet direction */
 
        RB_ENTRY(pf_fragment) fr_entry;
        TAILQ_ENTRY(pf_fragment) frag_next;
        TAILQ_HEAD(pf_fragq, pf_frent) fr_queue;
        int32_t         fr_timeout;
        u_int16_t       fr_maxlen;      /* maximum length of single fragment */
+       struct pf_frnode *fr_node;      /* ip src/dst/proto/af for fragments */
 };
 
 struct pf_fragment_tag {
@@ -109,19 +108,23 @@ struct pf_fragment_tag {
 
 TAILQ_HEAD(pf_fragqueue, pf_fragment)  pf_fragqueue;
 
+static __inline int     pf_frnode_compare(struct pf_frnode *,
+                           struct pf_frnode *);
+RB_HEAD(pf_frnode_tree, pf_frnode)     pf_frnode_tree;
+RB_PROTOTYPE(pf_frnode_tree, pf_frnode, fn_entry, pf_frnode_compare);
+RB_GENERATE(pf_frnode_tree, pf_frnode, fn_entry, pf_frnode_compare);
+
 static __inline int     pf_frag_compare(struct pf_fragment *,
                            struct pf_fragment *);
-RB_HEAD(pf_frag_tree, pf_fragment)     pf_frag_tree, pf_cache_tree;
 RB_PROTOTYPE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare);
 RB_GENERATE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare);
 
 /* Private prototypes */
 void                    pf_flush_fragments(void);
 void                    pf_free_fragment(struct pf_fragment *);
-struct pf_fragment     *pf_find_fragment(struct pf_fragment_cmp *,
-                           struct pf_frag_tree *);
+struct pf_fragment     *pf_find_fragment(struct pf_frnode *, u_int32_t);
 struct pf_frent                *pf_create_fragment(u_short *);
-struct pf_fragment     *pf_fillup_fragment(struct pf_fragment_cmp *,
+struct pf_fragment     *pf_fillup_fragment(struct pf_frnode *, u_int32_t,
                            struct pf_frent *, u_short *);
 int                     pf_isfull_fragment(struct pf_fragment *);
 struct mbuf            *pf_join_fragment(struct pf_fragment *);
@@ -132,7 +135,7 @@ int                  pf_reassemble6(struct mbuf **, st
 #endif /* INET6 */
 
 /* Globals */
-struct pool             pf_frent_pl, pf_frag_pl;
+struct pool             pf_frent_pl, pf_frag_pl, pf_frnode_pl;
 struct pool             pf_state_scrub_pl;
 int                     pf_nfrents;
 
@@ -153,6 +156,8 @@ pf_normalize_init(void)
 {
        pool_init(&pf_frent_pl, sizeof(struct pf_frent), 0,
            IPL_SOFTNET, 0, "pffrent", NULL);
+       pool_init(&pf_frnode_pl, sizeof(struct pf_frnode), 0,
+           IPL_SOFTNET, 0, "pffrnode", NULL);
        pool_init(&pf_frag_pl, sizeof(struct pf_fragment), 0,
            IPL_SOFTNET, 0, "pffrag", NULL);
        pool_init(&pf_state_scrub_pl, sizeof(struct pf_state_scrub), 0,
@@ -167,19 +172,28 @@ pf_normalize_init(void)
 }
 
 static __inline int
-pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b)
+pf_frnode_compare(struct pf_frnode *a, struct pf_frnode *b)
 {
        int     diff;
 
-       if ((diff = a->fr_id - b->fr_id) != 0)
+       if ((diff = a->fn_proto - b->fn_proto) != 0)
                return (diff);
-       if ((diff = a->fr_proto - b->fr_proto) != 0)
+       if ((diff = a->fn_af - b->fn_af) != 0)
                return (diff);
-       if ((diff = a->fr_af - b->fr_af) != 0)
+       if ((diff = pf_addr_compare(&a->fn_src, &b->fn_src, a->fn_af)) != 0)
                return (diff);
-       if ((diff = pf_addr_compare(&a->fr_src, &b->fr_src, a->fr_af)) != 0)
+       if ((diff = pf_addr_compare(&a->fn_dst, &b->fn_dst, a->fn_af)) != 0)
                return (diff);
-       if ((diff = pf_addr_compare(&a->fr_dst, &b->fr_dst, a->fr_af)) != 0)
+
+       return (0);
+}
+
+static __inline int
+pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b)
+{
+       int     diff;
+
+       if ((diff = a->fr_id - b->fr_id) != 0)
                return (diff);
 
        return (0);
@@ -231,8 +245,16 @@ void
 pf_free_fragment(struct pf_fragment *frag)
 {
        struct pf_frent         *frent;
+       struct pf_frnode        *frnode;
 
-       RB_REMOVE(pf_frag_tree, &pf_frag_tree, frag);
+       frnode = frag->fr_node;
+       RB_REMOVE(pf_frag_tree, &frnode->fn_tree, frag);
+       KASSERT(frnode->fn_fragments >= 1);
+       frnode->fn_fragments--;
+       if (frnode->fn_fragments == 0) {
+               RB_REMOVE(pf_frnode_tree, &pf_frnode_tree, frnode);
+               pool_put(&pf_frnode_pl, frnode);
+       }
        TAILQ_REMOVE(&pf_fragqueue, frag, frag_next);
 
        /* Free all fragment entries */
@@ -246,15 +268,21 @@ pf_free_fragment(struct pf_fragment *fra
 }
 
 struct pf_fragment *
-pf_find_fragment(struct pf_fragment_cmp *key, struct pf_frag_tree *tree)
+pf_find_fragment(struct pf_frnode *key, u_int32_t id)
 {
-       struct pf_fragment      *frag;
+       struct pf_fragment      *frag, idkey;
+       struct pf_frnode        *frnode;
 
-       frag = RB_FIND(pf_frag_tree, tree, (struct pf_fragment *)key);
-       if (frag != NULL) {
-               TAILQ_REMOVE(&pf_fragqueue, frag, frag_next);
-               TAILQ_INSERT_HEAD(&pf_fragqueue, frag, frag_next);
-       }
+       frnode = RB_FIND(pf_frnode_tree, &pf_frnode_tree, key);
+       if (frnode == NULL)
+               return (NULL);
+       KASSERT(frnode->fn_fragments >= 1);
+       idkey.fr_id = id;
+       frag = RB_FIND(pf_frag_tree, &frnode->fn_tree, &idkey);
+       if (frag == NULL)
+               return (NULL);
+       TAILQ_REMOVE(&pf_fragqueue, frag, frag_next);
+       TAILQ_INSERT_HEAD(&pf_fragqueue, frag, frag_next);
 
        return (frag);
 }
@@ -279,11 +307,12 @@ pf_create_fragment(u_short *reason)
 }
 
 struct pf_fragment *
-pf_fillup_fragment(struct pf_fragment_cmp *key, struct pf_frent *frent,
-    u_short *reason)
+pf_fillup_fragment(struct pf_frnode *key, u_int32_t id,
+    struct pf_frent *frent, u_short *reason)
 {
        struct pf_frent         *after, *next, *prev;
        struct pf_fragment      *frag;
+       struct pf_frnode        *frnode;
        u_int16_t                total;
 
        /* No empty fragments */
@@ -306,12 +335,12 @@ pf_fillup_fragment(struct pf_fragment_cm
                goto bad_fragment;
        }
 
-       DPFPRINTF(LOG_INFO, key->fr_af == AF_INET ?
+       DPFPRINTF(LOG_INFO, key->fn_af == AF_INET ?
            "reass frag %d @ %d-%d" : "reass frag %#08x @ %d-%d",
-           key->fr_id, frent->fe_off, frent->fe_off + frent->fe_len);
+           id, frent->fe_off, frent->fe_off + frent->fe_len);
 
        /* Fully buffer all of the fragments in this fragment queue */
-       frag = pf_find_fragment(key, &pf_frag_tree);
+       frag = pf_find_fragment(key, id);
 
        /* Create a new reassembly queue for this packet */
        if (frag == NULL) {
@@ -324,13 +353,32 @@ pf_fillup_fragment(struct pf_fragment_cm
                                goto drop_fragment;
                        }
                }
-
-               *(struct pf_fragment_cmp *)frag = *key;
+               frnode = RB_FIND(pf_frnode_tree, &pf_frnode_tree, key);
+               if (frnode == NULL) {
+                       frnode = pool_get(&pf_frnode_pl, PR_NOWAIT);
+                       if (frnode == NULL) {
+                               pf_flush_fragments();
+                               frnode = pool_get(&pf_frnode_pl, PR_NOWAIT);
+                               if (frnode == NULL) {
+                                       REASON_SET(reason, PFRES_MEMORY);
+                                       pool_put(&pf_frag_pl, frag);
+                                       goto drop_fragment;
+                               }
+                       }
+                       *frnode = *key;
+                       RB_INIT(&frnode->fn_tree);
+                       frnode->fn_fragments = 0;
+               }
                TAILQ_INIT(&frag->fr_queue);
                frag->fr_timeout = time_uptime;
                frag->fr_maxlen = frent->fe_len;
-
-               RB_INSERT(pf_frag_tree, &pf_frag_tree, frag);
+               frag->fr_id = id;
+               frag->fr_node = frnode;
+               /* RB_INSERT cannot fail as pf_find_fragment() found nothing */
+               RB_INSERT(pf_frag_tree, &frnode->fn_tree, frag);
+               frnode->fn_fragments++;
+               if (frnode->fn_fragments == 1)
+                       RB_INSERT(pf_frnode_tree, &pf_frnode_tree, frnode);
                TAILQ_INSERT_HEAD(&pf_fragqueue, frag, frag_next);
 
                /* We do not have a previous fragment */
@@ -340,6 +388,7 @@ pf_fillup_fragment(struct pf_fragment_cm
        }
 
        KASSERT(!TAILQ_EMPTY(&frag->fr_queue));
+       KASSERT(frag->fr_node);
 
        /* Remember maximum fragment len for refragmentation */
        if (frent->fe_len > frag->fr_maxlen)
@@ -377,7 +426,7 @@ pf_fillup_fragment(struct pf_fragment_cm
                u_int16_t       precut;
 
 #ifdef INET6
-               if (frag->fr_af == AF_INET6)
+               if (frag->fr_node->fn_af == AF_INET6)
                        goto free_fragment;
 #endif /* INET6 */
 
@@ -397,7 +446,7 @@ pf_fillup_fragment(struct pf_fragment_cm
                u_int16_t       aftercut;
 
 #ifdef INET6
-               if (frag->fr_af == AF_INET6)
+               if (frag->fr_node->fn_af == AF_INET6)
                        goto free_fragment;
 #endif /* INET6 */
 
@@ -428,7 +477,7 @@ pf_fillup_fragment(struct pf_fragment_cm
 
 free_ipv6_fragment:
 #ifdef INET6
-       if (frag->fr_af == AF_INET)
+       if (frag->fr_node->fn_af == AF_INET)
                goto bad_fragment;
 free_fragment:
        /*
@@ -530,7 +579,7 @@ pf_reassemble(struct mbuf **m0, int dir,
        struct ip               *ip = mtod(m, struct ip *);
        struct pf_frent         *frent;
        struct pf_fragment      *frag;
-       struct pf_fragment_cmp   key;
+       struct pf_frnode         key;
        u_int16_t                total, hdrlen;
 
        /* Get an entry for the fragment queue */
@@ -544,15 +593,15 @@ pf_reassemble(struct mbuf **m0, int dir,
        frent->fe_off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3;
        frent->fe_mff = ntohs(ip->ip_off) & IP_MF;
 
-       key.fr_src.v4 = ip->ip_src;
-       key.fr_dst.v4 = ip->ip_dst;
-       key.fr_af = AF_INET;
-       key.fr_proto = ip->ip_p;
-       key.fr_id = ip->ip_id;
-       key.fr_direction = dir;
+       key.fn_src.v4 = ip->ip_src;
+       key.fn_dst.v4 = ip->ip_dst;
+       key.fn_af = AF_INET;
+       key.fn_proto = ip->ip_p;
+       key.fn_direction = dir;
 
        PF_FRAG_LOCK();
-       if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) {
+       if ((frag = pf_fillup_fragment(&key, ip->ip_id, frent, reason))
+           == NULL) {
                PF_FRAG_UNLOCK();
                return (PF_DROP);
        }
@@ -611,7 +660,7 @@ pf_reassemble6(struct mbuf **m0, struct 
        struct pf_fragment_tag  *ftag;
        struct pf_frent         *frent;
        struct pf_fragment      *frag;
-       struct pf_fragment_cmp   key;
+       struct pf_frnode         key;
        int                      off;
        u_int16_t                total, maxlen;
        u_int8_t                 proto;
@@ -627,16 +676,16 @@ pf_reassemble6(struct mbuf **m0, struct 
        frent->fe_off = ntohs(fraghdr->ip6f_offlg & IP6F_OFF_MASK);
        frent->fe_mff = fraghdr->ip6f_offlg & IP6F_MORE_FRAG;
 
-       key.fr_src.v6 = ip6->ip6_src;
-       key.fr_dst.v6 = ip6->ip6_dst;
-       key.fr_af = AF_INET6;
+       key.fn_src.v6 = ip6->ip6_src;
+       key.fn_dst.v6 = ip6->ip6_dst;
+       key.fn_af = AF_INET6;
        /* Only the first fragment's protocol is relevant */
-       key.fr_proto = 0;
-       key.fr_id = fraghdr->ip6f_ident;
-       key.fr_direction = dir;
+       key.fn_proto = 0;
+       key.fn_direction = dir;
 
        PF_FRAG_LOCK();
-       if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) {
+       if ((frag = pf_fillup_fragment(&key, fraghdr->ip6f_ident, frent,
+           reason)) == NULL) {
                PF_FRAG_UNLOCK();
                return (PF_DROP);
        }

Reply via email to