Hi, markus@ has seen problems with IPv4 fragments on high volume IPsec tunnels. The fragment id gets reused to fast, this causes packet loss and the situation does not recover. Details are in RFC 4963 "IPv4 Reassembly Errors at High Data Rates".
In ESP IPsec tunnels source/destination address and protocol are always the same, and IPv4 fragment id has only 16 bit. To solve this we have to timeout such fragments more aggressively. To avoid that high volume IPsec causes aggressive dops for other fragements, we have to split the data structure into two red-black trees. On is holding the address and protocol information and the other has only the fragment id. This allows to react for specific connections. Adjusted timeouts will follow in the next diff. ok? bluhm Index: net/pf_norm.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf_norm.c,v retrieving revision 1.206 diff -u -p -r1.206 pf_norm.c --- net/pf_norm.c 19 Jun 2017 17:58:49 -0000 1.206 +++ net/pf_norm.c 23 Jun 2017 12:25:38 -0000 @@ -76,29 +76,28 @@ struct pf_frent { u_int16_t fe_mff; /* more fragment flag */ }; -/* keep synced with struct pf_fragment, used in RB_FIND */ -struct pf_fragment_cmp { - struct pf_addr fr_src; - struct pf_addr fr_dst; - u_int32_t fr_id; - sa_family_t fr_af; - u_int8_t fr_proto; - u_int8_t fr_direction; +RB_HEAD(pf_frag_tree, pf_fragment); +struct pf_frnode { + struct pf_addr fn_src; /* ip source address */ + struct pf_addr fn_dst; /* ip destination address */ + sa_family_t fn_af; /* address family */ + u_int8_t fn_proto; /* protocol for fragments in fn_tree */ + u_int8_t fn_direction; /* pf packet direction */ + u_int32_t fn_fragments; /* number of entries in fn_tree */ + + RB_ENTRY(pf_frnode) fn_entry; + struct pf_frag_tree fn_tree; /* matching fragments, lookup by id */ }; struct pf_fragment { - struct pf_addr fr_src; /* ip source address */ - struct pf_addr fr_dst; /* ip destination address */ u_int32_t fr_id; /* fragment id for reassemble */ - sa_family_t fr_af; /* address family */ - u_int8_t fr_proto; /* protocol of this fragment */ - u_int8_t fr_direction; /* pf packet direction */ RB_ENTRY(pf_fragment) fr_entry; TAILQ_ENTRY(pf_fragment) frag_next; TAILQ_HEAD(pf_fragq, pf_frent) fr_queue; int32_t fr_timeout; u_int16_t fr_maxlen; /* maximum length of single fragment */ + struct pf_frnode *fr_node; /* ip src/dst/proto/af for fragments */ }; struct pf_fragment_tag { @@ -109,19 +108,23 @@ struct pf_fragment_tag { TAILQ_HEAD(pf_fragqueue, pf_fragment) pf_fragqueue; +static __inline int pf_frnode_compare(struct pf_frnode *, + struct pf_frnode *); +RB_HEAD(pf_frnode_tree, pf_frnode) pf_frnode_tree; +RB_PROTOTYPE(pf_frnode_tree, pf_frnode, fn_entry, pf_frnode_compare); +RB_GENERATE(pf_frnode_tree, pf_frnode, fn_entry, pf_frnode_compare); + static __inline int pf_frag_compare(struct pf_fragment *, struct pf_fragment *); -RB_HEAD(pf_frag_tree, pf_fragment) pf_frag_tree, pf_cache_tree; RB_PROTOTYPE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare); RB_GENERATE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare); /* Private prototypes */ void pf_flush_fragments(void); void pf_free_fragment(struct pf_fragment *); -struct pf_fragment *pf_find_fragment(struct pf_fragment_cmp *, - struct pf_frag_tree *); +struct pf_fragment *pf_find_fragment(struct pf_frnode *, u_int32_t); struct pf_frent *pf_create_fragment(u_short *); -struct pf_fragment *pf_fillup_fragment(struct pf_fragment_cmp *, +struct pf_fragment *pf_fillup_fragment(struct pf_frnode *, u_int32_t, struct pf_frent *, u_short *); int pf_isfull_fragment(struct pf_fragment *); struct mbuf *pf_join_fragment(struct pf_fragment *); @@ -132,7 +135,7 @@ int pf_reassemble6(struct mbuf **, st #endif /* INET6 */ /* Globals */ -struct pool pf_frent_pl, pf_frag_pl; +struct pool pf_frent_pl, pf_frag_pl, pf_frnode_pl; struct pool pf_state_scrub_pl; int pf_nfrents; @@ -153,6 +156,8 @@ pf_normalize_init(void) { pool_init(&pf_frent_pl, sizeof(struct pf_frent), 0, IPL_SOFTNET, 0, "pffrent", NULL); + pool_init(&pf_frnode_pl, sizeof(struct pf_frnode), 0, + IPL_SOFTNET, 0, "pffrnode", NULL); pool_init(&pf_frag_pl, sizeof(struct pf_fragment), 0, IPL_SOFTNET, 0, "pffrag", NULL); pool_init(&pf_state_scrub_pl, sizeof(struct pf_state_scrub), 0, @@ -167,19 +172,28 @@ pf_normalize_init(void) } static __inline int -pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b) +pf_frnode_compare(struct pf_frnode *a, struct pf_frnode *b) { int diff; - if ((diff = a->fr_id - b->fr_id) != 0) + if ((diff = a->fn_proto - b->fn_proto) != 0) return (diff); - if ((diff = a->fr_proto - b->fr_proto) != 0) + if ((diff = a->fn_af - b->fn_af) != 0) return (diff); - if ((diff = a->fr_af - b->fr_af) != 0) + if ((diff = pf_addr_compare(&a->fn_src, &b->fn_src, a->fn_af)) != 0) return (diff); - if ((diff = pf_addr_compare(&a->fr_src, &b->fr_src, a->fr_af)) != 0) + if ((diff = pf_addr_compare(&a->fn_dst, &b->fn_dst, a->fn_af)) != 0) return (diff); - if ((diff = pf_addr_compare(&a->fr_dst, &b->fr_dst, a->fr_af)) != 0) + + return (0); +} + +static __inline int +pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b) +{ + int diff; + + if ((diff = a->fr_id - b->fr_id) != 0) return (diff); return (0); @@ -231,8 +245,16 @@ void pf_free_fragment(struct pf_fragment *frag) { struct pf_frent *frent; + struct pf_frnode *frnode; - RB_REMOVE(pf_frag_tree, &pf_frag_tree, frag); + frnode = frag->fr_node; + RB_REMOVE(pf_frag_tree, &frnode->fn_tree, frag); + KASSERT(frnode->fn_fragments >= 1); + frnode->fn_fragments--; + if (frnode->fn_fragments == 0) { + RB_REMOVE(pf_frnode_tree, &pf_frnode_tree, frnode); + pool_put(&pf_frnode_pl, frnode); + } TAILQ_REMOVE(&pf_fragqueue, frag, frag_next); /* Free all fragment entries */ @@ -246,15 +268,21 @@ pf_free_fragment(struct pf_fragment *fra } struct pf_fragment * -pf_find_fragment(struct pf_fragment_cmp *key, struct pf_frag_tree *tree) +pf_find_fragment(struct pf_frnode *key, u_int32_t id) { - struct pf_fragment *frag; + struct pf_fragment *frag, idkey; + struct pf_frnode *frnode; - frag = RB_FIND(pf_frag_tree, tree, (struct pf_fragment *)key); - if (frag != NULL) { - TAILQ_REMOVE(&pf_fragqueue, frag, frag_next); - TAILQ_INSERT_HEAD(&pf_fragqueue, frag, frag_next); - } + frnode = RB_FIND(pf_frnode_tree, &pf_frnode_tree, key); + if (frnode == NULL) + return (NULL); + KASSERT(frnode->fn_fragments >= 1); + idkey.fr_id = id; + frag = RB_FIND(pf_frag_tree, &frnode->fn_tree, &idkey); + if (frag == NULL) + return (NULL); + TAILQ_REMOVE(&pf_fragqueue, frag, frag_next); + TAILQ_INSERT_HEAD(&pf_fragqueue, frag, frag_next); return (frag); } @@ -279,11 +307,12 @@ pf_create_fragment(u_short *reason) } struct pf_fragment * -pf_fillup_fragment(struct pf_fragment_cmp *key, struct pf_frent *frent, - u_short *reason) +pf_fillup_fragment(struct pf_frnode *key, u_int32_t id, + struct pf_frent *frent, u_short *reason) { struct pf_frent *after, *next, *prev; struct pf_fragment *frag; + struct pf_frnode *frnode; u_int16_t total; /* No empty fragments */ @@ -306,12 +335,12 @@ pf_fillup_fragment(struct pf_fragment_cm goto bad_fragment; } - DPFPRINTF(LOG_INFO, key->fr_af == AF_INET ? + DPFPRINTF(LOG_INFO, key->fn_af == AF_INET ? "reass frag %d @ %d-%d" : "reass frag %#08x @ %d-%d", - key->fr_id, frent->fe_off, frent->fe_off + frent->fe_len); + id, frent->fe_off, frent->fe_off + frent->fe_len); /* Fully buffer all of the fragments in this fragment queue */ - frag = pf_find_fragment(key, &pf_frag_tree); + frag = pf_find_fragment(key, id); /* Create a new reassembly queue for this packet */ if (frag == NULL) { @@ -324,13 +353,32 @@ pf_fillup_fragment(struct pf_fragment_cm goto drop_fragment; } } - - *(struct pf_fragment_cmp *)frag = *key; + frnode = RB_FIND(pf_frnode_tree, &pf_frnode_tree, key); + if (frnode == NULL) { + frnode = pool_get(&pf_frnode_pl, PR_NOWAIT); + if (frnode == NULL) { + pf_flush_fragments(); + frnode = pool_get(&pf_frnode_pl, PR_NOWAIT); + if (frnode == NULL) { + REASON_SET(reason, PFRES_MEMORY); + pool_put(&pf_frag_pl, frag); + goto drop_fragment; + } + } + *frnode = *key; + RB_INIT(&frnode->fn_tree); + frnode->fn_fragments = 0; + } TAILQ_INIT(&frag->fr_queue); frag->fr_timeout = time_uptime; frag->fr_maxlen = frent->fe_len; - - RB_INSERT(pf_frag_tree, &pf_frag_tree, frag); + frag->fr_id = id; + frag->fr_node = frnode; + /* RB_INSERT cannot fail as pf_find_fragment() found nothing */ + RB_INSERT(pf_frag_tree, &frnode->fn_tree, frag); + frnode->fn_fragments++; + if (frnode->fn_fragments == 1) + RB_INSERT(pf_frnode_tree, &pf_frnode_tree, frnode); TAILQ_INSERT_HEAD(&pf_fragqueue, frag, frag_next); /* We do not have a previous fragment */ @@ -340,6 +388,7 @@ pf_fillup_fragment(struct pf_fragment_cm } KASSERT(!TAILQ_EMPTY(&frag->fr_queue)); + KASSERT(frag->fr_node); /* Remember maximum fragment len for refragmentation */ if (frent->fe_len > frag->fr_maxlen) @@ -377,7 +426,7 @@ pf_fillup_fragment(struct pf_fragment_cm u_int16_t precut; #ifdef INET6 - if (frag->fr_af == AF_INET6) + if (frag->fr_node->fn_af == AF_INET6) goto free_fragment; #endif /* INET6 */ @@ -397,7 +446,7 @@ pf_fillup_fragment(struct pf_fragment_cm u_int16_t aftercut; #ifdef INET6 - if (frag->fr_af == AF_INET6) + if (frag->fr_node->fn_af == AF_INET6) goto free_fragment; #endif /* INET6 */ @@ -428,7 +477,7 @@ pf_fillup_fragment(struct pf_fragment_cm free_ipv6_fragment: #ifdef INET6 - if (frag->fr_af == AF_INET) + if (frag->fr_node->fn_af == AF_INET) goto bad_fragment; free_fragment: /* @@ -530,7 +579,7 @@ pf_reassemble(struct mbuf **m0, int dir, struct ip *ip = mtod(m, struct ip *); struct pf_frent *frent; struct pf_fragment *frag; - struct pf_fragment_cmp key; + struct pf_frnode key; u_int16_t total, hdrlen; /* Get an entry for the fragment queue */ @@ -544,15 +593,15 @@ pf_reassemble(struct mbuf **m0, int dir, frent->fe_off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3; frent->fe_mff = ntohs(ip->ip_off) & IP_MF; - key.fr_src.v4 = ip->ip_src; - key.fr_dst.v4 = ip->ip_dst; - key.fr_af = AF_INET; - key.fr_proto = ip->ip_p; - key.fr_id = ip->ip_id; - key.fr_direction = dir; + key.fn_src.v4 = ip->ip_src; + key.fn_dst.v4 = ip->ip_dst; + key.fn_af = AF_INET; + key.fn_proto = ip->ip_p; + key.fn_direction = dir; PF_FRAG_LOCK(); - if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) { + if ((frag = pf_fillup_fragment(&key, ip->ip_id, frent, reason)) + == NULL) { PF_FRAG_UNLOCK(); return (PF_DROP); } @@ -611,7 +660,7 @@ pf_reassemble6(struct mbuf **m0, struct struct pf_fragment_tag *ftag; struct pf_frent *frent; struct pf_fragment *frag; - struct pf_fragment_cmp key; + struct pf_frnode key; int off; u_int16_t total, maxlen; u_int8_t proto; @@ -627,16 +676,16 @@ pf_reassemble6(struct mbuf **m0, struct frent->fe_off = ntohs(fraghdr->ip6f_offlg & IP6F_OFF_MASK); frent->fe_mff = fraghdr->ip6f_offlg & IP6F_MORE_FRAG; - key.fr_src.v6 = ip6->ip6_src; - key.fr_dst.v6 = ip6->ip6_dst; - key.fr_af = AF_INET6; + key.fn_src.v6 = ip6->ip6_src; + key.fn_dst.v6 = ip6->ip6_dst; + key.fn_af = AF_INET6; /* Only the first fragment's protocol is relevant */ - key.fr_proto = 0; - key.fr_id = fraghdr->ip6f_ident; - key.fr_direction = dir; + key.fn_proto = 0; + key.fn_direction = dir; PF_FRAG_LOCK(); - if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) { + if ((frag = pf_fillup_fragment(&key, fraghdr->ip6f_ident, frent, + reason)) == NULL) { PF_FRAG_UNLOCK(); return (PF_DROP); }