On Tue, Jul 12, 2016 at 3:26 PM, Joe Stringer <[email protected]> wrote:
> The core fragmentation handling logic is exported on all supported
> kernels, so it's not necessary to backport the latest version of this.
> This greatly simplifies the code due to inconsistencies between the old
> per-lookup garbage collection and the newer workqueue based garbage
> collection.
>
> As a result of simplifying and removing unnecessary backport code, a few
> bugs are fixed for corner cases such as when some fragments remain in
> the fragment cache when openvswitch is unloaded.
>
> Some backported ip functions need a little extra logic than what is seen
> on the latest code due to this, for instance on kernels <3.17:
> * Call inet_frag_evictor() before defrag
> * Limit hashsize in ip{,6}_fragment logic
>
> The pernet init/exit logic also differs a little from upstream. Upstream
> ipv[46]_defrag logic initializes the various pernet fragment parameters
> and its own global fragments cache. In the OVS backport, the pernet
> parameters are shared while the fragments cache is separate. The
> backport relies upon upstream pernet initialization to perform the
> shared setup, and performs no pernet initialization of its own. When it
> comes to pernet exit however, the backport must ensure that all
> OVS-specific fragment state is cleared, while the shared state remains
> untouched so that the regular ipv[46] logic may do its own cleanup. In
> practice this means that OVS must have its own divergent implementation
> of inet_frags_exit_net().
>
> Fixes the following crash:
>
> Call Trace:
> <IRQ>
> [<ffffffff810744f6>] ? call_timer_fn+0x36/0x100
> [<ffffffff8107548f>] run_timer_softirq+0x1ef/0x2f0
> [<ffffffff8106cccc>] __do_softirq+0xec/0x2c0
> [<ffffffff8106d215>] irq_exit+0x105/0x110
> [<ffffffff81737095>] smp_apic_timer_interrupt+0x45/0x60
> [<ffffffff81735a1d>] apic_timer_interrupt+0x6d/0x80
> <EOI>
> [<ffffffff8104f596>] ? native_safe_halt+0x6/0x10
> [<ffffffff8101cb2f>] default_idle+0x1f/0xc0
> [<ffffffff8101d406>] arch_cpu_idle+0x26/0x30
> [<ffffffff810bf3a5>] cpu_startup_entry+0xc5/0x290
> [<ffffffff810415ed>] start_secondary+0x21d/0x2d0
> Code: Bad RIP value.
> RIP [<ffffffffa0177480>] 0xffffffffa0177480
> RSP <ffff88003f703e78>
> CR2: ffffffffa0177480
> ---[ end trace eb98ca80ba07bd9c ]---
> Kernel panic - not syncing: Fatal exception in interrupt
>
> Signed-off-by: Joe Stringer <[email protected]>
> ---
> I've tested this on CentOS kernel 3.10.0-327 and Ubuntu kernels
> 3.13.0-68, 3.16.0-70, 3.19.0-58, and 4.2.0-35. Some earlier kernel
> versions may still trigger fragmentation-related crashes due to upstream
> bugs, for instance on Ubuntu's 3.13.0-24.
> ---
> acinclude.m4 | 1 +
> datapath/linux/compat/include/net/inet_frag.h | 58 ++-
> datapath/linux/compat/inet_fragment.c | 486
> ++------------------------
> datapath/linux/compat/ip_fragment.c | 36 +-
> datapath/linux/compat/nf_conntrack_reasm.c | 17 +
> 5 files changed, 91 insertions(+), 507 deletions(-)
>
...
> -void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
> - const char *prefix)
> -{
> - static const char msg[] = "inet_frag_find: Fragment hash bucket"
> - " list length grew over limit "
> __stringify(INETFRAGS_MAXDEPTH)
> - ". Dropping fragment.\n";
> + local_bh_disable();
> + inet_frag_evictor(nf, f, true);
> + local_bh_enable();
>
> - if (PTR_ERR(q) == -ENOBUFS)
> - net_dbg_ratelimited("%s%s", prefix, msg);
> + nf->low_thresh = thresh;
> }
Upstream inet_frags_exit_net() and back ported looks the same, we can
just use the exported symbol.
> +#endif /* HAVE_INET_FRAGS_WITH_FRAGS_WORK */
>
> #endif /* !HAVE_CORRECT_MRU_HANDLING */
> diff --git a/datapath/linux/compat/ip_fragment.c
> b/datapath/linux/compat/ip_fragment.c
> index 8d01088abc0a..64e2cf23c327 100644
> --- a/datapath/linux/compat/ip_fragment.c
> +++ b/datapath/linux/compat/ip_fragment.c
> @@ -76,12 +76,7 @@ struct ipfrag_skb_cb
>
> /* Describe an entry in the "incomplete datagrams" queue. */
> struct ipq {
> - union {
> - struct inet_frag_queue q;
> -#ifndef HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR
> - struct ovs_inet_frag_queue oq;
> -#endif
> - };
> + struct inet_frag_queue q;
>
> u32 user;
> __be32 saddr;
> @@ -119,6 +114,12 @@ static unsigned int ipqhashfn(__be16 id, __be32 saddr,
> __be32 daddr, u8 prot)
> (__force u32)saddr, (__force u32)daddr,
> ip4_frags.rnd);
> }
> +/* fb3cfe6e75b9 ("inet: frag: remove hash size assumptions from callers")
> + * shifted this logic into inet_fragment, but prior kernels still need this.
> + */
> +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,17,0)
> +#define ipqhashfn(a, b, c, d) (ipqhashfn(a, b, c, d) & (INETFRAGS_HASHSZ -
> 1))
> +#endif
>
> #ifdef HAVE_INET_FRAGS_CONST
> static unsigned int ip4_hashfn(const struct inet_frag_queue *q)
> @@ -267,6 +268,23 @@ out:
> ipq_put(qp);
> }
>
> +/* Memory limiting on fragments. Evictor trashes the oldest
> + * fragment queue until we are back under the threshold.
> + *
> + * Necessary for kernels earlier than v3.17. Replaced in commit
> + * b13d3cbfb8e8 ("inet: frag: move eviction of queues to work queue").
> + */
> +static void ip_evictor(struct net *net)
> +{
> +#ifdef HAVE_INET_FRAG_EVICTOR
> + int evicted;
> +
> + evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
> + if (evicted)
> + IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
> +#endif
> +}
> +
> /* Find the correct entry in the "incomplete datagrams" queue for
> * this IP datagram, and create new one, if nothing is found.
> */
> @@ -281,6 +299,11 @@ static struct ipq *ip_find(struct net *net, struct iphdr
> *iph,
> arg.user = user;
> arg.vif = vif;
>
> + ip_evictor(net);
> +
IP fragment eviction can be done after this lookup. So that there is
better chance of finding the fragment.
> +#ifdef HAVE_INET_FRAGS_WITH_RWLOCK
> + read_lock(&ip4_frags.lock);
> +#endif
> hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
>
> q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
> @@ -701,7 +724,6 @@ int rpl_ip_defrag(struct net *net, struct sk_buff *skb,
> u32 user)
> kfree_skb(skb);
> return -ENOMEM;
> }
> -EXPORT_SYMBOL_GPL(rpl_ip_defrag);
>
_______________________________________________
dev mailing list
[email protected]
http://openvswitch.org/mailman/listinfo/dev