Ovs and kernel stack would add frag_queue to same netns_frags list.
As result, ovs and kernel may access the fraq_queue without correct
lock. Also the struct ipq may be different on kernel(older than 4.3),
which leads to invalid pointer access.

The fix creates specific netns_frags for ovs.

Signed-off-by: wangzhike <[email protected]>
---
datapath/datapath.c                                | 22 +++---
datapath/datapath.h                                |  6 ++
datapath/linux/compat/include/net/inet_frag.h      | 18 -----
datapath/linux/compat/include/net/ip.h             |  4 ++
.../include/net/netfilter/ipv6/nf_defrag_ipv6.h    |  4 ++
datapath/linux/compat/inet_fragment.c              | 83 ----------------------
datapath/linux/compat/ip_fragment.c                | 66 ++++++++++++++---
datapath/linux/compat/nf_conntrack_reasm.c         | 58 +++++++++++++--
8 files changed, 138 insertions(+), 123 deletions(-)

diff --git a/datapath/datapath.c b/datapath/datapath.c
index c85029c..82cad74 100644
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -2297,6 +2297,8 @@ static int __net_init ovs_init_net(struct net *net)
       INIT_LIST_HEAD(&ovs_net->dps);
       INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq);
       ovs_ct_init(net);
+       ovs_netns_frags_init(net);
+       ovs_netns_frags6_init(net);
       return 0;
}
@@ -2332,6 +2334,8 @@ static void __net_exit ovs_exit_net(struct net *dnet)
       struct net *net;
       LIST_HEAD(head);
+       ovs_netns_frags6_exit(dnet);
+       ovs_netns_frags_exit(dnet);
       ovs_ct_exit(dnet);
       ovs_lock();
       list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node)
@@ -2368,13 +2372,9 @@ static int __init dp_init(void)
        pr_info("Open vSwitch switching datapath %s\n", VERSION);
-        err = compat_init();
-        if (err)
-                 goto error;
-
       err = action_fifos_init();
       if (err)
-                 goto error_compat_exit;
+                goto error;
        err = ovs_internal_dev_rtnl_link_register();
       if (err)
@@ -2392,10 +2392,14 @@ static int __init dp_init(void)
       if (err)
                goto error_vport_exit;
-        err = register_netdevice_notifier(&ovs_dp_device_notifier);
+       err = compat_init();
       if (err)
                goto error_netns_exit;
+       err = register_netdevice_notifier(&ovs_dp_device_notifier);
+       if (err)
+                goto error_compat_exit;
+
       err = ovs_netdev_init();
       if (err)
                goto error_unreg_notifier;
@@ -2410,6 +2414,8 @@ error_unreg_netdev:
       ovs_netdev_exit();
error_unreg_notifier:
       unregister_netdevice_notifier(&ovs_dp_device_notifier);
+error_compat_exit:
+       compat_exit();
error_netns_exit:
       unregister_pernet_device(&ovs_net_ops);
error_vport_exit:
@@ -2420,8 +2426,6 @@ error_unreg_rtnl_link:
       ovs_internal_dev_rtnl_link_unregister();
error_action_fifos_exit:
       action_fifos_exit();
-error_compat_exit:
-        compat_exit();
error:
       return err;
}
@@ -2431,13 +2435,13 @@ static void dp_cleanup(void)
       dp_unregister_genl(ARRAY_SIZE(dp_genl_families));
       ovs_netdev_exit();
       unregister_netdevice_notifier(&ovs_dp_device_notifier);
+       compat_exit();
       unregister_pernet_device(&ovs_net_ops);
       rcu_barrier();
       ovs_vport_exit();
       ovs_flow_exit();
       ovs_internal_dev_rtnl_link_unregister();
       action_fifos_exit();
-        compat_exit();
}
 module_init(dp_init);
diff --git a/datapath/datapath.h b/datapath/datapath.h
index b835ada..8849625 100644
--- a/datapath/datapath.h
+++ b/datapath/datapath.h
@@ -141,6 +141,12 @@ struct ovs_net {
        /* Module reference for configuring conntrack. */
       bool xt_label;
+
+#ifdef HAVE_INET_FRAG_LRU_MOVE
+       struct net *net;
+       struct netns_frags ipv4_frags;
+       struct netns_frags nf_frags;
+#endif
};
 extern unsigned int ovs_net_id;
diff --git a/datapath/linux/compat/include/net/inet_frag.h 
b/datapath/linux/compat/include/net/inet_frag.h
index 01d79ad..34078c8 100644
--- a/datapath/linux/compat/include/net/inet_frag.h
+++ b/datapath/linux/compat/include/net/inet_frag.h
@@ -52,22 +52,4 @@ static inline int rpl_inet_frags_init(struct inet_frags 
*frags)
#define inet_frags_init rpl_inet_frags_init
#endif
-#ifndef HAVE_CORRECT_MRU_HANDLING
-/* We reuse the upstream inet_fragment.c common code for managing fragment
- * stores, However we actually store the fragments within our own 'inet_frags'
- * structures (in {ip_fragment,nf_conntrack_reasm}.c). When unloading the OVS
- * kernel module, we need to flush all of the remaining fragments from these
- * caches, or else we will panic with the following sequence of events:
- *
- * 1) A fragment for a packet arrives and is cached in inet_frags. This
- *    starts a timer to ensure the fragment does not hang around forever.
- * 2) openvswitch module is unloaded.
- * 3) The timer for the fragment fires, calling into backported OVS code
- *    to free the fragment.
- * 4) BUG: unable to handle kernel paging request at ffffffffc03c01e0
- */
-void rpl_inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
-#define inet_frags_exit_net rpl_inet_frags_exit_net
-#endif
-
#endif /* inet_frag.h */
diff --git a/datapath/linux/compat/include/net/ip.h 
b/datapath/linux/compat/include/net/ip.h
index b188996..ad5ac33 100644
--- a/datapath/linux/compat/include/net/ip.h
+++ b/datapath/linux/compat/include/net/ip.h
@@ -97,6 +97,8 @@ int rpl_ip_defrag(struct net *net, struct sk_buff *skb, u32 
user);
#define ip_defrag rpl_ip_defrag
int __init rpl_ipfrag_init(void);
void rpl_ipfrag_fini(void);
+void ovs_netns_frags_init(struct net *net);
+void ovs_netns_frags_exit(struct net *net);
 #else /* HAVE_CORRECT_MRU_HANDLING */
@@ -131,6 +133,8 @@ static inline int rpl_ip_defrag(struct net *net, struct 
sk_buff *skb, u32 user)
  * compat_{in,ex}it() can be no-ops. */
static inline int rpl_ipfrag_init(void) { return 0; }
static inline void rpl_ipfrag_fini(void) { }
+static inline void ovs_netns_frags_init(struct net *net) { }
+static inline void ovs_netns_frags_exit(struct net *net) { }
#endif /* HAVE_CORRECT_MRU_HANDLING */
 #define ipfrag_init rpl_ipfrag_init
diff --git a/datapath/linux/compat/include/net/netfilter/ipv6/nf_defrag_ipv6.h 
b/datapath/linux/compat/include/net/netfilter/ipv6/nf_defrag_ipv6.h
index 2ab6c0a..c4c0f79 100644
--- a/datapath/linux/compat/include/net/netfilter/ipv6/nf_defrag_ipv6.h
+++ b/datapath/linux/compat/include/net/netfilter/ipv6/nf_defrag_ipv6.h
@@ -28,9 +28,13 @@ int rpl_nf_ct_frag6_gather(struct net *net, struct sk_buff 
*skb, u32 user);
  */
int __init rpl_nf_ct_frag6_init(void);
void rpl_nf_ct_frag6_cleanup(void);
+void ovs_netns_frags6_init(struct net *net);
+void ovs_netns_frags6_exit(struct net *net);
#else /* !OVS_NF_DEFRAG6_BACKPORT */
static inline int __init rpl_nf_ct_frag6_init(void) { return 0; }
static inline void rpl_nf_ct_frag6_cleanup(void) { }
+static inline void ovs_netns_frags6_init(struct net *net) { }
+static inline void ovs_netns_frags6_exit(struct net *net) { }
#endif /* OVS_NF_DEFRAG6_BACKPORT */
#define nf_ct_frag6_init rpl_nf_ct_frag6_init
#define nf_ct_frag6_cleanup rpl_nf_ct_frag6_cleanup
diff --git a/datapath/linux/compat/inet_fragment.c 
b/datapath/linux/compat/inet_fragment.c
index f05e617..21736e6 100644
--- a/datapath/linux/compat/inet_fragment.c
+++ b/datapath/linux/compat/inet_fragment.c
@@ -27,88 +27,5 @@
#include <net/inet_frag.h>
#include <net/inet_ecn.h>
-#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
-static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
-{
-        return q->net->low_thresh == 0 ||
-               frag_mem_limit(q->net) >= q->net->low_thresh;
-}
-
-static unsigned int
-inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
-{
-        struct inet_frag_queue *fq;
-        struct hlist_node *n;
-        unsigned int evicted = 0;
-        HLIST_HEAD(expired);
-
-        spin_lock(&hb->chain_lock);
-
-        hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
-                 if (!inet_fragq_should_evict(fq))
-                           continue;
-
-                 if (!del_timer(&fq->timer))
-                           continue;
-
-#ifdef HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR
-                 hlist_add_head(&fq->list_evictor, &expired);
-#else
-                 hlist_del(&fq->list);
-                 hlist_add_head(&fq->list, &expired);
-#endif
-                 ++evicted;
-        }
-
-        spin_unlock(&hb->chain_lock);
-
-#ifdef HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR
-        hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
-#else
-        hlist_for_each_entry_safe(fq, n, &expired, list)
-#endif
-                 f->frag_expire((unsigned long) fq);
-
-        return evicted;
-}
-
-void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
-{
-        int thresh = nf->low_thresh;
-        unsigned int seq;
-        int i;
-
-        nf->low_thresh = 0;
-
-evict_again:
-        local_bh_disable();
-        seq = read_seqbegin(&f->rnd_seqlock);
-
-        for (i = 0; i < INETFRAGS_HASHSZ ; i++)
-                 inet_evict_bucket(f, &f->hash[i]);
-
-        local_bh_enable();
-        cond_resched();
-
-        if (read_seqretry(&f->rnd_seqlock, seq) ||
-            percpu_counter_sum(&nf->mem))
-                 goto evict_again;
-
-        nf->low_thresh = thresh;
-}
-#else /* HAVE_INET_FRAGS_WITH_FRAGS_WORK */
-void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
-{
-        int thresh = nf->low_thresh;
-
-        nf->low_thresh = 0;
-
-        local_bh_disable();
-        inet_frag_evictor(nf, f, true);
-        local_bh_enable();
-
-        nf->low_thresh = thresh;
-}
-#endif /* HAVE_INET_FRAGS_WITH_FRAGS_WORK */
 #endif /* !HAVE_CORRECT_MRU_HANDLING */
diff --git a/datapath/linux/compat/ip_fragment.c 
b/datapath/linux/compat/ip_fragment.c
index 47b51b5..8f2012b 100644
--- a/datapath/linux/compat/ip_fragment.c
+++ b/datapath/linux/compat/ip_fragment.c
@@ -57,6 +57,8 @@
#include <net/inet_ecn.h>
#include <net/vrf.h>
#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#include <net/netns/generic.h>
+#include "datapath.h"
 /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
  * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
@@ -107,6 +109,51 @@ struct ip4_create_arg {
       int vif;
};
+static struct netns_frags *get_netns_frags_from_net(struct net *net)
+{
+#ifdef HAVE_INET_FRAG_LRU_MOVE
+       struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+       return &(ovs_net->ipv4_frags);
+#else
+       return &(net->ipv4.frags);
+#endif
+}
+
+static struct net *get_net_from_netns_frags(struct netns_frags *frags)
+{
+       struct net *net;
+#ifdef HAVE_INET_FRAG_LRU_MOVE
+       struct ovs_net *ovs_net;
+
+       ovs_net = container_of(frags, struct ovs_net, ipv4_frags);
+       net = ovs_net->net;
+#else
+       net = container_of(frags, struct net, ipv4.frags);
+#endif
+       return net;
+}
+
+void ovs_netns_frags_init(struct net *net)
+{
+#ifdef HAVE_INET_FRAG_LRU_MOVE
+       struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+
+       ovs_net->ipv4_frags.high_thresh = 4 * 1024 * 1024;
+       ovs_net->ipv4_frags.low_thresh = 3 * 1024 * 1024;
+       ovs_net->ipv4_frags.timeout = IP_FRAG_TIME;
+       inet_frags_init_net(&(ovs_net->ipv4_frags));
+       ovs_net->net = net;
+#endif
+}
+
+void ovs_netns_frags_exit(struct net *net)
+{
+       struct netns_frags *frags;
+
+       frags = get_netns_frags_from_net(net);
+       inet_frags_exit_net(frags, &ip4_frags);
+}
+
static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
{
       net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
@@ -158,9 +205,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void 
*a)
#endif
{
       struct ipq *qp = container_of(q, struct ipq, q);
-        struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
-                                                    frags);
-        struct net *net = container_of(ipv4, struct net, ipv4);
+       struct net *net = get_net_from_netns_frags(q->net);
        const struct ip4_create_arg *arg = a;
@@ -219,7 +264,7 @@ static void ip_expire(unsigned long arg)
       struct net *net;
        qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
-        net = container_of(qp->q.net, struct net, ipv4.frags);
+       net = get_net_from_netns_frags(qp->q.net);
        spin_lock(&qp->q.lock);
@@ -278,8 +323,10 @@ out:
static void ip_evictor(struct net *net)
{
       int evicted;
+       struct netns_frags *frags;
-        evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
+       frags = get_netns_frags_from_net(net);
+       evicted = inet_frag_evictor(frags, &ip4_frags, false);
       if (evicted)
                IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
}
@@ -294,6 +341,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr 
*iph,
       struct inet_frag_queue *q;
       struct ip4_create_arg arg;
       unsigned int hash;
+       struct netns_frags *frags;
        arg.iph = iph;
       arg.user = user;
@@ -304,7 +352,8 @@ static struct ipq *ip_find(struct net *net, struct iphdr 
*iph,
#endif
       hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
-        q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
+       frags = get_netns_frags_from_net(net);
+       q = inet_frag_find(frags, &ip4_frags, &arg, hash);
       if (IS_ERR_OR_NULL(q)) {
                inet_frag_maybe_warn_overflow(q, pr_fmt());
                return NULL;
@@ -333,7 +382,7 @@ static int ip_frag_too_far(struct ipq *qp)
       if (rc) {
                struct net *net;
-                 net = container_of(qp->q.net, struct net, ipv4.frags);
+                net = get_net_from_netns_frags(qp->q.net);
                IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
       }
@@ -566,7 +615,7 @@ err:
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
                           struct net_device *dev)
{
-        struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+       struct net *net = get_net_from_netns_frags(qp->q.net);
       struct iphdr *iph;
       struct sk_buff *fp, *head = qp->q.fragments;
       int len;
@@ -738,7 +787,6 @@ static int __net_init ipv4_frags_init_net(struct net *net)
 static void __net_exit ipv4_frags_exit_net(struct net *net)
{
-        inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
}
 static struct pernet_operations ip4_frags_ops = {
diff --git a/datapath/linux/compat/nf_conntrack_reasm.c 
b/datapath/linux/compat/nf_conntrack_reasm.c
index 0da9463..ea153c3 100644
--- a/datapath/linux/compat/nf_conntrack_reasm.c
+++ b/datapath/linux/compat/nf_conntrack_reasm.c
@@ -53,6 +53,8 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include <net/netns/generic.h>
+#include "datapath.h"
 #ifdef OVS_NF_DEFRAG6_BACKPORT
@@ -68,6 +70,30 @@ struct nf_ct_frag6_skb_cb
 static struct inet_frags nf_frags;
+static struct netns_frags *get_netns_frags6_from_net(struct net *net)
+{
+#ifdef HAVE_INET_FRAG_LRU_MOVE
+       struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+       return &(ovs_net->nf_frags);
+#else
+       return &(net->nf_frag.frags);
+#endif
+}
+
+static struct net *get_net_from_netns_frags6(struct netns_frags *frags)
+{
+       struct net *net;
+#ifdef HAVE_INET_FRAG_LRU_MOVE
+       struct ovs_net *ovs_net;
+
+       ovs_net = container_of(frags, struct ovs_net, nf_frags);
+       net = ovs_net->net;
+#else
+       net = container_of(frags, struct net, nf_frag.frags);
+#endif
+       return net;
+}
+
static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
{
       return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK);
@@ -105,7 +131,7 @@ static void nf_ct_frag6_expire(unsigned long data)
       struct net *net;
        fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q);
-        net = container_of(fq->q.net, struct net, nf_frag.frags);
+       net = get_net_from_netns_frags6(fq->q.net);
        ip6_expire_frag_queue(net, fq, &nf_frags);
}
@@ -118,6 +144,7 @@ static inline struct frag_queue *fq_find(struct net *net, 
__be32 id,
       struct inet_frag_queue *q;
       struct ip6_create_arg arg;
       unsigned int hash;
+       struct netns_frags *frags;
        arg.id = id;
       arg.user = user;
@@ -132,7 +159,8 @@ static inline struct frag_queue *fq_find(struct net *net, 
__be32 id,
#endif
       hash = nf_hash_frag(id, src, dst);
-        q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash);
+       frags = get_netns_frags6_from_net(net);
+       q = inet_frag_find(frags, &nf_frags, &arg, hash);
       local_bh_enable();
       if (IS_ERR_OR_NULL(q)) {
                inet_frag_maybe_warn_overflow(q, pr_fmt());
@@ -506,6 +534,7 @@ int rpl_nf_ct_frag6_gather(struct net *net, struct sk_buff 
*skb, u32 user)
       struct frag_queue *fq;
       struct ipv6hdr *hdr;
       u8 prevhdr;
+       struct netns_frags *frags;
        /* Jumbo payload inhibits frag. header */
       if (ipv6_hdr(skb)->payload_len == 0) {
@@ -524,9 +553,10 @@ int rpl_nf_ct_frag6_gather(struct net *net, struct sk_buff 
*skb, u32 user)
       fhdr = (struct frag_hdr *)skb_transport_header(skb);
 /* See ip_evictor(). */
+       frags = get_netns_frags6_from_net(net);
#ifdef HAVE_INET_FRAG_EVICTOR
       local_bh_disable();
-        inet_frag_evictor(&net->nf_frag.frags, &nf_frags, false);
+       inet_frag_evictor(frags, &nf_frags, false);
       local_bh_enable();
#endif
@@ -567,7 +597,27 @@ static int nf_ct_net_init(struct net *net)
 static void nf_ct_net_exit(struct net *net)
{
-        inet_frags_exit_net(&net->nf_frag.frags, &nf_frags);
+}
+
+void ovs_netns_frags6_init(struct net *net)
+{
+#ifdef HAVE_INET_FRAG_LRU_MOVE
+       struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+
+       ovs_net->nf_frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
+       ovs_net->nf_frags.low_thresh = IPV6_FRAG_LOW_THRESH;
+       ovs_net->nf_frags.timeout = IPV6_FRAG_TIMEOUT;
+
+       inet_frags_init_net(&(ovs_net->nf_frags));
+#endif
+}
+
+void ovs_netns_frags6_exit(struct net *net)
+{
+       struct netns_frags *frags;
+
+       frags = get_netns_frags6_from_net(net);
+       inet_frags_exit_net(frags, &nf_frags);
}
 static struct pernet_operations nf_ct_net_ops = {
--
1.8.3.1

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to