This patch backports several critical bug fixes related to locking and data consistency in nf_conncount code.
This backport is based on the following upstream net-next upstream commits. 4cd273b ("netfilter: nf_conncount: don't skip eviction when age is negative") d4e7df1 ("netfilter: nf_conncount: use rb_link_node_rcu() instead of rb_link_node()") 53ca0f2 ("netfilter: nf_conncount: remove wrong condition check routine") 3c5cdb1 ("netfilter: nf_conncount: fix unexpected permanent node of list.") 31568ec ("netfilter: nf_conncount: fix list_del corruption in conn_free") fd3e71a ("netfilter: nf_conncount: use spin_lock_bh instead of spin_lock") This patch also added additional compat code so that it can build on all supported kernel versions. Travis tests are at https://travis-ci.org/yifsun/ovs-travis/builds/568603796 VMware-BZ: #2396471 CC: Taehee Yoo <ap420...@gmail.com> Signed-off-by: Yifeng Sun <pkusunyif...@gmail.com> --- v1->v2: Add fixes to support old kernel versions. Thanks YiHung for reviewing. acinclude.m4 | 2 ++ datapath/linux/Modules.mk | 3 +- datapath/linux/compat/include/linux/rbtree.h | 19 ++++++++++++ datapath/linux/compat/nf_conncount.c | 46 ++++++++++++++++++---------- 4 files changed, 52 insertions(+), 18 deletions(-) create mode 100644 datapath/linux/compat/include/linux/rbtree.h diff --git a/acinclude.m4 b/acinclude.m4 index 116ffcf9096d..f8e856d3303f 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -1012,6 +1012,8 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [ [OVS_DEFINE([HAVE_GRE_CALC_HLEN])]) OVS_GREP_IFELSE([$KSRC/include/net/gre.h], [ip_gre_calc_hlen], [OVS_DEFINE([HAVE_IP_GRE_CALC_HLEN])]) + OVS_GREP_IFELSE([$KSRC/include/linux/rbtree.h], [rb_link_node_rcu], + [OVS_DEFINE([HAVE_RBTREE_RB_LINK_NODE_RCU])]) if cmp -s datapath/linux/kcompat.h.new \ datapath/linux/kcompat.h >/dev/null 2>&1; then diff --git a/datapath/linux/Modules.mk b/datapath/linux/Modules.mk index cbb29f1c69d0..69d7faeac414 100644 --- a/datapath/linux/Modules.mk +++ b/datapath/linux/Modules.mk @@ -116,5 +116,6 @@ openvswitch_headers += \ linux/compat/include/uapi/linux/netfilter.h \ linux/compat/include/linux/mm.h \ linux/compat/include/linux/netfilter.h \ - linux/compat/include/linux/overflow.h + linux/compat/include/linux/overflow.h \ + linux/compat/include/linux/rbtree.h EXTRA_DIST += linux/compat/build-aux/export-check-whitelist diff --git a/datapath/linux/compat/include/linux/rbtree.h b/datapath/linux/compat/include/linux/rbtree.h new file mode 100644 index 000000000000..dbf20ff0e0b8 --- /dev/null +++ b/datapath/linux/compat/include/linux/rbtree.h @@ -0,0 +1,19 @@ +#ifndef __LINUX_RBTREE_WRAPPER_H +#define __LINUX_RBTREE_WRAPPER_H 1 + +#include_next <linux/rbtree.h> + +#ifndef HAVE_RBTREE_RB_LINK_NODE_RCU +#include <linux/rcupdate.h> + +static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent, + struct rb_node **rb_link) +{ + node->__rb_parent_color = (unsigned long)parent; + node->rb_left = node->rb_right = NULL; + + rcu_assign_pointer(*rb_link, node); +} +#endif + +#endif /* __LINUX_RBTREE_WRAPPER_H */ diff --git a/datapath/linux/compat/nf_conncount.c b/datapath/linux/compat/nf_conncount.c index eeae440f872d..6a4d058e7fac 100644 --- a/datapath/linux/compat/nf_conncount.c +++ b/datapath/linux/compat/nf_conncount.c @@ -54,6 +54,7 @@ struct nf_conncount_tuple { struct nf_conntrack_zone zone; int cpu; u32 jiffies32; + bool dead; struct rcu_head rcu_head; }; @@ -111,15 +112,16 @@ nf_conncount_add(struct nf_conncount_list *list, conn->zone = *zone; conn->cpu = raw_smp_processor_id(); conn->jiffies32 = (u32)jiffies; - spin_lock(&list->list_lock); + conn->dead = false; + spin_lock_bh(&list->list_lock); if (list->dead == true) { kmem_cache_free(conncount_conn_cachep, conn); - spin_unlock(&list->list_lock); + spin_unlock_bh(&list->list_lock); return NF_CONNCOUNT_SKIP; } list_add_tail(&conn->node, &list->head); list->count++; - spin_unlock(&list->list_lock); + spin_unlock_bh(&list->list_lock); return NF_CONNCOUNT_ADDED; } @@ -136,19 +138,22 @@ static bool conn_free(struct nf_conncount_list *list, { bool free_entry = false; - spin_lock(&list->list_lock); + spin_lock_bh(&list->list_lock); - if (list->count == 0) { - spin_unlock(&list->list_lock); + if (conn->dead) { + spin_unlock_bh(&list->list_lock); return free_entry; } list->count--; + conn->dead = true; list_del_rcu(&conn->node); - if (list->count == 0) + if (list->count == 0) { + list->dead = true; free_entry = true; + } - spin_unlock(&list->list_lock); + spin_unlock_bh(&list->list_lock); call_rcu(&conn->rcu_head, __conn_free); return free_entry; } @@ -160,7 +165,7 @@ find_or_evict(struct net *net, struct nf_conncount_list *list, const struct nf_conntrack_tuple_hash *found; unsigned long a, b; int cpu = raw_smp_processor_id(); - __s32 age; + u32 age; found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple); if (found) @@ -248,7 +253,7 @@ static void nf_conncount_list_init(struct nf_conncount_list *list) { spin_lock_init(&list->list_lock); INIT_LIST_HEAD(&list->head); - list->count = 1; + list->count = 0; list->dead = false; } @@ -261,6 +266,7 @@ static bool nf_conncount_gc_list(struct net *net, struct nf_conn *found_ct; unsigned int collected = 0; bool free_entry = false; + bool ret = false; list_for_each_entry_safe(conn, conn_n, &list->head, node) { found = find_or_evict(net, list, conn, &free_entry); @@ -290,7 +296,15 @@ static bool nf_conncount_gc_list(struct net *net, if (collected > CONNCOUNT_GC_MAX_NODES) return false; } - return false; + + spin_lock_bh(&list->list_lock); + if (!list->count) { + list->dead = true; + ret = true; + } + spin_unlock_bh(&list->list_lock); + + return ret; } static void __tree_nodes_free(struct rcu_head *h) @@ -310,11 +324,8 @@ static void tree_nodes_free(struct rb_root *root, while (gc_count) { rbconn = gc_nodes[--gc_count]; spin_lock(&rbconn->list.list_lock); - if (rbconn->list.count == 0 && rbconn->list.dead == false) { - rbconn->list.dead = true; - rb_erase(&rbconn->node, root); - call_rcu(&rbconn->rcu_head, __tree_nodes_free); - } + rb_erase(&rbconn->node, root); + call_rcu(&rbconn->rcu_head, __tree_nodes_free); spin_unlock(&rbconn->list.list_lock); } } @@ -415,8 +426,9 @@ insert_tree(struct net *net, nf_conncount_list_init(&rbconn->list); list_add(&conn->node, &rbconn->list.head); count = 1; + rbconn->list.count = count; - rb_link_node(&rbconn->node, parent, rbnode); + rb_link_node_rcu(&rbconn->node, parent, rbnode); rb_insert_color(&rbconn->node, root); out_unlock: spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); -- 2.7.4 _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev