From: Florian Westphal <f...@strlen.de>

Add garbage collection worker.

Move the rbnode reclaim logic from search tree to insert tree
to avoid race condition.

Signed-off-by: Florian Westphal <f...@strlen.de>
Signed-off-by: Yi-Hung Wei <yihung....@gmail.com>
---
 include/net/netfilter/nf_conntrack_count.h |  14 ++-
 net/netfilter/nf_conncount.c               | 163 +++++++++++++++++++++--------
 net/netfilter/nft_connlimit.c              |   2 +-
 3 files changed, 132 insertions(+), 47 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_count.h 
b/include/net/netfilter/nf_conntrack_count.h
index af0c1c95eccd..4b2b2baf8ab4 100644
--- a/include/net/netfilter/nf_conntrack_count.h
+++ b/include/net/netfilter/nf_conntrack_count.h
@@ -5,10 +5,17 @@
 
 struct nf_conncount_data;
 
+enum nf_conncount_list_add {
+       NF_CONNCOUNT_ADDED,     /* list add was ok */
+       NF_CONNCOUNT_ERR,       /* -ENOMEM, must drop skb */
+       NF_CONNCOUNT_SKIP,      /* list is already reclaimed by gc */
+};
+
 struct nf_conncount_list {
        spinlock_t list_lock;
        struct list_head head;  /* connections with the same filtering key */
        unsigned int count;     /* length of list */
+       bool dead;
 };
 
 struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int 
family,
@@ -29,9 +36,10 @@ void nf_conncount_lookup(struct net *net, struct 
nf_conncount_list *list,
 
 void nf_conncount_list_init(struct nf_conncount_list *list);
 
-bool nf_conncount_add(struct nf_conncount_list *list,
-                     const struct nf_conntrack_tuple *tuple,
-                     const struct nf_conntrack_zone *zone);
+enum nf_conncount_list_add
+nf_conncount_add(struct nf_conncount_list *list,
+                const struct nf_conntrack_tuple *tuple,
+                const struct nf_conntrack_zone *zone);
 
 bool nf_conncount_gc_list(struct net *net,
                          struct nf_conncount_list *list);
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 760ba24db735..7e93487e5856 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -62,6 +62,10 @@ static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] 
__cacheline_aligned_i
 struct nf_conncount_data {
        unsigned int keylen;
        struct rb_root root[CONNCOUNT_SLOTS];
+       struct net *net;
+       struct work_struct gc_work;
+       unsigned long pending_trees[BITS_TO_LONGS(CONNCOUNT_SLOTS)];
+       unsigned int gc_tree;
 };
 
 static u_int32_t conncount_rnd __read_mostly;
@@ -82,25 +86,32 @@ static int key_diff(const u32 *a, const u32 *b, unsigned 
int klen)
        return memcmp(a, b, klen * sizeof(u32));
 }
 
-bool nf_conncount_add(struct nf_conncount_list *list,
-                     const struct nf_conntrack_tuple *tuple,
-                     const struct nf_conntrack_zone *zone)
+enum nf_conncount_list_add
+nf_conncount_add(struct nf_conncount_list *list,
+                const struct nf_conntrack_tuple *tuple,
+                const struct nf_conntrack_zone *zone)
 {
        struct nf_conncount_tuple *conn;
 
        if (WARN_ON_ONCE(list->count > INT_MAX))
-               return false;
+               return NF_CONNCOUNT_ERR;
 
        conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
        if (conn == NULL)
-               return false;
+               return NF_CONNCOUNT_ERR;
+
        conn->tuple = *tuple;
        conn->zone = *zone;
        spin_lock(&list->list_lock);
+       if (list->dead == true) {
+               kmem_cache_free(conncount_conn_cachep, conn);
+               spin_unlock(&list->list_lock);
+               return NF_CONNCOUNT_SKIP;
+       }
        list_add_tail(&conn->node, &list->head);
        list->count++;
        spin_unlock(&list->list_lock);
-       return true;
+       return NF_CONNCOUNT_ADDED;
 }
 EXPORT_SYMBOL_GPL(nf_conncount_add);
 
@@ -192,6 +203,7 @@ void nf_conncount_list_init(struct nf_conncount_list *list)
        spin_lock_init(&list->list_lock);
        INIT_LIST_HEAD(&list->head);
        list->count = 1;
+       list->dead = false;
 }
 EXPORT_SYMBOL_GPL(nf_conncount_list_init);
 
@@ -250,23 +262,39 @@ static void tree_nodes_free(struct rb_root *root,
 
        while (gc_count) {
                rbconn = gc_nodes[--gc_count];
-               rb_erase(&rbconn->node, root);
-               call_rcu(&rbconn->rcu_head, __tree_nodes_free);
+               spin_lock(&rbconn->list.list_lock);
+               if (rbconn->list.count == 0 && rbconn->list.dead == false) {
+                       rbconn->list.dead = true;
+                       rb_erase(&rbconn->node, root);
+                       call_rcu(&rbconn->rcu_head, __tree_nodes_free);
+               }
+               spin_unlock(&rbconn->list.list_lock);
        }
 }
 
+static void schedule_gc_worker(struct nf_conncount_data *data, int tree)
+{
+       set_bit(tree, data->pending_trees);
+       schedule_work(&data->gc_work);
+}
+
 static unsigned int
-insert_tree(struct rb_root *root,
+insert_tree(struct net *net,
+           struct nf_conncount_data *data,
+           struct rb_root *root,
            unsigned int hash,
            const u32 *key,
            u8 keylen,
            const struct nf_conntrack_tuple *tuple,
            const struct nf_conntrack_zone *zone)
 {
+       enum nf_conncount_list_add ret;
+       struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];
        struct rb_node **rbnode, *parent;
        struct nf_conncount_rb *rbconn;
        struct nf_conncount_tuple *conn;
-       unsigned int count = 0;
+       unsigned int count = 0, gc_count = 0;
+       bool node_found = false;
 
        spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
 
@@ -284,16 +312,44 @@ insert_tree(struct rb_root *root,
                        rbnode = &((*rbnode)->rb_right);
                } else {
                        /* unlikely: other cpu added node already */
-                       if (!nf_conncount_add(&rbconn->list, tuple, zone)) {
+                       node_found = true;
+                       ret = nf_conncount_add(&rbconn->list, tuple, zone);
+                       if (ret == NF_CONNCOUNT_ERR) {
                                count = 0; /* hotdrop */
-                               goto out_unlock;
+                       } else if (ret == NF_CONNCOUNT_ADDED) {
+                               count = rbconn->list.count;
+                       } else {
+                               /* NF_CONNCOUNT_SKIP, rbconn is already
+                                * reclaimed by gc, insert a new tree node
+                                */
+                               node_found = false;
                        }
-
-                       count = rbconn->list.count;
-                       goto out_unlock;
+                       break;
                }
+
+               if (gc_count >= ARRAY_SIZE(gc_nodes))
+                       continue;
+
+               if (nf_conncount_gc_list(net, &rbconn->list))
+                       gc_nodes[gc_count++] = rbconn;
        }
 
+       if (gc_count) {
+               tree_nodes_free(root, gc_nodes, gc_count);
+               /* tree_node_free before new allocation permits
+                * allocator to re-use newly free'd object.
+                *
+                * This is a rare event; in most cases we will find
+                * existing node to re-use. (or gc_count is 0).
+                */
+
+               if (gc_count >= ARRAY_SIZE(gc_nodes))
+                       schedule_gc_worker(data, hash);
+       }
+
+       if (node_found)
+               goto out_unlock;
+
        /* expected case: match, insert new node */
        rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
        if (rbconn == NULL)
@@ -327,19 +383,16 @@ count_tree(struct net *net,
           const struct nf_conntrack_tuple *tuple,
           const struct nf_conntrack_zone *zone)
 {
-       struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];
+       enum nf_conncount_list_add ret;
        struct rb_root *root;
        struct rb_node *parent;
        struct nf_conncount_rb *rbconn;
-       unsigned int gc_count, hash;
-       bool no_gc = false;
+       unsigned int hash;
        u8 keylen = data->keylen;
 
        hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS;
        root = &data->root[hash];
 
- restart:
-       gc_count = 0;
        parent = rcu_dereference_raw(root->rb_node);
        while (parent) {
                int diff;
@@ -357,44 +410,65 @@ count_tree(struct net *net,
                        nf_conncount_lookup(net, &rbconn->list, tuple, zone,
                                            &addit);
 
-                       spin_lock_bh(&nf_conncount_locks[hash % 
CONNCOUNT_LOCK_SLOTS]);
-                       tree_nodes_free(root, gc_nodes, gc_count);
-                       spin_unlock_bh(&nf_conncount_locks[hash % 
CONNCOUNT_LOCK_SLOTS]);
-
                        if (!addit)
                                return rbconn->list.count;
 
-                       if (!nf_conncount_add(&rbconn->list, tuple, zone))
+                       ret = nf_conncount_add(&rbconn->list, tuple, zone);
+                       if (ret == NF_CONNCOUNT_ERR) {
                                return 0; /* hotdrop */
-
-                       return rbconn->list.count;
+                       } else if (ret == NF_CONNCOUNT_ADDED) {
+                               return rbconn->list.count;
+                       } else {
+                               /* NF_CONNCOUNT_SKIP, rbconn is already
+                                * reclaimed by gc, insert a new tree node
+                                */
+                               break;
+                       }
                }
+       }
 
-               if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes))
-                       continue;
+       if (!tuple)
+               return 0;
 
-               if (nf_conncount_gc_list(net, &rbconn->list))
+       return insert_tree(net, data, root, hash, key, keylen, tuple, zone);
+}
+
+static void tree_gc_worker(struct work_struct *work)
+{
+       struct nf_conncount_data *data = container_of(work, struct 
nf_conncount_data, gc_work);
+       struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES], *rbconn;
+       struct rb_root *root;
+       struct rb_node *node;
+       unsigned int tree, next_tree, gc_count = 0;
+
+       tree = data->gc_tree % CONNCOUNT_LOCK_SLOTS;
+       root = &data->root[tree];
+
+       rcu_read_lock();
+       for (node = rb_first(root); node != NULL; node = rb_next(node)) {
+               rbconn = rb_entry(node, struct nf_conncount_rb, node);
+               if (nf_conncount_gc_list(data->net, &rbconn->list))
                        gc_nodes[gc_count++] = rbconn;
        }
+       rcu_read_unlock();
+
+       spin_lock_bh(&nf_conncount_locks[tree]);
 
        if (gc_count) {
-               no_gc = true;
-               spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
                tree_nodes_free(root, gc_nodes, gc_count);
-               spin_unlock_bh(&nf_conncount_locks[hash % 
CONNCOUNT_LOCK_SLOTS]);
-               /* tree_node_free before new allocation permits
-                * allocator to re-use newly free'd object.
-                *
-                * This is a rare event; in most cases we will find
-                * existing node to re-use. (or gc_count is 0).
-                */
-               goto restart;
        }
 
-       if (!tuple)
-               return 0;
+       clear_bit(tree, data->pending_trees);
+
+       next_tree = (tree + 1) % CONNCOUNT_SLOTS;
+       next_tree = find_next_bit(data->pending_trees, next_tree, 
CONNCOUNT_SLOTS);
+
+       if (next_tree < CONNCOUNT_SLOTS) {
+               data->gc_tree = next_tree;
+               schedule_work(work);
+       }
 
-       return insert_tree(root, hash, key, keylen, tuple, zone);
+       spin_unlock_bh(&nf_conncount_locks[tree]);
 }
 
 /* Count and return number of conntrack entries in 'net' with particular 'key'.
@@ -438,6 +512,8 @@ struct nf_conncount_data *nf_conncount_init(struct net 
*net, unsigned int family
                data->root[i] = RB_ROOT;
 
        data->keylen = keylen / sizeof(u32);
+       data->net = net;
+       INIT_WORK(&data->gc_work, tree_gc_worker);
 
        return data;
 }
@@ -473,6 +549,7 @@ void nf_conncount_destroy(struct net *net, unsigned int 
family,
 {
        unsigned int i;
 
+       cancel_work_sync(&data->gc_work);
        nf_ct_netns_put(net, family);
 
        for (i = 0; i < ARRAY_SIZE(data->root); ++i)
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
index 3dbc737915da..b90d96ba4a12 100644
--- a/net/netfilter/nft_connlimit.c
+++ b/net/netfilter/nft_connlimit.c
@@ -51,7 +51,7 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit 
*priv,
        if (!addit)
                goto out;
 
-       if (!nf_conncount_add(&priv->list, tuple_ptr, zone)) {
+       if (nf_conncount_add(&priv->list, tuple_ptr, zone) == NF_CONNCOUNT_ERR) 
{
                regs->verdict.code = NF_DROP;
                return;
        }
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to