Convert FIB Trie to use RCU for the normal lookup fast
path. Use simple spin_lock for updates and dump access.
This needs more testing on systems with lareg numbers of routes
Signed-off-by: Stephen Hemminger <[EMAIL PROTECTED]>
Index: fib-2.6.13-rc5/net/ipv4/fib_trie.c
===================================================================
--- fib-2.6.13-rc5.orig/net/ipv4/fib_trie.c
+++ fib-2.6.13-rc5/net/ipv4/fib_trie.c
@@ -43,7 +43,7 @@
* 2 of the License, or (at your option) any later version.
*/
-#define VERSION "0.325"
+#define VERSION "0.326"
#include <linux/config.h>
#include <asm/uaccess.h>
@@ -62,6 +62,7 @@
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/init.h>
@@ -81,7 +82,7 @@
#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH
- bits) >> offset))
-static DEFINE_RWLOCK(fib_lock);
+static DEFINE_SPINLOCK(fib_lock);
typedef unsigned int t_key;
@@ -90,6 +91,7 @@ typedef unsigned int t_key;
#define NODE_TYPE_MASK 0x1UL
#define NODE_PARENT(_node) \
((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
+
#define NODE_SET_PARENT(_node, _ptr) \
((_node)->_parent = (((unsigned long)(_ptr)) | \
((_node)->_parent & NODE_TYPE_MASK)))
@@ -110,10 +112,12 @@ struct leaf {
t_key key;
unsigned long _parent;
struct hlist_head list;
+ struct rcu_head rcu;
};
struct leaf_info {
struct hlist_node hlist;
+ struct rcu_head rcu;
int plen;
struct list_head falh;
};
@@ -125,6 +129,7 @@ struct tnode {
unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
unsigned short full_children; /* KEYLENGTH bits needed */
unsigned short empty_children; /* KEYLENGTH bits needed */
+ struct rcu_head rcu;
struct node *child[0];
};
@@ -177,7 +182,7 @@ static inline struct node *tnode_get_chi
{
BUG_ON(i >= 1<<tn->bits);
- return tn->child[i];
+ return rcu_dereference(tn->child[i]);
}
static inline int tnode_child_length(const struct tnode *tn)
@@ -314,48 +319,44 @@ static inline void check_tnode(const str
static int halve_threshold = 25;
static int inflate_threshold = 50;
-static struct leaf *leaf_new(void)
+
+static void __leaf_free_rcu(struct rcu_head *head)
{
- struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
- if (l) {
- NODE_INIT_PARENT(l, T_LEAF);
- INIT_HLIST_HEAD(&l->list);
- }
- return l;
+ kfree(container_of(head, struct leaf, rcu));
}
-static struct leaf_info *leaf_info_new(int plen)
+static inline void free_leaf(struct leaf *leaf)
{
- struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
- if (li) {
- li->plen = plen;
- INIT_LIST_HEAD(&li->falh);
- }
- return li;
+ call_rcu(&leaf->rcu, __leaf_free_rcu);
}
-static inline void free_leaf(struct leaf *l)
+static void __leaf_info_free_rcu(struct rcu_head *head)
{
- kfree(l);
+ kfree(container_of(head, struct leaf_info, rcu));
}
-static inline void free_leaf_info(struct leaf_info *li)
+static inline void free_leaf_info(struct leaf_info *leaf)
{
- kfree(li);
+ call_rcu(&leaf->rcu, __leaf_info_free_rcu);
}
static struct tnode *tnode_alloc(unsigned int size)
{
- if (size <= PAGE_SIZE) {
- return kmalloc(size, GFP_KERNEL);
- } else {
- return (struct tnode *)
- __get_free_pages(GFP_KERNEL, get_order(size));
- }
+ struct page *pages;
+
+ if (size <= PAGE_SIZE)
+ return kcalloc(size, 1, GFP_KERNEL);
+
+ pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
+ if (!pages)
+ return NULL;
+
+ return page_address(pages);
}
-static void __tnode_free(struct tnode *tn)
+static void __tnode_free_rcu(struct rcu_head *head)
{
+ struct tnode *tn = container_of(head, struct tnode, rcu);
unsigned int size = sizeof(struct tnode) +
(1<<tn->bits) * sizeof(struct node *);
@@ -365,6 +366,31 @@ static void __tnode_free(struct tnode *t
free_pages((unsigned long)tn, get_order(size));
}
+static inline void tnode_free(struct tnode *tn)
+{
+ call_rcu(&tn->rcu, __tnode_free_rcu);
+}
+
+static struct leaf *leaf_new(void)
+{
+ struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
+ if (l) {
+ NODE_INIT_PARENT(l, T_LEAF);
+ INIT_HLIST_HEAD(&l->list);
+ }
+ return l;
+}
+
+static struct leaf_info *leaf_info_new(int plen)
+{
+ struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
+ if (li) {
+ li->plen = plen;
+ INIT_LIST_HEAD(&li->falh);
+ }
+ return li;
+}
+
static struct tnode* tnode_new(t_key key, int pos, int bits)
{
int nchildren = 1<<bits;
@@ -372,7 +398,6 @@ static struct tnode* tnode_new(t_key key
struct tnode *tn = tnode_alloc(sz);
if (tn) {
- memset(tn, 0, sz);
NODE_INIT_PARENT(tn, T_TNODE);
tn->pos = pos;
tn->bits = bits;
@@ -386,20 +411,6 @@ static struct tnode* tnode_new(t_key key
return tn;
}
-static void tnode_free(struct tnode *tn)
-{
- BUG_ON(!tn);
-
- if (IS_LEAF(tn)) {
- free_leaf((struct leaf *)tn);
- pr_debug("FL %p \n", tn);
- }
- else if (IS_TNODE(tn)) {
- __tnode_free(tn);
- pr_debug("FT %p \n", tn);
- }
-}
-
/*
* Check whether a tnode 'n' is "full", i.e. it is an internal node
* and no bits are skipped. See discussion in dyntree paper p. 6
@@ -430,7 +441,7 @@ static void tnode_put_child_reorg(struct
BUG_ON(i >= 1<<tn->bits);
- write_lock_bh(&fib_lock);
+ spin_lock_bh(&fib_lock);
chi = tn->child[i];
/* update emptyChildren */
@@ -449,11 +460,13 @@ static void tnode_put_child_reorg(struct
else if (!wasfull && isfull)
tn->full_children++;
+
+ smp_wmb();
if (n)
NODE_SET_PARENT(n, tn);
- tn->child[i] = n;
- write_unlock_bh(&fib_lock);
+ rcu_assign_pointer(tn->child[i], n);
+ spin_unlock_bh(&fib_lock);
}
static struct node *resize(struct trie *t, struct tnode *tn)
@@ -476,7 +489,7 @@ static struct node *resize(struct trie *
if (tn->empty_children == tnode_child_length(tn) - 1)
for (i = 0; i < tnode_child_length(tn); i++) {
- write_lock_bh(&fib_lock);
+ spin_lock_bh(&fib_lock);
if (tn->child[i] != NULL) {
/* compress one level */
@@ -484,11 +497,11 @@ static struct node *resize(struct trie *
if (n)
NODE_INIT_PARENT(n, NODE_TYPE(n));
- write_unlock_bh(&fib_lock);
+ spin_unlock_bh(&fib_lock);
tnode_free(tn);
return n;
}
- write_unlock_bh(&fib_lock);
+ spin_unlock_bh(&fib_lock);
}
/*
* Double as long as the resulting node has a number of
@@ -597,7 +610,7 @@ static struct node *resize(struct trie *
if (tn->empty_children == tnode_child_length(tn) - 1)
for (i = 0; i < tnode_child_length(tn); i++) {
- write_lock_bh(&fib_lock);
+ spin_lock_bh(&fib_lock);
if (tn->child[i] != NULL) {
/* compress one level */
struct node *n = tn->child[i];
@@ -605,11 +618,11 @@ static struct node *resize(struct trie *
if (n)
NODE_INIT_PARENT(n, NODE_TYPE(n));
- write_unlock_bh(&fib_lock);
+ spin_unlock_bh(&fib_lock);
tnode_free(tn);
return n;
}
- write_unlock_bh(&fib_lock);
+ spin_unlock_bh(&fib_lock);
}
return (struct node *) tn;
@@ -857,7 +870,7 @@ static struct leaf_info *find_leaf_info(
struct hlist_node *node;
struct leaf_info *li;
- hlist_for_each_entry(li, node, head, hlist) {
+ hlist_for_each_entry_rcu(li, node, head, hlist) {
if (li->plen == plen)
return li;
}
@@ -880,10 +893,10 @@ static void insert_leaf_info(struct hlis
struct leaf_info *li = NULL, *last = NULL;
struct hlist_node *node, *tmp;
- write_lock_bh(&fib_lock);
+ spin_lock_bh(&fib_lock);
if (hlist_empty(head))
- hlist_add_head(&new->hlist, head);
+ hlist_add_head_rcu(&new->hlist, head);
else {
hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
@@ -892,12 +905,13 @@ static void insert_leaf_info(struct hlis
last = li;
}
+
if (last)
- hlist_add_after(&last->hlist, &new->hlist);
+ hlist_add_after_rcu(&last->hlist, &new->hlist);
else
- hlist_add_before(&new->hlist, &li->hlist);
+ hlist_add_before_rcu(&new->hlist, &li->hlist);
}
- write_unlock_bh(&fib_lock);
+ spin_unlock_bh(&fib_lock);
}
static struct leaf *
@@ -1195,7 +1209,7 @@ fn_trie_insert(struct fib_table *tb, str
struct fib_info *fi_drop;
u8 state;
- write_lock_bh(&fib_lock);
+ spin_lock_bh(&fib_lock);
fi_drop = fa->fa_info;
fa->fa_info = fi;
@@ -1204,7 +1218,7 @@ fn_trie_insert(struct fib_table *tb, str
state = fa->fa_state;
fa->fa_state &= ~FA_S_ACCESSED;
- write_unlock_bh(&fib_lock);
+ spin_unlock_bh(&fib_lock);
fib_release_info(fi_drop);
if (state & FA_S_ACCESSED)
@@ -1259,12 +1273,12 @@ fn_trie_insert(struct fib_table *tb, str
goto out_free_new_fa;
}
- write_lock_bh(&fib_lock);
+ spin_lock_bh(&fib_lock);
list_add_tail(&new_fa->fa_list,
(fa ? &fa->fa_list : fa_head));
- write_unlock_bh(&fib_lock);
+ spin_unlock_bh(&fib_lock);
rt_cache_flush(-1);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr,
req);
@@ -1288,7 +1302,7 @@ static inline int check_leaf(struct trie
struct hlist_head *hhead = &l->list;
struct hlist_node *node;
- hlist_for_each_entry(li, node, hhead, hlist) {
+ hlist_for_each_entry_rcu(li, node, hhead, hlist) {
i = li->plen;
mask = ntohl(inet_make_mask(i));
@@ -1323,7 +1337,7 @@ fn_trie_lookup(struct fib_table *tb, con
int current_prefix_length = KEYLENGTH;
n = t->trie;
- read_lock(&fib_lock);
+ rcu_read_lock();
if (!n)
goto failed;
@@ -1495,7 +1509,7 @@ backtrace:
failed:
ret = 1;
found:
- read_unlock(&fib_lock);
+ rcu_read_unlock();
return ret;
}
@@ -1616,7 +1630,7 @@ fn_trie_delete(struct fib_table *tb, str
l = fib_find_node(t, key);
li = find_leaf_info(&l->list, plen);
- write_lock_bh(&fib_lock);
+ spin_lock_bh(&fib_lock);
list_del(&fa->fa_list);
@@ -1624,7 +1638,7 @@ fn_trie_delete(struct fib_table *tb, str
hlist_del(&li->hlist);
kill_li = 1;
}
- write_unlock_bh(&fib_lock);
+ spin_unlock_bh(&fib_lock);
if (kill_li)
free_leaf_info(li);
@@ -1651,9 +1665,9 @@ static int trie_flush_list(struct trie *
if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
- write_lock_bh(&fib_lock);
+ spin_lock_bh(&fib_lock);
list_del(&fa->fa_list);
- write_unlock_bh(&fib_lock);
+ spin_unlock_bh(&fib_lock);
fn_free_alias(fa);
found++;
@@ -1675,9 +1689,9 @@ static int trie_flush_leaf(struct trie *
if (list_empty(&li->falh)) {
- write_lock_bh(&fib_lock);
+ spin_lock_bh(&fib_lock);
hlist_del(&li->hlist);
- write_unlock_bh(&fib_lock);
+ spin_unlock_bh(&fib_lock);
free_leaf_info(li);
}
@@ -1714,23 +1728,25 @@ static struct leaf *nextleaf(struct trie
last = 1 << p->bits;
for(idx = pos; idx < last ; idx++) {
- if (p->child[idx]) {
-
+ c = rcu_dereference(p->child[idx]);
+ if (c) {
/* Decend if tnode */
-
- while (IS_TNODE(p->child[idx])) {
- p = (struct tnode*) p->child[idx];
+ while (IS_TNODE(c)) {
+ p = (struct tnode *) c;
idx = 0;
/* Rightmost non-NULL branch */
if (p && IS_TNODE(p))
- while (p->child[idx] == NULL &&
idx < (1 << p->bits)) idx++;
+ while (!(c =
rcu_dereference(p->child[idx]))
+ && idx < (1<<p->bits))
+ idx++;
/* Done with this tnode? */
- if (idx >= (1 << p->bits) ||
p->child[idx] == NULL )
+ if (idx >= (1 << p->bits) || !c)
goto up;
}
- return (struct leaf*) p->child[idx];
+
+ return (struct leaf *) c;
}
}
up:
@@ -1738,6 +1754,7 @@ up:
c = (struct node*) p;
p = (struct tnode *) NODE_PARENT(p);
}
+
return NULL; /* Ready. Root of trie */
}
@@ -1781,7 +1798,7 @@ fn_trie_select_default(struct fib_table
last_resort = NULL;
order = -1;
- read_lock(&fib_lock);
+ rcu_read_lock();
l = fib_find_node(t, 0);
if (!l)
@@ -1845,7 +1862,7 @@ fn_trie_select_default(struct fib_table
}
trie_last_dflt = last_idx;
out:;
- read_unlock(&fib_lock);
+ rcu_read_unlock();
}
static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct
fib_table *tb,
@@ -1927,14 +1944,15 @@ static int fn_trie_dump_plen(struct trie
return skb->len;
}
-static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct
netlink_callback *cb)
+static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
+ struct netlink_callback *cb)
{
int m, s_m;
struct trie *t = (struct trie *) tb->tb_data;
s_m = cb->args[1];
- read_lock(&fib_lock);
+ spin_lock_bh(&fib_lock);
for (m=0; m<=32; m++) {
if (m < s_m)
@@ -1948,11 +1966,12 @@ static int fn_trie_dump(struct fib_table
goto out;
}
}
- read_unlock(&fib_lock);
+ spin_unlock_bh(&fib_lock);
cb->args[1] = m;
return skb->len;
+
out:
- read_unlock(&fib_lock);
+ spin_unlock_bh(&fib_lock);
return -1;
}
@@ -2003,7 +2022,9 @@ struct fib_table * __init fib_hash_init(
}
#ifdef CONFIG_PROC_FS
-/* Depth first Trie walk iterator */
+/* Depth first Trie walk iterator
+ * Assumes fib_lock is held because need to backtrack (no RCU)
+ */
struct fib_trie_iter {
struct tnode *tnode;
unsigned index;
@@ -2074,7 +2095,7 @@ static void trie_collect_stats(struct tr
memset(s, 0, sizeof(*s));
- read_lock(&fib_lock);
+ spin_lock_bh(&fib_lock);
for (n = fib_trie_get_first(&iter, t); n;
n = fib_trie_get_next(&iter)) {
if (IS_LEAF(n)) {
@@ -2094,7 +2115,7 @@ static void trie_collect_stats(struct tr
}
}
- read_unlock(&fib_lock);
+ spin_unlock_bh(&fib_lock);
}
/*
@@ -2212,7 +2233,7 @@ static struct node *fib_trie_get_idx(str
static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
{
- read_lock(&fib_lock);
+ spin_lock_bh(&fib_lock);
if (*pos == 0)
return SEQ_START_TOKEN;
return fib_trie_get_idx(seq->private, *pos - 1);
@@ -2241,7 +2262,7 @@ static void *fib_trie_seq_next(struct se
static void fib_trie_seq_stop(struct seq_file *seq, void *v)
{
- read_unlock(&fib_lock);
+ spin_unlock_bh(&fib_lock);
}
static void seq_indent(struct seq_file *seq, int n)
Index: fib-2.6.13-rc5/include/linux/list.h
===================================================================
--- fib-2.6.13-rc5.orig/include/linux/list.h
+++ fib-2.6.13-rc5/include/linux/list.h
@@ -620,6 +620,28 @@ static inline void hlist_add_after(struc
next->next->pprev = &next->next;
}
+static inline void hlist_add_before_rcu(struct hlist_node *n,
+ struct hlist_node *next)
+{
+ n->pprev = next->pprev;
+ n->next = next;
+ smp_wmb();
+ next->pprev = &n->next;
+ *(n->pprev) = n;
+}
+
+static inline void hlist_add_after_rcu(struct hlist_node *n,
+ struct hlist_node *next)
+{
+ next->next = n->next;
+ n->next = next;
+ smp_wmb();
+ next->pprev = &n->next;
+
+ if (next->next)
+ next->next->pprev = &next->next;
+}
+
#define hlist_entry(ptr, type, member) container_of(ptr,type,member)
#define hlist_for_each(pos, head) \
--
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at http://vger.kernel.org/majordomo-info.html