Convert FIB Trie to use RCU for the normal lookup fast
path. Use simple spin_lock for updates and dump access.

This needs more testing on systems with lareg numbers of routes

Signed-off-by: Stephen Hemminger <[EMAIL PROTECTED]>

Index: fib-2.6.13-rc5/net/ipv4/fib_trie.c
===================================================================
--- fib-2.6.13-rc5.orig/net/ipv4/fib_trie.c
+++ fib-2.6.13-rc5/net/ipv4/fib_trie.c
@@ -43,7 +43,7 @@
  *             2 of the License, or (at your option) any later version.
  */
 
-#define VERSION "0.325"
+#define VERSION "0.326"
 
 #include <linux/config.h>
 #include <asm/uaccess.h>
@@ -62,6 +62,7 @@
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
 #include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
 #include <linux/init.h>
@@ -81,7 +82,7 @@
 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH 
- bits) >> offset))
 
-static DEFINE_RWLOCK(fib_lock);
+static DEFINE_SPINLOCK(fib_lock);
 
 typedef unsigned int t_key;
 
@@ -90,6 +91,7 @@ typedef unsigned int t_key;
 #define NODE_TYPE_MASK 0x1UL
 #define NODE_PARENT(_node) \
        ((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
+
 #define NODE_SET_PARENT(_node, _ptr) \
        ((_node)->_parent = (((unsigned long)(_ptr)) | \
                      ((_node)->_parent & NODE_TYPE_MASK)))
@@ -110,10 +112,12 @@ struct leaf {
         t_key key;
        unsigned long _parent;
        struct hlist_head list;
+       struct rcu_head rcu;
 };
 
 struct leaf_info {
        struct hlist_node hlist;
+       struct rcu_head rcu;
        int plen;
        struct list_head falh;
 };
@@ -125,6 +129,7 @@ struct tnode {
         unsigned short bits:5;       /* 2log(KEYLENGTH) bits needed */
         unsigned short full_children;  /* KEYLENGTH bits needed */
         unsigned short empty_children; /* KEYLENGTH bits needed */
+       struct rcu_head rcu;
         struct node *child[0];
 };
 
@@ -177,7 +182,7 @@ static inline struct node *tnode_get_chi
 {
        BUG_ON(i >= 1<<tn->bits);
 
-        return tn->child[i];
+        return rcu_dereference(tn->child[i]);
 }
 
 static inline int tnode_child_length(const struct tnode *tn)
@@ -314,48 +319,44 @@ static inline void check_tnode(const str
 static int halve_threshold = 25;
 static int inflate_threshold = 50;
 
-static struct leaf *leaf_new(void)
+
+static void __leaf_free_rcu(struct rcu_head *head)
 {
-       struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
-       if (l) {
-               NODE_INIT_PARENT(l, T_LEAF);
-               INIT_HLIST_HEAD(&l->list);
-       }
-       return l;
+       kfree(container_of(head, struct leaf, rcu));
 }
 
-static struct leaf_info *leaf_info_new(int plen)
+static inline void free_leaf(struct leaf *leaf)
 {
-       struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
-       if (li) {
-               li->plen = plen;
-               INIT_LIST_HEAD(&li->falh);
-       }
-       return li;
+       call_rcu(&leaf->rcu, __leaf_free_rcu);
 }
 
-static inline void free_leaf(struct leaf *l)
+static void __leaf_info_free_rcu(struct rcu_head *head)
 {
-       kfree(l);
+       kfree(container_of(head, struct leaf_info, rcu));
 }
 
-static inline void free_leaf_info(struct leaf_info *li)
+static inline void free_leaf_info(struct leaf_info *leaf)
 {
-       kfree(li);
+       call_rcu(&leaf->rcu, __leaf_info_free_rcu);
 }
 
 static struct tnode *tnode_alloc(unsigned int size)
 {
-       if (size <= PAGE_SIZE) {
-               return kmalloc(size, GFP_KERNEL);
-       } else {
-               return (struct tnode *)
-                       __get_free_pages(GFP_KERNEL, get_order(size));
-       }
+       struct page *pages;
+
+       if (size <= PAGE_SIZE)
+               return kcalloc(size, 1, GFP_KERNEL);
+
+       pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
+       if (!pages)
+               return NULL;
+
+       return page_address(pages);
 }
 
-static void __tnode_free(struct tnode *tn)
+static void __tnode_free_rcu(struct rcu_head *head)
 {
+       struct tnode *tn = container_of(head, struct tnode, rcu);
        unsigned int size = sizeof(struct tnode) +
                            (1<<tn->bits) * sizeof(struct node *);
 
@@ -365,6 +366,31 @@ static void __tnode_free(struct tnode *t
                free_pages((unsigned long)tn, get_order(size));
 }
 
+static inline void tnode_free(struct tnode *tn)
+{
+       call_rcu(&tn->rcu, __tnode_free_rcu);
+}
+
+static struct leaf *leaf_new(void)
+{
+       struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
+       if (l) {
+               NODE_INIT_PARENT(l, T_LEAF);
+               INIT_HLIST_HEAD(&l->list);
+       }
+       return l;
+}
+
+static struct leaf_info *leaf_info_new(int plen)
+{
+       struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
+       if (li) {
+               li->plen = plen;
+               INIT_LIST_HEAD(&li->falh);
+       }
+       return li;
+}
+
 static struct tnode* tnode_new(t_key key, int pos, int bits)
 {
        int nchildren = 1<<bits;
@@ -372,7 +398,6 @@ static struct tnode* tnode_new(t_key key
        struct tnode *tn = tnode_alloc(sz);
 
        if (tn)  {
-               memset(tn, 0, sz);
                NODE_INIT_PARENT(tn, T_TNODE);
                tn->pos = pos;
                tn->bits = bits;
@@ -386,20 +411,6 @@ static struct tnode* tnode_new(t_key key
        return tn;
 }
 
-static void tnode_free(struct tnode *tn)
-{
-       BUG_ON(!tn);
-
-       if (IS_LEAF(tn)) {
-               free_leaf((struct leaf *)tn);
-               pr_debug("FL %p \n", tn);
-       }
-       else if (IS_TNODE(tn)) {
-               __tnode_free(tn);
-               pr_debug("FT %p \n", tn);
-       }
-}
-
 /*
  * Check whether a tnode 'n' is "full", i.e. it is an internal node
  * and no bits are skipped. See discussion in dyntree paper p. 6
@@ -430,7 +441,7 @@ static void tnode_put_child_reorg(struct
 
        BUG_ON(i >= 1<<tn->bits);
 
-       write_lock_bh(&fib_lock);
+       spin_lock_bh(&fib_lock);
        chi = tn->child[i];
 
        /* update emptyChildren */
@@ -449,11 +460,13 @@ static void tnode_put_child_reorg(struct
 
        else if (!wasfull && isfull)
                tn->full_children++;
+
+       smp_wmb();
        if (n)
                NODE_SET_PARENT(n, tn);
 
-       tn->child[i] = n;
-       write_unlock_bh(&fib_lock);
+       rcu_assign_pointer(tn->child[i], n);
+       spin_unlock_bh(&fib_lock);
 }
 
 static struct node *resize(struct trie *t, struct tnode *tn)
@@ -476,7 +489,7 @@ static struct node *resize(struct trie *
        if (tn->empty_children == tnode_child_length(tn) - 1)
                for (i = 0; i < tnode_child_length(tn); i++) {
 
-                       write_lock_bh(&fib_lock);
+                       spin_lock_bh(&fib_lock);
                        if (tn->child[i] != NULL) {
 
                                /* compress one level */
@@ -484,11 +497,11 @@ static struct node *resize(struct trie *
                                if (n)
                                        NODE_INIT_PARENT(n, NODE_TYPE(n));
 
-                               write_unlock_bh(&fib_lock);
+                               spin_unlock_bh(&fib_lock);
                                tnode_free(tn);
                                return n;
                        }
-                       write_unlock_bh(&fib_lock);
+                       spin_unlock_bh(&fib_lock);
                }
        /*
         * Double as long as the resulting node has a number of
@@ -597,7 +610,7 @@ static struct node *resize(struct trie *
        if (tn->empty_children == tnode_child_length(tn) - 1)
                for (i = 0; i < tnode_child_length(tn); i++) {
                
-                       write_lock_bh(&fib_lock);
+                       spin_lock_bh(&fib_lock);
                        if (tn->child[i] != NULL) {
                                /* compress one level */
                                struct node *n = tn->child[i];
@@ -605,11 +618,11 @@ static struct node *resize(struct trie *
                                if (n)
                                        NODE_INIT_PARENT(n, NODE_TYPE(n));
 
-                               write_unlock_bh(&fib_lock);
+                               spin_unlock_bh(&fib_lock);
                                tnode_free(tn);
                                return n;
                        }
-                       write_unlock_bh(&fib_lock);
+                       spin_unlock_bh(&fib_lock);
                }
 
        return (struct node *) tn;
@@ -857,7 +870,7 @@ static struct leaf_info *find_leaf_info(
        struct hlist_node *node;
        struct leaf_info *li;
 
-       hlist_for_each_entry(li, node, head, hlist) {
+       hlist_for_each_entry_rcu(li, node, head, hlist) {
                if (li->plen == plen)
                        return li;
        }
@@ -880,10 +893,10 @@ static void insert_leaf_info(struct hlis
        struct leaf_info *li = NULL, *last = NULL;
        struct hlist_node *node, *tmp;
 
-       write_lock_bh(&fib_lock);
+       spin_lock_bh(&fib_lock);
 
        if (hlist_empty(head))
-               hlist_add_head(&new->hlist, head);
+               hlist_add_head_rcu(&new->hlist, head);
        else {
                hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
                
@@ -892,12 +905,13 @@ static void insert_leaf_info(struct hlis
                
                        last = li;
                }
+
                if (last)
-                       hlist_add_after(&last->hlist, &new->hlist);
+                       hlist_add_after_rcu(&last->hlist, &new->hlist);
                else
-                       hlist_add_before(&new->hlist, &li->hlist);
+                       hlist_add_before_rcu(&new->hlist, &li->hlist);
        }
-       write_unlock_bh(&fib_lock);
+       spin_unlock_bh(&fib_lock);
 }
 
 static struct leaf *
@@ -1195,7 +1209,7 @@ fn_trie_insert(struct fib_table *tb, str
                        struct fib_info *fi_drop;
                        u8 state;
 
-                       write_lock_bh(&fib_lock);
+                       spin_lock_bh(&fib_lock);
 
                        fi_drop = fa->fa_info;
                        fa->fa_info = fi;
@@ -1204,7 +1218,7 @@ fn_trie_insert(struct fib_table *tb, str
                        state = fa->fa_state;
                        fa->fa_state &= ~FA_S_ACCESSED;
 
-                       write_unlock_bh(&fib_lock);
+                       spin_unlock_bh(&fib_lock);
 
                        fib_release_info(fi_drop);
                        if (state & FA_S_ACCESSED)
@@ -1259,12 +1273,12 @@ fn_trie_insert(struct fib_table *tb, str
                        goto out_free_new_fa;
        }
 
-       write_lock_bh(&fib_lock);
+       spin_lock_bh(&fib_lock);
 
        list_add_tail(&new_fa->fa_list,
                 (fa ? &fa->fa_list : fa_head));
 
-       write_unlock_bh(&fib_lock);
+       spin_unlock_bh(&fib_lock);
 
        rt_cache_flush(-1);
        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, 
req);
@@ -1288,7 +1302,7 @@ static inline int check_leaf(struct trie
        struct hlist_head *hhead = &l->list;
        struct hlist_node *node;
 
-       hlist_for_each_entry(li, node, hhead, hlist) {
+       hlist_for_each_entry_rcu(li, node, hhead, hlist) {
 
                i = li->plen;
                mask = ntohl(inet_make_mask(i));
@@ -1323,7 +1337,7 @@ fn_trie_lookup(struct fib_table *tb, con
        int current_prefix_length = KEYLENGTH;
        n = t->trie;
 
-       read_lock(&fib_lock);
+       rcu_read_lock();
        if (!n)
                goto failed;
 
@@ -1495,7 +1509,7 @@ backtrace:
 failed:
        ret = 1;
 found:
-       read_unlock(&fib_lock);
+       rcu_read_unlock();
        return ret;
 }
 
@@ -1616,7 +1630,7 @@ fn_trie_delete(struct fib_table *tb, str
                l = fib_find_node(t, key);
                li = find_leaf_info(&l->list, plen);
 
-               write_lock_bh(&fib_lock);
+               spin_lock_bh(&fib_lock);
 
                list_del(&fa->fa_list);
 
@@ -1624,7 +1638,7 @@ fn_trie_delete(struct fib_table *tb, str
                        hlist_del(&li->hlist);
                        kill_li = 1;
                }
-               write_unlock_bh(&fib_lock);
+               spin_unlock_bh(&fib_lock);
        
                if (kill_li)
                        free_leaf_info(li);
@@ -1651,9 +1665,9 @@ static int trie_flush_list(struct trie *
        
                if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
 
-                       write_lock_bh(&fib_lock);
+                       spin_lock_bh(&fib_lock);
                        list_del(&fa->fa_list);
-                       write_unlock_bh(&fib_lock);
+                       spin_unlock_bh(&fib_lock);
 
                        fn_free_alias(fa);
                        found++;
@@ -1675,9 +1689,9 @@ static int trie_flush_leaf(struct trie *
 
                if (list_empty(&li->falh)) {
 
-                       write_lock_bh(&fib_lock);
+                       spin_lock_bh(&fib_lock);
                        hlist_del(&li->hlist);
-                       write_unlock_bh(&fib_lock);
+                       spin_unlock_bh(&fib_lock);
 
                        free_leaf_info(li);
                }
@@ -1714,23 +1728,25 @@ static struct leaf *nextleaf(struct trie
 
                last = 1 << p->bits;
                for(idx = pos; idx < last ; idx++) {
-                       if (p->child[idx]) {
-
+                       c = rcu_dereference(p->child[idx]);
+                       if (c) {
                                /* Decend if tnode */
-
-                               while (IS_TNODE(p->child[idx])) {
-                                       p = (struct tnode*) p->child[idx];
+                               while (IS_TNODE(c)) {
+                                       p = (struct tnode *) c;
                                        idx = 0;
                                
                                        /* Rightmost non-NULL branch */
                                        if (p && IS_TNODE(p))
-                                               while (p->child[idx] == NULL && 
idx < (1 << p->bits)) idx++;
+                                               while (!(c = 
rcu_dereference(p->child[idx]))
+                                                      && idx < (1<<p->bits))
+                                                      idx++;
 
                                        /* Done with this tnode? */
-                                       if (idx >= (1 << p->bits) || 
p->child[idx] == NULL )
+                                       if (idx >= (1 << p->bits) || !c)
                                                goto up;
                                }
-                               return (struct leaf*) p->child[idx];
+
+                               return (struct leaf *) c;
                        }
                }
 up:
@@ -1738,6 +1754,7 @@ up:
                c = (struct node*) p;
                p = (struct tnode *) NODE_PARENT(p);
        }
+
        return NULL; /* Ready. Root of trie */
 }
 
@@ -1781,7 +1798,7 @@ fn_trie_select_default(struct fib_table 
        last_resort = NULL;
        order = -1;
 
-       read_lock(&fib_lock);
+       rcu_read_lock();
 
        l = fib_find_node(t, 0);
        if (!l)
@@ -1845,7 +1862,7 @@ fn_trie_select_default(struct fib_table 
        }
        trie_last_dflt = last_idx;
  out:;
-       read_unlock(&fib_lock);
+       rcu_read_unlock();
 }
 
 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct 
fib_table *tb,
@@ -1927,14 +1944,15 @@ static int fn_trie_dump_plen(struct trie
        return skb->len;
 }
 
-static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct 
netlink_callback *cb)
+static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
+                       struct netlink_callback *cb)
 {
        int m, s_m;
        struct trie *t = (struct trie *) tb->tb_data;
 
        s_m = cb->args[1];
 
-       read_lock(&fib_lock);
+       spin_lock_bh(&fib_lock);
        for (m=0; m<=32; m++) {
 
                if (m < s_m)
@@ -1948,11 +1966,12 @@ static int fn_trie_dump(struct fib_table
                        goto out;
                }
        }
-       read_unlock(&fib_lock);
+       spin_unlock_bh(&fib_lock);
        cb->args[1] = m;
        return skb->len;
+
  out:
-       read_unlock(&fib_lock);
+       spin_unlock_bh(&fib_lock);
        return -1;
 }
 
@@ -2003,7 +2022,9 @@ struct fib_table * __init fib_hash_init(
 }
 
 #ifdef CONFIG_PROC_FS
-/* Depth first Trie walk iterator */
+/* Depth first Trie walk iterator
+ * Assumes fib_lock is held because need to backtrack (no RCU)
+ */
 struct fib_trie_iter {
        struct tnode *tnode;
        unsigned index;
@@ -2074,7 +2095,7 @@ static void trie_collect_stats(struct tr
 
        memset(s, 0, sizeof(*s));
 
-       read_lock(&fib_lock);
+       spin_lock_bh(&fib_lock);
        for (n = fib_trie_get_first(&iter, t); n;
             n = fib_trie_get_next(&iter)) {
                if (IS_LEAF(n)) {
@@ -2094,7 +2115,7 @@ static void trie_collect_stats(struct tr
                }
        }
 
-       read_unlock(&fib_lock);
+       spin_unlock_bh(&fib_lock);
 }
 
 /*
@@ -2212,7 +2233,7 @@ static struct node *fib_trie_get_idx(str
 
 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
 {
-       read_lock(&fib_lock);
+       spin_lock_bh(&fib_lock);
        if (*pos == 0)
                return SEQ_START_TOKEN;
        return fib_trie_get_idx(seq->private, *pos - 1);
@@ -2241,7 +2262,7 @@ static void *fib_trie_seq_next(struct se
 
 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
 {
-       read_unlock(&fib_lock);
+       spin_unlock_bh(&fib_lock);
 }
 
 static void seq_indent(struct seq_file *seq, int n)
Index: fib-2.6.13-rc5/include/linux/list.h
===================================================================
--- fib-2.6.13-rc5.orig/include/linux/list.h
+++ fib-2.6.13-rc5/include/linux/list.h
@@ -620,6 +620,28 @@ static inline void hlist_add_after(struc
                next->next->pprev  = &next->next;
 }
 
+static inline void hlist_add_before_rcu(struct hlist_node *n,
+                                       struct hlist_node *next)
+{
+       n->pprev = next->pprev;
+       n->next = next;
+       smp_wmb();
+       next->pprev = &n->next;
+       *(n->pprev) = n;
+}
+
+static inline void hlist_add_after_rcu(struct hlist_node *n,
+                                      struct hlist_node *next)
+{
+       next->next = n->next;
+       n->next = next;
+       smp_wmb();
+       next->pprev = &n->next;
+
+       if (next->next)
+               next->next->pprev  = &next->next;
+}
+
 #define hlist_entry(ptr, type, member) container_of(ptr,type,member)
 
 #define hlist_for_each(pos, head) \

--

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to