Stephen Hemminger writes:
 > Convert FIB Trie to use RCU for the normal lookup fast
 > path. Use simple spin_lock for updates and dump access.
 
 Hello!

 Yes spent some days to get a complete RCU for the trie. Discussed  with 
 Patrick at OLS. This includes your start and also RCU versions of fib_alias 
 handling and removes the fib_lock totally etc.

 Replace of fib_alias needs to be proctected via RCU and should be added.

> This needs more testing on systems with lareg numbers of routes
 
 Testing and verification is absolutely crucial... and is often much more 
 work then doing a patch. And really testing & verification is a iterative
 part of the design.

 The patch below is current work it is tested/iterated with rDoS injection
 during both insert, delete and dump of full bgp on SMP machine.

 Also now insert and delete performance seems now acceptable. (destruction 
 of nested RCU  lists) leaf -> list of leaf_info -> list of fib_alias
 And lookup performance is interesting w. SMP :)

 I've asked Patrick to put on his argus eyes to spot any potential problems 
 that might be missed.

 I'm on vacation so email will be sporadic...
 

 Cheers.
                                        --ro


diff --git a/include/linux/list.h b/include/linux/list.h
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -620,6 +620,28 @@ static inline void hlist_add_after(struc
                next->next->pprev  = &next->next;
 }
 
+static inline void hlist_add_before_rcu(struct hlist_node *n,
+                                       struct hlist_node *next)
+{
+       n->pprev = next->pprev;
+       n->next = next;
+       smp_wmb();
+       next->pprev = &n->next;
+       *(n->pprev) = n;
+}
+
+static inline void hlist_add_after_rcu(struct hlist_node *n,
+                                      struct hlist_node *next)
+{
+       next->next = n->next;
+       n->next = next;
+       smp_wmb();
+       next->pprev = &n->next;
+
+       if (next->next)
+               next->next->pprev  = &next->next;
+}
+
 #define hlist_entry(ptr, type, member) container_of(ptr,type,member)
 
 #define hlist_for_each(pos, head) \
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -847,6 +847,102 @@ failure:
        return NULL;
 }
 
+/* Deal with better RCU integration later....  */
+
+/* Return the first fib alias matching TOS with
+ * priority less than or equal to PRIO.
+ */
+struct fib_alias *fib_find_alias_rcu(struct list_head *fah, u8 tos, u32 prio)
+{
+       if (fah) {
+               struct fib_alias *fa;
+               list_for_each_entry_rcu(fa, fah, fa_list) {
+                       if (fa->fa_tos > tos)
+                               continue;
+                       if (fa->fa_info->fib_priority >= prio ||
+                           fa->fa_tos < tos)
+                               return fa;
+               }
+       }
+       return NULL;
+}
+
+int fib_semantic_match_rcu(struct list_head *head, const struct flowi *flp,
+                      struct fib_result *res, __u32 zone, __u32 mask, 
+                       int prefixlen)
+{
+       struct fib_alias *fa;
+       int nh_sel = 0;
+
+       list_for_each_entry_rcu(fa, head, fa_list) {
+               int err;
+
+               if (fa->fa_tos &&
+                   fa->fa_tos != flp->fl4_tos)
+                       continue;
+
+               if (fa->fa_scope < flp->fl4_scope)
+                       continue;
+
+               fa->fa_state |= FA_S_ACCESSED;
+
+               err = fib_props[fa->fa_type].error;
+               if (err == 0) {
+                       struct fib_info *fi = fa->fa_info;
+
+                       if (fi->fib_flags & RTNH_F_DEAD)
+                               continue;
+
+                       switch (fa->fa_type) {
+                       case RTN_UNICAST:
+                       case RTN_LOCAL:
+                       case RTN_BROADCAST:
+                       case RTN_ANYCAST:
+                       case RTN_MULTICAST:
+                               for_nexthops(fi) {
+                                       if (nh->nh_flags&RTNH_F_DEAD)
+                                               continue;
+                                       if (!flp->oif || flp->oif == nh->nh_oif)
+                                               break;
+                               }
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+                               if (nhsel < fi->fib_nhs) {
+                                       nh_sel = nhsel;
+                                       goto out_fill_res;
+                               }
+#else
+                               if (nhsel < 1) {
+                                       goto out_fill_res;
+                               }
+#endif
+                               endfor_nexthops(fi);
+                               continue;
+
+                       default:
+                               printk(KERN_DEBUG "impossible 102\n");
+                               return -EINVAL;
+                       };
+               }       
+               return err;
+       }
+       return 1;
+
+out_fill_res:
+       res->prefixlen = prefixlen;
+       res->nh_sel = nh_sel;
+       res->type = fa->fa_type;
+       res->scope = fa->fa_scope;
+       res->fi = fa->fa_info;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+       res->netmask = mask;
+       res->network = zone &
+               (0xFFFFFFFF >> (32 - prefixlen));
+#endif
+       atomic_inc(&res->fi->fib_clntref);
+       return 0;
+}
+
+
 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
                       struct fib_result *res, __u32 zone, __u32 mask, 
                        int prefixlen)
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -43,7 +43,7 @@
  *             2 of the License, or (at your option) any later version.
  */
 
-#define VERSION "0.325"
+#define VERSION "0.333"
 
 #include <linux/config.h>
 #include <asm/uaccess.h>
@@ -62,6 +62,7 @@
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
 #include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
 #include <linux/init.h>
@@ -82,22 +83,23 @@
 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH 
- bits) >> offset))
 
-static DEFINE_RWLOCK(fib_lock);
-
 typedef unsigned int t_key;
 
 #define T_TNODE 0
 #define T_LEAF  1
 #define NODE_TYPE_MASK 0x1UL
 #define NODE_PARENT(_node) \
-       ((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
+       ((struct tnode *) rcu_dereference(((_node)->_parent & ~NODE_TYPE_MASK)))
+
 #define NODE_SET_PARENT(_node, _ptr) \
        ((_node)->_parent = (((unsigned long)(_ptr)) | \
                      ((_node)->_parent & NODE_TYPE_MASK)))
+
 #define NODE_INIT_PARENT(_node, _type) \
-       ((_node)->_parent = (_type))
+       (rcu_assign_pointer ((_node)->_parent, _type))
+
 #define NODE_TYPE(_node) \
-       ((_node)->_parent & NODE_TYPE_MASK)
+       (rcu_dereference(((_node)->_parent) & NODE_TYPE_MASK))
 
 #define IS_TNODE(n) (!(n->_parent & T_LEAF))
 #define IS_LEAF(n) (n->_parent & T_LEAF)
@@ -111,10 +113,12 @@ struct leaf {
         t_key key;
        unsigned long _parent;
        struct hlist_head list;
+       struct rcu_head rcu;
 };
 
 struct leaf_info {
        struct hlist_node hlist;
+       struct rcu_head rcu;
        int plen;
        struct list_head falh;
 };
@@ -126,6 +130,7 @@ struct tnode {
         unsigned short bits:5;       /* 2log(KEYLENGTH) bits needed */
         unsigned short full_children;  /* KEYLENGTH bits needed */
         unsigned short empty_children; /* KEYLENGTH bits needed */
+       struct rcu_head rcu;
         struct node *child[0];
 };
 
@@ -163,13 +168,12 @@ static int trie_debug = 0;
 static int tnode_full(struct tnode *tn, struct node *n);
 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int 
wasfull);
-static int tnode_child_length(struct tnode *tn);
 static struct node *resize(struct trie *t, struct tnode *tn);
 static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
 static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
 static void tnode_free(struct tnode *tn);
 static void trie_dump_seq(struct seq_file *seq, struct trie *t);
-extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 
prio);
+extern struct fib_alias *fib_find_alias_rcu(struct list_head *fah, u8 tos, u32 
prio);
 extern int fib_detect_death(struct fib_info *fi, int order,
                             struct fib_info **last_resort, int *last_idx, int 
*dflt);
 
@@ -190,10 +194,10 @@ static inline struct node *tnode_get_chi
         if (i >= 1<<tn->bits)
                 trie_bug("tnode_get_child");
 
-        return tn->child[i];
+        return rcu_dereference(tn->child[i]);
 }
 
-static inline int tnode_child_length(struct tnode *tn)
+static inline int tnode_child_length(const struct tnode *tn)
 {
         return 1<<tn->bits;
 }
@@ -249,14 +253,6 @@ static inline int tkey_mismatch(t_key a,
        return i;
 }
 
-/* Candiate for fib_semantics */
-
-static void fn_free_alias(struct fib_alias *fa)
-{
-       fib_release_info(fa->fa_info);
-       kmem_cache_free(fn_alias_kmem, fa);
-}
-
 /*
   To understand this stuff, an understanding of keys and all their bits is 
   necessary. Every node in the trie has a key associated with it, but not 
@@ -330,48 +326,55 @@ static void check_tnode(struct tnode *tn
 static int halve_threshold = 25;
 static int inflate_threshold = 50;
 
-static struct leaf *leaf_new(void)
+
+static void __alias_free_mem(struct rcu_head *head)
 {
-       struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
-       if (l) {
-               NODE_INIT_PARENT(l, T_LEAF);
-               INIT_HLIST_HEAD(&l->list);
-       }
-       return l;
+       struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
+       kmem_cache_free(fn_alias_kmem, fa);
 }
 
-static struct leaf_info *leaf_info_new(int plen)
+static inline void alias_free_mem_rcu(struct fib_alias *fa)
 {
-       struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
-       if (li) {
-               li->plen = plen;
-               INIT_LIST_HEAD(&li->falh);
-       }
-       return li;
+       call_rcu(&fa->rcu, __alias_free_mem);
+}
+
+static void __leaf_free_rcu(struct rcu_head *head)
+{
+       kfree(container_of(head, struct leaf, rcu));
 }
 
-static inline void free_leaf(struct leaf *l)
+static inline void free_leaf(struct leaf *leaf)
 {
-       kfree(l);
+       call_rcu(&leaf->rcu, __leaf_free_rcu);
 }
 
-static inline void free_leaf_info(struct leaf_info *li)
+static void __leaf_info_free_rcu(struct rcu_head *head)
 {
-       kfree(li);
+       kfree(container_of(head, struct leaf_info, rcu));
+}
+
+static inline void free_leaf_info(struct leaf_info *leaf)
+{
+       call_rcu(&leaf->rcu, __leaf_info_free_rcu);
 }
 
 static struct tnode *tnode_alloc(unsigned int size)
 {
-       if (size <= PAGE_SIZE) {
-               return kmalloc(size, GFP_KERNEL);
-       } else {
-               return (struct tnode *)
-                       __get_free_pages(GFP_KERNEL, get_order(size));
-       }
+       struct page *pages;
+
+       if (size <= PAGE_SIZE) 
+               return kcalloc(size, 1, GFP_KERNEL);
+
+       pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
+       if (!pages)
+               return NULL;
+
+       return page_address(pages);
 }
 
-static void __tnode_free(struct tnode *tn)
+static void __tnode_free_rcu(struct rcu_head *head)
 {
+       struct tnode *tn = container_of(head, struct tnode, rcu);
        unsigned int size = sizeof(struct tnode) +
                            (1<<tn->bits) * sizeof(struct node *);
 
@@ -381,6 +384,31 @@ static void __tnode_free(struct tnode *t
                free_pages((unsigned long)tn, get_order(size));
 }
 
+static inline void tnode_free(struct tnode *tn)
+{
+       call_rcu(&tn->rcu, __tnode_free_rcu);
+}
+
+static struct leaf *leaf_new(void)
+{
+       struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
+       if (l) {
+               NODE_INIT_PARENT(l, T_LEAF);
+               INIT_HLIST_HEAD(&l->list);
+       }
+       return l;
+}
+
+static struct leaf_info *leaf_info_new(int plen)
+{
+       struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
+       if (li) {
+               li->plen = plen;
+               INIT_LIST_HEAD(&li->falh);
+       }
+       return li;
+}
+
 static struct tnode* tnode_new(t_key key, int pos, int bits)
 {
        int nchildren = 1<<bits;
@@ -388,7 +416,6 @@ static struct tnode* tnode_new(t_key key
        struct tnode *tn = tnode_alloc(sz);
 
        if (tn)  {
-               memset(tn, 0, sz);
                NODE_INIT_PARENT(tn, T_TNODE);
                tn->pos = pos;
                tn->bits = bits;
@@ -403,26 +430,6 @@ static struct tnode* tnode_new(t_key key
        return tn;
 }
 
-static void tnode_free(struct tnode *tn)
-{
-       if (!tn) {
-               trie_bug("tnode_free\n");
-       }
-       if (IS_LEAF(tn)) {
-               free_leaf((struct leaf *)tn);
-               if (trie_debug > 0 )
-                       printk("FL %p \n", tn);
-       }
-       else if (IS_TNODE(tn)) {
-               __tnode_free(tn);
-               if (trie_debug > 0 )
-                       printk("FT %p \n", tn);
-       }
-       else {
-               trie_bug("tnode_free\n");
-       }
-}
-
 /*
  * Check whether a tnode 'n' is "full", i.e. it is an internal node
  * and no bits are skipped. See discussion in dyntree paper p. 6
@@ -455,7 +462,7 @@ static void tnode_put_child_reorg(struct
                printk("bits=%d, i=%d\n", tn->bits, i);
                trie_bug("tnode_put_child_reorg bits");
        }
-       write_lock_bh(&fib_lock);
+
        chi = tn->child[i];
 
        /* update emptyChildren */
@@ -477,8 +484,7 @@ static void tnode_put_child_reorg(struct
        if (n)
                NODE_SET_PARENT(n, tn);
 
-       tn->child[i] = n;
-       write_unlock_bh(&fib_lock);
+       rcu_assign_pointer(tn->child[i], n);
 }
 
 static struct node *resize(struct trie *t, struct tnode *tn)
@@ -502,7 +508,6 @@ static struct node *resize(struct trie *
        if (tn->empty_children == tnode_child_length(tn) - 1)
                for (i = 0; i < tnode_child_length(tn); i++) {
 
-                       write_lock_bh(&fib_lock);
                        if (tn->child[i] != NULL) {
 
                                /* compress one level */
@@ -510,11 +515,9 @@ static struct node *resize(struct trie *
                                if (n)
                                        NODE_INIT_PARENT(n, NODE_TYPE(n));
 
-                               write_unlock_bh(&fib_lock);
                                tnode_free(tn);
                                return n;
                        }
-                       write_unlock_bh(&fib_lock);
                }
        /*
         * Double as long as the resulting node has a number of
@@ -625,7 +628,6 @@ static struct node *resize(struct trie *
        if (tn->empty_children == tnode_child_length(tn) - 1)
                for (i = 0; i < tnode_child_length(tn); i++) {
                
-                       write_lock_bh(&fib_lock);
                        if (tn->child[i] != NULL) {
                                /* compress one level */
                                struct node *n = tn->child[i];
@@ -633,11 +635,9 @@ static struct node *resize(struct trie *
                                if (n)
                                        NODE_INIT_PARENT(n, NODE_TYPE(n));
 
-                               write_unlock_bh(&fib_lock);
                                tnode_free(tn);
                                return n;
                        }
-                       write_unlock_bh(&fib_lock);
                }
 
        return (struct node *) tn;
@@ -883,7 +883,7 @@ static void *trie_init(struct trie *t)
 {
        if (t) {
                t->size = 0;
-               t->trie = NULL;
+               rcu_assign_pointer(t->trie, NULL);
                t->revision = 0;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
                        memset(&t->stats, 0, sizeof(struct trie_use_stats));
@@ -897,7 +897,7 @@ static struct leaf_info *find_leaf_info(
        struct hlist_node *node;
        struct leaf_info *li;
 
-       hlist_for_each_entry(li, node, head, hlist) {
+       hlist_for_each_entry_rcu(li, node, head, hlist) {
                if (li->plen == plen)
                        return li;
        }
@@ -918,26 +918,25 @@ static inline struct list_head * get_fa_
 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
 {
        struct leaf_info *li = NULL, *last = NULL;
-       struct hlist_node *node, *tmp;
-
-       write_lock_bh(&fib_lock);
+       struct hlist_node *node;
 
        if (hlist_empty(head))
-               hlist_add_head(&new->hlist, head);
+               hlist_add_head_rcu(&new->hlist, head);
        else {
-               hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
+//             hlist_for_each_entry_safe_rcu(li, node, tmp, head, hlist) { 
//NEW
+               hlist_for_each_entry_rcu(li, node, head, hlist) {
                
                        if (new->plen > li->plen)
                                break;
                
                        last = li;
                }
+
                if (last)
-                       hlist_add_after(&last->hlist, &new->hlist);
+                       hlist_add_after_rcu(&last->hlist, &new->hlist);
                else
-                       hlist_add_before(&new->hlist, &li->hlist);
+                       hlist_add_before_rcu(&new->hlist, &li->hlist);
        }
-       write_unlock_bh(&fib_lock);
 }
 
 static struct leaf *
@@ -948,7 +947,7 @@ fib_find_node(struct trie *t, u32 key)
        struct node *n;
 
        pos = 0;
-       n = t->trie;
+       n = rcu_dereference(t->trie);
 
        while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
                tn = (struct tnode *) n;
@@ -1000,8 +999,10 @@ static struct node *trie_rebalance(struc
                tp = NODE_PARENT(tn);
                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
                wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
+//             rcu_read_lock();
                tn = (struct tnode *) resize (t, (struct tnode *)tn);
                tnode_put_child_reorg((struct tnode *)tp, cindex,(struct 
node*)tn, wasfull);
+//             rcu_read_unlock();
        
                if (!NODE_PARENT(tn))
                        break;
@@ -1028,7 +1029,7 @@ fib_insert_node(struct trie *t, int *err
        t_key cindex;
 
        pos = 0;
-       n = t->trie;
+       n = rcu_dereference(t->trie);
 
        /* If we point to NULL, stop. Either the tree is empty and we should
         * just put a new leaf in if, or we have reached an empty child slot,
@@ -1114,7 +1115,7 @@ fib_insert_node(struct trie *t, int *err
        insert_leaf_info(&l->list, li);
 
        /* Case 2: n is NULL, and will just insert a new leaf */
-       if (t->trie && n == NULL) {
+       if (rcu_dereference(t->trie) && n == NULL) {
 
                NODE_SET_PARENT(l, tp);
        
@@ -1164,7 +1165,7 @@ fib_insert_node(struct trie *t, int *err
                        put_child(t, (struct tnode *)tp, cindex, (struct node 
*)tn);
                }
                else {
-                       t->trie = (struct node*) tn; /* First tnode */
+                       rcu_assign_pointer(t->trie, (struct node *)tn); /* 
First tnode */
                        tp = tn;
                }
        }
@@ -1173,7 +1174,8 @@ fib_insert_node(struct trie *t, int *err
                       tp, tp->pos, tp->bits, key, plen);
        }
        /* Rebalance the trie */
-       t->trie = trie_rebalance(t, tp);
+
+       rcu_assign_pointer(t->trie, trie_rebalance(t, tp)); 
 done:
        t->revision++;
 err:;
@@ -1222,7 +1224,7 @@ fn_trie_insert(struct fib_table *tb, str
 
        if (l) {
                fa_head = get_fa_head(l, plen);
-               fa = fib_find_alias(fa_head, tos, fi->fib_priority);
+               fa = fib_find_alias_rcu(fa_head, tos, fi->fib_priority);
        }
 
        /* Now fa, if non-NULL, points to the first fib alias
@@ -1248,7 +1250,7 @@ fn_trie_insert(struct fib_table *tb, str
                        struct fib_info *fi_drop;
                        u8 state;
 
-                       write_lock_bh(&fib_lock);
+                       rcu_read_lock();   // call_rcu replace_alias
 
                        fi_drop = fa->fa_info;
                        fa->fa_info = fi;
@@ -1257,7 +1259,7 @@ fn_trie_insert(struct fib_table *tb, str
                        state = fa->fa_state;
                        fa->fa_state &= ~FA_S_ACCESSED;
 
-                       write_unlock_bh(&fib_lock);
+                       rcu_read_unlock();
 
                        fib_release_info(fi_drop);
                        if (state & FA_S_ACCESSED)
@@ -1270,7 +1272,7 @@ fn_trie_insert(struct fib_table *tb, str
                 * information.
                 */
                fa_orig = fa;
-               list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
+               list_for_each_entry_rcu(fa, fa_orig->fa_list.prev, fa_list) {
                        if (fa->fa_tos != tos)
                                break;
                        if (fa->fa_info->fib_priority != fi->fib_priority)
@@ -1312,13 +1314,9 @@ fn_trie_insert(struct fib_table *tb, str
                        goto out_free_new_fa;
        }
 
-       write_lock_bh(&fib_lock);
-
-       list_add_tail(&new_fa->fa_list,
+       list_add_tail_rcu(&new_fa->fa_list,
                 (fa ? &fa->fa_list : fa_head));
 
-       write_unlock_bh(&fib_lock);
-
        rt_cache_flush(-1);
        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, 
req);
 succeeded:
@@ -1341,14 +1339,14 @@ static inline int check_leaf(struct trie
        struct hlist_head *hhead = &l->list;
        struct hlist_node *node;
 
-       hlist_for_each_entry(li, node, hhead, hlist) {
+       hlist_for_each_entry_rcu(li, node, hhead, hlist) {
 
                i = li->plen;
                mask = ntohl(inet_make_mask(i));
                if (l->key != (key & mask))
                        continue;
 
-               if (((*err) = fib_semantic_match(&li->falh, flp, res, l->key, 
mask, i)) == 0) {
+               if (((*err) = fib_semantic_match_rcu(&li->falh, flp, res, 
l->key, mask, i)) == 0) {
                        *plen = i;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
                        t->stats.semantic_match_passed++;
@@ -1374,9 +1372,9 @@ fn_trie_lookup(struct fib_table *tb, con
        int chopped_off;
        t_key cindex = 0;
        int current_prefix_length = KEYLENGTH;
-       n = t->trie;
+       n = rcu_dereference(t->trie);
 
-       read_lock(&fib_lock);
+       rcu_read_lock();
        if (!n)
                goto failed;
 
@@ -1548,7 +1546,7 @@ backtrace:
 failed:
        ret = 1;
 found:
-       read_unlock(&fib_lock);
+       rcu_read_unlock();
        return ret;
 }
 
@@ -1556,7 +1554,7 @@ static int trie_leaf_remove(struct trie 
 {
        t_key cindex;
        struct tnode *tp = NULL;
-       struct node *n = t->trie;
+       struct node *n = rcu_dereference(t->trie);
        struct leaf *l;
 
        if (trie_debug)
@@ -1590,20 +1588,26 @@ static int trie_leaf_remove(struct trie 
        t->revision++;
        t->size--;
 
+       rcu_read_lock();
+//     must not preempt before child pointer is set to NULL
+
        tp = NODE_PARENT(n);
        tnode_free((struct tnode *) n);
 
        if (tp) {
                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
                put_child(t, (struct tnode *)tp, cindex, NULL);
-               t->trie = trie_rebalance(t, tp);
+
+               rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
        }
-       else
-               t->trie = NULL;
+       else 
+               rcu_assign_pointer(t->trie, NULL);
 
+       rcu_read_unlock();
        return 1;
 }
 
+
 static int
 fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
               struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
@@ -1636,7 +1640,7 @@ fn_trie_delete(struct fib_table *tb, str
                return -ESRCH;
 
        fa_head = get_fa_head(l, plen);
-       fa = fib_find_alias(fa_head, tos, 0);
+       fa = fib_find_alias_rcu(fa_head, tos, 0);
 
        if (!fa)
                return -ESRCH;
@@ -1646,7 +1650,7 @@ fn_trie_delete(struct fib_table *tb, str
 
        fa_to_delete = NULL;
        fa_head = fa->fa_list.prev;
-       list_for_each_entry(fa, fa_head, fa_list) {
+       list_for_each_entry_rcu(fa, fa_head, fa_list) {
                struct fib_info *fi = fa->fa_info;
 
                if (fa->fa_tos != tos)
@@ -1665,35 +1669,33 @@ fn_trie_delete(struct fib_table *tb, str
        }
 
        if (fa_to_delete) {
-               int kill_li = 0;
                struct leaf_info *li;
 
                fa = fa_to_delete;
                rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, 
req);
 
+
                l = fib_find_node(t, key);
                li = find_leaf_info(&l->list, plen);
 
-               write_lock_bh(&fib_lock);
-
-               list_del(&fa->fa_list);
+               list_del_rcu(&fa->fa_list);
 
                if (list_empty(fa_head)) {
-                       hlist_del(&li->hlist);
-                       kill_li = 1;
-               }
-               write_unlock_bh(&fib_lock);
-       
-               if (kill_li)
+                       hlist_del_rcu(&li->hlist);
                        free_leaf_info(li);
+               }
 
-               if (hlist_empty(&l->list))
+               if (hlist_empty(&l->list)) {
                        trie_leaf_remove(t, key);
+               }
 
-               if (fa->fa_state & FA_S_ACCESSED)
+               if (fa->fa_state & FA_S_ACCESSED) {
                        rt_cache_flush(-1);
+               }
+
+               fib_release_info(fa->fa_info);
+               alias_free_mem_rcu(fa);
 
-               fn_free_alias(fa);
                return 0;
        }
        return -ESRCH;
@@ -1701,19 +1703,17 @@ fn_trie_delete(struct fib_table *tb, str
 
 static int trie_flush_list(struct trie *t, struct list_head *head)
 {
-       struct fib_alias *fa, *fa_node;
+       struct fib_alias *fa;
        int found = 0;
 
-       list_for_each_entry_safe(fa, fa_node, head, fa_list) {
+       list_for_each_entry_rcu(fa, head, fa_list) {
                struct fib_info *fi = fa->fa_info;
        
                if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
 
-                       write_lock_bh(&fib_lock);
-                       list_del(&fa->fa_list);
-                       write_unlock_bh(&fib_lock);
-
-                       fn_free_alias(fa);
+                       list_del_rcu(&fa->fa_list);
+                       fib_release_info(fa->fa_info);
+                       alias_free_mem_rcu(fa); 
                        found++;
                }
        }
@@ -1724,18 +1724,16 @@ static int trie_flush_leaf(struct trie *
 {
        int found = 0;
        struct hlist_head *lih = &l->list;
-       struct hlist_node *node, *tmp;
+       struct hlist_node *node;
        struct leaf_info *li = NULL;
 
-       hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
+       hlist_for_each_entry_rcu(li, node, lih, hlist) {
                
                found += trie_flush_list(t, &li->falh);
 
                if (list_empty(&li->falh)) {
 
-                       write_lock_bh(&fib_lock);
-                       hlist_del(&li->hlist);
-                       write_unlock_bh(&fib_lock);
+                       hlist_del_rcu(&li->hlist);
 
                        free_leaf_info(li);
                }
@@ -1748,15 +1746,16 @@ static struct leaf *nextleaf(struct trie
        struct node *c = (struct node *) thisleaf;
        struct tnode *p;
        int idx;
+       struct node *trie = rcu_dereference(t->trie);
 
        if (c == NULL) {
-               if (t->trie == NULL)
+               if (trie == NULL) 
                        return NULL;
 
-               if (IS_LEAF(t->trie))          /* trie w. just a leaf */
-                       return (struct leaf *) t->trie;
+               if (IS_LEAF(trie))          /* trie w. just a leaf */ 
+                       return (struct leaf *) trie;
 
-               p = (struct tnode*) t->trie;  /* Start */
+               p = (struct tnode*) trie;  /* Start */
        }
        else
                p = (struct tnode *) NODE_PARENT(c);
@@ -1772,30 +1771,33 @@ static struct leaf *nextleaf(struct trie
 
                last = 1 << p->bits;
                for(idx = pos; idx < last ; idx++) {
-                       if (p->child[idx]) {
-
+                       c = rcu_dereference(p->child[idx]);
+                       if (c) {
                                /* Decend if tnode */
-
-                               while (IS_TNODE(p->child[idx])) {
-                                       p = (struct tnode*) p->child[idx];
+                               while (IS_TNODE(c)) {
+                                       p = (struct tnode *) c;
                                        idx = 0;
                                
                                        /* Rightmost non-NULL branch */
                                        if (p && IS_TNODE(p))
-                                               while (p->child[idx] == NULL && 
idx < (1 << p->bits)) idx++;
+                                               while (!(c = 
rcu_dereference(p->child[idx]))
+                                                      && idx < (1<<p->bits))
+                                                      idx++;
 
                                        /* Done with this tnode? */
-                                       if (idx >= (1 << p->bits) || 
p->child[idx] == NULL )
+                                       if (idx >= (1 << p->bits) || !c)
                                                goto up;
                                }
-                               return (struct leaf*) p->child[idx];
+
+                               return (struct leaf *) c;
                        }
                }
 up:
                /* No more children go up one step  */
-               c = (struct node*) p;
+               c = (struct node *) p;
                p = (struct tnode *) NODE_PARENT(p);
        }
+
        return NULL; /* Ready. Root of trie */
 }
 
@@ -1840,7 +1842,7 @@ fn_trie_select_default(struct fib_table 
        last_resort = NULL;
        order = -1;
 
-       read_lock(&fib_lock);
+       rcu_read_lock();
 
        l = fib_find_node(t, 0);
        if (!l)
@@ -1853,7 +1855,7 @@ fn_trie_select_default(struct fib_table 
        if (list_empty(fa_head))
                goto out;
 
-       list_for_each_entry(fa, fa_head, fa_list) {
+       list_for_each_entry_rcu(fa, fa_head, fa_list) {
                struct fib_info *next_fi = fa->fa_info;
        
                if (fa->fa_scope != res->scope ||
@@ -1904,7 +1906,7 @@ fn_trie_select_default(struct fib_table 
        }
        trie_last_dflt = last_idx;
  out:;
-       read_unlock(&fib_lock);
+       rcu_read_unlock();
 }
 
 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct 
fib_table *tb,
@@ -1918,7 +1920,7 @@ static int fn_trie_dump_fa(t_key key, in
        s_i=cb->args[3];
        i = 0;
 
-       list_for_each_entry(fa, fah, fa_list) {
+       list_for_each_entry_rcu(fa, fah, fa_list) {
                if (i < s_i) {
                        i++;
                        continue;
@@ -1993,7 +1995,7 @@ static int fn_trie_dump(struct fib_table
 
        s_m = cb->args[1];
 
-       read_lock(&fib_lock);
+       rcu_read_lock();
        for (m=0; m<=32; m++) {
 
                if (m < s_m)
@@ -2007,11 +2009,11 @@ static int fn_trie_dump(struct fib_table
                        goto out;
                }
        }
-       read_unlock(&fib_lock);
+       rcu_read_unlock();
        cb->args[1] = m;
        return skb->len;
  out:
-       read_unlock(&fib_lock);
+       rcu_read_unlock();
        return -1;
 }
 
@@ -2119,7 +2121,7 @@ static void printnode_seq(struct seq_fil
                                seq_printf(seq, "{/%d...dumping}\n", i);
 
 
-                               list_for_each_entry(fa, fa_head, fa_list) {
+                               list_for_each_entry_rcu(fa, fa_head, fa_list) {
                                        putspace_seq(seq, indent+2);
                                        if (fa->fa_info->fib_nh == NULL) {
                                                seq_printf(seq, "Error 
_fib_nh=NULL\n");
@@ -2154,13 +2156,13 @@ static void printnode_seq(struct seq_fil
 
 static void trie_dump_seq(struct seq_file *seq, struct trie *t)
 {
-       struct node *n = t->trie;
+       struct node *n = rcu_dereference(t->trie);
        int cindex=0;
        int indent=1;
        int pend=0;
        int depth = 0;
 
-       read_lock(&fib_lock);
+       rcu_read_lock();
 
        seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
        if (n) {
@@ -2173,38 +2175,37 @@ static void trie_dump_seq(struct seq_fil
                        depth++;
 
                        while (tn && cindex < (1 << tn->bits)) {
-                               if (tn->child[cindex]) {
-                               
+                               struct node *child = 
rcu_dereference(tn->child[cindex]);
+                               if (!child) 
+                                       cindex++;
+                               else {
                                        /* Got a child */
-                               
-                                       printnode_seq(seq, indent, 
tn->child[cindex], pend, cindex, tn->bits);
-                                       if (IS_LEAF(tn->child[cindex])) {
+                                       printnode_seq(seq, indent, child, pend,
+                                                     cindex, tn->bits);
+
+                                       if (IS_LEAF(child))
                                                cindex++;
-                                       
-                                       }
                                        else {
                                                /*
                                                 * New tnode. Decend one level
                                                 */
                                        
                                                depth++;
-                                               n = tn->child[cindex];
+                                               n = child;
                                                tn = (struct tnode *)n;
                                                pend = tn->pos+tn->bits;
-                                               putspace_seq(seq, indent); 
seq_printf(seq, "\\--\n");
-                                               indent+=3;
-                                               cindex=0;
+                                               putspace_seq(seq, indent);
+                                               seq_printf(seq, "\\--\n");
+                                               indent += 3;
+                                               cindex = 0;
                                        }
                                }
-                               else
-                                       cindex++;
 
                                /*
                                 * Test if we are done
                                 */
                        
                                while (cindex >= (1 << tn->bits)) {
-
                                        /*
                                         * Move upwards and test for root
                                         * pop off all traversed  nodes
@@ -2214,9 +2215,9 @@ static void trie_dump_seq(struct seq_fil
                                                tn = NULL;
                                                n = NULL;
                                                break;
-                                       }
-                                       else {
-                                               cindex = 
tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
+                                       } else {
+                                               cindex = 
tkey_extract_bits(tn->key,
+                                                                          
NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
                                                tn = NODE_PARENT(tn);
                                                cindex++;
                                                n = (struct node *)tn;
@@ -2227,11 +2228,13 @@ static void trie_dump_seq(struct seq_fil
                                }
                        }
                }
-               else n = NULL;
+               else
+                       n = NULL;
        }
-       else seq_printf(seq, "------ trie is empty\n");
+       else
+               seq_printf(seq, "------ trie is empty\n");
 
-       read_unlock(&fib_lock);
+       rcu_read_unlock();
 }
 
 static struct trie_stat *trie_stat_new(void)
@@ -2254,14 +2257,14 @@ static struct trie_stat *trie_stat_new(v
 
 static struct trie_stat *trie_collect_stats(struct trie *t)
 {
-       struct node *n = t->trie;
+       struct node *n = rcu_dereference(t->trie);
        struct trie_stat *s = trie_stat_new();
        int cindex = 0;
        int indent = 1;
        int pend = 0;
        int depth = 0;
 
-       read_lock(&fib_lock);   
+       rcu_read_lock();        
 
        if (s) {
                if (n) {
@@ -2273,10 +2276,11 @@ static struct trie_stat *trie_collect_st
                                depth++;
 
                                while (tn && cindex < (1 << tn->bits)) {
-                                       if (tn->child[cindex]) {
+                                       struct node *ch
+                                               = 
rcu_dereference(tn->child[cindex]);
+                                       if (ch) {
                                                /* Got a child */
-                               
-                                               if (IS_LEAF(tn->child[cindex])) 
{
+                                               if (IS_LEAF(ch)) {
                                                        cindex++;
                                        
                                                        /* stats */
@@ -2284,9 +2288,7 @@ static struct trie_stat *trie_collect_st
                                                                s->maxdepth = 
depth;
                                                        s->totdepth += depth;
                                                        s->leaves++;
-                                               }
-                               
-                                               else {
+                                               } else {
                                                        /*
                                                         * New tnode. Decend 
one level
                                                         */
@@ -2295,15 +2297,14 @@ static struct trie_stat *trie_collect_st
                                                        
s->nodesizes[tn->bits]++;
                                                        depth++;
                                        
-                                                       n = tn->child[cindex];
+                                                       n = ch;
                                                        tn = (struct tnode *)n;
                                                        pend = tn->pos+tn->bits;
 
                                                        indent += 3;
                                                        cindex = 0;
                                                }
-                                       }
-                                       else {
+                                       } else {
                                                cindex++;
                                                s->nullpointers++;
                                        }
@@ -2341,7 +2342,7 @@ static struct trie_stat *trie_collect_st
                }
        }
 
-       read_unlock(&fib_lock); 
+       rcu_read_unlock();      
        return s;
 }
 
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -7,6 +7,7 @@
 
 struct fib_alias {
        struct list_head        fa_list;
+       struct rcu_head rcu;
        struct fib_info         *fa_info;
        u8                      fa_tos;
        u8                      fa_type;
@@ -21,6 +22,10 @@ extern int fib_semantic_match(struct lis
                              const struct flowi *flp,
                              struct fib_result *res, __u32 zone, __u32 mask,
                                int prefixlen);
+extern int fib_semantic_match_rcu(struct list_head *head,
+                                 const struct flowi *flp,
+                                 struct fib_result *res, __u32 zone, __u32 
mask,
+                                 int prefixlen);
 extern void fib_release_info(struct fib_info *);
 extern struct fib_info *fib_create_info(const struct rtmsg *r,
                                        struct kern_rta *rta,


 

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to