On 13/05/14(Tue) 19:25, Claudio Jeker wrote:
> The last three hackathons I got sucked into one ugly dark corner of the
> network stack. Our radix tree implementation one one particular bug in it
> that caused bgpd and ospfd to freak out on semi regular basis.

Do you know why they would freak out?

> After a fair amount of versions tested by myself and benno@ I think it is
> time to send this to a larger audience. This affacts all our radix
> consumers (inclusive NFS and pf tables) so better test now then suffer
> later.

I'm putting it on my machines right now.

> I would be happy if people could throw this code on some of their (test)
> systems.  In general there should be no visible difference to the current
> state.  The one case that is no fixed has todo with routing priorities and
> the link state tracking of routes.  In short this should fix the bgpd "mpath
> route not found" error caused by a link state flap.
> 
> The diff is hard to read because I refactored and split the radix.c code
> into smaller, independent and hopefully more digestable bits.  It may be
> easier to read the end result for those brave ones who want to review the
> diff.  I added some extra comments to help me to keep track of what is
> going on but it is probably still very terse.

Indeed it's really hard to read and I hope you won't get hit by a bus
soon :)

> I would like to commit this rather soon so that it gets enough exposure
> before the next release.

I'd really appreciate if you can split this diff in various commits.  I
see that there's a lot of refactoring or documentation bits that could
easily be committed before.  I like the functions you introduce to
replace some label/gotos and they make the code easier to understand.
I'd bet they can be committed before too.

I understand that this is not fun and that this code is already scary as
it is, but if you are able to document the various steps you did to fix
this bug, I'm sure it will be helpful to anybody trying to understand
and/or contribute to this code.

> -- 
> :wq Claudio
> 
> Index: net/radix.c
> ===================================================================
> RCS file: /cvs/src/sys/net/radix.c,v
> retrieving revision 1.39
> diff -u -p -r1.39 radix.c
> --- net/radix.c       22 Jan 2014 10:17:59 -0000      1.39
> +++ net/radix.c       9 May 2014 17:01:23 -0000
> @@ -71,6 +71,13 @@ struct radix_node *rn_newpair(void *, in
>  
>  static inline struct radix_node *rn_search(void *, struct radix_node *);
>  struct radix_node *rn_search_m(void *, struct radix_node *, void *);
> +int rn_add_dupedkey(struct radix_node *, struct radix_node_head *,
> +    struct radix_node [2], u_int8_t);
> +void rn_fixup_nodes(struct radix_node *);
> +static inline struct radix_node *rn_lift_node(struct radix_node *);
> +void rn_add_radix_mask(struct radix_node *, int);
> +int rn_del_radix_mask(struct radix_node *);
> +static inline void rn_swap_nodes(struct radix_node *, struct radix_node *);
>  
>  /*
>   * The data structure for the keys is a radix tree with one way
> @@ -167,6 +174,7 @@ rn_refines(void *m_arg, void *n_arg)
>       return (!masks_are_equal);
>  }
>  
> +/* return a perfect match if m_arg is set, else do a regular rn_match */
>  struct radix_node *
>  rn_lookup(void *v_arg, void *m_arg, struct radix_node_head *head)
>  {
> @@ -201,10 +209,15 @@ rn_satisfies_leaf(char *trial, struct ra
>               cp3 = rn_ones;
>       else
>               length = min(length, *(u_char *)cp3);
> -     cplim = cp + length; cp3 += skip; cp2 += skip;
> -     for (cp += skip; cp < cplim; cp++, cp2++, cp3++)
> +     cplim = cp + length;
> +     cp += skip;
> +     cp2 += skip;
> +     cp3 += skip;
> +     while (cp < cplim) {
>               if ((*cp ^ *cp2) & *cp3)
>                       return 0;
> +             cp++, cp2++, cp3++;
> +     }
>       return 1;
>  }
>  
> @@ -395,7 +408,7 @@ rn_addmask(void *n_arg, int search, int 
>       if (skip == 0)
>               skip = 1;
>       if (mlen <= skip)
> -             return (mask_rnhead->rnh_nodes);
> +             return (mask_rnhead->rnh_nodes);        /* rn_zero root node */
>       if (skip > 1)
>               memcpy(addmask_key + 1, rn_ones + 1, skip - 1);
>       if ((m0 = mlen) > skip)
> @@ -432,7 +445,7 @@ rn_addmask(void *n_arg, int search, int 
>               return (tm);
>       }
>       /*
> -      * Calculate index of mask, and check for normalcy.
> +      * Calculate index of mask, and check for normalicy.
>        */
>       cplim = netmask + mlen;
>       isnormal = 1;
> @@ -495,388 +508,556 @@ rn_new_radix_mask(struct radix_node *tt,
>       return m;
>  }
>  
> -struct radix_node *
> -rn_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
> -    struct radix_node treenodes[2], u_int8_t prio)
> +/*
> + * Find the point where the rn_mklist needs to be changed.
> + */
> +static inline struct radix_node *
> +rn_lift_node(struct radix_node *t)
>  {
> -     caddr_t v = v_arg;
> -     caddr_t netmask = n_arg;
> -     struct radix_node *top = head->rnh_treetop;
> -     struct radix_node *t, *tt, *tm = NULL, *x;
> -     struct radix_node *saved_tt;
> -     short b = 0, b_leaf = 0;
> -     int keyduplicated, prioinv = -1;
> -     caddr_t mmask;
> -     struct radix_mask *m, **mp;
> -
> -     /*
> -      * In dealing with non-contiguous masks, there may be
> -      * many different routes which have the same mask.
> -      * We will find it useful to have a unique pointer to
> -      * the mask to speed avoiding duplicate references at
> -      * nodes and possibly save time in calculating indices.
> -      */
> -     if (netmask)  {
> -             if ((tm = rn_addmask(netmask, 0, top->rn_off)) == 0)
> -                     return (0);
> -             b_leaf = tm->rn_b;
> -             b = -1 - tm->rn_b;
> -             netmask = tm->rn_key;
> -     }
> +     struct radix_node *x = t;
> +     int b = -1 - t->rn_b;
>  
> -     saved_tt = tt = rn_insert(v, head, &keyduplicated, treenodes);
> -
> -     /*
> -      * Deal with duplicated keys: attach node to previous instance
> -      */
> -     if (keyduplicated) {
> -             for (t = tt; tt; t = tt, tt = tt->rn_dupedkey) {
> -#ifndef SMALL_KERNEL
> -                     /* permit multipath, if enabled for the family */
> -                     if (rn_mpath_capable(head) && netmask == tt->rn_mask) {
> -                             int mid;
> -                             /*
> -                              * Try to insert the new node in the middle
> -                              * of the list of any preexisting multipaths,
> -                              * to reduce the number of path disruptions
> -                              * that occur as a result of an insertion,
> -                              * per RFC2992.
> -                              * Additionally keep the list sorted by route
> -                              * priority.
> -                              */
> -                             prioinv = 0;
> -                             tt = rn_mpath_prio(tt, prio);
> -                             if (((struct rtentry *)tt)->rt_priority !=
> -                                 prio) {
> -                                     /*
> -                                      * rn_mpath_prio returns the previous
> -                                      * element if no element with the 
> -                                      * requested priority exists. It could
> -                                      * be that the previous element comes
> -                                      * with a bigger priority.
> -                                      */
> -                                     if (((struct rtentry *)tt)->
> -                                         rt_priority > prio)
> -                                             prioinv = 1;
> -                                     t = tt;
> -                                     break;
> -                             }
> +     /* rewind possible dupedkey list to head */
> +     while (t->rn_b < 0)
> +             t = t->rn_p;
>  
> -                             mid = rn_mpath_active_count(tt) / 2;
> -                             do {
> -                                     t = tt;
> -                                     tt = rn_mpath_next(tt, 0);
> -                             } while (tt && --mid > 0);
> -                             break;
> -                     }
> -#endif
> -                     if (tt->rn_mask == netmask)
> -                             return (0);
> -                     if (netmask == 0 ||
> -                         (tt->rn_mask &&
> -                          ((b_leaf < tt->rn_b) || /* index(netmask) > node */
> -                            rn_refines(netmask, tt->rn_mask) ||
> -                            rn_lexobetter(netmask, tt->rn_mask))))
> -                             break;
> -             }
> -             /*
> -              * If the mask is not duplicated, we wouldn't
> -              * find it among possible duplicate key entries
> -              * anyway, so the above test doesn't hurt.
> -              *
> -              * We sort the masks for a duplicated key the same way as
> -              * in a masklist -- most specific to least specific.
> -              * This may require the unfortunate nuisance of relocating
> -              * the head of the list.
> -              *
> -              * We also reverse, or doubly link the list through the
> -              * parent pointer.
> -              */
> -             if (tt == saved_tt && prioinv) {
> -                     struct  radix_node *xx;
> -                     /* link in at head of list */
> -                     (tt = treenodes)->rn_dupedkey = t;
> -                     tt->rn_flags = t->rn_flags;
> -                     tt->rn_p = xx = t->rn_p;
> -                     t->rn_p = tt;
> -                     if (xx->rn_l == t)
> -                             xx->rn_l = tt;
> -                     else
> -                             xx->rn_r = tt;
> -                     saved_tt = tt;
> -             } else if (prioinv == 1) {
> -                     (tt = treenodes)->rn_dupedkey = t;
> -                     if (t->rn_p == NULL)
> -                             panic("rn_addroute: t->rn_p is NULL");
> -                     t->rn_p->rn_dupedkey = tt;
> -                     tt->rn_p = t->rn_p;
> -                     t->rn_p = tt;
> -             } else {
> -                     (tt = treenodes)->rn_dupedkey = t->rn_dupedkey;
> -                     t->rn_dupedkey = tt;
> -                     tt->rn_p = t;
> -                     if (tt->rn_dupedkey)
> -                             tt->rn_dupedkey->rn_p = tt;
> -             }
> -             tt->rn_key = (caddr_t) v;
> -             tt->rn_b = -1;
> -             tt->rn_flags = RNF_ACTIVE;
> -     }
> +     /* can't lift node above head of dupedkey list, give up */
> +     if (b > t->rn_b)
> +             return (NULL);
>  
> -     /*
> -      * Put mask in tree.
> -      */
> -     if (netmask) {
> -             tt->rn_mask = netmask;
> -             tt->rn_b = tm->rn_b;
> -             tt->rn_flags |= tm->rn_flags & RNF_NORMAL;
> -     }
> -     t = saved_tt->rn_p;
> -     if (keyduplicated)
> -             goto on2;
> -     b_leaf = -1 - t->rn_b;
> -     if (t->rn_r == saved_tt)
> -             x = t->rn_l;
> -     else
> -             x = t->rn_r;
> -     /* Promote general routes from below */
> -     if (x->rn_b < 0) {
> -         struct      radix_node *xx = NULL;
> -         for (mp = &t->rn_mklist; x; xx = x, x = x->rn_dupedkey) {
> -             if (xx && xx->rn_mklist && xx->rn_mask == x->rn_mask &&
> -                 x->rn_mklist == 0) {
> -                     /* multipath route, bump refcount on first mklist */
> -                     x->rn_mklist = xx->rn_mklist;
> -                     x->rn_mklist->rm_refs++;
> -             }
> -             if (x->rn_mask && (x->rn_b >= b_leaf) && x->rn_mklist == 0) {
> -                     *mp = m = rn_new_radix_mask(x, 0);
> -                     if (m)
> -                             mp = &m->rm_mklist;
> -             }
> -         }
> -     } else if (x->rn_mklist) {
> -             /*
> -              * Skip over masks whose index is > that of new node
> -              */
> -             for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist)
> -                     if (m->rm_b >= b_leaf)
> -                             break;
> -             t->rn_mklist = m;
> -             *mp = 0;
> -     }
> -on2:
> -     /* Add new route to highest possible ancestor's list */
> -     if ((netmask == 0) || (b > t->rn_b ))
> -             return tt; /* can't lift at all */
> -     b_leaf = tt->rn_b;
>       do {
>               x = t;
>               t = t->rn_p;
> -     } while (b <= t->rn_b && x != top);
> +     } while (b <= t->rn_b && x != t);
> +
> +     return (x);
> +}
> +
> +void
> +rn_add_radix_mask(struct radix_node *tt, int keyduplicated)
> +{
> +     caddr_t netmask, mmask;
> +     struct radix_node *x;
> +     struct radix_mask *m, **mp;
> +     int b_leaf = tt->rn_b;
> +
> +     /* Add new route to highest possible ancestor's list */
> +     if (tt->rn_mask == NULL)
> +             return; /* can't lift at all */
> +     x = rn_lift_node(tt);
> +     if (x == NULL)
> +             return; /* didn't lift either */
> +
>       /*
>        * Search through routes associated with node to
>        * insert new route according to index.
>        * Need same criteria as when sorting dupedkeys to avoid
>        * double loop on deletion.
>        */
> +     netmask = tt->rn_mask;
>       for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) {
>               if (m->rm_b < b_leaf)
>                       continue;
>               if (m->rm_b > b_leaf)
>                       break;
>               if (m->rm_flags & RNF_NORMAL) {
> -                     mmask = m->rm_leaf->rn_mask;
>                       if (keyduplicated) {
>                               if (m->rm_leaf->rn_p == tt)
>                                       /* new route is better */
>                                       m->rm_leaf = tt;
>  #ifdef DIAGNOSTIC
>                               else {
> -                                     for (t = m->rm_leaf; t;
> -                                             t = t->rn_dupedkey)
> +                                     struct radix_node *t;
> +
> +                                     for (t = m->rm_leaf;
> +                                         t && t->rn_mklist == m;
> +                                         t = t->rn_dupedkey)
>                                               if (t == tt)
>                                                       break;
>                                       if (t == NULL) {
>                                               log(LOG_ERR, "Non-unique "
>                                                   "normal route on dupedkey, "
>                                                   "mask not entered\n");
> -                                             return tt;
> +                                             return;
>                                       }
>                               }
>  #endif
>                               m->rm_refs++;
>                               tt->rn_mklist = m;
> -                             return tt;
> +                             return;
>                       } else if (tt->rn_flags & RNF_NORMAL) {
>                               log(LOG_ERR, "Non-unique normal route,"
>                                   " mask not entered\n");
> -                             return tt;
> +                             return;
>                       }
> +                     mmask = m->rm_leaf->rn_mask;
>               } else
>                       mmask = m->rm_mask;
>               if (mmask == netmask) {
>                       m->rm_refs++;
>                       tt->rn_mklist = m;
> -                     return tt;
> +                     return;
>               }
>               if (rn_refines(netmask, mmask) || rn_lexobetter(netmask, mmask))
>                       break;
>       }
>       *mp = rn_new_radix_mask(tt, *mp);
> -     return tt;
> +}
> +
> +int
> +rn_add_dupedkey(struct radix_node *saved_tt, struct radix_node_head *head,
> +    struct radix_node *tt, u_int8_t prio)
> +{
> +     caddr_t netmask = tt->rn_mask;
> +     struct radix_node *x = saved_tt, *xp;
> +#ifndef SMALL_KERNEL
> +     struct radix_node *dupedkey_tt = NULL;
> +#endif
> +     int before = -1;
> +     int b_leaf = 0;
> +
> +     if (netmask)
> +             b_leaf = tt->rn_b;
> +
> +     for (xp = x; x; xp = x, x = x->rn_dupedkey) {
> +#ifndef SMALL_KERNEL
> +             /* permit multipath, if enabled for the family */
> +             if (rn_mpath_capable(head) && netmask == x->rn_mask) {
> +                     int mid;
> +                     /*
> +                      * Try to insert the new node in the middle
> +                      * of the list of any preexisting multipaths,
> +                      * to reduce the number of path disruptions
> +                      * that occur as a result of an insertion,
> +                      * per RFC2992.
> +                      * Additionally keep the list sorted by route
> +                      * priority.
> +                      */
> +                     before = 0;
> +
> +                     dupedkey_tt = x;
> +                     x = rn_mpath_prio(x, prio);
> +                     if (((struct rtentry *)x)->rt_priority !=
> +                         prio) {
> +                             /*
> +                              * rn_mpath_prio returns the previous
> +                              * element if no element with the 
> +                              * requested priority exists. It could
> +                              * be that the previous element comes
> +                              * with a bigger priority.
> +                              */
> +                             if (((struct rtentry *)x)->rt_priority > prio)
> +                                     before = 1;
> +                             xp = x;
> +                             break;
> +                     }
> +
> +                     mid = rn_mpath_active_count(x) / 2;
> +                     do {
> +                             xp = x;
> +                             x = rn_mpath_next(x, RMP_MODE_BYPRIO);
> +                     } while (x && --mid > 0);
> +                     break;
> +             }
> +#endif
> +             if (x->rn_mask == netmask)
> +                     return (-1);
> +             if (netmask == NULL ||
> +                 (x->rn_mask &&
> +                  ((b_leaf < x->rn_b) || /* index(netmask) > node */
> +                    rn_refines(netmask, x->rn_mask) ||
> +                    rn_lexobetter(netmask, x->rn_mask))))
> +                     break;
> +     }
> +     /*
> +      * If the mask is not duplicated, we wouldn't
> +      * find it among possible duplicate key entries
> +      * anyway, so the above test doesn't hurt.
> +      *
> +      * We sort the masks for a duplicated key the same way as
> +      * in a masklist -- most specific to least specific.
> +      * This may require the unfortunate nuisance of relocating
> +      * the head of the list.
> +      *
> +      * We also reverse, or doubly link the list through the
> +      * parent pointer.
> +      */
> +
> +     if ((x == saved_tt && before) || before == 1)
> +             before = 1;
> +     else
> +             before = 0;
> +     rn_link_dupedkey(tt, xp, before);
> +
> +
> +#ifndef SMALL_KERNEL
> +     /* adjust the flags of the possible multipath chain */
> +     if (!dupedkey_tt)
> +             dupedkey_tt = tt;
> +     if (rn_mpath_capable(head))
> +             rn_mpath_adj_mpflag(dupedkey_tt, prio);
> +#endif
> +     return (0);
> +}
> +
> +/*
> + * Insert tt after x or in place of x if before is true.
> + */
> +void
> +rn_link_dupedkey(struct radix_node *tt, struct radix_node *x, int before)
> +{
> +     if (before) {
> +             if (x->rn_p->rn_b > 0) {
> +                     /* link in at head of list */
> +                     tt->rn_dupedkey = x;
> +                     tt->rn_flags = x->rn_flags;
> +                     tt->rn_p = x->rn_p;
> +                     x->rn_p = tt;
> +                     if (tt->rn_p->rn_l == x)
> +                             tt->rn_p->rn_l = tt;
> +                     else
> +                             tt->rn_p->rn_r = tt;
> +             } else {
> +                     tt->rn_dupedkey = x;
> +                     x->rn_p->rn_dupedkey = tt;
> +                     tt->rn_p = x->rn_p;
> +                     x->rn_p = tt;
> +             }
> +     } else {
> +             tt->rn_dupedkey = x->rn_dupedkey;
> +             x->rn_dupedkey = tt;
> +             tt->rn_p = x;
> +             if (tt->rn_dupedkey)
> +                     tt->rn_dupedkey->rn_p = tt;
> +     }
> +}
> +
> +/*
> + * This function ensures that routes are properly promoted upwards.
> + * It adjusts the rn_mklist of the parent node to make sure overlapping
> + * routes can be found.
> + *
> + * There are two cases:
> + * - leaf nodes with possible rn_dupedkey list
> + * - internal nodes with maybe their own mklist
> + * If the mask of the route is bigger than the current branch bit then
> + * a rn_mklist entrie needs to be made.
> + */
> +void
> +rn_fixup_nodes(struct radix_node *tt)
> +{
> +     struct radix_node *tp, *x;
> +     struct radix_mask *m, **mp;
> +     int b_leaf;
> +
> +     tp = tt->rn_p;
> +     if (tp->rn_r == tt)
> +             x = tp->rn_l;
> +     else
> +             x = tp->rn_r;
> +
> +     b_leaf = -1 - tp->rn_b;
> +     if (x->rn_b < 0) {      /* x is a leaf node */
> +             struct  radix_node *xx = NULL;
> +
> +             for (mp = &tp->rn_mklist; x; xx = x, x = x->rn_dupedkey) {
> +                     if (xx && xx->rn_mklist && xx->rn_mask == x->rn_mask &&
> +                         x->rn_mklist == 0) {
> +                             /* multipath route */
> +                             x->rn_mklist = xx->rn_mklist;
> +                             x->rn_mklist->rm_refs++;
> +                     }
> +                     if (x->rn_mask && (x->rn_b >= b_leaf) &&
> +                         x->rn_mklist == 0) {
> +                             *mp = m = rn_new_radix_mask(x, 0);
> +                             if (m)
> +                                     mp = &m->rm_mklist;
> +                     }
> +             }
> +     } else if (x->rn_mklist) {      /* x is an internal node */
> +             /*
> +              * Skip over masks whose index is > that of new node
> +              */
> +             for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist)
> +                     if (m->rm_b >= b_leaf)
> +                             break;
> +             tp->rn_mklist = m;
> +             *mp = 0;
> +     }
>  }
>  
>  struct radix_node *
> -rn_delete(void *v_arg, void *netmask_arg, struct radix_node_head *head,
> -    struct radix_node *rn)
> +rn_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
> +    struct radix_node treenodes[2], u_int8_t prio)
>  {
>       caddr_t v = v_arg;
> -     caddr_t netmask = netmask_arg;
>       struct radix_node *top = head->rnh_treetop;
> -     struct radix_node *t, *p, *tm, *x, *tt;
> -     struct radix_mask *m, *saved_m, **mp;
> -     struct radix_node *dupedkey, *saved_tt;
> -     int off = top->rn_off;
> -     int b, vlen;
> +     struct radix_node *tt, *saved_tt, *tm = NULL;
> +     int keyduplicated;
>  
> -     vlen =  *(u_char *)v;
> -     tt = rn_search(v, top);
> -     saved_tt = tt;
> -     if (memcmp(v + off, tt->rn_key + off, vlen - off))
> -             return (0);
>       /*
> -      * Delete our route from mask lists.
> +      * In dealing with non-contiguous masks, there may be
> +      * many different routes which have the same mask.
> +      * We will find it useful to have a unique pointer to
> +      * the mask to speed avoiding duplicate references at
> +      * nodes and possibly save time in calculating indices.
>        */
> -     if (netmask) {
> -             if ((tm = rn_addmask(netmask, 1, off)) == NULL)
> +     if (n_arg)  {
> +             if ((tm = rn_addmask(n_arg, 0, top->rn_off)) == 0)
>                       return (0);
> -             netmask = tm->rn_key;
> -             while (tt->rn_mask != netmask)
> -                     if ((tt = tt->rn_dupedkey) == NULL)
> -                             return (0);
>       }
> -#ifndef SMALL_KERNEL
> -     if (rn) {
> -             while (tt != rn)
> -                     if ((tt = tt->rn_dupedkey) == NULL)
> -                             return (0);
> +
> +     tt = rn_insert(v, head, &keyduplicated, treenodes);
> +
> +     if (keyduplicated) {
> +             saved_tt = tt;
> +             tt = treenodes;
> +
> +             tt->rn_key = v_arg;
> +             tt->rn_b = -1;
> +             tt->rn_flags = RNF_ACTIVE;
>       }
> +
> +     /* Put mask into the node. */
> +     if (tm) {
> +             tt->rn_mask = tm->rn_key;
> +             tt->rn_b = tm->rn_b;
> +             tt->rn_flags |= tm->rn_flags & RNF_NORMAL;
> +     }
> +
> +     /* Either insert into dupedkey list or as a leaf node.  */
> +     if (keyduplicated) {
> +             if (rn_add_dupedkey(saved_tt, head, tt, prio))
> +                     return (NULL);
> +     } else {
> +             rn_fixup_nodes(tt);
> +#ifndef SMALL_KERNEL
> +             if (rn_mpath_capable(head))
> +                     rn_mpath_adj_mpflag(tt, prio);
>  #endif
> -     if (tt->rn_mask == NULL || (saved_m = m = tt->rn_mklist) == NULL)
> -             goto on1;
> +     }
> +
> +     /* finally insert a radix_mask element if needed */
> +     rn_add_radix_mask(tt, keyduplicated);
> +     return (tt);
> +}
> +
> +/*
> + * Cleanup mask list, tt points to route that needs to be cleaned
> + */
> +int
> +rn_del_radix_mask(struct radix_node *tt)
> +{
> +     struct radix_node *x;
> +     struct radix_mask *m, *saved_m, **mp;
> +
> +     /*
> +      * Cleanup mask list from possible references to this route.
> +      */
> +     saved_m = m = tt->rn_mklist;
> +     if (tt->rn_mask == NULL || m == NULL)
> +             return (0);
> +
>       if (tt->rn_flags & RNF_NORMAL) {
>               if (m->rm_leaf != tt && m->rm_refs == 0) {
>                       log(LOG_ERR, "rn_delete: inconsistent normal "
>                           "annotation\n");
> -                     return (0);
> +                     return (-1);
>               }
>               if (m->rm_leaf != tt) {
>                       if (--m->rm_refs >= 0)
> -                             goto on1;
> +                             return (0);
> +                     else 
> +                             log(LOG_ERR, "rn_delete: "
> +                                 "inconsistent mklist refcount\n");
>               }
> -             /* tt is currently the head of the possible multipath chain */
> +             /*
> +              * If we end up here tt should be m->rm_leaf and therefor
> +              * tt should be the head of a multipath chain.
> +              * If this is not the case the table is no longer consistent.
> +              */
>               if (m->rm_refs > 0) {
>                       if (tt->rn_dupedkey == NULL ||
>                           tt->rn_dupedkey->rn_mklist != m) {
>                               log(LOG_ERR, "rn_delete: inconsistent "
>                                   "dupedkey list\n");
> -                             return (0);
> +                             return (-1);
>                       }
>                       m->rm_leaf = tt->rn_dupedkey;
>                       --m->rm_refs;
> -                     goto on1;
> +                     return (0);
>               }
>               /* else tt is last and only route */
>       } else {
>               if (m->rm_mask != tt->rn_mask) {
>                       log(LOG_ERR, "rn_delete: inconsistent annotation\n");
> -                     goto on1;
> +                     return (0);
>               }
>               if (--m->rm_refs >= 0)
> -                     goto on1;
> +                     return (0);
>       }
> -     b = -1 - tt->rn_b;
> -     t = saved_tt->rn_p;
> -     if (b > t->rn_b)
> -             goto on1; /* Wasn't lifted at all */
> -     do {
> -             x = t;
> -             t = t->rn_p;
> -     } while (b <= t->rn_b && x != top);
> +
> +     /*
> +      * No other references hold to the radix_mask remove it from
> +      * the tree.
> +      */
> +     x = rn_lift_node(tt);
> +     if (x == NULL)
> +             return (0);     /* Wasn't lifted at all */
> +
> +     /* Finally eliminate the radix_mask from the tree */
>       for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist)
>               if (m == saved_m) {
>                       *mp = m->rm_mklist;
>                       pool_put(&rtmask_pool, m);
>                       break;
>               }
> +
>       if (m == NULL) {
>               log(LOG_ERR, "rn_delete: couldn't find our annotation\n");
>               if (tt->rn_flags & RNF_NORMAL)
> -                     return (0); /* Dangling ref to us */
> +                     return (-1); /* Dangling ref to us */
>       }
> -on1:
> +
> +     return (0);
> +}
> +
> +/* swap two internal nodes and fixup the parent and child pointers */
> +static inline void
> +rn_swap_nodes(struct radix_node *from, struct radix_node *to)
> +{
> +     *to = *from;
> +     if (from->rn_p->rn_l == from)
> +             from->rn_p->rn_l = to;
> +     else
> +             from->rn_p->rn_r = to;
> +
> +     to->rn_l->rn_p = to;
> +     to->rn_r->rn_p = to;
> +}
> +
> +struct radix_node *
> +rn_delete(void *v_arg, void *n_arg, struct radix_node_head *head,
> +    struct radix_node *rn)
> +{
> +     caddr_t v = v_arg;
> +     caddr_t netmask = n_arg;
> +     struct radix_node *top = head->rnh_treetop;
> +     struct radix_node *tt, *tp, *pp, *x;
> +     struct radix_node *dupedkey_tt, *saved_tt;
> +     int off = top->rn_off;
> +     int vlen;
> +
> +     vlen =  *(u_char *)v;
> +
>       /*
> -      * Eliminate us from tree
> +      * Implement a lookup similar to rn_lookup but we need to save
> +      * the radix leaf node (where th rn_dupedkey list starts) so
> +      * it is not possible to use rn_lookup.
>        */
> -     if (tt->rn_flags & RNF_ROOT)
> +     tt = rn_search(v, top);
> +     /* make sure the key is a perfect match */
> +     if (memcmp(v + off, tt->rn_key + off, vlen - off))
>               return (0);
> -     t = tt->rn_p;
> -     dupedkey = saved_tt->rn_dupedkey;
> -     if (dupedkey) {
> -             /*
> -              * Here, tt is the deletion target, and
> -              * saved_tt is the head of the dupedkey chain.
> -              */
> +
> +     /*
> +      * Here, tt is the deletion target, and
> +      * saved_tt is the head of the dupedkey chain.
> +      * dupedkey_tt will point to the start of the multipath chain.
> +      */
> +     saved_tt = tt;
> +
> +     /*
> +      * make tt point to the start of the rn_dupedkey list of multipath
> +      * routes.
> +      */
> +     if (netmask) {
> +             struct radix_node *tm;
> +
> +             if ((tm = rn_addmask(netmask, 1, off)) == NULL)
> +                     return (0);
> +             netmask = tm->rn_key;
> +             while (tt->rn_mask != netmask)
> +                     if ((tt = tt->rn_dupedkey) == NULL)
> +                             return (0);
> +     }
> +
> +     KASSERT((tt->rn_flags & RNF_ROOT) == 0);
> +
> +     /* save start of multi path chain for later use */
> +     dupedkey_tt = tt;
> +
> +#ifndef SMALL_KERNEL
> +     /* if we got a hint use the hint from now on */
> +     if (rn) 
> +             tt = rn;
> +#endif
> +
> +     /* remove possible radix_mask */
> +     if (rn_del_radix_mask(tt))
> +             return (0);
> +
> +     /*
> +      * Finally eliminate us from tree
> +      */
> +     tp = tt->rn_p;
> +     if (saved_tt->rn_dupedkey) {
>               if (tt == saved_tt) {
> -                     x = dupedkey;
> -                     x->rn_p = t;
> -                     if (t->rn_l == tt)
> -                             t->rn_l = x;
> +                     x = saved_tt->rn_dupedkey;
> +                     x->rn_p = tp;
> +                     if (tp->rn_l == tt)
> +                             tp->rn_l = x;
>                       else
> -                             t->rn_r = x;
> +                             tp->rn_r = x;
>               } else {
>                       x = saved_tt;
> -                     t->rn_dupedkey = tt->rn_dupedkey;
> +                     tp->rn_dupedkey = tt->rn_dupedkey;
>                       if (tt->rn_dupedkey)
> -                             tt->rn_dupedkey->rn_p = t;
> -             }
> -             t = tt + 1;
> -             if  (t->rn_flags & RNF_ACTIVE) {
> -                     *++x = *t;
> -                     p = t->rn_p;
> -                     if (p->rn_l == t)
> -                             p->rn_l = x;
> -                     else
> -                             p->rn_r = x;
> -                     x->rn_l->rn_p = x;
> -                     x->rn_r->rn_p = x;
> +                             tt->rn_dupedkey->rn_p = tp;
>               }
> +
> +             /*
> +              * We may be holding an active internal node in the tree.
> +              */
> +             if  (tt[1].rn_flags & RNF_ACTIVE)
> +                     rn_swap_nodes(&tt[1], &x[1]);
> +
> +#ifndef SMALL_KERNEL
> +             /* adjust the flags of the multipath chain */
> +             if (rn_mpath_capable(head))
> +                     rn_mpath_adj_mpflag(dupedkey_tt,
> +                         ((struct rtentry *)tt)->rt_priority);
> +#endif
> +             /* over and out */
>               goto out;
>       }
> -     if (t->rn_l == tt)
> -             x = t->rn_r;
> +
> +     /* non-rn_dupedkey case, remove tt and tp node from the tree */
> +     if (tp->rn_l == tt)
> +             x = tp->rn_r;
>       else
> -             x = t->rn_l;
> -     p = t->rn_p;
> -     if (p->rn_r == t)
> -             p->rn_r = x;
> +             x = tp->rn_l;
> +     pp = tp->rn_p;
> +     if (pp->rn_r == tp)
> +             pp->rn_r = x;
>       else
> -             p->rn_l = x;
> -     x->rn_p = p;
> +             pp->rn_l = x;
> +     x->rn_p = pp;
> +
>       /*
> -      * Demote routes attached to us.
> +      * Demote routes attached to us (actually on the internal parent node).
>        */
> -     if (t->rn_mklist) {
> +     if (tp->rn_mklist) {
> +             struct radix_mask *m, **mp;
>               if (x->rn_b >= 0) {
>                       for (mp = &x->rn_mklist; (m = *mp);)
>                               mp = &m->rm_mklist;
> -                     *mp = t->rn_mklist;
> +                     *mp = tp->rn_mklist;
>               } else {
>                       /* If there are any key,mask pairs in a sibling
>                          duped-key chain, some subset will appear sorted
>                          in the same order attached to our mklist */
> -                     for (m = t->rn_mklist; m && x; x = x->rn_dupedkey)
> +                     for (m = tp->rn_mklist; m && x; x = x->rn_dupedkey)
>                               if (m == x->rn_mklist) {
>                                       struct radix_mask *mm = m->rm_mklist;
>                                       x->rn_mklist = 0;
> @@ -896,22 +1077,18 @@ on1:
>                                   "rn_delete: Orphaned Mask", m, x);
>               }
>       }
> +
>       /*
>        * We may be holding an active internal node in the tree.
> +      * If so swap our internal node (t) with the parent node (tp)
> +      * since that one was just removed from the tree.
>        */
> -     x = tt + 1;
> -     if (t != x) {
> -             *t = *x;
> -             t->rn_l->rn_p = t;
> -             t->rn_r->rn_p = t;
> -             p = x->rn_p;
> -             if (p->rn_l == x)
> -                     p->rn_l = t;
> -             else
> -                     p->rn_r = t;
> -     }
> +     if (tp != &tt[1])
> +             rn_swap_nodes(&tt[1], tp);
> +
> +     /* no rn_dupedkey list so no need to fixup multipath chains */
>  out:
> -     tt->rn_flags &= ~RNF_ACTIVE;
> +     tt[0].rn_flags &= ~RNF_ACTIVE;
>       tt[1].rn_flags &= ~RNF_ACTIVE;
>       return (tt);
>  }
> Index: net/radix.h
> ===================================================================
> RCS file: /cvs/src/sys/net/radix.h,v
> retrieving revision 1.24
> diff -u -p -r1.24 radix.h
> --- net/radix.h       22 Jan 2014 10:17:59 -0000      1.24
> +++ net/radix.h       26 Jan 2014 14:23:06 -0000
> @@ -119,6 +119,8 @@ void      rn_init(void);
>  int  rn_inithead(void **, int);
>  int  rn_inithead0(struct radix_node_head *, int);
>  int  rn_refines(void *, void *);
> +void rn_link_dupedkey(struct radix_node *, struct radix_node *, int);
> +
>  int  rn_walktree(struct radix_node_head *,
>           int (*)(struct radix_node *, void *, u_int), void *);
>  
> Index: net/radix_mpath.c
> ===================================================================
> RCS file: /cvs/src/sys/net/radix_mpath.c,v
> retrieving revision 1.22
> diff -u -p -r1.22 radix_mpath.c
> --- net/radix_mpath.c 22 Jan 2014 10:17:59 -0000      1.22
> +++ net/radix_mpath.c 18 Apr 2014 19:04:04 -0000
> @@ -38,7 +38,6 @@
>  #include <sys/systm.h>
>  #include <sys/malloc.h>
>  #include <sys/socket.h>
> -#define      M_DONTWAIT M_NOWAIT
>  #include <sys/domain.h>
>  #include <sys/syslog.h>
>  #include <net/radix.h>
> @@ -68,156 +67,99 @@ rn_mpath_capable(struct radix_node_head 
>  }
>  
>  struct radix_node *
> -rn_mpath_next(struct radix_node *rn, int all)
> +rn_mpath_next(struct radix_node *rn, int mode)
>  {
>       struct radix_node       *next;
> -     struct rtentry          *rt = (struct rtentry *)rn;
> +     struct rtentry          *nt, *rt = (struct rtentry *)rn;
>  
>       if (!rn->rn_dupedkey)
>               return NULL;
>       next = rn->rn_dupedkey;
> -     if (rn->rn_mask == next->rn_mask && (all ||
> -         rt->rt_priority == ((struct rtentry *)next)->rt_priority))
> -             return next;
> -     else
> +
> +     if (rn->rn_mask != next->rn_mask)
>               return NULL;
> +
> +     switch (mode) {
> +     case RMP_MODE_BYPRIO:
> +             /* scan all of the list until we find a match */
> +             while (next) {
> +                     if (rn->rn_mask != next->rn_mask)
> +                             return NULL;
> +                     nt = (struct rtentry *)next;
> +                     if ((rt->rt_priority & RTP_MASK) ==
> +                         (nt->rt_priority & RTP_MASK))
> +                             return next;
> +                     rn = next;
> +                     next = next->rn_dupedkey;
> +             }
> +             return NULL;
> +     case RMP_MODE_ACTIVE:
> +             nt = (struct rtentry *)next;
> +             if (rt->rt_priority != nt->rt_priority)
> +                     return NULL;
> +             break;
> +     case RMP_MODE_ALL:
> +             break;
> +     }
> +     return next;
> +}
> +
> +struct rtentry *
> +rt_mpath_next(struct rtentry *rt)
> +{
> +     struct radix_node *rn = (struct radix_node *)rt;
> +
> +     return ((struct rtentry *)rn_mpath_next(rn, RMP_MODE_ACTIVE));
>  }
>  
> +/*
> + * return first route matching prio or the node just before.
> + */
>  struct radix_node *
>  rn_mpath_prio(struct radix_node *rn, u_int8_t prio)
>  {
> -     struct radix_node       *prev = rn;
> +     struct radix_node       *hit = rn;
>       struct rtentry          *rt;
>  
>       if (prio == RTP_ANY)
> -             return (rn);
> -
> -     while (rn) {
> -             /* different netmask -> different route */
> -             if (rn->rn_mask != prev->rn_mask)
> -                     return (prev);
> +             return (hit);
>  
> +     do {
>               rt = (struct rtentry *)rn;
>               if (rt->rt_priority == prio)
> +                     /* perfect match */
>                       return (rn);
> -             if (rt->rt_priority > prio)
> -                     /* list is sorted return last more prefered entry */
> -                     return (prev);
> -             prev = rn;
> -             rn = rn->rn_dupedkey;
> -     }
> -     return (prev);
> +
> +             /* list is sorted remember last more prefered (smaller) entry */
> +             if (rt->rt_priority < prio)
> +                     hit = rn;
> +     } while ((rn = rn_mpath_next(rn, RMP_MODE_ALL)) != NULL);
> +
> +     return (hit);
>  }
>  
> +/*
> + * Adjust the RTF_MPATH flag for the part of the rn_dupedkey chain
> + * that matches the prio.
> + */
>  void
> -rn_mpath_reprio(struct radix_node *rn, int newprio)
> +rn_mpath_adj_mpflag(struct radix_node *rn, u_int8_t prio)
>  {
> -     struct radix_node       *prev = rn->rn_p;
> -     struct radix_node       *next = rn->rn_dupedkey;
> -     struct radix_node       *t, *tt, *saved_tt, *head;
> -     struct rtentry          *rt = (struct rtentry *)rn;
> -     int                      mid, oldprio, prioinv = 0;
> +     struct rtentry *rt = (struct rtentry *)rn;
>  
> -     oldprio = rt->rt_priority;
> -     rt->rt_priority = newprio;
> -
> -     /* same prio, no change needed */
> -     if (oldprio == newprio)
> +     prio &= RTP_MASK;
> +     rt = rt_mpath_matchgate(rt, NULL, prio);
> +     rn = (struct radix_node *)rt;
> +     
> +     if (!rn)
>               return;
> -     if (rn_mpath_next(rn, 1) == NULL) {
> -             /* no need to move node, route is alone */
> -             if (prev->rn_mask != rn->rn_mask)
> -                     return;
> -             /* ... or route is last and prio gets bigger */
> -             if (oldprio < newprio)
> -                     return;
> -     }
> -
> -     /* remove node from dupedkey list and reinsert at correct place */
> -     if (prev->rn_dupedkey == rn) {
> -             prev->rn_dupedkey = next;
> -             if (next)
> -                     next->rn_p = prev;
> -             else
> -                     next = prev;
> -     } else {
> -             if (next == NULL)
> -                     panic("next == NULL");
> -             next->rn_p = prev;
> -             if (prev->rn_l == rn)
> -                     prev->rn_l = next;
> -             else
> -                     prev->rn_r = next;
> -     }
> -
> -     /* re-insert rn at the right spot, so first rewind to the head */
> -     for (tt = next; tt->rn_p->rn_dupedkey == tt; tt = tt->rn_p)
> -             ;
> -     saved_tt = tt;
> -
> -     /*
> -      * Stolen from radix.c rn_addroute().
> -      * This is nasty code with a certain amount of magic and dragons.
> -      * t is the element where the re-priorized rn is inserted -- before
> -      * or after depending on prioinv. saved_tt points to the head of the
> -      * dupedkey chain and tt is a bit of a helper
> -      *
> -      * First we skip with tt to the start of the mpath group then we
> -      * search the right spot to enter our node.
> -      */
> -     for (; tt; tt = tt->rn_dupedkey)
> -             if (rn->rn_mask == tt->rn_mask)
> -                     break;
> -     head = tt; /* store current head entry for rn_mklist check */
> -
> -     tt = rn_mpath_prio(tt, newprio);
> -     if (((struct rtentry *)tt)->rt_priority != newprio) {
> -             if (((struct rtentry *)tt)->rt_priority > newprio)
> -                     prioinv = 1;
> -             t = tt;
> -     } else {
> -             mid = rn_mpath_active_count(tt) / 2;
> -             do {
> -                     t = tt;
> -                     tt = rn_mpath_next(tt, 0);
> -             } while (tt && --mid > 0);
> -     }
> -
> -     /* insert rn before or after t depending on prioinv, tt and saved_tt */
> -     if (tt == saved_tt && prioinv) {
> -             /* link in at head of list */
> -             rn->rn_dupedkey = tt;
> -             rn->rn_p = tt->rn_p;
> -             tt->rn_p = rn;
> -             if (rn->rn_p->rn_l == tt)
> -                     rn->rn_p->rn_l = rn;
> -             else
> -                     rn->rn_p->rn_r = rn;
> -     } else if (prioinv == 1) {
> -             rn->rn_dupedkey = t;
> -             t->rn_p->rn_dupedkey = rn;
> -             rn->rn_p = t->rn_p;
> -             t->rn_p = rn;
> -     } else {
> -             rn->rn_dupedkey = t->rn_dupedkey;
> -             t->rn_dupedkey = rn;
> -             rn->rn_p = t;
> -             if (rn->rn_dupedkey)
> -                     rn->rn_dupedkey->rn_p = rn;
> -     }
> -
> -     if (rn->rn_mklist && rn->rn_flags & RNF_NORMAL) {
> -             /* the rn_mklist needs to be fixed if the best route changed */
> -             if (rn->rn_mklist->rm_leaf != rn) {
> -                     if (rn->rn_mklist->rm_leaf->rn_p == rn)
> -                             /* changed route is now best */
> -                             rn->rn_mklist->rm_leaf = rn;
> -             } else {
> -                     if (rn->rn_dupedkey != head)
> -                             /* rn moved behind head, so head is new head */
> -                             rn->rn_mklist->rm_leaf = head;
> +     if (rn_mpath_next(rn, RMP_MODE_BYPRIO)) {
> +             while (rn != NULL) {
> +                     ((struct rtentry *)rn)->rt_flags |= RTF_MPATH;
> +                     rn = rn_mpath_next(rn, RMP_MODE_BYPRIO);
>               }
> -     }
> +     } else
> +             rt->rt_flags &= ~RTF_MPATH;
>  }
>  
>  int
> @@ -226,11 +168,16 @@ rn_mpath_active_count(struct radix_node 
>       int i;
>  
>       i = 1;
> -     while ((rn = rn_mpath_next(rn, 0)) != NULL)
> +     while ((rn = rn_mpath_next(rn, RMP_MODE_ACTIVE)) != NULL)
>               i++;
>       return i;
>  }
>  
> +/*
> + * return best matching route based on gateway and prio. If both are
> + * specified it acts as a lookup function to get the actual rt.
> + * If gate is NULL the first node matching the prio will be returned.
> + */
>  struct rtentry *
>  rt_mpath_matchgate(struct rtentry *rt, struct sockaddr *gate, u_int8_t prio)
>  {
> @@ -243,20 +190,13 @@ rt_mpath_matchgate(struct rtentry *rt, s
>               if (prio != RTP_ANY &&
>                   (rt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
>                       continue;
> -             /*
> -              * if gate is set it must be compared, if not set the route
> -              * must be a non-multipath one.
> -              */
> -             if (!gate && !rn_mpath_next(rn, 0))
> -                     return rt;
> +             /* if no gate is set we found a match */
>               if (!gate)
> -                     return NULL;
> -             if (!rt->rt_gateway)
> -                     continue;
> +                     return rt;
>               if (rt->rt_gateway->sa_len == gate->sa_len &&
>                   !memcmp(rt->rt_gateway, gate, gate->sa_len))
>                       break;
> -     } while ((rn = rn_mpath_next(rn, 1)) != NULL);
> +     } while ((rn = rn_mpath_next(rn, RMP_MODE_ALL)) != NULL);
>  
>       return (struct rtentry *)rn;
>  }
> @@ -341,28 +281,105 @@ rt_mpath_conflict(struct radix_node_head
>       if (!mpathok && rt1->rt_priority == rt->rt_priority)
>               return EEXIST;
>  
> -     rn1 = rn_mpath_prio((struct radix_node *)rt1, rt->rt_priority);
>       /* key/mask were the same.  compare gateway for all multipaths */
> -     do {
> -             rt1 = (struct rtentry *)rn1;
> -
> -             /* sanity: no use in comparing the same thing */
> -             if (rn1 == rn)
> -                     continue;
> -
> -             if (rt1->rt_gateway->sa_len != rt->rt_gateway->sa_len ||
> -                 bcmp(rt1->rt_gateway, rt->rt_gateway,
> -                 rt1->rt_gateway->sa_len))
> -                     continue;
> -
> +     if (rt_mpath_matchgate(rt1, rt->rt_gateway, rt->rt_priority))
>               /* all key/mask/gateway are the same.  conflicting entry. */
>               return EEXIST;
> -     } while ((rn1 = rn_mpath_next(rn1, 0)) != NULL);
>  
>   different:
>       return 0;
>  }
>  
> +void
> +rn_mpath_reprio(struct radix_node *rn, int newprio)
> +{
> +     struct radix_node       *prev = rn->rn_p;
> +     struct radix_node       *next = rn->rn_dupedkey;
> +     struct radix_node       *t, *tt, *saved_tt, *head;
> +     struct rtentry          *rt = (struct rtentry *)rn;
> +     int                      mid, oldprio, prioinv = 0;
> +
> +     oldprio = rt->rt_priority;
> +     rt->rt_priority = newprio;
> +
> +     /* same prio, no change needed */
> +     if (oldprio == newprio)
> +             return;
> +     if (rn_mpath_next(rn, RMP_MODE_ALL) == NULL) {
> +             /* no need to move node, route is alone */
> +             if (prev->rn_mask != rn->rn_mask)
> +                     return;
> +             /* ... or route is last and prio gets bigger */
> +             if (oldprio < newprio)
> +                     return;
> +     }
> +
> +     /* remove node from dupedkey list and reinsert at correct place */
> +     if (prev->rn_dupedkey == rn) {
> +             prev->rn_dupedkey = next;
> +             if (next)
> +                     next->rn_p = prev;
> +             else
> +                     next = prev;
> +     } else {
> +             if (next == NULL)
> +                     panic("next == NULL");
> +             next->rn_p = prev;
> +             if (prev->rn_l == rn)
> +                     prev->rn_l = next;
> +             else
> +                     prev->rn_r = next;
> +     }
> +
> +     /* re-insert rn at the right spot, so first rewind to the head */
> +     for (tt = next; tt->rn_p->rn_dupedkey == tt; tt = tt->rn_p)
> +             ;
> +     saved_tt = tt;
> +
> +     /*
> +      * Stolen from radix.c rn_addroute().
> +      * This is nasty code with a certain amount of magic and dragons.
> +      * t is the element where the re-priorized rn is inserted -- before
> +      * or after depending on prioinv. saved_tt points to the head of the
> +      * dupedkey chain and tt is a bit of a helper
> +      *
> +      * First we skip with tt to the start of the mpath group then we
> +      * search the right spot to enter our node.
> +      */
> +     for (; tt; tt = tt->rn_dupedkey)
> +             if (rn->rn_mask == tt->rn_mask)
> +                     break;
> +     head = tt; /* store current head entry for rn_mklist check */
> +
> +     tt = rn_mpath_prio(tt, newprio);
> +     if (((struct rtentry *)tt)->rt_priority != newprio) {
> +             if (((struct rtentry *)tt)->rt_priority > newprio)
> +                     prioinv = 1;
> +             t = tt;
> +     } else {
> +             mid = rn_mpath_active_count(tt) / 2;
> +             do {
> +                     t = tt;
> +                     tt = rn_mpath_next(tt, RMP_MODE_ACTIVE);
> +             } while (tt && --mid > 0);
> +     }
> +
> +     /* insert rn before or after t depending on prioinv */
> +     rn_link_dupedkey(rn, t, prioinv);
> +
> +     /* the rn_mklist needs to be fixed if the best route changed */
> +     if (rn->rn_mklist && rn->rn_flags & RNF_NORMAL) {
> +             if (rn->rn_mklist->rm_leaf != rn) {
> +                     if (rn->rn_mklist->rm_leaf->rn_p == rn)
> +                             /* changed route is now best */
> +                             rn->rn_mklist->rm_leaf = rn;
> +             } else if (rn->rn_dupedkey != head) {
> +                     /* rn moved behind head, so head is new head */
> +                     rn->rn_mklist->rm_leaf = head;
> +             }
> +     }
> +}
> +
>  /*
>   * allocate a route, potentially using multipath to select the peer.
>   */
> @@ -405,13 +422,9 @@ rtalloc_mpath(struct route *ro, u_int32_
>       threshold = 1 + (0xffff / npaths);
>       while (hash > threshold && rn) {
>               /* stay within the multipath routes */
> -             if (rn_mpath_next(rn, 0) == NULL)
> -                     break;
> -             rn = rn->rn_dupedkey;
> +             rn = rn_mpath_next(rn, RMP_MODE_ACTIVE);
>               hash -= threshold;
>       }
> -
> -     /* XXX try filling rt_gwroute and avoid unreachable gw  */
>  
>       /* if gw selection fails, use the first match (default) */
>       if (!rn)
> Index: net/radix_mpath.h
> ===================================================================
> RCS file: /cvs/src/sys/net/radix_mpath.h,v
> retrieving revision 1.11
> diff -u -p -r1.11 radix_mpath.h
> --- net/radix_mpath.h 24 Oct 2013 18:50:16 -0000      1.11
> +++ net/radix_mpath.h 17 Apr 2014 19:30:10 -0000
> @@ -45,9 +45,13 @@ struct route;
>  struct rtentry;
>  struct sockaddr;
>  int  rn_mpath_capable(struct radix_node_head *);
> +#define RMP_MODE_ACTIVE      0
> +#define RMP_MODE_ALL 1
> +#define RMP_MODE_BYPRIO      2
>  struct radix_node *rn_mpath_next(struct radix_node *, int);
>  struct radix_node *rn_mpath_prio(struct radix_node *, u_int8_t);
>  void rn_mpath_reprio(struct radix_node *, int);
> +void rn_mpath_adj_mpflag(struct radix_node *, u_int8_t);
>  int  rn_mpath_active_count(struct radix_node *);
>  struct rtentry *rt_mpath_matchgate(struct rtentry *, struct sockaddr *,
>           u_int8_t);
> Index: net/route.c
> ===================================================================
> RCS file: /cvs/src/sys/net/route.c,v
> retrieving revision 1.165
> diff -u -p -r1.165 route.c
> --- net/route.c       29 Apr 2014 11:58:29 -0000      1.165
> +++ net/route.c       5 May 2014 19:17:22 -0000
> @@ -312,7 +312,7 @@ void
>  rtalloc_noclone(struct route *ro)
>  {
>       if (ro->ro_rt && ro->ro_rt->rt_ifp && (ro->ro_rt->rt_flags & RTF_UP))
> -             return;                          /* XXX */
> +             return;         /* cached route is still valid */
>       ro->ro_rt = rtalloc1(&ro->ro_dst, RT_REPORT | RT_NOCLONING,
>           ro->ro_tableid);
>  }
> @@ -321,7 +321,7 @@ void
>  rtalloc(struct route *ro)
>  {
>       if (ro->ro_rt && ro->ro_rt->rt_ifp && (ro->ro_rt->rt_flags & RTF_UP))
> -             return;                          /* XXX */
> +             return;         /* cached route is still valid */
>       ro->ro_rt = rtalloc1(&ro->ro_dst, RT_REPORT, ro->ro_tableid);
>  }
>  
> @@ -773,7 +773,9 @@ rtrequest1(int req, struct rt_addrinfo *
>                       rt = rt_mpath_matchgate(rt,
>                           info->rti_info[RTAX_GATEWAY], prio);
>                       rn = (struct radix_node *)rt;
> -                     if (!rt)
> +                     if (!rt ||
> +                         (!info->rti_info[RTAX_GATEWAY] &&
> +                         rt->rt_flags & RTF_MPATH))
>                               senderr(ESRCH);
>               }
>  #endif
> @@ -799,15 +801,6 @@ rtrequest1(int req, struct rt_addrinfo *
>                       rt->rt_parent = NULL;
>               }
>  
> -#ifndef SMALL_KERNEL
> -             if (rn_mpath_capable(rnh)) {
> -                     if ((rn = rnh->rnh_lookup(info->rti_info[RTAX_DST],
> -                         info->rti_info[RTAX_NETMASK], rnh)) != NULL &&
> -                         rt_mpath_next((struct rtentry *)rn) == NULL)
> -                             ((struct rtentry *)rn)->rt_flags &= ~RTF_MPATH;
> -             }
> -#endif
> -
>               rt->rt_flags &= ~RTF_UP;
>               if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest)
>                       ifa->ifa_rtrequest(RTM_DELETE, rt);
> @@ -995,18 +988,6 @@ rtrequest1(int req, struct rt_addrinfo *
>                       senderr(EEXIST);
>               }
>  
> -#ifndef SMALL_KERNEL
> -             if (rn_mpath_capable(rnh) &&
> -                 (rn = rnh->rnh_lookup(info->rti_info[RTAX_DST],
> -                 info->rti_info[RTAX_NETMASK], rnh)) != NULL &&
> -                 (rn = rn_mpath_prio(rn, prio)) != NULL) {
> -                     if (rt_mpath_next((struct rtentry *)rn) == NULL)
> -                             ((struct rtentry *)rn)->rt_flags &= ~RTF_MPATH;
> -                     else
> -                             ((struct rtentry *)rn)->rt_flags |= RTF_MPATH;
> -             }
> -#endif
> -
>               if (ifa->ifa_rtrequest)
>                       ifa->ifa_rtrequest(req, rt);
>               if (ret_nrt) {
> @@ -1618,13 +1599,5 @@ rt_if_linkstate_change(struct radix_node
>       }
>  
>       return (0);
> -}
> -
> -struct rtentry *
> -rt_mpath_next(struct rtentry *rt)
> -{
> -     struct radix_node *rn = (struct radix_node *)rt;
> -
> -     return ((struct rtentry *)rn_mpath_next(rn, 0));
>  }
>  #endif
> Index: net/rtsock.c
> ===================================================================
> RCS file: /cvs/src/sys/net/rtsock.c,v
> retrieving revision 1.143
> diff -u -p -r1.143 rtsock.c
> --- net/rtsock.c      25 Apr 2014 10:41:09 -0000      1.143
> +++ net/rtsock.c      26 Apr 2014 11:47:34 -0000
> @@ -617,45 +617,23 @@ route_output(struct mbuf *m, ...)
>                *
>                * for RTM_GET, info.rti_info[RTAX_GATEWAY] is optional
>                * even with multipath.
> -              * if it is NULL the first match is returned (no need to
> -              * call rt_mpath_matchgate).
>                */
>               if (rn_mpath_capable(rnh)) {
> -                     /* first find correct priority bucket */
> -                     rn = rn_mpath_prio(rn, prio);
> -                     rt = (struct rtentry *)rn;
> -                     if (prio != RTP_ANY &&
> -                         (rt->rt_priority & RTP_MASK) != prio) {
> +                     rt = rt_mpath_matchgate(rt,
> +                         info.rti_info[RTAX_GATEWAY], prio);
> +                     if (!rt) {
>                               error = ESRCH;
> -                             rt->rt_refcnt++;
>                               goto flush;
>                       }
>  
> -                     /* if multipath routes */
> -                     if (rt_mpath_next(rt)) { /* XXX ignores down routes */
> -                             if (info.rti_info[RTAX_GATEWAY] != NULL) {
> -                                     rt = rt_mpath_matchgate(rt,
> -                                         info.rti_info[RTAX_GATEWAY], prio);
> -                             } else if (rtm->rtm_type != RTM_GET) {
> -                                     /*
> -                                      * only RTM_GET may use an empty
> -                                      * gateway  on multipath ...
> -                                      */
> -                                     rt = NULL;
> -                             }
> -                     } else if ((info.rti_info[RTAX_GATEWAY] != NULL) &&
> -                         (rtm->rtm_type == RTM_GET ||
> -                          rtm->rtm_type == RTM_LOCK)) {
> -                             /*
> -                              * ... but if a gateway is specified RTM_GET
> -                              * and RTM_LOCK must match the gateway no matter
> -                              * what.
> -                              */
> -                             rt = rt_mpath_matchgate(rt,
> -                                 info.rti_info[RTAX_GATEWAY], prio);
> -                     }
> -
> -                     if (!rt) {
> +                     /*
> +                      * only RTM_GET may use an empty gateway
> +                      * on multipath routes
> +                      */
> +                     if (!info.rti_info[RTAX_GATEWAY] &&
> +                         rt->rt_flags & RTF_MPATH &&
> +                         rtm->rtm_type != RTM_GET) {
> +                             rt = NULL;
>                               error = ESRCH;
>                               goto flush;
>                       }
> 

Reply via email to