I really like this. It will allow us to move forward. While I was for a
long time holding the opinion that static routes should not disapear from
the routing table because the interface address changes I came to the
conclusion that this is causing more harm and so it is better to remove
all routes that point to dead ifas. The only thing to consider is that we
may end up with a veritable RT message storm because of all the removes.
While this may happen I think it is not common and so I would not spend
too much energy into this and just accept that in the worst case all
userland tools will get a RTM_MISS message and will then resync the tree.
Without the ART bits this is OK claudio
On Sat, Sep 03, 2016 at 01:30:57PM +0200, Martin Pieuchot wrote:
> The next MP problem when using route entries are stale ifas.
>
> Stale ifas exist because we do not purge corresponding route entries
> when we remove an ifa. Instead we try to 'fixup' such routes when
> we found them later on. This happens in two places when cloning
> a route, inside rtalloc(9), or when adding an address, inside the
> ioctl() path.
>
> Because a CPU can change the 'rt->rt_ifa' pointer of a route it is not
> MP safe to dereference it inside rtisavalid(9).
>
> Since stale ifas are known the create other problems I don't think
> trying to keep this feature is worth it. So I'd like to purge route
> entries attached to an ifa when such ifa is removed from the system.
>
> After that if people still believe we should decouple the lifetime of
> route entries and addresses, then I'd be more than happy to offer help
> to make route not depend on ifas.
>
> For the moment I want to make progress on the MP side and enforcing pointer
> immutability is the easiest way to go.
>
> Diff attached get rid of stale ifa. Well almost. We're still calling
> ifafree() inside rtfree(9). I'd move it to rtrequest(9) and remove the
> refcounting later when we know this step is ok.
>
> I don't plan to commit the bits inside "#ifdef ART" below, unless you prefer
> to have them. I'm using db_show_trashedrt() from ddb(4) to see
> which routes are currently beeing cached but are invalid.
>
> I'm thinking about crafting a proper code for that since we will need
> something like that when route entries will be cached in pf states.
> Index: net/route.c
> ===================================================================
> RCS file: /cvs/src/sys/net/route.c,v
> retrieving revision 1.321
> diff -u -p -r1.321 route.c
> --- net/route.c 1 Sep 2016 15:40:38 -0000 1.321
> +++ net/route.c 3 Sep 2016 11:09:43 -0000
> @@ -148,6 +148,14 @@ extern unsigned int rtmap_limit;
>
> struct rtstat rtstat;
> int rttrash; /* routes not in table but not freed */
> +int ifatrash; /* ifas not in ifp list but not free */
> +
> +#ifdef ART
> +/* Keep track of referenced route entries that are no longer in the table. */
> +void noop(void *null, void *xrt){}
> +SRPL_HEAD(, rtentry) trashedrt;
> +struct srpl_rc trashedrt_rc = SRPL_RC_INITIALIZER(noop, noop,
> NULL);
> +#endif
>
> struct pool rtentry_pool; /* pool for rtentry structures */
> struct pool rttimer_pool; /* pool for rttimer structures */
> @@ -155,10 +163,9 @@ struct pool rttimer_pool; /* pool for r
> void rt_timer_init(void);
> int rt_setgwroute(struct rtentry *, u_int);
> void rt_putgwroute(struct rtentry *);
> -int rt_fixgwroute(struct rtentry *, void *, unsigned int);
> int rtflushclone1(struct rtentry *, void *, u_int);
> void rtflushclone(unsigned int, struct rtentry *);
> -int rt_if_remove_rtdelete(struct rtentry *, void *, u_int);
> +int rt_ifa_purge_walker(struct rtentry *, void *, unsigned int);
> struct rtentry *rt_match(struct sockaddr *, uint32_t *, int, unsigned int);
> struct sockaddr *rt_plentosa(sa_family_t, int, struct sockaddr_in6 *);
>
> @@ -210,10 +217,6 @@ rtisvalid(struct rtentry *rt)
> if (!ISSET(rt->rt_flags, RTF_UP))
> return (0);
>
> - /* Routes attached to stale ifas should be freed. */
> - if (rt->rt_ifa == NULL || rt->rt_ifa->ifa_ifp == NULL)
> - return (0);
> -
> if (ISSET(rt->rt_flags, RTF_GATEWAY)) {
> KASSERT(rt->rt_gwroute != NULL);
> KASSERT(!ISSET(rt->rt_gwroute->rt_flags, RTF_GATEWAY));
> @@ -446,41 +449,6 @@ rt_putgwroute(struct rtentry *rt)
> rt->rt_gwroute = NULL;
> }
>
> -/*
> - * Refresh cached entries of RTF_GATEWAY routes for a given interface.
> - *
> - * This clever logic is necessary to try to fix routes linked to stale
> - * ifas.
> - */
> -int
> -rt_fixgwroute(struct rtentry *rt, void *arg, unsigned int id)
> -{
> - struct ifnet *ifp = arg;
> -
> - KERNEL_ASSERT_LOCKED();
> -
> - if (rt->rt_ifidx != ifp->if_index || !ISSET(rt->rt_flags, RTF_GATEWAY))
> - return (0);
> -
> - /*
> - * If the gateway route is not stale, its associated cached
> - * is also not stale.
> - */
> - if (rt->rt_ifa->ifa_ifp != NULL)
> - return (0);
> -
> - /* If we can fix the cached next hop entry, we can fix the ifa. */
> - if (rt_setgate(rt, rt->rt_gateway, ifp->if_rdomain) == 0) {
> - struct ifaddr *ifa = rt->rt_gwroute->rt_ifa;
> -
> - ifafree(rt->rt_ifa);
> - ifa->ifa_refcnt++;
> - rt->rt_ifa = ifa;
> - }
> -
> - return (0);
> -}
> -
> void
> rtref(struct rtentry *rt)
> {
> @@ -500,6 +468,10 @@ rtfree(struct rtentry *rt)
> KASSERT(!ISSET(rt->rt_flags, RTF_UP));
> KASSERT(!RT_ROOT(rt));
> atomic_dec_int(&rttrash);
> +#ifdef ART
> + SRPL_REMOVE_LOCKED(&trashedrt_rc, &trashedrt, rt, rtentry,
> + rt_next);
> +#endif
> if (refcnt < 0) {
> printf("rtfree: %p not freed (neg refs)\n", rt);
> return;
> @@ -551,9 +523,10 @@ ifafree(struct ifaddr *ifa)
> {
> if (ifa == NULL)
> panic("ifafree");
> - if (ifa->ifa_refcnt == 0)
> + if (ifa->ifa_refcnt == 0) {
> + ifatrash--;
> free(ifa, M_IFADDR, 0);
> - else
> + } else
> ifa->ifa_refcnt--;
> }
>
> @@ -728,7 +701,7 @@ rtflushclone1(struct rtentry *rt, void *
> /*
> * This happens when an interface with a RTF_CLONING route is
> * being detached. In this case it's safe to bail because all
> - * the routes are being purged by rt_if_remove().
> + * the routes are being purged by rt_ifa_purge().
> */
> if (ifp == NULL)
> return 0;
> @@ -939,6 +912,9 @@ rtrequest_delete(struct rt_addrinfo *inf
> }
>
> atomic_inc_int(&rttrash);
> +#ifdef ART
> + SRPL_INSERT_HEAD_LOCKED(&trashedrt_rc, &trashedrt, rt, rt_next);
> +#endif
>
> if (ret_nrt != NULL)
> *ret_nrt = rt;
> @@ -981,22 +957,8 @@ rtrequest(int req, struct rt_addrinfo *i
> return (EINVAL);
> if ((rt->rt_flags & RTF_CLONING) == 0)
> return (EINVAL);
> - if (rt->rt_ifa->ifa_ifp) {
> - info->rti_ifa = rt->rt_ifa;
> - } else {
> - /*
> - * The address of the cloning route is not longer
> - * configured on an interface, but its descriptor
> - * is still there because of reference counting.
> - *
> - * Try to find a similar active address and use
> - * it for the cloned route. The cloning route
> - * will get the new address and interface later.
> - */
> - info->rti_ifa = NULL;
> - info->rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
> - }
> -
> + KASSERT(rt->rt_ifa->ifa_ifp != NULL);
> + info->rti_ifa = rt->rt_ifa;
> info->rti_flags = rt->rt_flags | (RTF_CLONED|RTF_HOST);
> info->rti_flags &= ~(RTF_CLONING|RTF_CONNECTED|RTF_STATIC);
> info->rti_info[RTAX_GATEWAY] = sdltosa(&sa_dl);
> @@ -1089,29 +1051,6 @@ rtrequest(int req, struct rt_addrinfo *i
> rt->rt_ifidx = ifp->if_index;
> if (rt->rt_flags & RTF_CLONED) {
> /*
> - * If the ifa of the cloning route was stale, a
> - * successful lookup for an ifa with the same address
> - * has been made. Use this ifa also for the cloning
> - * route.
> - */
> - if ((*ret_nrt)->rt_ifa->ifa_ifp == NULL) {
> - struct ifnet *ifp0;
> -
> - printf("%s RTM_RESOLVE: wrong ifa (%p) was (%p)"
> - "\n", __func__, ifa, (*ret_nrt)->rt_ifa);
> -
> - ifp0 = if_get((*ret_nrt)->rt_ifidx);
> - KASSERT(ifp0 != NULL);
> - ifp0->if_rtrequest(ifp0, RTM_DELETE, *ret_nrt);
> - ifafree((*ret_nrt)->rt_ifa);
> - if_put(ifp0);
> -
> - ifa->ifa_refcnt++;
> - (*ret_nrt)->rt_ifa = ifa;
> - (*ret_nrt)->rt_ifidx = ifp->if_index;
> - ifp->if_rtrequest(ifp, RTM_ADD, *ret_nrt);
> - }
> - /*
> * Copy both metrics and a back pointer to the cloned
> * route's parent.
> */
> @@ -1280,8 +1219,6 @@ rt_ifa_add(struct ifaddr *ifa, int flags
>
> error = rtrequest(RTM_ADD, &info, prio, &rt, rtableid);
> if (error == 0) {
> - unsigned int i;
> -
> /*
> * A local route is created for every address configured
> * on an interface, so use this information to notify
> @@ -1291,18 +1228,6 @@ rt_ifa_add(struct ifaddr *ifa, int flags
> rt_sendaddrmsg(rt, RTM_NEWADDR, ifa);
> rt_sendmsg(rt, RTM_ADD, rtableid);
> rtfree(rt);
> -
> - /*
> - * Userland inserted routes stay in the table even
> - * if their corresponding ``ifa'' is no longer valid.
> - *
> - * Try to fix the stale RTF_GATEWAY entries in case
> - * their gateway match the newly inserted route.
> - */
> - for (i = 0; i <= RT_TABLEID_MAX; i++) {
> - rtable_walk(i, ifa->ifa_addr->sa_family,
> - rt_fixgwroute, ifp);
> - }
> }
> return (error);
> }
> @@ -1459,6 +1384,46 @@ rt_ifa_dellocal(struct ifaddr *ifa)
> }
>
> /*
> + * Remove all addresses attached to ``ifa''.
> + */
> +void
> +rt_ifa_purge(struct ifaddr *ifa)
> +{
> + struct ifnet *ifp = ifa->ifa_ifp;
> + unsigned int rtableid;
> + int i;
> +
> + KASSERT(ifp != NULL);
> +
> + for (rtableid = 0; rtableid < rtmap_limit; rtableid++) {
> + /* skip rtables that are not in the rdomain of the ifp */
> + if (rtable_l2(rtableid) != ifp->if_rdomain)
> + continue;
> + for (i = 1; i <= AF_MAX; i++) {
> + rtable_walk(rtableid, i, rt_ifa_purge_walker, ifa);
> + }
> + }
> +}
> +
> +int
> +rt_ifa_purge_walker(struct rtentry *rt, void *vifa, unsigned int rtableid)
> +{
> + struct ifaddr *ifa = vifa;
> + struct ifnet *ifp = ifa->ifa_ifp;
> + int error;
> +
> + if (rt->rt_ifa != ifa)
> + return (0);
> +
> + if ((error = rtdeletemsg(rt, ifp, rtableid))) {
> + return (error);
> + }
> +
> + return (EAGAIN);
> +
> +}
> +
> +/*
> * Route timer routines. These routes allow functions to be called
> * for various routes at any time. This is useful in supporting
> * path MTU discovery and redirect route deletion.
> @@ -1741,38 +1706,6 @@ rtlabel_unref(u_int16_t id)
> }
> }
>
> -void
> -rt_if_remove(struct ifnet *ifp)
> -{
> - int i;
> - u_int tid;
> -
> - for (tid = 0; tid < rtmap_limit; tid++) {
> - /* skip rtables that are not in the rdomain of the ifp */
> - if (rtable_l2(tid) != ifp->if_rdomain)
> - continue;
> - for (i = 1; i <= AF_MAX; i++) {
> - rtable_walk(tid, i, rt_if_remove_rtdelete, ifp);
> - }
> - }
> -}
> -
> -int
> -rt_if_remove_rtdelete(struct rtentry *rt, void *vifp, u_int id)
> -{
> - struct ifnet *ifp = vifp;
> - int error;
> -
> - if (rt->rt_ifidx != ifp->if_index)
> - return (0);
> -
> - if ((error = rtdeletemsg(rt, ifp, id)))
> - return (error);
> -
> - return (EAGAIN);
> -
> -}
> -
> #ifndef SMALL_KERNEL
> void
> rt_if_track(struct ifnet *ifp)
> @@ -1965,5 +1898,16 @@ db_show_arptab(void)
> db_printf("Route tree for AF_INET\n");
> rtable_walk(0, AF_INET, db_show_rtentry, NULL);
> return (0);
> +}
> +
> +void
> +db_show_trashedrt(void)
> +{
> +#ifdef ART
> + struct rtentry *rt;
> +
> + SRPL_FOREACH_LOCKED(rt, &trashedrt, rt_next)
> + db_show_rtentry(rt, NULL, 0);
> +#endif
> }
> #endif /* DDB */
> Index: net/route.h
> ===================================================================
> RCS file: /cvs/src/sys/net/route.h,v
> retrieving revision 1.144
> diff -u -p -r1.144 route.h
> --- net/route.h 31 Aug 2016 21:32:06 -0000 1.144
> +++ net/route.h 3 Sep 2016 10:14:17 -0000
> @@ -396,13 +396,13 @@ void rtfree(struct rtentry *);
> int rt_getifa(struct rt_addrinfo *, u_int);
> int rt_ifa_add(struct ifaddr *, int, struct sockaddr *);
> int rt_ifa_del(struct ifaddr *, int, struct sockaddr *);
> +void rt_ifa_purge(struct ifaddr *);
> int rt_ifa_addlocal(struct ifaddr *);
> int rt_ifa_dellocal(struct ifaddr *);
> int rtioctl(u_long, caddr_t, struct proc *);
> void rtredirect(struct sockaddr *, struct sockaddr *, struct sockaddr *,
> struct rtentry **, unsigned int);
> int rtrequest(int, struct rt_addrinfo *, u_int8_t, struct rtentry **,
> u_int);
> -void rt_if_remove(struct ifnet *);
> #ifndef SMALL_KERNEL
> void rt_if_track(struct ifnet *);
> int rt_if_linkstate_change(struct rtentry *, void *, u_int);
> Index: net/if.c
> ===================================================================
> RCS file: /cvs/src/sys/net/if.c,v
> retrieving revision 1.440
> diff -u -p -r1.440 if.c
> --- net/if.c 3 Sep 2016 10:05:19 -0000 1.440
> +++ net/if.c 3 Sep 2016 10:12:44 -0000
> @@ -954,7 +954,6 @@ if_detach(struct ifnet *ifp)
> #ifdef INET6
> in6_ifdetach(ifp);
> #endif
> - rt_if_remove(ifp);
> #if NPF > 0
> pfi_detach_ifnet(ifp);
> #endif
> @@ -1951,7 +1950,6 @@ ifioctl(struct socket *so, u_long cmd, c
> #ifdef INET6
> in6_ifdetach(ifp);
> #endif
> - rt_if_remove(ifp);
> splx(s);
> }
>
> Index: netinet/in.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/in.c,v
> retrieving revision 1.128
> diff -u -p -r1.128 in.c
> --- netinet/in.c 13 Jun 2016 10:34:40 -0000 1.128
> +++ netinet/in.c 3 Sep 2016 10:38:05 -0000
> @@ -699,12 +699,14 @@ in_purgeaddr(struct ifaddr *ifa)
> {
> struct ifnet *ifp = ifa->ifa_ifp;
> struct in_ifaddr *ia = ifatoia(ifa);
> + extern int ifatrash;
>
> splsoftassert(IPL_SOFTNET);
>
> in_ifscrub(ifp, ia);
>
> rt_ifa_dellocal(&ia->ia_ifa);
> + rt_ifa_purge(&ia->ia_ifa);
> ifa_del(ifp, &ia->ia_ifa);
>
> if (ia->ia_allhosts != NULL) {
> @@ -712,6 +714,7 @@ in_purgeaddr(struct ifaddr *ifa)
> ia->ia_allhosts = NULL;
> }
>
> + ifatrash++;
> ia->ia_ifp = NULL;
> ifafree(&ia->ia_ifa);
> }
> Index: netinet6/in6.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet6/in6.c,v
> retrieving revision 1.191
> diff -u -p -r1.191 in6.c
> --- netinet6/in6.c 22 Aug 2016 10:33:22 -0000 1.191
> +++ netinet6/in6.c 3 Sep 2016 10:42:24 -0000
> @@ -895,12 +895,11 @@ void
> in6_unlink_ifa(struct in6_ifaddr *ia6, struct ifnet *ifp)
> {
> struct ifaddr *ifa = &ia6->ia_ifa;
> + extern int ifatrash;
> int plen;
>
> splsoftassert(IPL_SOFTNET);
>
> - ifa_del(ifp, ifa);
> -
> TAILQ_REMOVE(&in6_ifaddr, ia6, ia_list);
>
> /* Release the reference to the base prefix. */
> @@ -918,10 +917,11 @@ in6_unlink_ifa(struct in6_ifaddr *ia6, s
> ia6->ia6_ndpr = NULL;
> }
>
> - /*
> - * release another refcnt for the link from in6_ifaddr.
> - * Note that we should decrement the refcnt at least once for all *BSD.
> - */
> + rt_ifa_purge(ifa);
> + ifa_del(ifp, ifa);
> +
> + ifatrash++;
> + ia6->ia_ifp = NULL;
> ifafree(&ia6->ia_ifa);
> }
>
--
:wq Claudio