The next MP problem when using route entries are stale ifas.
Stale ifas exist because we do not purge corresponding route entries
when we remove an ifa. Instead we try to 'fixup' such routes when
we found them later on. This happens in two places when cloning
a route, inside rtalloc(9), or when adding an address, inside the
ioctl() path.
Because a CPU can change the 'rt->rt_ifa' pointer of a route it is not
MP safe to dereference it inside rtisavalid(9).
Since stale ifas are known the create other problems I don't think
trying to keep this feature is worth it. So I'd like to purge route
entries attached to an ifa when such ifa is removed from the system.
After that if people still believe we should decouple the lifetime of
route entries and addresses, then I'd be more than happy to offer help
to make route not depend on ifas.
For the moment I want to make progress on the MP side and enforcing
pointer immutability is the easiest way to go.
Diff attached get rid of stale ifa. Well almost. We're still calling
ifafree() inside rtfree(9). I'd move it to rtrequest(9) and remove the
refcounting later when we know this step is ok.
I don't plan to commit the bits inside "#ifdef ART" below, unless you
prefer to have them. I'm using db_show_trashedrt() from ddb(4) to see
which routes are currently beeing cached but are invalid.
I'm thinking about crafting a proper code for that since we will need
something like that when route entries will be cached in pf states.
Index: net/route.c
===================================================================
RCS file: /cvs/src/sys/net/route.c,v
retrieving revision 1.321
diff -u -p -r1.321 route.c
--- net/route.c 1 Sep 2016 15:40:38 -0000 1.321
+++ net/route.c 3 Sep 2016 11:09:43 -0000
@@ -148,6 +148,14 @@ extern unsigned int rtmap_limit;
struct rtstat rtstat;
int rttrash; /* routes not in table but not freed */
+int ifatrash; /* ifas not in ifp list but not free */
+
+#ifdef ART
+/* Keep track of referenced route entries that are no longer in the table. */
+void noop(void *null, void *xrt){}
+SRPL_HEAD(, rtentry) trashedrt;
+struct srpl_rc trashedrt_rc = SRPL_RC_INITIALIZER(noop, noop, NULL);
+#endif
struct pool rtentry_pool; /* pool for rtentry structures */
struct pool rttimer_pool; /* pool for rttimer structures */
@@ -155,10 +163,9 @@ struct pool rttimer_pool; /* pool for r
void rt_timer_init(void);
int rt_setgwroute(struct rtentry *, u_int);
void rt_putgwroute(struct rtentry *);
-int rt_fixgwroute(struct rtentry *, void *, unsigned int);
int rtflushclone1(struct rtentry *, void *, u_int);
void rtflushclone(unsigned int, struct rtentry *);
-int rt_if_remove_rtdelete(struct rtentry *, void *, u_int);
+int rt_ifa_purge_walker(struct rtentry *, void *, unsigned int);
struct rtentry *rt_match(struct sockaddr *, uint32_t *, int, unsigned int);
struct sockaddr *rt_plentosa(sa_family_t, int, struct sockaddr_in6 *);
@@ -210,10 +217,6 @@ rtisvalid(struct rtentry *rt)
if (!ISSET(rt->rt_flags, RTF_UP))
return (0);
- /* Routes attached to stale ifas should be freed. */
- if (rt->rt_ifa == NULL || rt->rt_ifa->ifa_ifp == NULL)
- return (0);
-
if (ISSET(rt->rt_flags, RTF_GATEWAY)) {
KASSERT(rt->rt_gwroute != NULL);
KASSERT(!ISSET(rt->rt_gwroute->rt_flags, RTF_GATEWAY));
@@ -446,41 +449,6 @@ rt_putgwroute(struct rtentry *rt)
rt->rt_gwroute = NULL;
}
-/*
- * Refresh cached entries of RTF_GATEWAY routes for a given interface.
- *
- * This clever logic is necessary to try to fix routes linked to stale
- * ifas.
- */
-int
-rt_fixgwroute(struct rtentry *rt, void *arg, unsigned int id)
-{
- struct ifnet *ifp = arg;
-
- KERNEL_ASSERT_LOCKED();
-
- if (rt->rt_ifidx != ifp->if_index || !ISSET(rt->rt_flags, RTF_GATEWAY))
- return (0);
-
- /*
- * If the gateway route is not stale, its associated cached
- * is also not stale.
- */
- if (rt->rt_ifa->ifa_ifp != NULL)
- return (0);
-
- /* If we can fix the cached next hop entry, we can fix the ifa. */
- if (rt_setgate(rt, rt->rt_gateway, ifp->if_rdomain) == 0) {
- struct ifaddr *ifa = rt->rt_gwroute->rt_ifa;
-
- ifafree(rt->rt_ifa);
- ifa->ifa_refcnt++;
- rt->rt_ifa = ifa;
- }
-
- return (0);
-}
-
void
rtref(struct rtentry *rt)
{
@@ -500,6 +468,10 @@ rtfree(struct rtentry *rt)
KASSERT(!ISSET(rt->rt_flags, RTF_UP));
KASSERT(!RT_ROOT(rt));
atomic_dec_int(&rttrash);
+#ifdef ART
+ SRPL_REMOVE_LOCKED(&trashedrt_rc, &trashedrt, rt, rtentry,
+ rt_next);
+#endif
if (refcnt < 0) {
printf("rtfree: %p not freed (neg refs)\n", rt);
return;
@@ -551,9 +523,10 @@ ifafree(struct ifaddr *ifa)
{
if (ifa == NULL)
panic("ifafree");
- if (ifa->ifa_refcnt == 0)
+ if (ifa->ifa_refcnt == 0) {
+ ifatrash--;
free(ifa, M_IFADDR, 0);
- else
+ } else
ifa->ifa_refcnt--;
}
@@ -728,7 +701,7 @@ rtflushclone1(struct rtentry *rt, void *
/*
* This happens when an interface with a RTF_CLONING route is
* being detached. In this case it's safe to bail because all
- * the routes are being purged by rt_if_remove().
+ * the routes are being purged by rt_ifa_purge().
*/
if (ifp == NULL)
return 0;
@@ -939,6 +912,9 @@ rtrequest_delete(struct rt_addrinfo *inf
}
atomic_inc_int(&rttrash);
+#ifdef ART
+ SRPL_INSERT_HEAD_LOCKED(&trashedrt_rc, &trashedrt, rt, rt_next);
+#endif
if (ret_nrt != NULL)
*ret_nrt = rt;
@@ -981,22 +957,8 @@ rtrequest(int req, struct rt_addrinfo *i
return (EINVAL);
if ((rt->rt_flags & RTF_CLONING) == 0)
return (EINVAL);
- if (rt->rt_ifa->ifa_ifp) {
- info->rti_ifa = rt->rt_ifa;
- } else {
- /*
- * The address of the cloning route is not longer
- * configured on an interface, but its descriptor
- * is still there because of reference counting.
- *
- * Try to find a similar active address and use
- * it for the cloned route. The cloning route
- * will get the new address and interface later.
- */
- info->rti_ifa = NULL;
- info->rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
- }
-
+ KASSERT(rt->rt_ifa->ifa_ifp != NULL);
+ info->rti_ifa = rt->rt_ifa;
info->rti_flags = rt->rt_flags | (RTF_CLONED|RTF_HOST);
info->rti_flags &= ~(RTF_CLONING|RTF_CONNECTED|RTF_STATIC);
info->rti_info[RTAX_GATEWAY] = sdltosa(&sa_dl);
@@ -1089,29 +1051,6 @@ rtrequest(int req, struct rt_addrinfo *i
rt->rt_ifidx = ifp->if_index;
if (rt->rt_flags & RTF_CLONED) {
/*
- * If the ifa of the cloning route was stale, a
- * successful lookup for an ifa with the same address
- * has been made. Use this ifa also for the cloning
- * route.
- */
- if ((*ret_nrt)->rt_ifa->ifa_ifp == NULL) {
- struct ifnet *ifp0;
-
- printf("%s RTM_RESOLVE: wrong ifa (%p) was (%p)"
- "\n", __func__, ifa, (*ret_nrt)->rt_ifa);
-
- ifp0 = if_get((*ret_nrt)->rt_ifidx);
- KASSERT(ifp0 != NULL);
- ifp0->if_rtrequest(ifp0, RTM_DELETE, *ret_nrt);
- ifafree((*ret_nrt)->rt_ifa);
- if_put(ifp0);
-
- ifa->ifa_refcnt++;
- (*ret_nrt)->rt_ifa = ifa;
- (*ret_nrt)->rt_ifidx = ifp->if_index;
- ifp->if_rtrequest(ifp, RTM_ADD, *ret_nrt);
- }
- /*
* Copy both metrics and a back pointer to the cloned
* route's parent.
*/
@@ -1280,8 +1219,6 @@ rt_ifa_add(struct ifaddr *ifa, int flags
error = rtrequest(RTM_ADD, &info, prio, &rt, rtableid);
if (error == 0) {
- unsigned int i;
-
/*
* A local route is created for every address configured
* on an interface, so use this information to notify
@@ -1291,18 +1228,6 @@ rt_ifa_add(struct ifaddr *ifa, int flags
rt_sendaddrmsg(rt, RTM_NEWADDR, ifa);
rt_sendmsg(rt, RTM_ADD, rtableid);
rtfree(rt);
-
- /*
- * Userland inserted routes stay in the table even
- * if their corresponding ``ifa'' is no longer valid.
- *
- * Try to fix the stale RTF_GATEWAY entries in case
- * their gateway match the newly inserted route.
- */
- for (i = 0; i <= RT_TABLEID_MAX; i++) {
- rtable_walk(i, ifa->ifa_addr->sa_family,
- rt_fixgwroute, ifp);
- }
}
return (error);
}
@@ -1459,6 +1384,46 @@ rt_ifa_dellocal(struct ifaddr *ifa)
}
/*
+ * Remove all addresses attached to ``ifa''.
+ */
+void
+rt_ifa_purge(struct ifaddr *ifa)
+{
+ struct ifnet *ifp = ifa->ifa_ifp;
+ unsigned int rtableid;
+ int i;
+
+ KASSERT(ifp != NULL);
+
+ for (rtableid = 0; rtableid < rtmap_limit; rtableid++) {
+ /* skip rtables that are not in the rdomain of the ifp */
+ if (rtable_l2(rtableid) != ifp->if_rdomain)
+ continue;
+ for (i = 1; i <= AF_MAX; i++) {
+ rtable_walk(rtableid, i, rt_ifa_purge_walker, ifa);
+ }
+ }
+}
+
+int
+rt_ifa_purge_walker(struct rtentry *rt, void *vifa, unsigned int rtableid)
+{
+ struct ifaddr *ifa = vifa;
+ struct ifnet *ifp = ifa->ifa_ifp;
+ int error;
+
+ if (rt->rt_ifa != ifa)
+ return (0);
+
+ if ((error = rtdeletemsg(rt, ifp, rtableid))) {
+ return (error);
+ }
+
+ return (EAGAIN);
+
+}
+
+/*
* Route timer routines. These routes allow functions to be called
* for various routes at any time. This is useful in supporting
* path MTU discovery and redirect route deletion.
@@ -1741,38 +1706,6 @@ rtlabel_unref(u_int16_t id)
}
}
-void
-rt_if_remove(struct ifnet *ifp)
-{
- int i;
- u_int tid;
-
- for (tid = 0; tid < rtmap_limit; tid++) {
- /* skip rtables that are not in the rdomain of the ifp */
- if (rtable_l2(tid) != ifp->if_rdomain)
- continue;
- for (i = 1; i <= AF_MAX; i++) {
- rtable_walk(tid, i, rt_if_remove_rtdelete, ifp);
- }
- }
-}
-
-int
-rt_if_remove_rtdelete(struct rtentry *rt, void *vifp, u_int id)
-{
- struct ifnet *ifp = vifp;
- int error;
-
- if (rt->rt_ifidx != ifp->if_index)
- return (0);
-
- if ((error = rtdeletemsg(rt, ifp, id)))
- return (error);
-
- return (EAGAIN);
-
-}
-
#ifndef SMALL_KERNEL
void
rt_if_track(struct ifnet *ifp)
@@ -1965,5 +1898,16 @@ db_show_arptab(void)
db_printf("Route tree for AF_INET\n");
rtable_walk(0, AF_INET, db_show_rtentry, NULL);
return (0);
+}
+
+void
+db_show_trashedrt(void)
+{
+#ifdef ART
+ struct rtentry *rt;
+
+ SRPL_FOREACH_LOCKED(rt, &trashedrt, rt_next)
+ db_show_rtentry(rt, NULL, 0);
+#endif
}
#endif /* DDB */
Index: net/route.h
===================================================================
RCS file: /cvs/src/sys/net/route.h,v
retrieving revision 1.144
diff -u -p -r1.144 route.h
--- net/route.h 31 Aug 2016 21:32:06 -0000 1.144
+++ net/route.h 3 Sep 2016 10:14:17 -0000
@@ -396,13 +396,13 @@ void rtfree(struct rtentry *);
int rt_getifa(struct rt_addrinfo *, u_int);
int rt_ifa_add(struct ifaddr *, int, struct sockaddr *);
int rt_ifa_del(struct ifaddr *, int, struct sockaddr *);
+void rt_ifa_purge(struct ifaddr *);
int rt_ifa_addlocal(struct ifaddr *);
int rt_ifa_dellocal(struct ifaddr *);
int rtioctl(u_long, caddr_t, struct proc *);
void rtredirect(struct sockaddr *, struct sockaddr *, struct sockaddr *, struct rtentry **, unsigned int);
int rtrequest(int, struct rt_addrinfo *, u_int8_t, struct rtentry **,
u_int);
-void rt_if_remove(struct ifnet *);
#ifndef SMALL_KERNEL
void rt_if_track(struct ifnet *);
int rt_if_linkstate_change(struct rtentry *, void *, u_int);
Index: net/if.c
===================================================================
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.440
diff -u -p -r1.440 if.c
--- net/if.c 3 Sep 2016 10:05:19 -0000 1.440
+++ net/if.c 3 Sep 2016 10:12:44 -0000
@@ -954,7 +954,6 @@ if_detach(struct ifnet *ifp)
#ifdef INET6
in6_ifdetach(ifp);
#endif
- rt_if_remove(ifp);
#if NPF > 0
pfi_detach_ifnet(ifp);
#endif
@@ -1951,7 +1950,6 @@ ifioctl(struct socket *so, u_long cmd, c
#ifdef INET6
in6_ifdetach(ifp);
#endif
- rt_if_remove(ifp);
splx(s);
}
Index: netinet/in.c
===================================================================
RCS file: /cvs/src/sys/netinet/in.c,v
retrieving revision 1.128
diff -u -p -r1.128 in.c
--- netinet/in.c 13 Jun 2016 10:34:40 -0000 1.128
+++ netinet/in.c 3 Sep 2016 10:38:05 -0000
@@ -699,12 +699,14 @@ in_purgeaddr(struct ifaddr *ifa)
{
struct ifnet *ifp = ifa->ifa_ifp;
struct in_ifaddr *ia = ifatoia(ifa);
+ extern int ifatrash;
splsoftassert(IPL_SOFTNET);
in_ifscrub(ifp, ia);
rt_ifa_dellocal(&ia->ia_ifa);
+ rt_ifa_purge(&ia->ia_ifa);
ifa_del(ifp, &ia->ia_ifa);
if (ia->ia_allhosts != NULL) {
@@ -712,6 +714,7 @@ in_purgeaddr(struct ifaddr *ifa)
ia->ia_allhosts = NULL;
}
+ ifatrash++;
ia->ia_ifp = NULL;
ifafree(&ia->ia_ifa);
}
Index: netinet6/in6.c
===================================================================
RCS file: /cvs/src/sys/netinet6/in6.c,v
retrieving revision 1.191
diff -u -p -r1.191 in6.c
--- netinet6/in6.c 22 Aug 2016 10:33:22 -0000 1.191
+++ netinet6/in6.c 3 Sep 2016 10:42:24 -0000
@@ -895,12 +895,11 @@ void
in6_unlink_ifa(struct in6_ifaddr *ia6, struct ifnet *ifp)
{
struct ifaddr *ifa = &ia6->ia_ifa;
+ extern int ifatrash;
int plen;
splsoftassert(IPL_SOFTNET);
- ifa_del(ifp, ifa);
-
TAILQ_REMOVE(&in6_ifaddr, ia6, ia_list);
/* Release the reference to the base prefix. */
@@ -918,10 +917,11 @@ in6_unlink_ifa(struct in6_ifaddr *ia6, s
ia6->ia6_ndpr = NULL;
}
- /*
- * release another refcnt for the link from in6_ifaddr.
- * Note that we should decrement the refcnt at least once for all *BSD.
- */
+ rt_ifa_purge(ifa);
+ ifa_del(ifp, ifa);
+
+ ifatrash++;
+ ia6->ia_ifp = NULL;
ifafree(&ia6->ia_ifa);
}