The next MP problem when using route entries are stale ifas.

Stale ifas exist because we do not purge corresponding route entries
when we remove an ifa. Instead we try to 'fixup' such routes when
we found them later on.  This happens in two places when cloning
a route, inside rtalloc(9), or when adding an address, inside the
ioctl() path.

Because a CPU can change the 'rt->rt_ifa' pointer of a route it is not
MP safe to dereference it inside rtisavalid(9).

Since stale ifas are known the create other problems I don't think
trying to keep this feature is worth it.  So I'd like to purge route
entries attached to an ifa when such ifa is removed from the system.

After that if people still believe we should decouple the lifetime of
route entries and addresses, then I'd be more than happy to offer help
to make route not depend on ifas.

For the moment I want to make progress on the MP side and enforcing pointer immutability is the easiest way to go.

Diff attached get rid of stale ifa.  Well almost.  We're still calling
ifafree() inside rtfree(9). I'd move it to rtrequest(9) and remove the refcounting later when we know this step is ok.

I don't plan to commit the bits inside "#ifdef ART" below, unless you prefer to have them. I'm using db_show_trashedrt() from ddb(4) to see
which routes are currently beeing cached but are invalid.

I'm thinking about crafting a proper code for that since we will need
something like that when route entries will be cached in pf states.
Index: net/route.c
===================================================================
RCS file: /cvs/src/sys/net/route.c,v
retrieving revision 1.321
diff -u -p -r1.321 route.c
--- net/route.c	1 Sep 2016 15:40:38 -0000	1.321
+++ net/route.c	3 Sep 2016 11:09:43 -0000
@@ -148,6 +148,14 @@ extern unsigned int	rtmap_limit;
 
 struct rtstat		rtstat;
 int			rttrash;	/* routes not in table but not freed */
+int			ifatrash;	/* ifas not in ifp list but not free */
+
+#ifdef ART
+/* Keep track of referenced route entries that are no longer in the table. */
+void noop(void *null, void *xrt){}
+SRPL_HEAD(, rtentry)	trashedrt;
+struct srpl_rc		trashedrt_rc = SRPL_RC_INITIALIZER(noop, noop, NULL);
+#endif
 
 struct pool		rtentry_pool;	/* pool for rtentry structures */
 struct pool		rttimer_pool;	/* pool for rttimer structures */
@@ -155,10 +163,9 @@ struct pool		rttimer_pool;	/* pool for r
 void	rt_timer_init(void);
 int	rt_setgwroute(struct rtentry *, u_int);
 void	rt_putgwroute(struct rtentry *);
-int	rt_fixgwroute(struct rtentry *, void *, unsigned int);
 int	rtflushclone1(struct rtentry *, void *, u_int);
 void	rtflushclone(unsigned int, struct rtentry *);
-int	rt_if_remove_rtdelete(struct rtentry *, void *, u_int);
+int	rt_ifa_purge_walker(struct rtentry *, void *, unsigned int);
 struct rtentry *rt_match(struct sockaddr *, uint32_t *, int, unsigned int);
 struct sockaddr *rt_plentosa(sa_family_t, int, struct sockaddr_in6 *);
 
@@ -210,10 +217,6 @@ rtisvalid(struct rtentry *rt)
 	if (!ISSET(rt->rt_flags, RTF_UP))
 		return (0);
 
-	/* Routes attached to stale ifas should be freed. */
-	if (rt->rt_ifa == NULL || rt->rt_ifa->ifa_ifp == NULL)
-		return (0);
-
 	if (ISSET(rt->rt_flags, RTF_GATEWAY)) {
 		KASSERT(rt->rt_gwroute != NULL);
 	    	KASSERT(!ISSET(rt->rt_gwroute->rt_flags, RTF_GATEWAY));
@@ -446,41 +449,6 @@ rt_putgwroute(struct rtentry *rt)
 	rt->rt_gwroute = NULL;
 }
 
-/*
- * Refresh cached entries of RTF_GATEWAY routes for a given interface.
- *
- * This clever logic is necessary to try to fix routes linked to stale
- * ifas.
- */
-int
-rt_fixgwroute(struct rtentry *rt, void *arg, unsigned int id)
-{
-	struct ifnet *ifp = arg;
-
-	KERNEL_ASSERT_LOCKED();
-
-	if (rt->rt_ifidx != ifp->if_index || !ISSET(rt->rt_flags, RTF_GATEWAY))
-		return (0);
-
-	/*
-	 * If the gateway route is not stale, its associated cached
-	 * is also not stale.
-	 */
-	if (rt->rt_ifa->ifa_ifp != NULL)
-		return (0);
-
-	/* If we can fix the cached next hop entry, we can fix the ifa. */
-	if (rt_setgate(rt, rt->rt_gateway, ifp->if_rdomain) == 0) {
-		struct ifaddr *ifa = rt->rt_gwroute->rt_ifa;
-
-		ifafree(rt->rt_ifa);
-		ifa->ifa_refcnt++;
-		rt->rt_ifa = ifa;
-	}
-
-	return (0);
-}
-
 void
 rtref(struct rtentry *rt)
 {
@@ -500,6 +468,10 @@ rtfree(struct rtentry *rt)
 		KASSERT(!ISSET(rt->rt_flags, RTF_UP));
 		KASSERT(!RT_ROOT(rt));
 		atomic_dec_int(&rttrash);
+#ifdef ART
+		SRPL_REMOVE_LOCKED(&trashedrt_rc, &trashedrt, rt, rtentry,
+		    rt_next);
+#endif
 		if (refcnt < 0) {
 			printf("rtfree: %p not freed (neg refs)\n", rt);
 			return;
@@ -551,9 +523,10 @@ ifafree(struct ifaddr *ifa)
 {
 	if (ifa == NULL)
 		panic("ifafree");
-	if (ifa->ifa_refcnt == 0)
+	if (ifa->ifa_refcnt == 0) {
+		ifatrash--;
 		free(ifa, M_IFADDR, 0);
-	else
+	} else
 		ifa->ifa_refcnt--;
 }
 
@@ -728,7 +701,7 @@ rtflushclone1(struct rtentry *rt, void *
 	/*
 	 * This happens when an interface with a RTF_CLONING route is
 	 * being detached.  In this case it's safe to bail because all
-	 * the routes are being purged by rt_if_remove().
+	 * the routes are being purged by rt_ifa_purge().
 	 */
 	if (ifp == NULL)
 	        return 0;
@@ -939,6 +912,9 @@ rtrequest_delete(struct rt_addrinfo *inf
 	}
 
 	atomic_inc_int(&rttrash);
+#ifdef ART
+	SRPL_INSERT_HEAD_LOCKED(&trashedrt_rc, &trashedrt, rt, rt_next);
+#endif
 
 	if (ret_nrt != NULL)
 		*ret_nrt = rt;
@@ -981,22 +957,8 @@ rtrequest(int req, struct rt_addrinfo *i
 			return (EINVAL);
 		if ((rt->rt_flags & RTF_CLONING) == 0)
 			return (EINVAL);
-		if (rt->rt_ifa->ifa_ifp) {
-			info->rti_ifa = rt->rt_ifa;
-		} else {
-			/*
-			 * The address of the cloning route is not longer
-			 * configured on an interface, but its descriptor
-			 * is still there because of reference counting.
-			 *
-			 * Try to find a similar active address and use
-			 * it for the cloned route.  The cloning route
-			 * will get the new address and interface later.
-			 */
-			info->rti_ifa = NULL;
-			info->rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
-		}
-
+		KASSERT(rt->rt_ifa->ifa_ifp != NULL);
+		info->rti_ifa = rt->rt_ifa;
 		info->rti_flags = rt->rt_flags | (RTF_CLONED|RTF_HOST);
 		info->rti_flags &= ~(RTF_CLONING|RTF_CONNECTED|RTF_STATIC);
 		info->rti_info[RTAX_GATEWAY] = sdltosa(&sa_dl);
@@ -1089,29 +1051,6 @@ rtrequest(int req, struct rt_addrinfo *i
 		rt->rt_ifidx = ifp->if_index;
 		if (rt->rt_flags & RTF_CLONED) {
 			/*
-			 * If the ifa of the cloning route was stale, a
-			 * successful lookup for an ifa with the same address
-			 * has been made.  Use this ifa also for the cloning
-			 * route.
-			 */
-			if ((*ret_nrt)->rt_ifa->ifa_ifp == NULL) {
-				struct ifnet *ifp0;
-
-				printf("%s RTM_RESOLVE: wrong ifa (%p) was (%p)"
-				    "\n", __func__, ifa, (*ret_nrt)->rt_ifa);
-
-				ifp0 = if_get((*ret_nrt)->rt_ifidx);
-				KASSERT(ifp0 != NULL);
-				ifp0->if_rtrequest(ifp0, RTM_DELETE, *ret_nrt);
-				ifafree((*ret_nrt)->rt_ifa);
-				if_put(ifp0);
-
-				ifa->ifa_refcnt++;
-				(*ret_nrt)->rt_ifa = ifa;
-				(*ret_nrt)->rt_ifidx = ifp->if_index;
-				ifp->if_rtrequest(ifp, RTM_ADD, *ret_nrt);
-			}
-			/*
 			 * Copy both metrics and a back pointer to the cloned
 			 * route's parent.
 			 */
@@ -1280,8 +1219,6 @@ rt_ifa_add(struct ifaddr *ifa, int flags
 
 	error = rtrequest(RTM_ADD, &info, prio, &rt, rtableid);
 	if (error == 0) {
-		unsigned int i;
-
 		/*
 		 * A local route is created for every address configured
 		 * on an interface, so use this information to notify
@@ -1291,18 +1228,6 @@ rt_ifa_add(struct ifaddr *ifa, int flags
 			rt_sendaddrmsg(rt, RTM_NEWADDR, ifa);
 		rt_sendmsg(rt, RTM_ADD, rtableid);
 		rtfree(rt);
-
-		/*
-		 * Userland inserted routes stay in the table even
-		 * if their corresponding ``ifa'' is no longer valid.
-		 *
-		 * Try to fix the stale RTF_GATEWAY entries in case
-		 * their gateway match the newly inserted route.
-		 */
-		for (i = 0; i <= RT_TABLEID_MAX; i++) {
-			rtable_walk(i, ifa->ifa_addr->sa_family,
-			    rt_fixgwroute, ifp);
-		}
 	}
 	return (error);
 }
@@ -1459,6 +1384,46 @@ rt_ifa_dellocal(struct ifaddr *ifa)
 }
 
 /*
+ * Remove all addresses attached to ``ifa''.
+ */
+void
+rt_ifa_purge(struct ifaddr *ifa)
+{
+	struct ifnet		*ifp = ifa->ifa_ifp;
+	unsigned int		 rtableid;
+	int			 i;
+
+	KASSERT(ifp != NULL);
+
+	for (rtableid = 0; rtableid < rtmap_limit; rtableid++) {
+		/* skip rtables that are not in the rdomain of the ifp */
+		if (rtable_l2(rtableid) != ifp->if_rdomain)
+			continue;
+		for (i = 1; i <= AF_MAX; i++) {
+			rtable_walk(rtableid, i, rt_ifa_purge_walker, ifa);
+		}
+	}
+}
+
+int
+rt_ifa_purge_walker(struct rtentry *rt, void *vifa, unsigned int rtableid)
+{
+	struct ifaddr		*ifa = vifa;
+	struct ifnet		*ifp = ifa->ifa_ifp;
+	int			 error;
+
+	if (rt->rt_ifa != ifa)
+		return (0);
+
+	if ((error = rtdeletemsg(rt, ifp, rtableid))) {
+		return (error);
+	}
+
+	return (EAGAIN);
+
+}
+
+/*
  * Route timer routines.  These routes allow functions to be called
  * for various routes at any time.  This is useful in supporting
  * path MTU discovery and redirect route deletion.
@@ -1741,38 +1706,6 @@ rtlabel_unref(u_int16_t id)
 	}
 }
 
-void
-rt_if_remove(struct ifnet *ifp)
-{
-	int			 i;
-	u_int			 tid;
-
-	for (tid = 0; tid < rtmap_limit; tid++) {
-		/* skip rtables that are not in the rdomain of the ifp */
-		if (rtable_l2(tid) != ifp->if_rdomain)
-			continue;
-		for (i = 1; i <= AF_MAX; i++) {
-			rtable_walk(tid, i, rt_if_remove_rtdelete, ifp);
-		}
-	}
-}
-
-int
-rt_if_remove_rtdelete(struct rtentry *rt, void *vifp, u_int id)
-{
-	struct ifnet	*ifp = vifp;
-	int		 error;
-
-	if (rt->rt_ifidx != ifp->if_index)
-		return (0);
-
-	if ((error = rtdeletemsg(rt, ifp, id)))
-		return (error);
-
-	return (EAGAIN);
-
-}
-
 #ifndef SMALL_KERNEL
 void
 rt_if_track(struct ifnet *ifp)
@@ -1965,5 +1898,16 @@ db_show_arptab(void)
 	db_printf("Route tree for AF_INET\n");
 	rtable_walk(0, AF_INET, db_show_rtentry, NULL);
 	return (0);
+}
+
+void
+db_show_trashedrt(void)
+{
+#ifdef ART
+	struct rtentry *rt;
+
+	SRPL_FOREACH_LOCKED(rt, &trashedrt, rt_next)
+		db_show_rtentry(rt, NULL, 0);
+#endif
 }
 #endif /* DDB */
Index: net/route.h
===================================================================
RCS file: /cvs/src/sys/net/route.h,v
retrieving revision 1.144
diff -u -p -r1.144 route.h
--- net/route.h	31 Aug 2016 21:32:06 -0000	1.144
+++ net/route.h	3 Sep 2016 10:14:17 -0000
@@ -396,13 +396,13 @@ void	 rtfree(struct rtentry *);
 int	 rt_getifa(struct rt_addrinfo *, u_int);
 int	 rt_ifa_add(struct ifaddr *, int, struct sockaddr *);
 int	 rt_ifa_del(struct ifaddr *, int, struct sockaddr *);
+void	 rt_ifa_purge(struct ifaddr *);
 int	 rt_ifa_addlocal(struct ifaddr *);
 int	 rt_ifa_dellocal(struct ifaddr *);
 int	 rtioctl(u_long, caddr_t, struct proc *);
 void	 rtredirect(struct sockaddr *, struct sockaddr *, struct sockaddr *, struct rtentry **, unsigned int);
 int	 rtrequest(int, struct rt_addrinfo *, u_int8_t, struct rtentry **,
 	     u_int);
-void	 rt_if_remove(struct ifnet *);
 #ifndef SMALL_KERNEL
 void	 rt_if_track(struct ifnet *);
 int	 rt_if_linkstate_change(struct rtentry *, void *, u_int);
Index: net/if.c
===================================================================
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.440
diff -u -p -r1.440 if.c
--- net/if.c	3 Sep 2016 10:05:19 -0000	1.440
+++ net/if.c	3 Sep 2016 10:12:44 -0000
@@ -954,7 +954,6 @@ if_detach(struct ifnet *ifp)
 #ifdef INET6
 	in6_ifdetach(ifp);
 #endif
-	rt_if_remove(ifp);
 #if NPF > 0
 	pfi_detach_ifnet(ifp);
 #endif
@@ -1951,7 +1950,6 @@ ifioctl(struct socket *so, u_long cmd, c
 #ifdef INET6
 			in6_ifdetach(ifp);
 #endif
-			rt_if_remove(ifp);
 			splx(s);
 		}
 
Index: netinet/in.c
===================================================================
RCS file: /cvs/src/sys/netinet/in.c,v
retrieving revision 1.128
diff -u -p -r1.128 in.c
--- netinet/in.c	13 Jun 2016 10:34:40 -0000	1.128
+++ netinet/in.c	3 Sep 2016 10:38:05 -0000
@@ -699,12 +699,14 @@ in_purgeaddr(struct ifaddr *ifa)
 {
 	struct ifnet *ifp = ifa->ifa_ifp;
 	struct in_ifaddr *ia = ifatoia(ifa);
+	extern int ifatrash;
 
 	splsoftassert(IPL_SOFTNET);
 
 	in_ifscrub(ifp, ia);
 
 	rt_ifa_dellocal(&ia->ia_ifa);
+	rt_ifa_purge(&ia->ia_ifa);
 	ifa_del(ifp, &ia->ia_ifa);
 
 	if (ia->ia_allhosts != NULL) {
@@ -712,6 +714,7 @@ in_purgeaddr(struct ifaddr *ifa)
 		ia->ia_allhosts = NULL;
 	}
 
+	ifatrash++;
 	ia->ia_ifp = NULL;
 	ifafree(&ia->ia_ifa);
 }
Index: netinet6/in6.c
===================================================================
RCS file: /cvs/src/sys/netinet6/in6.c,v
retrieving revision 1.191
diff -u -p -r1.191 in6.c
--- netinet6/in6.c	22 Aug 2016 10:33:22 -0000	1.191
+++ netinet6/in6.c	3 Sep 2016 10:42:24 -0000
@@ -895,12 +895,11 @@ void
 in6_unlink_ifa(struct in6_ifaddr *ia6, struct ifnet *ifp)
 {
 	struct ifaddr *ifa = &ia6->ia_ifa;
+	extern int ifatrash;
 	int plen;
 
 	splsoftassert(IPL_SOFTNET);
 
-	ifa_del(ifp, ifa);
-
 	TAILQ_REMOVE(&in6_ifaddr, ia6, ia_list);
 
 	/* Release the reference to the base prefix. */
@@ -918,10 +917,11 @@ in6_unlink_ifa(struct in6_ifaddr *ia6, s
 		ia6->ia6_ndpr = NULL;
 	}
 
-	/*
-	 * release another refcnt for the link from in6_ifaddr.
-	 * Note that we should decrement the refcnt at least once for all *BSD.
-	 */
+	rt_ifa_purge(ifa);
+	ifa_del(ifp, ifa);
+
+	ifatrash++;
+	ia6->ia_ifp = NULL;
 	ifafree(&ia6->ia_ifa);
 }
 

Reply via email to