Re: Call for Testing: rtalloc(9) change

2015-11-09 Thread Alexander Bluhm
On Wed, Nov 04, 2015 at 12:33:23PM +0100, Martin Pieuchot wrote:
> On 12/08/15(Wed) 17:03, Martin Pieuchot wrote:
> > I'm currently working on the routing table interface to make is safe
> > to use by multiple CPUs at the same time.  The diff below is a big
> > step in this direction and I'd really appreciate if people could test
> > it with their usual network setup and report back.
> 
> Below is an updated version of the diff now that bluhm@ fixed the
> potential loop with RTF_GATEWAY routes.
> 
> Note that regression tests will fail because we no longer call
> rtalloc(9) inside rt_setgate().  In other words we do not cache
> a next-hop route before using it.
> 
> Ok?

OK bluhm@

> 
> Index: net/route.c
> ===
> RCS file: /cvs/src/sys/net/route.c,v
> retrieving revision 1.268
> diff -u -p -r1.268 route.c
> --- net/route.c   4 Nov 2015 10:13:55 -   1.268
> +++ net/route.c   4 Nov 2015 11:15:00 -
> @@ -151,6 +151,7 @@ void  rt_timer_init(void);
>  int  rtflushclone1(struct rtentry *, void *, u_int);
>  void rtflushclone(unsigned int, struct rtentry *);
>  int  rt_if_remove_rtdelete(struct rtentry *, void *, u_int);
> +struct rtentry *rt_match(struct sockaddr *, int, unsigned int);
>  
>  struct   ifaddr *ifa_ifwithroute(int, struct sockaddr *, struct sockaddr 
> *,
>   u_int);
> @@ -194,6 +195,12 @@ rtisvalid(struct rtentry *rt)
>   if (rt == NULL)
>   return (0);
>  
> +#ifdef DIAGNOSTIC
> + if (ISSET(rt->rt_flags, RTF_GATEWAY) && (rt->rt_gwroute != NULL) &&
> + ISSET(rt->rt_gwroute->rt_flags, RTF_GATEWAY))
> + panic("next hop must be directly reachable");
> +#endif
> +
>   if ((rt->rt_flags & RTF_UP) == 0)
>   return (0);
>  
> @@ -201,11 +208,27 @@ rtisvalid(struct rtentry *rt)
>   if (rt->rt_ifa == NULL || rt->rt_ifa->ifa_ifp == NULL)
>   return (0);
>  
> + if (ISSET(rt->rt_flags, RTF_GATEWAY) && !rtisvalid(rt->rt_gwroute))
> + return (0);
> +
>   return (1);
>  }
>  
> +/*
> + * Do the actual lookup for rtalloc(9), do not use directly!
> + *
> + * Return the best matching entry for the destination ``dst''.
> + *
> + * "RT_RESOLVE" means that a corresponding L2 entry should
> + *   be added to the routing table and resolved (via ARP or
> + *   NDP), if it does not exist.
> + *
> + * "RT_REPORT" indicates that a message should be sent to
> + *   userland if no matching route has been found or if an
> + *   error occured while adding a L2 entry.
> + */
>  struct rtentry *
> -rtalloc(struct sockaddr *dst, int flags, unsigned int tableid)
> +rt_match(struct sockaddr *dst, int flags, unsigned int tableid)
>  {
>   struct rtentry  *rt0, *rt = NULL;
>   struct rt_addrinfo   info;
> @@ -336,6 +359,76 @@ rtalloc_mpath(struct sockaddr *dst, uint
>  }
>  #endif /* SMALL_KERNEL */
>  
> +/*
> + * Look in the routing table for the best matching entry for
> + * ``dst''.
> + *
> + * If a route with a gateway is found and its next hop is no
> + * longer valid, try to cache it.
> + */
> +struct rtentry *
> +rtalloc(struct sockaddr *dst, int flags, unsigned int rtableid)
> +{
> + struct rtentry *rt, *nhrt;
> +
> + rt = rt_match(dst, flags, rtableid);
> +
> + /* No match or route to host?  We're done. */
> + if (rt == NULL || !ISSET(rt->rt_flags, RTF_GATEWAY))
> + return (rt);
> +
> + /* Nothing to do if the next hop is valid. */
> + if (rtisvalid(rt->rt_gwroute))
> + return (rt);
> +
> + rtfree(rt->rt_gwroute);
> + rt->rt_gwroute = NULL;
> +
> + /*
> +  * If we cannot find a valid next hop, return the route
> +  * with a gateway.
> +  *
> +  * XXX Some dragons hiding in the tree certainly depends on
> +  * this behavior.  But it is safe since rt_checkgate() wont
> +  * allow us to us this route later on.
> +  */
> + nhrt = rt_match(rt->rt_gateway, flags | RT_RESOLVE, rtableid);
> + if (nhrt == NULL)
> + return (rt);
> +
> + /*
> +  * Next hop must be reachable, this also prevents rtentry
> +  * loops for example when rt->rt_gwroute points to rt.
> +  */
> + if (ISSET(nhrt->rt_flags, RTF_CLONING|RTF_GATEWAY)) {
> + rtfree(nhrt);
> + return (rt);
> + }
> +
> + /*
> +  * Next hop entry must be UP and on the same interface.
> +  */
> + if (!ISSET(nhrt->rt_flags, RTF_UP) || nhrt->rt_ifidx != rt->rt_ifidx) {
> + rtfree(nhrt);
> + return (rt);
> + }
> +
> + /*
> +  * If the MTU of next hop is 0, this will reset the MTU of the
> +  * route to run PMTUD again from scratch.
> +  */
> + if (!ISSET(rt->rt_locks, RTV_MTU) && (rt->rt_mtu > nhrt->rt_mtu))
> + rt->rt_mtu = nhrt->rt_mtu;
> +
> + /*
> +  * Do not return the cached next-hop route, rt_checkgate() will
> +  * do the ma

Re: Call for Testing: rtalloc(9) change

2015-11-04 Thread Martin Pieuchot
On 12/08/15(Wed) 17:03, Martin Pieuchot wrote:
> I'm currently working on the routing table interface to make is safe
> to use by multiple CPUs at the same time.  The diff below is a big
> step in this direction and I'd really appreciate if people could test
> it with their usual network setup and report back.

Below is an updated version of the diff now that bluhm@ fixed the
potential loop with RTF_GATEWAY routes.

Note that regression tests will fail because we no longer call
rtalloc(9) inside rt_setgate().  In other words we do not cache
a next-hop route before using it.

Ok?

Index: net/route.c
===
RCS file: /cvs/src/sys/net/route.c,v
retrieving revision 1.268
diff -u -p -r1.268 route.c
--- net/route.c 4 Nov 2015 10:13:55 -   1.268
+++ net/route.c 4 Nov 2015 11:15:00 -
@@ -151,6 +151,7 @@ voidrt_timer_init(void);
 intrtflushclone1(struct rtentry *, void *, u_int);
 void   rtflushclone(unsigned int, struct rtentry *);
 intrt_if_remove_rtdelete(struct rtentry *, void *, u_int);
+struct rtentry *rt_match(struct sockaddr *, int, unsigned int);
 
 struct ifaddr *ifa_ifwithroute(int, struct sockaddr *, struct sockaddr *,
u_int);
@@ -194,6 +195,12 @@ rtisvalid(struct rtentry *rt)
if (rt == NULL)
return (0);
 
+#ifdef DIAGNOSTIC
+   if (ISSET(rt->rt_flags, RTF_GATEWAY) && (rt->rt_gwroute != NULL) &&
+   ISSET(rt->rt_gwroute->rt_flags, RTF_GATEWAY))
+   panic("next hop must be directly reachable");
+#endif
+
if ((rt->rt_flags & RTF_UP) == 0)
return (0);
 
@@ -201,11 +208,27 @@ rtisvalid(struct rtentry *rt)
if (rt->rt_ifa == NULL || rt->rt_ifa->ifa_ifp == NULL)
return (0);
 
+   if (ISSET(rt->rt_flags, RTF_GATEWAY) && !rtisvalid(rt->rt_gwroute))
+   return (0);
+
return (1);
 }
 
+/*
+ * Do the actual lookup for rtalloc(9), do not use directly!
+ *
+ * Return the best matching entry for the destination ``dst''.
+ *
+ * "RT_RESOLVE" means that a corresponding L2 entry should
+ *   be added to the routing table and resolved (via ARP or
+ *   NDP), if it does not exist.
+ *
+ * "RT_REPORT" indicates that a message should be sent to
+ *   userland if no matching route has been found or if an
+ *   error occured while adding a L2 entry.
+ */
 struct rtentry *
-rtalloc(struct sockaddr *dst, int flags, unsigned int tableid)
+rt_match(struct sockaddr *dst, int flags, unsigned int tableid)
 {
struct rtentry  *rt0, *rt = NULL;
struct rt_addrinfo   info;
@@ -336,6 +359,76 @@ rtalloc_mpath(struct sockaddr *dst, uint
 }
 #endif /* SMALL_KERNEL */
 
+/*
+ * Look in the routing table for the best matching entry for
+ * ``dst''.
+ *
+ * If a route with a gateway is found and its next hop is no
+ * longer valid, try to cache it.
+ */
+struct rtentry *
+rtalloc(struct sockaddr *dst, int flags, unsigned int rtableid)
+{
+   struct rtentry *rt, *nhrt;
+
+   rt = rt_match(dst, flags, rtableid);
+
+   /* No match or route to host?  We're done. */
+   if (rt == NULL || !ISSET(rt->rt_flags, RTF_GATEWAY))
+   return (rt);
+
+   /* Nothing to do if the next hop is valid. */
+   if (rtisvalid(rt->rt_gwroute))
+   return (rt);
+
+   rtfree(rt->rt_gwroute);
+   rt->rt_gwroute = NULL;
+
+   /*
+* If we cannot find a valid next hop, return the route
+* with a gateway.
+*
+* XXX Some dragons hiding in the tree certainly depends on
+* this behavior.  But it is safe since rt_checkgate() wont
+* allow us to us this route later on.
+*/
+   nhrt = rt_match(rt->rt_gateway, flags | RT_RESOLVE, rtableid);
+   if (nhrt == NULL)
+   return (rt);
+
+   /*
+* Next hop must be reachable, this also prevents rtentry
+* loops for example when rt->rt_gwroute points to rt.
+*/
+   if (ISSET(nhrt->rt_flags, RTF_CLONING|RTF_GATEWAY)) {
+   rtfree(nhrt);
+   return (rt);
+   }
+
+   /*
+* Next hop entry must be UP and on the same interface.
+*/
+   if (!ISSET(nhrt->rt_flags, RTF_UP) || nhrt->rt_ifidx != rt->rt_ifidx) {
+   rtfree(nhrt);
+   return (rt);
+   }
+
+   /*
+* If the MTU of next hop is 0, this will reset the MTU of the
+* route to run PMTUD again from scratch.
+*/
+   if (!ISSET(rt->rt_locks, RTV_MTU) && (rt->rt_mtu > nhrt->rt_mtu))
+   rt->rt_mtu = nhrt->rt_mtu;
+
+   /*
+* Do not return the cached next-hop route, rt_checkgate() will
+* do the magic for us.
+*/
+   rt->rt_gwroute = nhrt;
+
+   return (rt);
+}
+
 void
 rtref(struct rtentry *rt)
 {
@@ -499,7 +592,7 @@ create:
rt->rt_flags |= RTF_MODIFIED;
flags |= RTF_MODIFIED;
  

Re: Call for Testing: rtalloc(9) change

2015-09-02 Thread Alexander Bluhm
On Tue, Sep 01, 2015 at 04:47:31PM +0200, Martin Pieuchot wrote:
> Smaller diff now that rtisvalid(9) is in!

> +rt_isvalid(struct rtentry *rt)

> +/*
> + * Returns 1 if the (cached) ``rt'' entry is still valid, 0 otherwise.
> + */
> +int
> +rtisvalid(struct rtentry *rt)
> +{
> + if (!rt_isvalid(rt))
> + return (0);
> +
> + /* Next hop must also be valid. */
> + if ((rt->rt_flags & RTF_GATEWAY) && !rt_isvalid(rt->rt_gwroute))
> + return (0);
> +
> + return (1);
> +}

Please do not call two functions rt_isvalid() and rtisvalid(), that
is confusing.  Maybe rt_isvalid() could be called rtisvalidnotgw().

Is it possible that a gateway route has another gateway route as
gateway?  rt_checkgate() checks and fixes that.  What do you think
about this?  Maybe the panic() is too aggressive and should be replaced
with a return (0) to avoid recursion.

bluhm

Index: net/route.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/route.c,v
retrieving revision 1.228
diff -u -p -r1.228 route.c
--- net/route.c 1 Sep 2015 12:50:03 -   1.228
+++ net/route.c 2 Sep 2015 14:54:54 -
@@ -316,6 +316,15 @@ rtisvalid(struct rtentry *rt)
if (rt->rt_ifa == NULL || rt->rt_ifa->ifa_ifp == NULL)
return (0);
 
+   /* Next hop must also be valid. */
+   if (rt->rt_flags & RTF_GATEWAY) {
+   if (rt->rt_gwroute && (rt->rt_gwroute->rt_flags & RTF_GATEWAY))
+   panic("%s: route %p has gateway route %p with gateway",
+   __func__, rt, rt->rt_gwroute);
+   if (!rtisvalid(rt->rt_gwroute))
+   return (0);
+   }
+
return (1);
 }
 



Re: Call for Testing: rtalloc(9) change

2015-09-01 Thread Martin Pieuchot
On 31/08/15(Mon) 11:52, Martin Pieuchot wrote:
> On 25/08/15(Tue) 12:27, Martin Pieuchot wrote:
> > On 12/08/15(Wed) 17:03, Martin Pieuchot wrote:
> > > I'm currently working on the routing table interface to make is safe
> > > to use by multiple CPUs at the same time.  The diff below is a big
> > > step in this direction and I'd really appreciate if people could test
> > > it with their usual network setup and report back.
> > 
> > Updated version to match recent changes.  I'm still looking for test
> > reports and reviews.
> 
> Thanks to all the testers.  Here's a third version that should fix a
> regression reported by phessler@.  The idea is to use rtvalid(9) to
> check early if the gateway route is still valid or not.  If it isn't
> then call rtalloc(9) again to perform the ARP/NDP resolution.

Smaller diff now that rtisvalid(9) is in!

> Tests & reviews are still welcome!

Note that this diff plugs a rt leak for free.

Index: net/route.c
===
RCS file: /cvs/src/sys/net/route.c,v
retrieving revision 1.228
diff -u -p -r1.228 route.c
--- net/route.c 1 Sep 2015 12:50:03 -   1.228
+++ net/route.c 1 Sep 2015 14:41:36 -
@@ -153,6 +153,7 @@ int rtable_alloc(void ***, u_int);
 intrtflushclone1(struct rtentry *, void *, u_int);
 void   rtflushclone(unsigned int, struct rtentry *);
 intrt_if_remove_rtdelete(struct rtentry *, void *, u_int);
+struct rtentry *rt_match(struct sockaddr *, int, unsigned int);
 
 struct ifaddr *ifa_ifwithroute(int, struct sockaddr *, struct sockaddr *,
u_int);
@@ -300,11 +301,8 @@ rtable_exists(u_int id)/* verify table 
return (1);
 }
 
-/*
- * Returns 1 if the (cached) ``rt'' entry is still valid, 0 otherwise.
- */
-int
-rtisvalid(struct rtentry *rt)
+static inline int
+rt_isvalid(struct rtentry *rt)
 {
if (rt == NULL)
return (0);
@@ -319,19 +317,48 @@ rtisvalid(struct rtentry *rt)
return (1);
 }
 
+/*
+ * Returns 1 if the (cached) ``rt'' entry is still valid, 0 otherwise.
+ */
+int
+rtisvalid(struct rtentry *rt)
+{
+   if (!rt_isvalid(rt))
+   return (0);
+
+   /* Next hop must also be valid. */
+   if ((rt->rt_flags & RTF_GATEWAY) && !rt_isvalid(rt->rt_gwroute))
+   return (0);
+
+   return (1);
+}
+
+/*
+ * Do the actual lookup for rtalloc(9), do not use directly!
+ *
+ * Return the best matching entry for the destination ``dst''.
+ *
+ * "RT_RESOLVE" means that a corresponding L2 entry should
+ *   be added to the routing table and resolved (via ARP or
+ *   NDP), if it does not exist.
+ *
+ * "RT_REPORT" indicates that a message should be sent to
+ *   userland if no matching route has been found or if an
+ *   error occured while adding a L2 entry.
+ */
 struct rtentry *
-rtalloc(struct sockaddr *dst, int flags, unsigned int tableid)
+rt_match(struct sockaddr *dst, int flags, unsigned int tableid)
 {
struct rtentry  *rt;
struct rtentry  *newrt = NULL;
struct rt_addrinfo   info;
-   int  s, error = 0, msgtype = RTM_MISS;
+   int  s, error = 0;
 
-   s = splsoftnet();
 
bzero(&info, sizeof(info));
info.rti_info[RTAX_DST] = dst;
 
+   s = splsoftnet();
rt = rtable_match(tableid, dst);
if (rt != NULL) {
newrt = rt;
@@ -344,10 +371,6 @@ rtalloc(struct sockaddr *dst, int flags,
goto miss;
}
rt = newrt;
-   if (rt->rt_flags & RTF_XRESOLVE) {
-   msgtype = RTM_RESOLVE;
-   goto miss;
-   }
/* Inform listeners of the new route */
rt_sendmsg(rt, RTM_ADD, tableid);
} else
@@ -355,11 +378,8 @@ rtalloc(struct sockaddr *dst, int flags,
} else {
rtstat.rts_unreach++;
 miss:
-   if (ISSET(flags, RT_REPORT)) {
-   bzero((caddr_t)&info, sizeof(info));
-   info.rti_info[RTAX_DST] = dst;
-   rt_missmsg(msgtype, &info, 0, NULL, error, tableid);
-   }
+   if (ISSET(flags, RT_REPORT))
+   rt_missmsg(RTM_MISS, &info, 0, NULL, error, tableid);
}
splx(s);
return (newrt);
@@ -393,6 +413,75 @@ rtalloc_mpath(struct sockaddr *dst, uint
 }
 #endif /* SMALL_KERNEL */
 
+/*
+ * Look in the routing table for the best matching entry for
+ * ``dst''.
+ *
+ * If a route with a gateway is found and its next hop is no
+ * longer valid, try to cache it.
+ */
+struct rtentry *
+rtalloc(struct sockaddr *dst, int flags, unsigned int rtableid)
+{
+   struct rtentry *rt, *nhrt;
+
+   rt = rt_match(dst, flags, rtableid);
+
+   /* No match or route to host?  We're done. */
+   if (r

Re: Call for Testing: rtalloc(9) change

2015-08-31 Thread Martin Pieuchot
On 25/08/15(Tue) 12:27, Martin Pieuchot wrote:
> On 12/08/15(Wed) 17:03, Martin Pieuchot wrote:
> > I'm currently working on the routing table interface to make is safe
> > to use by multiple CPUs at the same time.  The diff below is a big
> > step in this direction and I'd really appreciate if people could test
> > it with their usual network setup and report back.
> 
> Updated version to match recent changes.  I'm still looking for test
> reports and reviews.

Thanks to all the testers.  Here's a third version that should fix a
regression reported by phessler@.  The idea is to use rtvalid(9) to
check early if the gateway route is still valid or not.  If it isn't
then call rtalloc(9) again to perform the ARP/NDP resolution.

This diff includes rtvalid(9) and makes use of it in ip{,6}_output(),
more conversions will be done afterward, but these should hopefully
be all the necessary bits to fix the regression.

Tests & reviews are still welcome!

Index: net/route.c
===
RCS file: /cvs/src/sys/net/route.c,v
retrieving revision 1.226
diff -u -p -r1.226 route.c
--- net/route.c 30 Aug 2015 10:39:16 -  1.226
+++ net/route.c 31 Aug 2015 09:33:27 -
@@ -153,6 +153,7 @@ int rtable_alloc(void ***, u_int);
 intrtflushclone1(struct rtentry *, void *, u_int);
 void   rtflushclone(unsigned int, struct rtentry *);
 intrt_if_remove_rtdelete(struct rtentry *, void *, u_int);
+struct rtentry *rt_match(struct sockaddr *, int, unsigned int);
 
 struct ifaddr *ifa_ifwithroute(int, struct sockaddr *, struct sockaddr *,
u_int);
@@ -300,19 +301,68 @@ rtable_exists(u_int id)   /* verify table 
return (1);
 }
 
+
+/*
+ * Do the actual checks for rtvalid(9), do not use directly!
+ */
+static inline int
+rt_is_valid(struct rtentry *rt)
+{
+   if (rt == NULL)
+   return (0);
+
+   if ((rt->rt_flags & RTF_UP) == 0)
+   return (0);
+
+   /* Routes attached to stall ifas should be freed. */
+   if (rt->rt_ifa == NULL || rt->rt_ifa->ifa_ifp == NULL)
+   return (0);
+
+   return (1);
+}
+
+/*
+ * Returns 1 if the cached ``rt'' entry is still valid, 0 otherwise.
+ */
+int
+rtvalid(struct rtentry *rt)
+{
+   if (rt_is_valid(rt) == 0)
+   return (0);
+
+   /* Next hop must also be valid. */
+   if ((rt->rt_flags & RTF_GATEWAY) && rt_is_valid(rt->rt_gwroute) == 0)
+   return (0);
+
+   return (1);
+}
+
+/*
+ * Do the actual lookup for rtalloc(9), do not use directly!
+ *
+ * Return the best matching entry for the destination ``dst''.
+ *
+ * "RT_RESOLVE" means that a corresponding L2 entry should
+ *   be added to the routing table and resolved (via ARP or
+ *   NDP), if it does not exist.
+ *
+ * "RT_REPORT" indicates that a message should be sent to
+ *   userland if no matching route has been found or if an
+ *   error occured while adding a L2 entry.
+ */
 struct rtentry *
-rtalloc(struct sockaddr *dst, int flags, unsigned int tableid)
+rt_match(struct sockaddr *dst, int flags, unsigned int tableid)
 {
struct rtentry  *rt;
struct rtentry  *newrt = NULL;
struct rt_addrinfo   info;
-   int  s, error = 0, msgtype = RTM_MISS;
+   int  s, error = 0;
 
-   s = splsoftnet();
 
bzero(&info, sizeof(info));
info.rti_info[RTAX_DST] = dst;
 
+   s = splsoftnet();
rt = rtable_match(tableid, dst);
if (rt != NULL) {
newrt = rt;
@@ -325,10 +375,6 @@ rtalloc(struct sockaddr *dst, int flags,
goto miss;
}
rt = newrt;
-   if (rt->rt_flags & RTF_XRESOLVE) {
-   msgtype = RTM_RESOLVE;
-   goto miss;
-   }
/* Inform listeners of the new route */
rt_sendmsg(rt, RTM_ADD, tableid);
} else
@@ -336,11 +382,8 @@ rtalloc(struct sockaddr *dst, int flags,
} else {
rtstat.rts_unreach++;
 miss:
-   if (ISSET(flags, RT_REPORT)) {
-   bzero((caddr_t)&info, sizeof(info));
-   info.rti_info[RTAX_DST] = dst;
-   rt_missmsg(msgtype, &info, 0, NULL, error, tableid);
-   }
+   if (ISSET(flags, RT_REPORT))
+   rt_missmsg(RTM_MISS, &info, 0, NULL, error, tableid);
}
splx(s);
return (newrt);
@@ -374,6 +417,75 @@ rtalloc_mpath(struct sockaddr *dst, uint
 }
 #endif /* SMALL_KERNEL */
 
+/*
+ * Look in the routing table for the best matching entry for
+ * ``dst''.
+ *
+ * If a route with a gateway is found and its next hop is no
+ * longer valid, try to cache it.
+ */
+struct rtentry *
+rtalloc(struct sockaddr *dst, int flags, unsigned int rta

Re: Call for Testing: rtalloc(9) change

2015-08-27 Thread Norman Golisz
Hello Martin,

On Tue Aug 25 2015 12:27, Martin Pieuchot wrote:
> On 12/08/15(Wed) 17:03, Martin Pieuchot wrote:
> > I'm currently working on the routing table interface to make is safe
> > to use by multiple CPUs at the same time.  The diff below is a big
> > step in this direction and I'd really appreciate if people could test
> > it with their usual network setup and report back.
> 
> Updated version to match recent changes.  I'm still looking for test
> reports and reviews.

I'm running a kernel with this patch on my business laptop since Tuesday
and had no problems. My daily working environment involves OpenVPN (tun)
and DHCP.


OpenBSD 5.8-current (GENERIC.MP) #2: Tue Aug 25 22:55:46 CEST 2015
nor...@theos.my.domain:/usr/src/sys/arch/amd64/compile/GENERIC.MP
real mem = 4166717440 (3973MB)
avail mem = 4036542464 (3849MB)
mpath0 at root
scsibus0 at mpath0: 256 targets
mainbus0 at root
bios0 at mainbus0: SMBIOS rev. 2.4 @ 0xe0010 (80 entries)
bios0: vendor LENOVO version "7UET94WW (3.24 )" date 10/17/2012
bios0: LENOVO 6475BE3
acpi0 at bios0: rev 2
acpi0: sleep states S0 S3 S4 S5
acpi0: tables DSDT FACP SSDT ECDT APIC MCFG HPET SLIC BOOT ASF! SSDT TCPA DMAR 
SSDT SSDT SSDT
acpi0: wakeup devices LID_(S3) SLPB(S3) UART(S3) IGBE(S4) EXP0(S4) EXP1(S4) 
EXP2(S4) EXP3(S4) EXP4(S4) PCI1(S4) USB0(S3) USB3(S3) USB5(S3) EHC0(S3) 
EHC1(S3) HDEF(S4)
acpitimer0 at acpi0: 3579545 Hz, 24 bits
acpiec0 at acpi0
acpimadt0 at acpi0 addr 0xfee0: PC-AT compat
cpu0 at mainbus0: apid 0 (boot processor)
cpu0: Intel(R) Core(TM)2 Duo CPU P8400 @ 2.26GHz, 2261.34 MHz
cpu0: 
FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,SSE3,DTES64,MWAIT,DS-CPL,VMX,SMX,EST,TM2,SSSE3,CX16,xTPR,PDCM,SSE4.1,NXE,LONG,LAHF,PERF,SENSOR
cpu0: 3MB 64b/line 8-way L2 cache
cpu0: smt 0, core 0, package 0
mtrr: Pentium Pro MTRR support, 7 var ranges, 88 fixed ranges
cpu0: apic clock running at 266MHz
cpu0: mwait min=64, max=64, C-substates=0.2.2.2.2.1.3, IBE
cpu1 at mainbus0: apid 1 (application processor)
cpu1: Intel(R) Core(TM)2 Duo CPU P8400 @ 2.26GHz, 2261.00 MHz
cpu1: 
FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,SSE3,DTES64,MWAIT,DS-CPL,VMX,SMX,EST,TM2,SSSE3,CX16,xTPR,PDCM,SSE4.1,NXE,LONG,LAHF,PERF,SENSOR
cpu1: 3MB 64b/line 8-way L2 cache
cpu1: smt 0, core 1, package 0
ioapic0 at mainbus0: apid 1 pa 0xfec0, version 20, 24 pins
ioapic0: misconfigured as apic 2, remapped to apid 1
acpimcfg0 at acpi0 addr 0xe000, bus 0-63
acpihpet0 at acpi0: 14318179 Hz
acpiprt0 at acpi0: bus 0 (PCI0)
acpiprt1 at acpi0: bus -1 (AGP_)
acpiprt2 at acpi0: bus 2 (EXP0)
acpiprt3 at acpi0: bus 3 (EXP1)
acpiprt4 at acpi0: bus -1 (EXP2)
acpiprt5 at acpi0: bus 5 (EXP3)
acpiprt6 at acpi0: bus 13 (EXP4)
acpiprt7 at acpi0: bus 21 (PCI1)
acpicpu0 at acpi0: !C3(250@17 mwait.3@0x20), !C2(500@1 mwait.1@0x10), C1(1000@1 
mwait.1), PSS
acpicpu1 at acpi0: !C3(250@17 mwait.3@0x20), !C2(500@1 mwait.1@0x10), C1(1000@1 
mwait.1), PSS
acpipwrres0 at acpi0: PUBS, resource for USB0, USB3, USB5, EHC0, EHC1
acpitz0 at acpi0: critical temperature is 127 degC
acpitz1 at acpi0: critical temperature is 100 degC
acpibtn0 at acpi0: LID_
acpibtn1 at acpi0: SLPB
acpibat0 at acpi0: BAT0 model "42T5264" serial  3499 type LION oem "Panasonic"
acpibat1 at acpi0: BAT1 not present
acpiac0 at acpi0: AC unit online
acpithinkpad0 at acpi0
acpidock0 at acpi0: GDCK docked (15)
cpu0: Enhanced SpeedStep 2261 MHz: speeds: 2267, 2266, 1600, 800 MHz
pci0 at mainbus0 bus 0
pchb0 at pci0 dev 0 function 0 "Intel GM45 Host" rev 0x07
vga1 at pci0 dev 2 function 0 "Intel GM45 Video" rev 0x07
intagp0 at vga1
agp0 at intagp0: aperture at 0xd000, size 0x1000
inteldrm0 at vga1
drm0 at inteldrm0
inteldrm0: 1440x900
wsdisplay0 at vga1 mux 1: console (std, vt100 emulation)
wsdisplay0: screen 1-5 added (std, vt100 emulation)
"Intel GM45 Video" rev 0x07 at pci0 dev 2 function 1 not configured
"Intel GM45 HECI" rev 0x07 at pci0 dev 3 function 0 not configured
em0 at pci0 dev 25 function 0 "Intel ICH9 IGP M AMT" rev 0x03: msi, address 
00:1c:25:95:39:e7
uhci0 at pci0 dev 26 function 0 "Intel 82801I USB" rev 0x03: apic 1 int 20
uhci1 at pci0 dev 26 function 1 "Intel 82801I USB" rev 0x03: apic 1 int 21
uhci2 at pci0 dev 26 function 2 "Intel 82801I USB" rev 0x03: apic 1 int 22
ehci0 at pci0 dev 26 function 7 "Intel 82801I USB" rev 0x03: apic 1 int 23
usb0 at ehci0: USB revision 2.0
uhub0 at usb0 "Intel EHCI root hub" rev 2.00/1.00 addr 1
azalia0 at pci0 dev 27 function 0 "Intel 82801I HD Audio" rev 0x03: msi
azalia0: codecs: Conexant CX20561
audio0 at azalia0
ppb0 at pci0 dev 28 function 0 "Intel 82801I PCIE" rev 0x03: msi
pci1 at ppb0 bus 2
ppb1 at pci0 dev 28 function 1 "Intel 82801I PCIE" rev 0x03: msi
pci2 at ppb1 bus 3
iwn0 at pci2 dev 0 function 0 "Intel WiFi Link 5300" rev 0x00: msi, MIMO 3T3R, 
MoW, address 00:16:ea:b3:62:e8
ppb2 at pci0 dev 28 function 3 "Intel 82801

Re: Call for Testing: rtalloc(9) change

2015-08-25 Thread Martin Pieuchot
On 12/08/15(Wed) 17:03, Martin Pieuchot wrote:
> I'm currently working on the routing table interface to make is safe
> to use by multiple CPUs at the same time.  The diff below is a big
> step in this direction and I'd really appreciate if people could test
> it with their usual network setup and report back.

Updated version to match recent changes.  I'm still looking for test
reports and reviews.

> The goal of this diff is to "cache" the route corresponding to "your"
> next hop as early as possible.  Let's assume you're using a common
> dhcp-based network:
> 
> mpi@goiaba $ netstat -rnf inet|egrep "(default|Dest)"
> DestinationGatewayFlags   Refs  Use   Mtu  Prio Iface
> default192.168.0.1UGS5  508 - 8 em0
> 
> Here my default route points to a gateway (G) whose address is
> 192.168.0.1.  In such setup your computer generally sends most of the
> packets to the internet through this gateway.  But to do that it needs
> more informations:
> 
> mpi@goiaba $ netstat -rnf inet|egrep "(192.168.0.1.*L|Dest)"
> DestinationGatewayFlags   Refs  Use   Mtu  Prio Iface
> 192.168.0.1bc:05:43:bd:3e:29  UHLc   1  149 - 8 em0
> 
> Yes this is another route.  This one contains link-layer informations
> (L) and has been cloned (c).  This route is what I described before as
> "your" next hop.  In this case, "your" is a shortcut for "the next hop
> of your default route" but all of this is valid for any route pointing
> to a gateway (G).
> 
> In order to send packets via my default route, the kernel needs to know
> the link-layer address corresponding to the IP address of the gateway.
> This is called "Address Resolution" in network jargon.  In OpenBSD,
> resolved addresses appear in the routing table with a link-layer address
> in the "Gateway" field, as shown previously. 
> 
> This resolution is done in the kernel by calling rtalloc(9) with the
> RT_RESOLVE flag for the wanted destination, in my case 192.168.0.1.
> Once the resolution is complete, a corresponding entry appears in the
> routing table and there's no need to redo it for a certain period of
> time.  That is what I meant with "cache".
> 
> Currently this resolution is done "late" in the journey of a packet and
> that's fine since it is not done often.  Late means that it is done when
> the packet reaches a L2 output function, nd6_output() or arpresolve(). 
> 
> The problem is that having a proper reference count on route entries in
> these functions is really complicated because you can end up using 3
> different routes.  So this diff starts the resolution early: as soon as
> a gateway route is returned by rtalloc(9).
> 
> It also makes sense to do the resolution as soon as possible since we
> need the link-layer address to send the packet.
> 
> One important point: gateway routes (rt_gwroute) are only returned to
> the stack in L2 functions and when that happens, their reference
> counter is not incremented.  That's why the reference count for such
> routes is almost always 1.  They are the simplest example of working
> route caching in our kernel*.  That means that when you purge your
> cloned route, rt_gwroute will still be valid but marked as RTP_DOWN
> until a new resolution is started.
> 
> This diff changes rt_checkgate() to only do sanity checks (finally!).
> 
> Do not hesitate to ask questions if something is not clear, I believe
> it's important that more people understand this.
> 
> Note that this diff includes other bits to be committed separately:
> 
>   - Deprecate the use of RTF_XRESOLVE in rtalloc(9)
>   - Remove PF_KEY-specific code & comments now that SPD lookups no
> longer use rtalloc(9).
>   - Make rtfree(9) accept NULL
> 
> 
> * That's why I'm slowly killing "struct route" & friends to use the
>   simplest route caching mechanism everywhere.

Index: net/route.c
===
RCS file: /cvs/src/sys/net/route.c,v
retrieving revision 1.225
diff -u -p -r1.225 route.c
--- net/route.c 24 Aug 2015 22:11:33 -  1.225
+++ net/route.c 25 Aug 2015 10:23:02 -
@@ -153,6 +153,7 @@ int rtable_alloc(void ***, u_int);
 intrtflushclone1(struct rtentry *, void *, u_int);
 void   rtflushclone(unsigned int, struct rtentry *);
 intrt_if_remove_rtdelete(struct rtentry *, void *, u_int);
+struct rtentry *rt_match(struct sockaddr *, int, unsigned int);
 
 struct ifaddr *ifa_ifwithroute(int, struct sockaddr *, struct sockaddr *,
u_int);
@@ -297,19 +298,32 @@ rtable_exists(u_int id)   /* verify table 
return (1);
 }
 
+/*
+ * Do the actual lookup for rtalloc(9), do not use directly!
+ *
+ * Return the best matching entry for the destination ``dst''.
+ *
+ * "RT_RESOLVE" means that a corresponding L2 entry should
+ *   be added to the routing table and resolved (via ARP or
+ *   NDP), if it does not exist.
+ *
+ * "RT_REPORT" indicates that a message sh

Re: Call for Testing: rtalloc(9) change

2015-08-13 Thread Martin Pieuchot
On 12/08/15(Wed) 17:03, Martin Pieuchot wrote:
> I'm currently working on the routing table interface to make is safe
> to use by multiple CPUs at the same time.  The diff below is a big
> step in this direction and I'd really appreciate if people could test
> it with their usual network setup and report back.

Thanks to a nice report from Hrvoje Popovski, I found a bug in my diff.
Please find a corrected version below.  The changes with the previous
diff are:

-   nhrt = rt_match(rt->rt_gateway, flags, rtableid);
+   nhrt = rt_match(rt->rt_gateway, flags | RT_RESOLVE, rtableid);
if (nhrt == NULL)
return (rt);
[...]

-   if ((nhrt->rt_flags & (RTF_UP|RTF_GATEWAY)) != RTF_UP) {
+   if ((nhrt->rt_flags & (RTF_UP|RTF_CLONING|RTF_GATEWAY)) != RTF_UP) {
rtfree(nhrt);
return (rt);
}

> The goal of this diff is to "cache" the route corresponding to "your"
> next hop as early as possible.  Let's assume you're using a common
> dhcp-based network:
> 
> mpi@goiaba $ netstat -rnf inet|egrep "(default|Dest)"
> DestinationGatewayFlags   Refs  Use   Mtu  Prio Iface
> default192.168.0.1UGS5  508 - 8 em0
> 
> Here my default route points to a gateway (G) whose address is
> 192.168.0.1.  In such setup your computer generally sends most of the
> packets to the internet through this gateway.  But to do that it needs
> more informations:
> 
> mpi@goiaba $ netstat -rnf inet|egrep "(192.168.0.1.*L|Dest)"
> DestinationGatewayFlags   Refs  Use   Mtu  Prio Iface
> 192.168.0.1bc:05:43:bd:3e:29  UHLc   1  149 - 8 em0
> 
> Yes this is another route.  This one contains link-layer informations
> (L) and has been cloned (c).  This route is what I described before as
> "your" next hop.  In this case, "your" is a shortcut for "the next hop
> of your default route" but all of this is valid for any route pointing
> to a gateway (G).
> 
> In order to send packets via my default route, the kernel needs to know
> the link-layer address corresponding to the IP address of the gateway.
> This is called "Address Resolution" in network jargon.  In OpenBSD,
> resolved addresses appear in the routing table with a link-layer address
> in the "Gateway" field, as shown previously. 
> 
> This resolution is done in the kernel by calling rtalloc(9) with the
> RT_RESOLVE flag for the wanted destination, in my case 192.168.0.1.
> Once the resolution is complete, a corresponding entry appears in the
> routing table and there's no need to redo it for a certain period of
> time.  That is what I meant with "cache".
> 
> Currently this resolution is done "late" in the journey of a packet and
> that's fine since it is not done often.  Late means that it is done when
> the packet reaches a L2 output function, nd6_output() or arpresolve(). 
> 
> The problem is that having a proper reference count on route entries in
> these functions is really complicated because you can end up using 3
> different routes.  So this diff starts the resolution early: as soon as
> a gateway route is returned by rtalloc(9).
> 
> It also makes sense to do the resolution as soon as possible since we
> need the link-layer address to send the packet.
> 
> One important point: gateway routes (rt_gwroute) are only returned to
> the stack in L2 functions and when that happens, their reference
> counter is not incremented.  That's why the reference count for such
> routes is almost always 1.  They are the simplest example of working
> route caching in our kernel*.  That means that when you purge your
> cloned route, rt_gwroute will still be valid but marked as RTP_DOWN
> until a new resolution is started.
> 
> This diff changes rt_checkgate() to only do sanity checks (finally!).
> 
> Do not hesitate to ask questions if something is not clear, I believe
> it's important that more people understand this.
> 
> Note that this diff includes other bits to be committed separately:
> 
>   - Deprecate the use of RTF_XRESOLVE in rtalloc(9)
>   - Remove PF_KEY-specific code & comments now that SPD lookups no
> longer use rtalloc(9).
>   - Make rtfree(9) accept NULL
> 
> 
> * That's why I'm slowly killing "struct route" & friends to use the
>   simplest route caching mechanism everywhere.

Index: net/route.c
===
RCS file: /cvs/src/sys/net/route.c,v
retrieving revision 1.217
diff -u -p -r1.217 route.c
--- net/route.c 18 Jul 2015 15:51:16 -  1.217
+++ net/route.c 13 Aug 2015 12:09:58 -
@@ -153,6 +153,7 @@ int rtable_alloc(void ***, u_int);
 intrtflushclone1(struct rtentry *, void *, u_int);
 void   rtflushclone(unsigned int, struct rtentry *);
 intrt_if_remove_rtdelete(struct rtentry *, void *, u_int);
+struct rtentry *rt_match(struct sockaddr *, int, unsigned int);
 
 struct ifaddr *ifa_ifwithroute(int, struct sockaddr *, struct socka

Call for Testing: rtalloc(9) change

2015-08-12 Thread Martin Pieuchot
I'm currently working on the routing table interface to make is safe
to use by multiple CPUs at the same time.  The diff below is a big
step in this direction and I'd really appreciate if people could test
it with their usual network setup and report back.


The goal of this diff is to "cache" the route corresponding to "your"
next hop as early as possible.  Let's assume you're using a common
dhcp-based network:

mpi@goiaba $ netstat -rnf inet|egrep "(default|Dest)"
DestinationGatewayFlags   Refs  Use   Mtu  Prio Iface
default192.168.0.1UGS5  508 - 8 em0

Here my default route points to a gateway (G) whose address is
192.168.0.1.  In such setup your computer generally sends most of the
packets to the internet through this gateway.  But to do that it needs
more informations:

mpi@goiaba $ netstat -rnf inet|egrep "(192.168.0.1.*L|Dest)"
DestinationGatewayFlags   Refs  Use   Mtu  Prio Iface
192.168.0.1bc:05:43:bd:3e:29  UHLc   1  149 - 8 em0

Yes this is another route.  This one contains link-layer informations
(L) and has been cloned (c).  This route is what I described before as
"your" next hop.  In this case, "your" is a shortcut for "the next hop
of your default route" but all of this is valid for any route pointing
to a gateway (G).

In order to send packets via my default route, the kernel needs to know
the link-layer address corresponding to the IP address of the gateway.
This is called "Address Resolution" in network jargon.  In OpenBSD,
resolved addresses appear in the routing table with a link-layer address
in the "Gateway" field, as shown previously. 

This resolution is done in the kernel by calling rtalloc(9) with the
RT_RESOLVE flag for the wanted destination, in my case 192.168.0.1.
Once the resolution is complete, a corresponding entry appears in the
routing table and there's no need to redo it for a certain period of
time.  That is what I meant with "cache".

Currently this resolution is done "late" in the journey of a packet and
that's fine since it is not done often.  Late means that it is done when
the packet reaches a L2 output function, nd6_output() or arpresolve(). 

The problem is that having a proper reference count on route entries in
these functions is really complicated because you can end up using 3
different routes.  So this diff starts the resolution early: as soon as
a gateway route is returned by rtalloc(9).

It also makes sense to do the resolution as soon as possible since we
need the link-layer address to send the packet.

One important point: gateway routes (rt_gwroute) are only returned to
the stack in L2 functions and when that happens, their reference
counter is not incremented.  That's why the reference count for such
routes is almost always 1.  They are the simplest example of working
route caching in our kernel*.  That means that when you purge your
cloned route, rt_gwroute will still be valid but marked as RTP_DOWN
until a new resolution is started.

This diff changes rt_checkgate() to only do sanity checks (finally!).

Do not hesitate to ask questions if something is not clear, I believe
it's important that more people understand this.

Note that this diff includes other bits to be committed separately:

  - Deprecate the use of RTF_XRESOLVE in rtalloc(9)
  - Remove PF_KEY-specific code & comments now that SPD lookups no
longer use rtalloc(9).
  - Make rtfree(9) accept NULL


* That's why I'm slowly killing "struct route" & friends to use the
  simplest route caching mechanism everywhere.

Index: net/route.c
===
RCS file: /cvs/src/sys/net/route.c,v
retrieving revision 1.217
diff -u -p -r1.217 route.c
--- net/route.c 18 Jul 2015 15:51:16 -  1.217
+++ net/route.c 12 Aug 2015 13:54:56 -
@@ -153,6 +153,7 @@ int rtable_alloc(void ***, u_int);
 intrtflushclone1(struct rtentry *, void *, u_int);
 void   rtflushclone(unsigned int, struct rtentry *);
 intrt_if_remove_rtdelete(struct rtentry *, void *, u_int);
+struct rtentry *rt_match(struct sockaddr *, int, unsigned int);
 
 struct ifaddr *ifa_ifwithroute(int, struct sockaddr *, struct sockaddr *,
u_int);
@@ -297,17 +298,31 @@ rtable_exists(u_int id)   /* verify table 
return (1);
 }
 
+/*
+ * Do the actual lookup for rtalloc(9), do not use directly!
+ *
+ * Return the best matching entry for the destination ``dst''.
+ *
+ * "RT_RESOLVE" means that a corresponding L2 entry should
+ *   be added to the routing table and resolved (via ARP or
+ *   NDP), if it does not exist.
+ *
+ * "RT_REPORT" indicates that a message should be sent to
+ *   userland if no matching route has been found or if an
+ *   error occured while adding a L2 entry.
+ */
 struct rtentry *
-rtalloc(struct sockaddr *dst, int flags, unsigned int tableid)
+rt_match(struct sockaddr *dst, int flags, unsigned int tableid)
 {
s