Re: Call for Testing: rtalloc(9) change
On Wed, Nov 04, 2015 at 12:33:23PM +0100, Martin Pieuchot wrote: > On 12/08/15(Wed) 17:03, Martin Pieuchot wrote: > > I'm currently working on the routing table interface to make is safe > > to use by multiple CPUs at the same time. The diff below is a big > > step in this direction and I'd really appreciate if people could test > > it with their usual network setup and report back. > > Below is an updated version of the diff now that bluhm@ fixed the > potential loop with RTF_GATEWAY routes. > > Note that regression tests will fail because we no longer call > rtalloc(9) inside rt_setgate(). In other words we do not cache > a next-hop route before using it. > > Ok? OK bluhm@ > > Index: net/route.c > === > RCS file: /cvs/src/sys/net/route.c,v > retrieving revision 1.268 > diff -u -p -r1.268 route.c > --- net/route.c 4 Nov 2015 10:13:55 - 1.268 > +++ net/route.c 4 Nov 2015 11:15:00 - > @@ -151,6 +151,7 @@ void rt_timer_init(void); > int rtflushclone1(struct rtentry *, void *, u_int); > void rtflushclone(unsigned int, struct rtentry *); > int rt_if_remove_rtdelete(struct rtentry *, void *, u_int); > +struct rtentry *rt_match(struct sockaddr *, int, unsigned int); > > struct ifaddr *ifa_ifwithroute(int, struct sockaddr *, struct sockaddr > *, > u_int); > @@ -194,6 +195,12 @@ rtisvalid(struct rtentry *rt) > if (rt == NULL) > return (0); > > +#ifdef DIAGNOSTIC > + if (ISSET(rt->rt_flags, RTF_GATEWAY) && (rt->rt_gwroute != NULL) && > + ISSET(rt->rt_gwroute->rt_flags, RTF_GATEWAY)) > + panic("next hop must be directly reachable"); > +#endif > + > if ((rt->rt_flags & RTF_UP) == 0) > return (0); > > @@ -201,11 +208,27 @@ rtisvalid(struct rtentry *rt) > if (rt->rt_ifa == NULL || rt->rt_ifa->ifa_ifp == NULL) > return (0); > > + if (ISSET(rt->rt_flags, RTF_GATEWAY) && !rtisvalid(rt->rt_gwroute)) > + return (0); > + > return (1); > } > > +/* > + * Do the actual lookup for rtalloc(9), do not use directly! > + * > + * Return the best matching entry for the destination ``dst''. > + * > + * "RT_RESOLVE" means that a corresponding L2 entry should > + * be added to the routing table and resolved (via ARP or > + * NDP), if it does not exist. > + * > + * "RT_REPORT" indicates that a message should be sent to > + * userland if no matching route has been found or if an > + * error occured while adding a L2 entry. > + */ > struct rtentry * > -rtalloc(struct sockaddr *dst, int flags, unsigned int tableid) > +rt_match(struct sockaddr *dst, int flags, unsigned int tableid) > { > struct rtentry *rt0, *rt = NULL; > struct rt_addrinfo info; > @@ -336,6 +359,76 @@ rtalloc_mpath(struct sockaddr *dst, uint > } > #endif /* SMALL_KERNEL */ > > +/* > + * Look in the routing table for the best matching entry for > + * ``dst''. > + * > + * If a route with a gateway is found and its next hop is no > + * longer valid, try to cache it. > + */ > +struct rtentry * > +rtalloc(struct sockaddr *dst, int flags, unsigned int rtableid) > +{ > + struct rtentry *rt, *nhrt; > + > + rt = rt_match(dst, flags, rtableid); > + > + /* No match or route to host? We're done. */ > + if (rt == NULL || !ISSET(rt->rt_flags, RTF_GATEWAY)) > + return (rt); > + > + /* Nothing to do if the next hop is valid. */ > + if (rtisvalid(rt->rt_gwroute)) > + return (rt); > + > + rtfree(rt->rt_gwroute); > + rt->rt_gwroute = NULL; > + > + /* > + * If we cannot find a valid next hop, return the route > + * with a gateway. > + * > + * XXX Some dragons hiding in the tree certainly depends on > + * this behavior. But it is safe since rt_checkgate() wont > + * allow us to us this route later on. > + */ > + nhrt = rt_match(rt->rt_gateway, flags | RT_RESOLVE, rtableid); > + if (nhrt == NULL) > + return (rt); > + > + /* > + * Next hop must be reachable, this also prevents rtentry > + * loops for example when rt->rt_gwroute points to rt. > + */ > + if (ISSET(nhrt->rt_flags, RTF_CLONING|RTF_GATEWAY)) { > + rtfree(nhrt); > + return (rt); > + } > + > + /* > + * Next hop entry must be UP and on the same interface. > + */ > + if (!ISSET(nhrt->rt_flags, RTF_UP) || nhrt->rt_ifidx != rt->rt_ifidx) { > + rtfree(nhrt); > + return (rt); > + } > + > + /* > + * If the MTU of next hop is 0, this will reset the MTU of the > + * route to run PMTUD again from scratch. > + */ > + if (!ISSET(rt->rt_locks, RTV_MTU) && (rt->rt_mtu > nhrt->rt_mtu)) > + rt->rt_mtu = nhrt->rt_mtu; > + > + /* > + * Do not return the cached next-hop route, rt_checkgate() will > + * do the ma
Re: Call for Testing: rtalloc(9) change
On 12/08/15(Wed) 17:03, Martin Pieuchot wrote: > I'm currently working on the routing table interface to make is safe > to use by multiple CPUs at the same time. The diff below is a big > step in this direction and I'd really appreciate if people could test > it with their usual network setup and report back. Below is an updated version of the diff now that bluhm@ fixed the potential loop with RTF_GATEWAY routes. Note that regression tests will fail because we no longer call rtalloc(9) inside rt_setgate(). In other words we do not cache a next-hop route before using it. Ok? Index: net/route.c === RCS file: /cvs/src/sys/net/route.c,v retrieving revision 1.268 diff -u -p -r1.268 route.c --- net/route.c 4 Nov 2015 10:13:55 - 1.268 +++ net/route.c 4 Nov 2015 11:15:00 - @@ -151,6 +151,7 @@ voidrt_timer_init(void); intrtflushclone1(struct rtentry *, void *, u_int); void rtflushclone(unsigned int, struct rtentry *); intrt_if_remove_rtdelete(struct rtentry *, void *, u_int); +struct rtentry *rt_match(struct sockaddr *, int, unsigned int); struct ifaddr *ifa_ifwithroute(int, struct sockaddr *, struct sockaddr *, u_int); @@ -194,6 +195,12 @@ rtisvalid(struct rtentry *rt) if (rt == NULL) return (0); +#ifdef DIAGNOSTIC + if (ISSET(rt->rt_flags, RTF_GATEWAY) && (rt->rt_gwroute != NULL) && + ISSET(rt->rt_gwroute->rt_flags, RTF_GATEWAY)) + panic("next hop must be directly reachable"); +#endif + if ((rt->rt_flags & RTF_UP) == 0) return (0); @@ -201,11 +208,27 @@ rtisvalid(struct rtentry *rt) if (rt->rt_ifa == NULL || rt->rt_ifa->ifa_ifp == NULL) return (0); + if (ISSET(rt->rt_flags, RTF_GATEWAY) && !rtisvalid(rt->rt_gwroute)) + return (0); + return (1); } +/* + * Do the actual lookup for rtalloc(9), do not use directly! + * + * Return the best matching entry for the destination ``dst''. + * + * "RT_RESOLVE" means that a corresponding L2 entry should + * be added to the routing table and resolved (via ARP or + * NDP), if it does not exist. + * + * "RT_REPORT" indicates that a message should be sent to + * userland if no matching route has been found or if an + * error occured while adding a L2 entry. + */ struct rtentry * -rtalloc(struct sockaddr *dst, int flags, unsigned int tableid) +rt_match(struct sockaddr *dst, int flags, unsigned int tableid) { struct rtentry *rt0, *rt = NULL; struct rt_addrinfo info; @@ -336,6 +359,76 @@ rtalloc_mpath(struct sockaddr *dst, uint } #endif /* SMALL_KERNEL */ +/* + * Look in the routing table for the best matching entry for + * ``dst''. + * + * If a route with a gateway is found and its next hop is no + * longer valid, try to cache it. + */ +struct rtentry * +rtalloc(struct sockaddr *dst, int flags, unsigned int rtableid) +{ + struct rtentry *rt, *nhrt; + + rt = rt_match(dst, flags, rtableid); + + /* No match or route to host? We're done. */ + if (rt == NULL || !ISSET(rt->rt_flags, RTF_GATEWAY)) + return (rt); + + /* Nothing to do if the next hop is valid. */ + if (rtisvalid(rt->rt_gwroute)) + return (rt); + + rtfree(rt->rt_gwroute); + rt->rt_gwroute = NULL; + + /* +* If we cannot find a valid next hop, return the route +* with a gateway. +* +* XXX Some dragons hiding in the tree certainly depends on +* this behavior. But it is safe since rt_checkgate() wont +* allow us to us this route later on. +*/ + nhrt = rt_match(rt->rt_gateway, flags | RT_RESOLVE, rtableid); + if (nhrt == NULL) + return (rt); + + /* +* Next hop must be reachable, this also prevents rtentry +* loops for example when rt->rt_gwroute points to rt. +*/ + if (ISSET(nhrt->rt_flags, RTF_CLONING|RTF_GATEWAY)) { + rtfree(nhrt); + return (rt); + } + + /* +* Next hop entry must be UP and on the same interface. +*/ + if (!ISSET(nhrt->rt_flags, RTF_UP) || nhrt->rt_ifidx != rt->rt_ifidx) { + rtfree(nhrt); + return (rt); + } + + /* +* If the MTU of next hop is 0, this will reset the MTU of the +* route to run PMTUD again from scratch. +*/ + if (!ISSET(rt->rt_locks, RTV_MTU) && (rt->rt_mtu > nhrt->rt_mtu)) + rt->rt_mtu = nhrt->rt_mtu; + + /* +* Do not return the cached next-hop route, rt_checkgate() will +* do the magic for us. +*/ + rt->rt_gwroute = nhrt; + + return (rt); +} + void rtref(struct rtentry *rt) { @@ -499,7 +592,7 @@ create: rt->rt_flags |= RTF_MODIFIED; flags |= RTF_MODIFIED;
Re: Call for Testing: rtalloc(9) change
On Tue, Sep 01, 2015 at 04:47:31PM +0200, Martin Pieuchot wrote: > Smaller diff now that rtisvalid(9) is in! > +rt_isvalid(struct rtentry *rt) > +/* > + * Returns 1 if the (cached) ``rt'' entry is still valid, 0 otherwise. > + */ > +int > +rtisvalid(struct rtentry *rt) > +{ > + if (!rt_isvalid(rt)) > + return (0); > + > + /* Next hop must also be valid. */ > + if ((rt->rt_flags & RTF_GATEWAY) && !rt_isvalid(rt->rt_gwroute)) > + return (0); > + > + return (1); > +} Please do not call two functions rt_isvalid() and rtisvalid(), that is confusing. Maybe rt_isvalid() could be called rtisvalidnotgw(). Is it possible that a gateway route has another gateway route as gateway? rt_checkgate() checks and fixes that. What do you think about this? Maybe the panic() is too aggressive and should be replaced with a return (0) to avoid recursion. bluhm Index: net/route.c === RCS file: /data/mirror/openbsd/cvs/src/sys/net/route.c,v retrieving revision 1.228 diff -u -p -r1.228 route.c --- net/route.c 1 Sep 2015 12:50:03 - 1.228 +++ net/route.c 2 Sep 2015 14:54:54 - @@ -316,6 +316,15 @@ rtisvalid(struct rtentry *rt) if (rt->rt_ifa == NULL || rt->rt_ifa->ifa_ifp == NULL) return (0); + /* Next hop must also be valid. */ + if (rt->rt_flags & RTF_GATEWAY) { + if (rt->rt_gwroute && (rt->rt_gwroute->rt_flags & RTF_GATEWAY)) + panic("%s: route %p has gateway route %p with gateway", + __func__, rt, rt->rt_gwroute); + if (!rtisvalid(rt->rt_gwroute)) + return (0); + } + return (1); }
Re: Call for Testing: rtalloc(9) change
On 31/08/15(Mon) 11:52, Martin Pieuchot wrote: > On 25/08/15(Tue) 12:27, Martin Pieuchot wrote: > > On 12/08/15(Wed) 17:03, Martin Pieuchot wrote: > > > I'm currently working on the routing table interface to make is safe > > > to use by multiple CPUs at the same time. The diff below is a big > > > step in this direction and I'd really appreciate if people could test > > > it with their usual network setup and report back. > > > > Updated version to match recent changes. I'm still looking for test > > reports and reviews. > > Thanks to all the testers. Here's a third version that should fix a > regression reported by phessler@. The idea is to use rtvalid(9) to > check early if the gateway route is still valid or not. If it isn't > then call rtalloc(9) again to perform the ARP/NDP resolution. Smaller diff now that rtisvalid(9) is in! > Tests & reviews are still welcome! Note that this diff plugs a rt leak for free. Index: net/route.c === RCS file: /cvs/src/sys/net/route.c,v retrieving revision 1.228 diff -u -p -r1.228 route.c --- net/route.c 1 Sep 2015 12:50:03 - 1.228 +++ net/route.c 1 Sep 2015 14:41:36 - @@ -153,6 +153,7 @@ int rtable_alloc(void ***, u_int); intrtflushclone1(struct rtentry *, void *, u_int); void rtflushclone(unsigned int, struct rtentry *); intrt_if_remove_rtdelete(struct rtentry *, void *, u_int); +struct rtentry *rt_match(struct sockaddr *, int, unsigned int); struct ifaddr *ifa_ifwithroute(int, struct sockaddr *, struct sockaddr *, u_int); @@ -300,11 +301,8 @@ rtable_exists(u_int id)/* verify table return (1); } -/* - * Returns 1 if the (cached) ``rt'' entry is still valid, 0 otherwise. - */ -int -rtisvalid(struct rtentry *rt) +static inline int +rt_isvalid(struct rtentry *rt) { if (rt == NULL) return (0); @@ -319,19 +317,48 @@ rtisvalid(struct rtentry *rt) return (1); } +/* + * Returns 1 if the (cached) ``rt'' entry is still valid, 0 otherwise. + */ +int +rtisvalid(struct rtentry *rt) +{ + if (!rt_isvalid(rt)) + return (0); + + /* Next hop must also be valid. */ + if ((rt->rt_flags & RTF_GATEWAY) && !rt_isvalid(rt->rt_gwroute)) + return (0); + + return (1); +} + +/* + * Do the actual lookup for rtalloc(9), do not use directly! + * + * Return the best matching entry for the destination ``dst''. + * + * "RT_RESOLVE" means that a corresponding L2 entry should + * be added to the routing table and resolved (via ARP or + * NDP), if it does not exist. + * + * "RT_REPORT" indicates that a message should be sent to + * userland if no matching route has been found or if an + * error occured while adding a L2 entry. + */ struct rtentry * -rtalloc(struct sockaddr *dst, int flags, unsigned int tableid) +rt_match(struct sockaddr *dst, int flags, unsigned int tableid) { struct rtentry *rt; struct rtentry *newrt = NULL; struct rt_addrinfo info; - int s, error = 0, msgtype = RTM_MISS; + int s, error = 0; - s = splsoftnet(); bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; + s = splsoftnet(); rt = rtable_match(tableid, dst); if (rt != NULL) { newrt = rt; @@ -344,10 +371,6 @@ rtalloc(struct sockaddr *dst, int flags, goto miss; } rt = newrt; - if (rt->rt_flags & RTF_XRESOLVE) { - msgtype = RTM_RESOLVE; - goto miss; - } /* Inform listeners of the new route */ rt_sendmsg(rt, RTM_ADD, tableid); } else @@ -355,11 +378,8 @@ rtalloc(struct sockaddr *dst, int flags, } else { rtstat.rts_unreach++; miss: - if (ISSET(flags, RT_REPORT)) { - bzero((caddr_t)&info, sizeof(info)); - info.rti_info[RTAX_DST] = dst; - rt_missmsg(msgtype, &info, 0, NULL, error, tableid); - } + if (ISSET(flags, RT_REPORT)) + rt_missmsg(RTM_MISS, &info, 0, NULL, error, tableid); } splx(s); return (newrt); @@ -393,6 +413,75 @@ rtalloc_mpath(struct sockaddr *dst, uint } #endif /* SMALL_KERNEL */ +/* + * Look in the routing table for the best matching entry for + * ``dst''. + * + * If a route with a gateway is found and its next hop is no + * longer valid, try to cache it. + */ +struct rtentry * +rtalloc(struct sockaddr *dst, int flags, unsigned int rtableid) +{ + struct rtentry *rt, *nhrt; + + rt = rt_match(dst, flags, rtableid); + + /* No match or route to host? We're done. */ + if (r
Re: Call for Testing: rtalloc(9) change
On 25/08/15(Tue) 12:27, Martin Pieuchot wrote: > On 12/08/15(Wed) 17:03, Martin Pieuchot wrote: > > I'm currently working on the routing table interface to make is safe > > to use by multiple CPUs at the same time. The diff below is a big > > step in this direction and I'd really appreciate if people could test > > it with their usual network setup and report back. > > Updated version to match recent changes. I'm still looking for test > reports and reviews. Thanks to all the testers. Here's a third version that should fix a regression reported by phessler@. The idea is to use rtvalid(9) to check early if the gateway route is still valid or not. If it isn't then call rtalloc(9) again to perform the ARP/NDP resolution. This diff includes rtvalid(9) and makes use of it in ip{,6}_output(), more conversions will be done afterward, but these should hopefully be all the necessary bits to fix the regression. Tests & reviews are still welcome! Index: net/route.c === RCS file: /cvs/src/sys/net/route.c,v retrieving revision 1.226 diff -u -p -r1.226 route.c --- net/route.c 30 Aug 2015 10:39:16 - 1.226 +++ net/route.c 31 Aug 2015 09:33:27 - @@ -153,6 +153,7 @@ int rtable_alloc(void ***, u_int); intrtflushclone1(struct rtentry *, void *, u_int); void rtflushclone(unsigned int, struct rtentry *); intrt_if_remove_rtdelete(struct rtentry *, void *, u_int); +struct rtentry *rt_match(struct sockaddr *, int, unsigned int); struct ifaddr *ifa_ifwithroute(int, struct sockaddr *, struct sockaddr *, u_int); @@ -300,19 +301,68 @@ rtable_exists(u_int id) /* verify table return (1); } + +/* + * Do the actual checks for rtvalid(9), do not use directly! + */ +static inline int +rt_is_valid(struct rtentry *rt) +{ + if (rt == NULL) + return (0); + + if ((rt->rt_flags & RTF_UP) == 0) + return (0); + + /* Routes attached to stall ifas should be freed. */ + if (rt->rt_ifa == NULL || rt->rt_ifa->ifa_ifp == NULL) + return (0); + + return (1); +} + +/* + * Returns 1 if the cached ``rt'' entry is still valid, 0 otherwise. + */ +int +rtvalid(struct rtentry *rt) +{ + if (rt_is_valid(rt) == 0) + return (0); + + /* Next hop must also be valid. */ + if ((rt->rt_flags & RTF_GATEWAY) && rt_is_valid(rt->rt_gwroute) == 0) + return (0); + + return (1); +} + +/* + * Do the actual lookup for rtalloc(9), do not use directly! + * + * Return the best matching entry for the destination ``dst''. + * + * "RT_RESOLVE" means that a corresponding L2 entry should + * be added to the routing table and resolved (via ARP or + * NDP), if it does not exist. + * + * "RT_REPORT" indicates that a message should be sent to + * userland if no matching route has been found or if an + * error occured while adding a L2 entry. + */ struct rtentry * -rtalloc(struct sockaddr *dst, int flags, unsigned int tableid) +rt_match(struct sockaddr *dst, int flags, unsigned int tableid) { struct rtentry *rt; struct rtentry *newrt = NULL; struct rt_addrinfo info; - int s, error = 0, msgtype = RTM_MISS; + int s, error = 0; - s = splsoftnet(); bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; + s = splsoftnet(); rt = rtable_match(tableid, dst); if (rt != NULL) { newrt = rt; @@ -325,10 +375,6 @@ rtalloc(struct sockaddr *dst, int flags, goto miss; } rt = newrt; - if (rt->rt_flags & RTF_XRESOLVE) { - msgtype = RTM_RESOLVE; - goto miss; - } /* Inform listeners of the new route */ rt_sendmsg(rt, RTM_ADD, tableid); } else @@ -336,11 +382,8 @@ rtalloc(struct sockaddr *dst, int flags, } else { rtstat.rts_unreach++; miss: - if (ISSET(flags, RT_REPORT)) { - bzero((caddr_t)&info, sizeof(info)); - info.rti_info[RTAX_DST] = dst; - rt_missmsg(msgtype, &info, 0, NULL, error, tableid); - } + if (ISSET(flags, RT_REPORT)) + rt_missmsg(RTM_MISS, &info, 0, NULL, error, tableid); } splx(s); return (newrt); @@ -374,6 +417,75 @@ rtalloc_mpath(struct sockaddr *dst, uint } #endif /* SMALL_KERNEL */ +/* + * Look in the routing table for the best matching entry for + * ``dst''. + * + * If a route with a gateway is found and its next hop is no + * longer valid, try to cache it. + */ +struct rtentry * +rtalloc(struct sockaddr *dst, int flags, unsigned int rta
Re: Call for Testing: rtalloc(9) change
Hello Martin, On Tue Aug 25 2015 12:27, Martin Pieuchot wrote: > On 12/08/15(Wed) 17:03, Martin Pieuchot wrote: > > I'm currently working on the routing table interface to make is safe > > to use by multiple CPUs at the same time. The diff below is a big > > step in this direction and I'd really appreciate if people could test > > it with their usual network setup and report back. > > Updated version to match recent changes. I'm still looking for test > reports and reviews. I'm running a kernel with this patch on my business laptop since Tuesday and had no problems. My daily working environment involves OpenVPN (tun) and DHCP. OpenBSD 5.8-current (GENERIC.MP) #2: Tue Aug 25 22:55:46 CEST 2015 nor...@theos.my.domain:/usr/src/sys/arch/amd64/compile/GENERIC.MP real mem = 4166717440 (3973MB) avail mem = 4036542464 (3849MB) mpath0 at root scsibus0 at mpath0: 256 targets mainbus0 at root bios0 at mainbus0: SMBIOS rev. 2.4 @ 0xe0010 (80 entries) bios0: vendor LENOVO version "7UET94WW (3.24 )" date 10/17/2012 bios0: LENOVO 6475BE3 acpi0 at bios0: rev 2 acpi0: sleep states S0 S3 S4 S5 acpi0: tables DSDT FACP SSDT ECDT APIC MCFG HPET SLIC BOOT ASF! SSDT TCPA DMAR SSDT SSDT SSDT acpi0: wakeup devices LID_(S3) SLPB(S3) UART(S3) IGBE(S4) EXP0(S4) EXP1(S4) EXP2(S4) EXP3(S4) EXP4(S4) PCI1(S4) USB0(S3) USB3(S3) USB5(S3) EHC0(S3) EHC1(S3) HDEF(S4) acpitimer0 at acpi0: 3579545 Hz, 24 bits acpiec0 at acpi0 acpimadt0 at acpi0 addr 0xfee0: PC-AT compat cpu0 at mainbus0: apid 0 (boot processor) cpu0: Intel(R) Core(TM)2 Duo CPU P8400 @ 2.26GHz, 2261.34 MHz cpu0: FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,SSE3,DTES64,MWAIT,DS-CPL,VMX,SMX,EST,TM2,SSSE3,CX16,xTPR,PDCM,SSE4.1,NXE,LONG,LAHF,PERF,SENSOR cpu0: 3MB 64b/line 8-way L2 cache cpu0: smt 0, core 0, package 0 mtrr: Pentium Pro MTRR support, 7 var ranges, 88 fixed ranges cpu0: apic clock running at 266MHz cpu0: mwait min=64, max=64, C-substates=0.2.2.2.2.1.3, IBE cpu1 at mainbus0: apid 1 (application processor) cpu1: Intel(R) Core(TM)2 Duo CPU P8400 @ 2.26GHz, 2261.00 MHz cpu1: FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,SSE3,DTES64,MWAIT,DS-CPL,VMX,SMX,EST,TM2,SSSE3,CX16,xTPR,PDCM,SSE4.1,NXE,LONG,LAHF,PERF,SENSOR cpu1: 3MB 64b/line 8-way L2 cache cpu1: smt 0, core 1, package 0 ioapic0 at mainbus0: apid 1 pa 0xfec0, version 20, 24 pins ioapic0: misconfigured as apic 2, remapped to apid 1 acpimcfg0 at acpi0 addr 0xe000, bus 0-63 acpihpet0 at acpi0: 14318179 Hz acpiprt0 at acpi0: bus 0 (PCI0) acpiprt1 at acpi0: bus -1 (AGP_) acpiprt2 at acpi0: bus 2 (EXP0) acpiprt3 at acpi0: bus 3 (EXP1) acpiprt4 at acpi0: bus -1 (EXP2) acpiprt5 at acpi0: bus 5 (EXP3) acpiprt6 at acpi0: bus 13 (EXP4) acpiprt7 at acpi0: bus 21 (PCI1) acpicpu0 at acpi0: !C3(250@17 mwait.3@0x20), !C2(500@1 mwait.1@0x10), C1(1000@1 mwait.1), PSS acpicpu1 at acpi0: !C3(250@17 mwait.3@0x20), !C2(500@1 mwait.1@0x10), C1(1000@1 mwait.1), PSS acpipwrres0 at acpi0: PUBS, resource for USB0, USB3, USB5, EHC0, EHC1 acpitz0 at acpi0: critical temperature is 127 degC acpitz1 at acpi0: critical temperature is 100 degC acpibtn0 at acpi0: LID_ acpibtn1 at acpi0: SLPB acpibat0 at acpi0: BAT0 model "42T5264" serial 3499 type LION oem "Panasonic" acpibat1 at acpi0: BAT1 not present acpiac0 at acpi0: AC unit online acpithinkpad0 at acpi0 acpidock0 at acpi0: GDCK docked (15) cpu0: Enhanced SpeedStep 2261 MHz: speeds: 2267, 2266, 1600, 800 MHz pci0 at mainbus0 bus 0 pchb0 at pci0 dev 0 function 0 "Intel GM45 Host" rev 0x07 vga1 at pci0 dev 2 function 0 "Intel GM45 Video" rev 0x07 intagp0 at vga1 agp0 at intagp0: aperture at 0xd000, size 0x1000 inteldrm0 at vga1 drm0 at inteldrm0 inteldrm0: 1440x900 wsdisplay0 at vga1 mux 1: console (std, vt100 emulation) wsdisplay0: screen 1-5 added (std, vt100 emulation) "Intel GM45 Video" rev 0x07 at pci0 dev 2 function 1 not configured "Intel GM45 HECI" rev 0x07 at pci0 dev 3 function 0 not configured em0 at pci0 dev 25 function 0 "Intel ICH9 IGP M AMT" rev 0x03: msi, address 00:1c:25:95:39:e7 uhci0 at pci0 dev 26 function 0 "Intel 82801I USB" rev 0x03: apic 1 int 20 uhci1 at pci0 dev 26 function 1 "Intel 82801I USB" rev 0x03: apic 1 int 21 uhci2 at pci0 dev 26 function 2 "Intel 82801I USB" rev 0x03: apic 1 int 22 ehci0 at pci0 dev 26 function 7 "Intel 82801I USB" rev 0x03: apic 1 int 23 usb0 at ehci0: USB revision 2.0 uhub0 at usb0 "Intel EHCI root hub" rev 2.00/1.00 addr 1 azalia0 at pci0 dev 27 function 0 "Intel 82801I HD Audio" rev 0x03: msi azalia0: codecs: Conexant CX20561 audio0 at azalia0 ppb0 at pci0 dev 28 function 0 "Intel 82801I PCIE" rev 0x03: msi pci1 at ppb0 bus 2 ppb1 at pci0 dev 28 function 1 "Intel 82801I PCIE" rev 0x03: msi pci2 at ppb1 bus 3 iwn0 at pci2 dev 0 function 0 "Intel WiFi Link 5300" rev 0x00: msi, MIMO 3T3R, MoW, address 00:16:ea:b3:62:e8 ppb2 at pci0 dev 28 function 3 "Intel 82801
Re: Call for Testing: rtalloc(9) change
On 12/08/15(Wed) 17:03, Martin Pieuchot wrote: > I'm currently working on the routing table interface to make is safe > to use by multiple CPUs at the same time. The diff below is a big > step in this direction and I'd really appreciate if people could test > it with their usual network setup and report back. Updated version to match recent changes. I'm still looking for test reports and reviews. > The goal of this diff is to "cache" the route corresponding to "your" > next hop as early as possible. Let's assume you're using a common > dhcp-based network: > > mpi@goiaba $ netstat -rnf inet|egrep "(default|Dest)" > DestinationGatewayFlags Refs Use Mtu Prio Iface > default192.168.0.1UGS5 508 - 8 em0 > > Here my default route points to a gateway (G) whose address is > 192.168.0.1. In such setup your computer generally sends most of the > packets to the internet through this gateway. But to do that it needs > more informations: > > mpi@goiaba $ netstat -rnf inet|egrep "(192.168.0.1.*L|Dest)" > DestinationGatewayFlags Refs Use Mtu Prio Iface > 192.168.0.1bc:05:43:bd:3e:29 UHLc 1 149 - 8 em0 > > Yes this is another route. This one contains link-layer informations > (L) and has been cloned (c). This route is what I described before as > "your" next hop. In this case, "your" is a shortcut for "the next hop > of your default route" but all of this is valid for any route pointing > to a gateway (G). > > In order to send packets via my default route, the kernel needs to know > the link-layer address corresponding to the IP address of the gateway. > This is called "Address Resolution" in network jargon. In OpenBSD, > resolved addresses appear in the routing table with a link-layer address > in the "Gateway" field, as shown previously. > > This resolution is done in the kernel by calling rtalloc(9) with the > RT_RESOLVE flag for the wanted destination, in my case 192.168.0.1. > Once the resolution is complete, a corresponding entry appears in the > routing table and there's no need to redo it for a certain period of > time. That is what I meant with "cache". > > Currently this resolution is done "late" in the journey of a packet and > that's fine since it is not done often. Late means that it is done when > the packet reaches a L2 output function, nd6_output() or arpresolve(). > > The problem is that having a proper reference count on route entries in > these functions is really complicated because you can end up using 3 > different routes. So this diff starts the resolution early: as soon as > a gateway route is returned by rtalloc(9). > > It also makes sense to do the resolution as soon as possible since we > need the link-layer address to send the packet. > > One important point: gateway routes (rt_gwroute) are only returned to > the stack in L2 functions and when that happens, their reference > counter is not incremented. That's why the reference count for such > routes is almost always 1. They are the simplest example of working > route caching in our kernel*. That means that when you purge your > cloned route, rt_gwroute will still be valid but marked as RTP_DOWN > until a new resolution is started. > > This diff changes rt_checkgate() to only do sanity checks (finally!). > > Do not hesitate to ask questions if something is not clear, I believe > it's important that more people understand this. > > Note that this diff includes other bits to be committed separately: > > - Deprecate the use of RTF_XRESOLVE in rtalloc(9) > - Remove PF_KEY-specific code & comments now that SPD lookups no > longer use rtalloc(9). > - Make rtfree(9) accept NULL > > > * That's why I'm slowly killing "struct route" & friends to use the > simplest route caching mechanism everywhere. Index: net/route.c === RCS file: /cvs/src/sys/net/route.c,v retrieving revision 1.225 diff -u -p -r1.225 route.c --- net/route.c 24 Aug 2015 22:11:33 - 1.225 +++ net/route.c 25 Aug 2015 10:23:02 - @@ -153,6 +153,7 @@ int rtable_alloc(void ***, u_int); intrtflushclone1(struct rtentry *, void *, u_int); void rtflushclone(unsigned int, struct rtentry *); intrt_if_remove_rtdelete(struct rtentry *, void *, u_int); +struct rtentry *rt_match(struct sockaddr *, int, unsigned int); struct ifaddr *ifa_ifwithroute(int, struct sockaddr *, struct sockaddr *, u_int); @@ -297,19 +298,32 @@ rtable_exists(u_int id) /* verify table return (1); } +/* + * Do the actual lookup for rtalloc(9), do not use directly! + * + * Return the best matching entry for the destination ``dst''. + * + * "RT_RESOLVE" means that a corresponding L2 entry should + * be added to the routing table and resolved (via ARP or + * NDP), if it does not exist. + * + * "RT_REPORT" indicates that a message sh
Re: Call for Testing: rtalloc(9) change
On 12/08/15(Wed) 17:03, Martin Pieuchot wrote: > I'm currently working on the routing table interface to make is safe > to use by multiple CPUs at the same time. The diff below is a big > step in this direction and I'd really appreciate if people could test > it with their usual network setup and report back. Thanks to a nice report from Hrvoje Popovski, I found a bug in my diff. Please find a corrected version below. The changes with the previous diff are: - nhrt = rt_match(rt->rt_gateway, flags, rtableid); + nhrt = rt_match(rt->rt_gateway, flags | RT_RESOLVE, rtableid); if (nhrt == NULL) return (rt); [...] - if ((nhrt->rt_flags & (RTF_UP|RTF_GATEWAY)) != RTF_UP) { + if ((nhrt->rt_flags & (RTF_UP|RTF_CLONING|RTF_GATEWAY)) != RTF_UP) { rtfree(nhrt); return (rt); } > The goal of this diff is to "cache" the route corresponding to "your" > next hop as early as possible. Let's assume you're using a common > dhcp-based network: > > mpi@goiaba $ netstat -rnf inet|egrep "(default|Dest)" > DestinationGatewayFlags Refs Use Mtu Prio Iface > default192.168.0.1UGS5 508 - 8 em0 > > Here my default route points to a gateway (G) whose address is > 192.168.0.1. In such setup your computer generally sends most of the > packets to the internet through this gateway. But to do that it needs > more informations: > > mpi@goiaba $ netstat -rnf inet|egrep "(192.168.0.1.*L|Dest)" > DestinationGatewayFlags Refs Use Mtu Prio Iface > 192.168.0.1bc:05:43:bd:3e:29 UHLc 1 149 - 8 em0 > > Yes this is another route. This one contains link-layer informations > (L) and has been cloned (c). This route is what I described before as > "your" next hop. In this case, "your" is a shortcut for "the next hop > of your default route" but all of this is valid for any route pointing > to a gateway (G). > > In order to send packets via my default route, the kernel needs to know > the link-layer address corresponding to the IP address of the gateway. > This is called "Address Resolution" in network jargon. In OpenBSD, > resolved addresses appear in the routing table with a link-layer address > in the "Gateway" field, as shown previously. > > This resolution is done in the kernel by calling rtalloc(9) with the > RT_RESOLVE flag for the wanted destination, in my case 192.168.0.1. > Once the resolution is complete, a corresponding entry appears in the > routing table and there's no need to redo it for a certain period of > time. That is what I meant with "cache". > > Currently this resolution is done "late" in the journey of a packet and > that's fine since it is not done often. Late means that it is done when > the packet reaches a L2 output function, nd6_output() or arpresolve(). > > The problem is that having a proper reference count on route entries in > these functions is really complicated because you can end up using 3 > different routes. So this diff starts the resolution early: as soon as > a gateway route is returned by rtalloc(9). > > It also makes sense to do the resolution as soon as possible since we > need the link-layer address to send the packet. > > One important point: gateway routes (rt_gwroute) are only returned to > the stack in L2 functions and when that happens, their reference > counter is not incremented. That's why the reference count for such > routes is almost always 1. They are the simplest example of working > route caching in our kernel*. That means that when you purge your > cloned route, rt_gwroute will still be valid but marked as RTP_DOWN > until a new resolution is started. > > This diff changes rt_checkgate() to only do sanity checks (finally!). > > Do not hesitate to ask questions if something is not clear, I believe > it's important that more people understand this. > > Note that this diff includes other bits to be committed separately: > > - Deprecate the use of RTF_XRESOLVE in rtalloc(9) > - Remove PF_KEY-specific code & comments now that SPD lookups no > longer use rtalloc(9). > - Make rtfree(9) accept NULL > > > * That's why I'm slowly killing "struct route" & friends to use the > simplest route caching mechanism everywhere. Index: net/route.c === RCS file: /cvs/src/sys/net/route.c,v retrieving revision 1.217 diff -u -p -r1.217 route.c --- net/route.c 18 Jul 2015 15:51:16 - 1.217 +++ net/route.c 13 Aug 2015 12:09:58 - @@ -153,6 +153,7 @@ int rtable_alloc(void ***, u_int); intrtflushclone1(struct rtentry *, void *, u_int); void rtflushclone(unsigned int, struct rtentry *); intrt_if_remove_rtdelete(struct rtentry *, void *, u_int); +struct rtentry *rt_match(struct sockaddr *, int, unsigned int); struct ifaddr *ifa_ifwithroute(int, struct sockaddr *, struct socka
Call for Testing: rtalloc(9) change
I'm currently working on the routing table interface to make is safe to use by multiple CPUs at the same time. The diff below is a big step in this direction and I'd really appreciate if people could test it with their usual network setup and report back. The goal of this diff is to "cache" the route corresponding to "your" next hop as early as possible. Let's assume you're using a common dhcp-based network: mpi@goiaba $ netstat -rnf inet|egrep "(default|Dest)" DestinationGatewayFlags Refs Use Mtu Prio Iface default192.168.0.1UGS5 508 - 8 em0 Here my default route points to a gateway (G) whose address is 192.168.0.1. In such setup your computer generally sends most of the packets to the internet through this gateway. But to do that it needs more informations: mpi@goiaba $ netstat -rnf inet|egrep "(192.168.0.1.*L|Dest)" DestinationGatewayFlags Refs Use Mtu Prio Iface 192.168.0.1bc:05:43:bd:3e:29 UHLc 1 149 - 8 em0 Yes this is another route. This one contains link-layer informations (L) and has been cloned (c). This route is what I described before as "your" next hop. In this case, "your" is a shortcut for "the next hop of your default route" but all of this is valid for any route pointing to a gateway (G). In order to send packets via my default route, the kernel needs to know the link-layer address corresponding to the IP address of the gateway. This is called "Address Resolution" in network jargon. In OpenBSD, resolved addresses appear in the routing table with a link-layer address in the "Gateway" field, as shown previously. This resolution is done in the kernel by calling rtalloc(9) with the RT_RESOLVE flag for the wanted destination, in my case 192.168.0.1. Once the resolution is complete, a corresponding entry appears in the routing table and there's no need to redo it for a certain period of time. That is what I meant with "cache". Currently this resolution is done "late" in the journey of a packet and that's fine since it is not done often. Late means that it is done when the packet reaches a L2 output function, nd6_output() or arpresolve(). The problem is that having a proper reference count on route entries in these functions is really complicated because you can end up using 3 different routes. So this diff starts the resolution early: as soon as a gateway route is returned by rtalloc(9). It also makes sense to do the resolution as soon as possible since we need the link-layer address to send the packet. One important point: gateway routes (rt_gwroute) are only returned to the stack in L2 functions and when that happens, their reference counter is not incremented. That's why the reference count for such routes is almost always 1. They are the simplest example of working route caching in our kernel*. That means that when you purge your cloned route, rt_gwroute will still be valid but marked as RTP_DOWN until a new resolution is started. This diff changes rt_checkgate() to only do sanity checks (finally!). Do not hesitate to ask questions if something is not clear, I believe it's important that more people understand this. Note that this diff includes other bits to be committed separately: - Deprecate the use of RTF_XRESOLVE in rtalloc(9) - Remove PF_KEY-specific code & comments now that SPD lookups no longer use rtalloc(9). - Make rtfree(9) accept NULL * That's why I'm slowly killing "struct route" & friends to use the simplest route caching mechanism everywhere. Index: net/route.c === RCS file: /cvs/src/sys/net/route.c,v retrieving revision 1.217 diff -u -p -r1.217 route.c --- net/route.c 18 Jul 2015 15:51:16 - 1.217 +++ net/route.c 12 Aug 2015 13:54:56 - @@ -153,6 +153,7 @@ int rtable_alloc(void ***, u_int); intrtflushclone1(struct rtentry *, void *, u_int); void rtflushclone(unsigned int, struct rtentry *); intrt_if_remove_rtdelete(struct rtentry *, void *, u_int); +struct rtentry *rt_match(struct sockaddr *, int, unsigned int); struct ifaddr *ifa_ifwithroute(int, struct sockaddr *, struct sockaddr *, u_int); @@ -297,17 +298,31 @@ rtable_exists(u_int id) /* verify table return (1); } +/* + * Do the actual lookup for rtalloc(9), do not use directly! + * + * Return the best matching entry for the destination ``dst''. + * + * "RT_RESOLVE" means that a corresponding L2 entry should + * be added to the routing table and resolved (via ARP or + * NDP), if it does not exist. + * + * "RT_REPORT" indicates that a message should be sent to + * userland if no matching route has been found or if an + * error occured while adding a L2 entry. + */ struct rtentry * -rtalloc(struct sockaddr *dst, int flags, unsigned int tableid) +rt_match(struct sockaddr *dst, int flags, unsigned int tableid) { s