On Tue, Aug 06, 2013 at 09:24:13PM +0200, Christopher Zimmermann wrote: > Hi, > > I'm currently working towards an IP_SENDSRCADDR implementation. > As a first step I moved the calls to in_pcbrehash() from > in_pcb(dis)connect() and in_pcbbind() to the call sites of those > functions. This should save some calls to in_pcbrehash() in the > in_pcbconnect() codepath. > Also I improved code sharing between the IPv4 and IPv6 stacks by > implementing a generic in_pcbsetport used by both stacks. > The next step will be to replace in_pcbconnect() by in_selectsrc() > and in_pcbsetport() in udp_output() like the IPv6 stack does it. > The splsoftnet() lock could then be removed. Also implementing > IP_SENDSRCADDR will become trivial because in_selectsrc() won't > modify the struct inpcb like in_pcbconnect() does at the moment. > > Not much testing yet. Just surfing the web and sending this mail... > > I'd really like some feedback on whether I'm on the right track or > doing something stupid here. I have no experience in the OpenBSD > IP-stack yet. > >
Moving the calls of of in_pcbrehash() to outside of in_pcb.c is wrong. This is an internal function for the specific way fast PCB lookups are done at the moment. Having that all over the place in higher level code where someone does a bind or connect call is not an option IMO. This needs to be done differently, sorry. -- :wq Claudio > Christopher > > > > Index: netinet/in_pcb.c > =================================================================== > RCS file: /cvs/src/sys/netinet/in_pcb.c,v > retrieving revision 1.139 > diff -u -p -r1.139 in_pcb.c > --- netinet/in_pcb.c 1 Jun 2013 13:25:40 -0000 1.139 > +++ netinet/in_pcb.c 6 Aug 2013 18:25:54 -0000 > @@ -218,7 +218,6 @@ in_pcbbind(struct inpcb *inp, struct mbu > { > struct socket *so = inp->inp_socket; > struct inpcbtable *table = inp->inp_table; > - u_int16_t *lastport = &inp->inp_table->inpt_lastport; > struct sockaddr_in *sin; > u_int16_t lport = 0; > int wild = 0, reuseport = (so->so_options & SO_REUSEPORT); > @@ -293,71 +292,121 @@ in_pcbbind(struct inpcb *inp, struct mbu > inp->inp_laddr = sin->sin_addr; > } > if (lport == 0) { > - u_int16_t first, last; > - int count; > + error = in_pcbsetport(inp, p); > + if (error != 0) > + return error; > + } > + else > + inp->inp_lport = lport; > + return (0); > +} > > - if (inp->inp_flags & INP_HIGHPORT) { > - first = ipport_hifirstauto; /* sysctl */ > - last = ipport_hilastauto; > - } else if (inp->inp_flags & INP_LOWPORT) { > - if ((error = suser(p, 0))) > - return (EACCES); > - first = IPPORT_RESERVED-1; /* 1023 */ > - last = 600; /* not IPPORT_RESERVED/2 */ > - } else { > - first = ipport_firstauto; /* sysctl */ > - last = ipport_lastauto; > - } > +int > +in_pcbsetport(struct inpcb *inp, struct proc *p) > +{ > + struct socket *so = inp->inp_socket; > + struct inpcbtable *table = inp->inp_table; > + u_int16_t first, last; > + u_int16_t *lastport = &inp->inp_table->inpt_lastport; > + u_int16_t lport = 0; > + int count; > + int error; > + int wild; > + u_int rtableid; > + void *faddrp, *laddrp; > +#ifdef INET6 > + extern struct in6_addr zeroin6_addr; > > - /* > - * Simple check to ensure all ports are not used up causing > - * a deadlock here. > - * > - * We split the two cases (up and down) so that the direction > - * is not being tested on each round of the loop. > - */ > + if (sotopf(so) == PF_INET6) { > + wild = INPLOOKUP_IPV6; > + rtableid = 0; > + faddrp = &zeroin6_addr; > + laddrp = &inp->inp_laddr6; > + } > + else > +#endif /* INET6 */ > + { > + wild = 0; > + rtableid = inp->inp_rtableid; > + faddrp = &zeroin_addr; > + laddrp = &inp->inp_laddr; > + } > > - if (first > last) { > - /* > - * counting down > - */ > - count = first - last; > - if (count) > - *lastport = first - arc4random_uniform(count); > + if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0 && > + ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0 || > + (so->so_options & SO_ACCEPTCONN) == 0)) > + wild |= INPLOOKUP_WILDCARD; > > - do { > - if (count-- < 0) /* completely used? */ > - return (EADDRNOTAVAIL); > - --*lastport; > - if (*lastport > first || *lastport < last) > - *lastport = first; > - lport = htons(*lastport); > - } while (in_baddynamic(*lastport, > so->so_proto->pr_protocol) || > - in_pcblookup(table, &zeroin_addr, 0, > - &inp->inp_laddr, lport, wild, inp->inp_rtableid)); > - } else { > - /* > - * counting up > - */ > - count = last - first; > - if (count) > - *lastport = first + arc4random_uniform(count); > + if (inp->inp_flags & INP_HIGHPORT) { > + first = ipport_hifirstauto; /* sysctl */ > + last = ipport_hilastauto; > + } else if (inp->inp_flags & INP_LOWPORT) { > + if ((error = suser(p, 0))) > + return (EACCES); > + first = IPPORT_RESERVED-1; /* 1023 */ > + last = 600; /* not IPPORT_RESERVED/2 */ > + } else { > + first = ipport_firstauto; /* sysctl */ > + last = ipport_lastauto; > + } > > - do { > - if (count-- < 0) /* completely used? */ > - return (EADDRNOTAVAIL); > - ++*lastport; > - if (*lastport < first || *lastport > last) > - *lastport = first; > - lport = htons(*lastport); > - } while (in_baddynamic(*lastport, > so->so_proto->pr_protocol) || > - in_pcblookup(table, &zeroin_addr, 0, > - &inp->inp_laddr, lport, wild, inp->inp_rtableid)); > - } > + /* > + * Simple check to ensure all ports are not used up causing > + * a deadlock here. > + * > + * We split the two cases (up and down) so that the direction > + * is not being tested on each round of the loop. > + * XXX: Does this optimization really have a relevant effect? > + * How many iterations can we expect? > + * What about branch prediction? > + * Shouldn't we forbid the first > last case completely? > + */ > + > + if (first > last) { > + /* > + * counting down > + */ > + count = first - last; > + if (count) > + *lastport = first - arc4random_uniform(count); > + > + do { > + if (count-- < 0) /* completely used? */ > + return (EADDRNOTAVAIL); > + --*lastport; > + if (*lastport > first || *lastport < last) > + *lastport = first; > + lport = htons(*lastport); > + } while (in_baddynamic(*lastport, so->so_proto->pr_protocol) || > + in_pcblookup(table, faddrp, 0, > + laddrp, lport, wild, rtableid)); > + } else { > + /* > + * counting up > + */ > + count = last - first; > + if (count) > + *lastport = first + arc4random_uniform(count); > + > + do { > + if (count-- < 0) /* completely used? */ > + return (EADDRNOTAVAIL); > + ++*lastport; > + if (*lastport < first || *lastport > last) > + *lastport = first; > + lport = htons(*lastport); > + } while (in_baddynamic(*lastport, so->so_proto->pr_protocol) || > + in_pcblookup(table, faddrp, 0, > + laddrp, lport, wild, rtableid)); > } > + > inp->inp_lport = lport; > - in_pcbrehash(inp); > - return (0); > + > +#if 0 > + inp->inp_flowinfo = 0; /* XXX */ > +#endif > + > + return 0; > } > > /* > @@ -418,13 +467,12 @@ in_pcbconnect(struct inpcb *inp, struct > return (EADDRINUSE); > if (inp->inp_laddr.s_addr == INADDR_ANY) { > if (inp->inp_lport == 0 && > - in_pcbbind(inp, NULL, curproc) == EADDRNOTAVAIL) > - return (EADDRNOTAVAIL); > + in_pcbsetport(inp, curproc) == EADDRNOTAVAIL) > + return EADDRNOTAVAIL; > inp->inp_laddr = ifaddr->sin_addr; > } > inp->inp_faddr = sin->sin_addr; > inp->inp_fport = sin->sin_port; > - in_pcbrehash(inp); > #ifdef IPSEC > { > int error; /* This is just ignored */ > @@ -452,7 +500,6 @@ in_pcbdisconnect(struct inpcb *inp) > } > > inp->inp_fport = 0; > - in_pcbrehash(inp); > if (inp->inp_socket->so_state & SS_NOFDREF) > in_pcbdetach(inp); > } > Index: netinet/in_pcb.h > =================================================================== > RCS file: /cvs/src/sys/netinet/in_pcb.h,v > retrieving revision 1.79 > diff -u -p -r1.79 in_pcb.h > --- netinet/in_pcb.h 31 May 2013 13:15:53 -0000 1.79 > +++ netinet/in_pcb.h 6 Aug 2013 18:25:54 -0000 > @@ -297,6 +297,6 @@ int in6_pcbnotify(struct inpcbtable *, s > u_int, const struct sockaddr_in6 *, u_int, int, void *, > void (*)(struct inpcb *, int)); > int in6_selecthlim(struct inpcb *, struct ifnet *); > -int in6_pcbsetport(struct in6_addr *, struct inpcb *, struct proc *); > +int in_pcbsetport(struct inpcb *, struct proc *); > #endif /* _KERNEL */ > #endif /* _NETINET_IN_PCB_H_ */ > Index: netinet/ip_divert.c > =================================================================== > RCS file: /cvs/src/sys/netinet/ip_divert.c,v > retrieving revision 1.13 > diff -u -p -r1.13 ip_divert.c > --- netinet/ip_divert.c 8 Apr 2013 15:32:23 -0000 1.13 > +++ netinet/ip_divert.c 6 Aug 2013 18:25:55 -0000 > @@ -319,6 +319,8 @@ divert_usrreq(struct socket *so, int req > case PRU_BIND: > s = splsoftnet(); > error = in_pcbbind(inp, addr, p); > + if (error == 0) > + in_pcbrehash(inp); > splx(s); > break; > > Index: netinet/tcp_input.c > =================================================================== > RCS file: /cvs/src/sys/netinet/tcp_input.c,v > retrieving revision 1.265 > diff -u -p -r1.265 tcp_input.c > --- netinet/tcp_input.c 1 Jul 2013 10:53:52 -0000 1.265 > +++ netinet/tcp_input.c 6 Aug 2013 18:26:01 -0000 > @@ -3820,6 +3820,7 @@ syn_cache_get(struct sockaddr *src, stru > break; > #endif > } > + in_pcbrehash(inp); > (void) m_free(am); > > tp = intotcpcb(inp); > Index: netinet/tcp_usrreq.c > =================================================================== > RCS file: /cvs/src/sys/netinet/tcp_usrreq.c,v > retrieving revision 1.112 > diff -u -p -r1.112 tcp_usrreq.c > --- netinet/tcp_usrreq.c 17 May 2013 09:04:30 -0000 1.112 > +++ netinet/tcp_usrreq.c 6 Aug 2013 18:26:02 -0000 > @@ -228,6 +228,7 @@ tcp_usrreq(so, req, m, nam, control, p) > error = in_pcbbind(inp, nam, p); > if (error) > break; > + in_pcbrehash(inp); > break; > > /* > @@ -244,8 +245,10 @@ tcp_usrreq(so, req, m, nam, control, p) > } > /* If the in_pcbbind() above is called, the tp->pf > should still be whatever it was before. */ > - if (error == 0) > + if (error == 0) { > + in_pcbrehash(inp); > tp->t_state = TCPS_LISTEN; > + } > break; > > /* > @@ -304,6 +307,8 @@ tcp_usrreq(so, req, m, nam, control, p) > error = ENOBUFS; > break; > } > + > + in_pcbrehash(inp); > > so->so_state |= SS_CONNECTOUT; > > Index: netinet/udp_usrreq.c > =================================================================== > RCS file: /cvs/src/sys/netinet/udp_usrreq.c,v > retrieving revision 1.164 > diff -u -p -r1.164 udp_usrreq.c > --- netinet/udp_usrreq.c 9 Jun 2013 22:03:06 -0000 1.164 > +++ netinet/udp_usrreq.c 6 Aug 2013 18:26:04 -0000 > @@ -1164,6 +1164,8 @@ udp_usrreq(struct socket *so, int req, s > else > #endif > error = in_pcbbind(inp, addr, p); > + if (error == 0) > + in_pcbrehash(inp); > splx(s); > break; > > @@ -1180,7 +1182,6 @@ udp_usrreq(struct socket *so, int req, s > } > s = splsoftnet(); > error = in6_pcbconnect(inp, addr); > - splx(s); > } else > #endif /* INET6 */ > { > @@ -1190,8 +1191,9 @@ udp_usrreq(struct socket *so, int req, s > } > s = splsoftnet(); > error = in_pcbconnect(inp, addr); > - splx(s); > } > + in_pcbrehash(inp); > + splx(s); > > if (error == 0) > soisconnected(so); > @@ -1229,6 +1231,7 @@ udp_usrreq(struct socket *so, int req, s > #endif /* INET6 */ > inp->inp_laddr.s_addr = INADDR_ANY; > in_pcbdisconnect(inp); > + in_pcbrehash(inp); > > splx(s); > so->so_state &= ~SS_ISCONNECTED; /* XXX */ > Index: netinet6/in6_pcb.c > =================================================================== > RCS file: /cvs/src/sys/netinet6/in6_pcb.c,v > retrieving revision 1.56 > diff -u -p -r1.56 in6_pcb.c > --- netinet6/in6_pcb.c 31 May 2013 15:04:24 -0000 1.56 > +++ netinet6/in6_pcb.c 6 Aug 2013 18:26:05 -0000 > @@ -267,8 +267,8 @@ in6_pcbbind(struct inpcb *inp, struct mb > return error; > > t = in_pcblookup(head, > - (struct in_addr *)&zeroin6_addr, 0, > - (struct in_addr *)&sin6->sin6_addr, lport, > + &zeroin6_addr, 0, > + &sin6->sin6_addr, lport, > wild, /* XXX */ 0); > > if (t && (reuseport & t->inp_socket->so_options) == 0) > @@ -278,101 +278,11 @@ in6_pcbbind(struct inpcb *inp, struct mb > } > > if (lport == 0) { > - error = in6_pcbsetport(&inp->inp_laddr6, inp, p); > + error = in_pcbsetport(inp, p); > if (error != 0) > return error; > - } else { > + } else > inp->inp_lport = lport; > - in_pcbrehash(inp); > - } > - > - return 0; > -} > - > -int > -in6_pcbsetport(struct in6_addr *laddr, struct inpcb *inp, struct proc *p) > -{ > - struct socket *so = inp->inp_socket; > - struct inpcbtable *table = inp->inp_table; > - u_int16_t first, last; > - u_int16_t *lastport = &inp->inp_table->inpt_lastport; > - u_int16_t lport = 0; > - int count; > - int wild = INPLOOKUP_IPV6; > - int error; > - > - /* XXX we no longer support IPv4 mapped address, so no tweaks here */ > - > - if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0 && > - ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0 || > - (so->so_options & SO_ACCEPTCONN) == 0)) > - wild |= INPLOOKUP_WILDCARD; > - > - if (inp->inp_flags & INP_HIGHPORT) { > - first = ipport_hifirstauto; /* sysctl */ > - last = ipport_hilastauto; > - } else if (inp->inp_flags & INP_LOWPORT) { > - if ((error = suser(p, 0))) > - return (EACCES); > - first = IPPORT_RESERVED-1; /* 1023 */ > - last = 600; /* not IPPORT_RESERVED/2 */ > - } else { > - first = ipport_firstauto; /* sysctl */ > - last = ipport_lastauto; > - } > - > - /* > - * Simple check to ensure all ports are not used up causing > - * a deadlock here. > - * > - * We split the two cases (up and down) so that the direction > - * is not being tested on each round of the loop. > - */ > - > - if (first > last) { > - /* > - * counting down > - */ > - count = first - last; > - if (count) > - *lastport = first - arc4random_uniform(count); > - > - do { > - if (count-- < 0) /* completely used? */ > - return (EADDRNOTAVAIL); > - --*lastport; > - if (*lastport > first || *lastport < last) > - *lastport = first; > - lport = htons(*lastport); > - } while (in_baddynamic(*lastport, so->so_proto->pr_protocol) || > - in_pcblookup(table, &zeroin6_addr, 0, > - &inp->inp_laddr6, lport, wild, /* XXX */ 0)); > - } else { > - /* > - * counting up > - */ > - count = last - first; > - if (count) > - *lastport = first + arc4random_uniform(count); > - > - do { > - if (count-- < 0) /* completely used? */ > - return (EADDRNOTAVAIL); > - ++*lastport; > - if (*lastport < first || *lastport > last) > - *lastport = first; > - lport = htons(*lastport); > - } while (in_baddynamic(*lastport, so->so_proto->pr_protocol) || > - in_pcblookup(table, &zeroin6_addr, 0, > - &inp->inp_laddr6, lport, wild, /* XXX */ 0)); > - } > - > - inp->inp_lport = lport; > - in_pcbrehash(inp); > - > -#if 0 > - inp->inp_flowinfo = 0; /* XXX */ > -#endif > > return 0; > } > @@ -449,8 +359,9 @@ in6_pcbconnect(struct inpcb *inp, struct > return (EADDRINUSE); > } > if (IN6_IS_ADDR_UNSPECIFIED(&inp->inp_laddr6)) { > - if (inp->inp_lport == 0) > - (void)in6_pcbbind(inp, NULL, curproc); > + if (inp->inp_lport == 0 && > + in_pcbsetport(inp, curproc) == EADDRNOTAVAIL) > + return EADDRNOTAVAIL; > inp->inp_laddr6 = *in6a; > } > inp->inp_faddr6 = sin6->sin6_addr; > @@ -459,7 +370,6 @@ in6_pcbconnect(struct inpcb *inp, struct > if (ip6_auto_flowlabel) > inp->inp_flowinfo |= > (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); > - in_pcbrehash(inp); > return (0); > } > > Index: netinet6/ip6_divert.c > =================================================================== > RCS file: /cvs/src/sys/netinet6/ip6_divert.c,v > retrieving revision 1.13 > diff -u -p -r1.13 ip6_divert.c > --- netinet6/ip6_divert.c 26 Jun 2013 09:12:40 -0000 1.13 > +++ netinet6/ip6_divert.c 6 Aug 2013 18:26:05 -0000 > @@ -314,6 +314,8 @@ divert6_usrreq(struct socket *so, int re > case PRU_BIND: > s = splsoftnet(); > error = in6_pcbbind(inp, addr, p); > + if (error == 0) > + in_pcbrehash(inp); > splx(s); > break; > > Index: netinet6/udp6_output.c > =================================================================== > RCS file: /cvs/src/sys/netinet6/udp6_output.c,v > retrieving revision 1.19 > diff -u -p -r1.19 udp6_output.c > --- netinet6/udp6_output.c 28 Mar 2013 16:45:16 -0000 1.19 > +++ netinet6/udp6_output.c 6 Aug 2013 18:26:05 -0000 > @@ -194,9 +194,12 @@ udp6_output(struct in6pcb *in6p, struct > error = EADDRNOTAVAIL; > goto release; > } > - if (in6p->in6p_lport == 0 && > - (error = in6_pcbsetport(laddr, in6p, p)) != 0) > - goto release; > + if (in6p->in6p_lport == 0) { > + error = in_pcbsetport(in6p, p); > + if (error) > + goto release; > + in_pcbrehash(in6p); > + } > } else { > if (IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) { > error = ENOTCONN; > > > -- > http://gmerlin.de > OpenPGP: http://gmerlin.de/christopher.pub > 1917 680A 723C BF3D 2CA3 0E44 7E24 D19F 34B8 2A2A