Author: gallatin
Date: Mon Aug  1 17:02:21 2016
New Revision: 303626
URL: https://svnweb.freebsd.org/changeset/base/303626

Log:
  Rework IPV6 TCP path MTU discovery to match IPv4
  
  - Re-write tcp_ctlinput6() to closely mimic the IPv4 tcp_ctlinput()
  
  - Now that tcp_ctlinput6() updates t_maxseg, we can allow ip6_output()
    to send TCP packets without looking at the tcp host cache for every
    single transmit.
  
  - Make the icmp6 code mimic the IPv4 code & avoid returning
    PRC_HOSTDEAD because it is so expensive.
  
  Without these changes in place, every TCP6 pmtu discovery or host
  unreachable ICMP resulted in a call to in6_pcbnotify() which walks the
  tcbinfo table with the write lock held.  Because the tcbinfo table is
  shared between IPv4 and IPv6, this causes huge scalabilty issues on
  servers with lots of (~100K) TCP connections, to the point where even
  a small percent of IPv6 traffic had a disproportionate impact on
  overall throughput.
  
  Reviewed by:  bz, rrs, ae (all earlier versions), lstewart (in Netflix's tree)
  Sponsored by:         Netflix
  Differential Revision:        https://reviews.freebsd.org/D7272

Modified:
  head/sys/netinet/tcp_subr.c
  head/sys/netinet6/icmp6.c
  head/sys/netinet6/ip6_output.c

Modified: head/sys/netinet/tcp_subr.c
==============================================================================
--- head/sys/netinet/tcp_subr.c Mon Aug  1 16:40:42 2016        (r303625)
+++ head/sys/netinet/tcp_subr.c Mon Aug  1 17:02:21 2016        (r303626)
@@ -78,6 +78,7 @@ __FBSDID("$FreeBSD$");
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
+#include <netinet/icmp6.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_pcb.h>
@@ -2040,72 +2041,146 @@ tcp_ctlinput(int cmd, struct sockaddr *s
 void
 tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
 {
-       struct tcphdr th;
+       struct in6_addr *dst;
+       struct tcphdr *th;
        struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
        struct ip6_hdr *ip6;
        struct mbuf *m;
+       struct inpcb *inp;
+       struct tcpcb *tp;
+       struct icmp6_hdr *icmp6;
        struct ip6ctlparam *ip6cp = NULL;
        const struct sockaddr_in6 *sa6_src = NULL;
-       int off;
-       struct tcp_portonly {
-               u_int16_t th_sport;
-               u_int16_t th_dport;
-       } *thp;
+       struct in_conninfo inc;
+       tcp_seq icmp_tcp_seq;
+       unsigned int mtu;
+       unsigned int off;
+
 
        if (sa->sa_family != AF_INET6 ||
            sa->sa_len != sizeof(struct sockaddr_in6))
                return;
 
-       if (cmd == PRC_MSGSIZE)
-               notify = tcp_mtudisc_notify;
-       else if (!PRC_IS_REDIRECT(cmd) &&
-                ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
-               return;
-
        /* if the parameter is from icmp6, decode it. */
        if (d != NULL) {
                ip6cp = (struct ip6ctlparam *)d;
+               icmp6 = ip6cp->ip6c_icmp6;
                m = ip6cp->ip6c_m;
                ip6 = ip6cp->ip6c_ip6;
                off = ip6cp->ip6c_off;
                sa6_src = ip6cp->ip6c_src;
+               dst = ip6cp->ip6c_finaldst;
        } else {
                m = NULL;
                ip6 = NULL;
                off = 0;        /* fool gcc */
                sa6_src = &sa6_any;
+               dst = NULL;
        }
 
-       if (ip6 != NULL) {
-               struct in_conninfo inc;
-               /*
-                * XXX: We assume that when IPV6 is non NULL,
-                * M and OFF are valid.
-                */
+       if (cmd == PRC_MSGSIZE)
+               notify = tcp_mtudisc_notify;
+       else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
+               cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) &&
+               ip6 != NULL)
+               notify = tcp_drop_syn_sent;
 
-               /* check if we can safely examine src and dst ports */
-               if (m->m_pkthdr.len < off + sizeof(*thp))
-                       return;
+       /*
+        * Hostdead is ugly because it goes linearly through all PCBs.
+        * XXX: We never get this from ICMP, otherwise it makes an
+        * excellent DoS attack on machines with many connections.
+        */
+       else if (cmd == PRC_HOSTDEAD)
+               ip6 = NULL;
+       else if ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0)
+               return;
 
-               bzero(&th, sizeof(th));
-               m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
+       if (ip6 == NULL) {
+               in6_pcbnotify(&V_tcbinfo, sa, 0,
+                             (const struct sockaddr *)sa6_src,
+                             0, cmd, NULL, notify);
+               return;
+       }
 
-               in6_pcbnotify(&V_tcbinfo, sa, th.th_dport,
-                   (struct sockaddr *)ip6cp->ip6c_src,
-                   th.th_sport, cmd, NULL, notify);
+       /* Check if we can safely get the ports from the tcp hdr */
+       if (m == NULL ||
+           (m->m_pkthdr.len <
+               (int32_t) (off + offsetof(struct tcphdr, th_seq)))) {
+               return;
+       }
 
+       th = (struct tcphdr *) mtodo(ip6cp->ip6c_m, ip6cp->ip6c_off);
+       INP_INFO_RLOCK(&V_tcbinfo);
+       inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_dst, th->th_dport,
+           &ip6->ip6_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL);
+       if (inp != NULL && PRC_IS_REDIRECT(cmd)) {
+               /* signal EHOSTDOWN, as it flushes the cached route */
+               inp = (*notify)(inp, EHOSTDOWN);
+               if (inp != NULL)
+                       INP_WUNLOCK(inp);
+       } else if (inp != NULL)  {
+               if (!(inp->inp_flags & INP_TIMEWAIT) &&
+                   !(inp->inp_flags & INP_DROPPED) &&
+                   !(inp->inp_socket == NULL)) {
+                       icmp_tcp_seq = ntohl(th->th_seq);
+                       tp = intotcpcb(inp);
+                       if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
+                           SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
+                               if (cmd == PRC_MSGSIZE) {
+                                       /*
+                                        * MTU discovery:
+                                        * If we got a needfrag set the MTU
+                                        * in the route to the suggested new
+                                        * value (if given) and then notify.
+                                        */
+                                       mtu = ntohl(icmp6->icmp6_mtu);
+                                       /*
+                                        * If no alternative MTU was
+                                        * proposed, or the proposed
+                                        * MTU was too small, set to
+                                        * the min.
+                                        */
+                                       if (mtu < IPV6_MMTU)
+                                               mtu = IPV6_MMTU - 8;
+
+
+                                       bzero(&inc, sizeof(inc));
+                                       inc.inc_fibnum = M_GETFIB(m);
+                                       inc.inc_flags |= INC_ISIPV6;
+                                       inc.inc6_faddr = *dst;
+                                       if (in6_setscope(&inc.inc6_faddr,
+                                               m->m_pkthdr.rcvif, NULL))
+                                               goto unlock_inp;
+
+                                       /*
+                                        * Only process the offered MTU if it
+                                        * is smaller than the current one.
+                                        */
+                                       if (mtu < tp->t_maxseg +
+                                           (sizeof (*th) + sizeof (*ip6))) {
+                                               tcp_hc_updatemtu(&inc, mtu);
+                                               tcp_mtudisc(inp, mtu);
+                                               ICMP6STAT_INC(icp6s_pmtuchg);
+                                       }
+                               } else
+                                       inp = (*notify)(inp,
+                                           inet6ctlerrmap[cmd]);
+                       }
+               }
+unlock_inp:
+               if (inp != NULL)
+                       INP_WUNLOCK(inp);
+       } else {
                bzero(&inc, sizeof(inc));
-               inc.inc_fport = th.th_dport;
-               inc.inc_lport = th.th_sport;
-               inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
-               inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
+               inc.inc_fibnum = M_GETFIB(m);
                inc.inc_flags |= INC_ISIPV6;
-               INP_INFO_RLOCK(&V_tcbinfo);
-               syncache_unreach(&inc, &th);
-               INP_INFO_RUNLOCK(&V_tcbinfo);
-       } else
-               in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr 
*)sa6_src,
-                             0, cmd, NULL, notify);
+               inc.inc_fport = th->th_dport;
+               inc.inc_lport = th->th_sport;
+               inc.inc6_faddr = *dst;
+               inc.inc6_laddr = ip6->ip6_src;
+               syncache_unreach(&inc, th);
+       }
+       INP_INFO_RUNLOCK(&V_tcbinfo);
 }
 #endif /* INET6 */
 

Modified: head/sys/netinet6/icmp6.c
==============================================================================
--- head/sys/netinet6/icmp6.c   Mon Aug  1 16:40:42 2016        (r303625)
+++ head/sys/netinet6/icmp6.c   Mon Aug  1 17:02:21 2016        (r303626)
@@ -485,15 +485,13 @@ icmp6_input(struct mbuf **mp, int *offp,
                icmp6_ifstat_inc(ifp, ifs6_in_dstunreach);
                switch (code) {
                case ICMP6_DST_UNREACH_NOROUTE:
+               case ICMP6_DST_UNREACH_ADDR:    /* PRC_HOSTDEAD is a DOS */
                        code = PRC_UNREACH_NET;
                        break;
                case ICMP6_DST_UNREACH_ADMIN:
                        icmp6_ifstat_inc(ifp, ifs6_in_adminprohib);
                        code = PRC_UNREACH_PROTOCOL; /* is this a good code? */
                        break;
-               case ICMP6_DST_UNREACH_ADDR:
-                       code = PRC_HOSTDEAD;
-                       break;
                case ICMP6_DST_UNREACH_BEYONDSCOPE:
                        /* I mean "source address was incorrect." */
                        code = PRC_PARAMPROB;

Modified: head/sys/netinet6/ip6_output.c
==============================================================================
--- head/sys/netinet6/ip6_output.c      Mon Aug  1 16:40:42 2016        
(r303625)
+++ head/sys/netinet6/ip6_output.c      Mon Aug  1 17:02:21 2016        
(r303626)
@@ -150,9 +150,10 @@ static int ip6_insertfraghdr(struct mbuf
 static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t);
 static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *);
 static int ip6_getpmtu(struct route_in6 *, int,
-       struct ifnet *, const struct in6_addr *, u_long *, int *, u_int);
+       struct ifnet *, const struct in6_addr *, u_long *, int *, u_int,
+       u_int);
 static int ip6_calcmtu(struct ifnet *, const struct in6_addr *, u_long,
-       u_long *, int *);
+       u_long *, int *, u_int);
 static int ip6_getpmtu_ctl(u_int, const struct in6_addr *, u_long *);
 static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int);
 
@@ -718,7 +719,7 @@ again:
 
        /* Determine path MTU. */
        if ((error = ip6_getpmtu(ro_pmtu, ro != ro_pmtu, ifp, &ip6->ip6_dst,
-           &mtu, &alwaysfrag, fibnum)) != 0)
+                   &mtu, &alwaysfrag, fibnum, *nexthdrp)) != 0)
                goto bad;
 
        /*
@@ -1250,7 +1251,7 @@ ip6_getpmtu_ctl(u_int fibnum, const stru
        ifp = nh6.nh_ifp;
        mtu = nh6.nh_mtu;
 
-       error = ip6_calcmtu(ifp, dst, mtu, mtup, NULL);
+       error = ip6_calcmtu(ifp, dst, mtu, mtup, NULL, 0);
        fib6_free_nh_ext(fibnum, &nh6);
 
        return (error);
@@ -1269,7 +1270,7 @@ ip6_getpmtu_ctl(u_int fibnum, const stru
 static int
 ip6_getpmtu(struct route_in6 *ro_pmtu, int do_lookup,
     struct ifnet *ifp, const struct in6_addr *dst, u_long *mtup,
-    int *alwaysfragp, u_int fibnum)
+    int *alwaysfragp, u_int fibnum, u_int proto)
 {
        struct nhop6_basic nh6;
        struct in6_addr kdst;
@@ -1307,7 +1308,7 @@ ip6_getpmtu(struct route_in6 *ro_pmtu, i
        if (ro_pmtu->ro_rt)
                mtu = ro_pmtu->ro_rt->rt_mtu;
 
-       return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp));
+       return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp, proto));
 }
 
 /*
@@ -1319,7 +1320,7 @@ ip6_getpmtu(struct route_in6 *ro_pmtu, i
  */
 static int
 ip6_calcmtu(struct ifnet *ifp, const struct in6_addr *dst, u_long rt_mtu,
-    u_long *mtup, int *alwaysfragp)
+    u_long *mtup, int *alwaysfragp, u_int proto)
 {
        u_long mtu = 0;
        int alwaysfrag = 0;
@@ -1334,7 +1335,11 @@ ip6_calcmtu(struct ifnet *ifp, const str
                inc.inc6_faddr = *dst;
 
                ifmtu = IN6_LINKMTU(ifp);
-               mtu = tcp_hc_getmtu(&inc);
+
+               /* TCP is known to react to pmtu changes so skip hc */
+               if (proto != IPPROTO_TCP)
+                       mtu = tcp_hc_getmtu(&inc);
+
                if (mtu)
                        mtu = min(mtu, rt_mtu);
                else
_______________________________________________
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to