The branch main has been updated by tuexen:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=9e644c23000c2f5028b235f6263d17ffb24d3605

commit 9e644c23000c2f5028b235f6263d17ffb24d3605
Author:     Michael Tuexen <[email protected]>
AuthorDate: 2021-04-18 14:08:08 +0000
Commit:     Michael Tuexen <[email protected]>
CommitDate: 2021-04-18 14:16:42 +0000

    tcp: add support for TCP over UDP
    
    Adding support for TCP over UDP allows communication with
    TCP stacks which can be implemented in userspace without
    requiring special priviledges or specific support by the OS.
    This is joint work with rrs.
    
    Reviewed by:            rrs
    Sponsored by:           Netflix, Inc.
    MFC after:              1 week
    Differential Revision:  https://reviews.freebsd.org/D29469
---
 share/man/man4/tcp.4          |  15 +-
 sys/netinet/tcp.h             |   1 +
 sys/netinet/tcp_input.c       |  47 ++++-
 sys/netinet/tcp_output.c      |  80 ++++++--
 sys/netinet/tcp_stacks/bbr.c  |  38 +---
 sys/netinet/tcp_stacks/rack.c |  26 +--
 sys/netinet/tcp_subr.c        | 462 ++++++++++++++++++++++++++++++++++++++++--
 sys/netinet/tcp_syncache.c    | 127 +++++++++---
 sys/netinet/tcp_syncache.h    |  12 +-
 sys/netinet/tcp_timewait.c    |  84 ++++++--
 sys/netinet/tcp_usrreq.c      |  30 +++
 sys/netinet/tcp_var.h         |  27 ++-
 sys/netinet/toecore.c         |   4 +-
 sys/netinet6/tcp6_var.h       |   2 +
 sys/sys/mbuf.h                |   1 +
 usr.bin/netstat/inet.c        |   4 +
 usr.bin/sockstat/sockstat.1   |   6 +-
 usr.bin/sockstat/sockstat.c   |  13 +-
 18 files changed, 821 insertions(+), 158 deletions(-)

diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4
index cbb8021226fe..873cfe4b822a 100644
--- a/share/man/man4/tcp.4
+++ b/share/man/man4/tcp.4
@@ -34,7 +34,7 @@
 .\"     From: @(#)tcp.4        8.1 (Berkeley) 6/5/93
 .\" $FreeBSD$
 .\"
-.Dd April 17, 2021
+.Dd April 18, 2021
 .Dt TCP 4
 .Os
 .Sh NAME
@@ -329,6 +329,9 @@ currently executing.
 This is typically used after a process or thread inherits a listen
 socket from its parent, and sets its CPU affinity to a particular core.
 .El
+.It Dv TCP_REMOTE_UDP_ENCAPS_PORT
+Set and get the remote UDP encapsulation port.
+It can only be set on a closed TCP socket.
 .El
 .Pp
 The option level for the
@@ -755,6 +758,16 @@ A CSV list of template_spec=percent key-value pairs which 
controls the per
 template sampling rates when
 .Xr stats 3
 sampling is enabled.
+.It Va udp_tunneling_port
+The local UDP encapsulation port.
+A value of 0 indicates that UDP encapsulation is disabled.
+The default is 0.
+.It Va udp_tunneling_overhead
+The overhead taken into account when using UDP encapsulation.
+Since MSS clamping by middleboxes will most likely not work, values larger than
+8 (the size of the UDP header) are also supported.
+Supported values are between 8 and 1024.
+The default is 8.
 .El
 .Sh ERRORS
 A socket operation may fail with one of the following errors returned:
diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
index 0b71bd4658f8..d2bf1f8431fd 100644
--- a/sys/netinet/tcp.h
+++ b/sys/netinet/tcp.h
@@ -183,6 +183,7 @@ struct tcphdr {
 #define        TCP_RXTLS_MODE  42      /* Receive TLS mode */
 #define        TCP_CONGESTION  64      /* get/set congestion control algorithm 
*/
 #define        TCP_CCALGOOPT   65      /* get/set cc algorithm specific 
options */
+#define TCP_REMOTE_UDP_ENCAPS_PORT 71  /* Enable TCP over UDP tunneling via 
the specified port */
 #define TCP_DELACK     72      /* socket option for delayed ack */
 #define TCP_FIN_IS_RST 73      /* A fin from the peer is treated has a RST */
 #define TCP_LOG_LIMIT  74      /* Limit to number of records in tcp-log */
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index ed184de4a4bf..8592f3313725 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -123,6 +123,7 @@ __FBSDID("$FreeBSD$");
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
+#include <netinet/udp.h>
 
 #include <netipsec/ipsec_support.h>
 
@@ -567,7 +568,7 @@ cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, 
uint8_t iptos)
  */
 #ifdef INET6
 int
-tcp6_input(struct mbuf **mp, int *offp, int proto)
+tcp6_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
 {
        struct mbuf *m;
        struct in6_ifaddr *ia6;
@@ -597,12 +598,19 @@ tcp6_input(struct mbuf **mp, int *offp, int proto)
        }
 
        *mp = m;
-       return (tcp_input(mp, offp, proto));
+       return (tcp_input_with_port(mp, offp, proto, port));
+}
+
+int
+tcp6_input(struct mbuf **mp, int *offp, int proto)
+{
+
+       return(tcp6_input_with_port(mp, offp, proto, 0));
 }
 #endif /* INET6 */
 
 int
-tcp_input(struct mbuf **mp, int *offp, int proto)
+tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
 {
        struct mbuf *m = *mp;
        struct tcphdr *th = NULL;
@@ -659,6 +667,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
                ip6 = mtod(m, struct ip6_hdr *);
                th = (struct tcphdr *)((caddr_t)ip6 + off0);
                tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
+               if (port)
+                       goto skip6_csum;
                if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
                        if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
                                th->th_sum = m->m_pkthdr.csum_data;
@@ -672,7 +682,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
                        TCPSTAT_INC(tcps_rcvbadsum);
                        goto drop;
                }
-
+       skip6_csum:
                /*
                 * Be proactive about unspecified IPv6 address in source.
                 * As we use all-zero to indicate unbounded/unconnected pcb,
@@ -713,6 +723,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
                tlen = ntohs(ip->ip_len) - off0;
 
                iptos = ip->ip_tos;
+               if (port)
+                       goto skip_csum;
                if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
                        if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
                                th->th_sum = m->m_pkthdr.csum_data;
@@ -742,8 +754,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
                        ip->ip_v = IPVERSION;
                        ip->ip_hl = off0 >> 2;
                }
-
-               if (th->th_sum) {
+       skip_csum:
+               if (th->th_sum && (port == 0)) {
                        TCPSTAT_INC(tcps_rcvbadsum);
                        goto drop;
                }
@@ -1004,6 +1016,11 @@ findpcb:
                goto dropwithreset;
        }
 
+       if ((tp->t_port != port) && (tp->t_state > TCPS_LISTEN)) {
+               rstreason = BANDLIM_RST_CLOSEDPORT;
+               goto dropwithreset;
+       }
+
 #ifdef TCP_OFFLOAD
        if (tp->t_flags & TF_TOE) {
                tcp_offload_input(tp, m);
@@ -1074,7 +1091,7 @@ findpcb:
                         * NB: syncache_expand() doesn't unlock
                         * inp and tcpinfo locks.
                         */
-                       rstreason = syncache_expand(&inc, &to, th, &so, m);
+                       rstreason = syncache_expand(&inc, &to, th, &so, m, 
port);
                        if (rstreason < 0) {
                                /*
                                 * A failing TCP MD5 signature comparison
@@ -1156,7 +1173,7 @@ tfo_socket_result:
                 * causes.
                 */
                if (thflags & TH_RST) {
-                       syncache_chkrst(&inc, th, m);
+                       syncache_chkrst(&inc, th, m, port);
                        goto dropunlock;
                }
                /*
@@ -1178,7 +1195,7 @@ tfo_socket_result:
                                log(LOG_DEBUG, "%s; %s: Listen socket: "
                                    "SYN|ACK invalid, segment rejected\n",
                                    s, __func__);
-                       syncache_badack(&inc);  /* XXX: Not needed! */
+                       syncache_badack(&inc, port);    /* XXX: Not needed! */
                        TCPSTAT_INC(tcps_badsyn);
                        rstreason = BANDLIM_RST_OPENPORT;
                        goto dropwithreset;
@@ -1337,7 +1354,7 @@ tfo_socket_result:
                TCP_PROBE3(debug__input, tp, th, m);
                tcp_dooptions(&to, optp, optlen, TO_SYN);
                if ((so = syncache_add(&inc, &to, th, inp, so, m, NULL, NULL,
-                   iptos)) != NULL)
+                   iptos, port)) != NULL)
                        goto tfo_socket_result;
 
                /*
@@ -1468,6 +1485,12 @@ tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct 
socket *so,
        return (newsize);
 }
 
+int
+tcp_input(struct mbuf **mp, int *offp, int proto)
+{
+       return(tcp_input_with_port(mp, offp, proto, 0));
+}
+
 void
 tcp_handle_wakeup(struct tcpcb *tp, struct socket *so)
 {
@@ -3672,11 +3695,13 @@ tcp_mss_update(struct tcpcb *tp, int offer, int 
mtuoffer,
                            sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
                            sizeof (struct tcpiphdr);
 #else
-       const size_t min_protoh = sizeof(struct tcpiphdr);
+        size_t min_protoh = sizeof(struct tcpiphdr);
 #endif
 
        INP_WLOCK_ASSERT(tp->t_inpcb);
 
+       if (tp->t_port)
+               min_protoh += V_tcp_udp_tunneling_overhead;
        if (mtuoffer != -1) {
                KASSERT(offer == -1, ("%s: conflict", __func__));
                offer = mtuoffer - min_protoh;
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index e23cdc749e98..5bda2be14df0 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -101,6 +101,8 @@ __FBSDID("$FreeBSD$");
 
 #include <netipsec/ipsec_support.h>
 
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
@@ -207,7 +209,7 @@ tcp_output(struct tcpcb *tp)
 #endif
        struct tcphdr *th;
        u_char opt[TCP_MAXOLEN];
-       unsigned ipoptlen, optlen, hdrlen;
+       unsigned ipoptlen, optlen, hdrlen, ulen;
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
        unsigned ipsec_optlen = 0;
 #endif
@@ -216,6 +218,7 @@ tcp_output(struct tcpcb *tp)
        struct sackhole *p;
        int tso, mtu;
        struct tcpopt to;
+       struct udphdr *udp = NULL;
        unsigned int wanted_cookie = 0;
        unsigned int dont_sendalot = 0;
 #if 0
@@ -558,6 +561,7 @@ after_sack_rexmit:
 #endif
 
        if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
+           (tp->t_port == 0) &&
            ((tp->t_flags & TF_SIGNATURE) == 0) &&
            tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
            ipoptlen == 0 && !(flags & TH_SYN))
@@ -800,6 +804,8 @@ send:
                /* Maximum segment size. */
                if (flags & TH_SYN) {
                        to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc);
+                       if (tp->t_port)
+                               to.to_mss -= V_tcp_udp_tunneling_overhead;
                        to.to_flags |= TOF_MSS;
 
                        /*
@@ -887,7 +893,14 @@ send:
                    !(to.to_flags & TOF_FASTOPEN))
                        len = 0;
        }
-
+       if (tp->t_port) {
+               if (V_tcp_udp_tunneling_port == 0) {
+                       /* The port was removed?? */
+                       SOCKBUF_UNLOCK(&so->so_snd);
+                       return (EHOSTUNREACH);
+               }
+               hdrlen += sizeof(struct udphdr);
+       }
        /*
         * Adjust data length if insertion of options will
         * bump the packet length beyond the t_maxseg length.
@@ -1140,8 +1153,17 @@ send:
 #ifdef INET6
        if (isipv6) {
                ip6 = mtod(m, struct ip6_hdr *);
-               th = (struct tcphdr *)(ip6 + 1);
-               tcpip_fillheaders(tp->t_inpcb, ip6, th);
+               if (tp->t_port) {
+                       udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + 
sizeof(struct ip6_hdr));
+                       udp->uh_sport = htons(V_tcp_udp_tunneling_port);
+                       udp->uh_dport = tp->t_port;
+                       ulen = hdrlen + len - sizeof(struct ip6_hdr);
+                       udp->uh_ulen = htons(ulen);
+                       th = (struct tcphdr *)(udp + 1);
+               } else {
+                       th = (struct tcphdr *)(ip6 + 1);
+               }
+               tcpip_fillheaders(tp->t_inpcb, tp->t_port, ip6, th);
        } else
 #endif /* INET6 */
        {
@@ -1149,8 +1171,16 @@ send:
 #ifdef TCPDEBUG
                ipov = (struct ipovly *)ip;
 #endif
-               th = (struct tcphdr *)(ip + 1);
-               tcpip_fillheaders(tp->t_inpcb, ip, th);
+               if (tp->t_port) {
+                       udp = (struct udphdr *)((caddr_t)ip + ipoptlen + 
sizeof(struct ip));
+                       udp->uh_sport = htons(V_tcp_udp_tunneling_port);
+                       udp->uh_dport = tp->t_port;
+                       ulen = hdrlen + len - sizeof(struct ip);
+                       udp->uh_ulen = htons(ulen);
+                       th = (struct tcphdr *)(udp + 1);
+               } else
+                       th = (struct tcphdr *)(ip + 1);
+               tcpip_fillheaders(tp->t_inpcb, tp->t_port, ip, th);
        }
 
        /*
@@ -1309,7 +1339,6 @@ send:
         * checksum extended header and data.
         */
        m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
-       m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
        if (to.to_flags & TOF_SIGNATURE) {
@@ -1336,9 +1365,19 @@ send:
                 * There is no need to fill in ip6_plen right now.
                 * It will be filled later by ip6_output.
                 */
-               m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
-               th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) +
-                   optlen + len, IPPROTO_TCP, 0);
+               if (tp->t_port) {
+                       m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
+                       m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
+                       udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 
0);
+                       th->th_sum = htons(0);
+                       UDPSTAT_INC(udps_opackets);
+               } else {
+                       m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
+                       m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+                       th->th_sum = in6_cksum_pseudo(ip6,
+                           sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
+                           0);
+               }
        }
 #endif
 #if defined(INET6) && defined(INET)
@@ -1346,9 +1385,20 @@ send:
 #endif
 #ifdef INET
        {
-               m->m_pkthdr.csum_flags = CSUM_TCP;
-               th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
-                   htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen));
+               if (tp->t_port) {
+                       m->m_pkthdr.csum_flags = CSUM_UDP;
+                       m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
+                       udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
+                          ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
+                       th->th_sum = htons(0);
+                       UDPSTAT_INC(udps_opackets);
+               } else {
+                       m->m_pkthdr.csum_flags = CSUM_TCP;
+                       m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+                       th->th_sum = in_pseudo(ip->ip_src.s_addr,
+                           ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
+                           IPPROTO_TCP + len + optlen));
+               }
 
                /* IP version must be set here for ipv4/ipv6 checking later */
                KASSERT(ip->ip_v == IPVERSION,
@@ -1473,8 +1523,10 @@ send:
         * NB: Don't set DF on small MTU/MSS to have a safe fallback.
         */
        if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
-               ip->ip_off |= htons(IP_DF);
                tp->t_flags2 |= TF2_PLPMTU_PMTUD;
+               if (tp->t_port == 0 || len < V_tcp_minmss) {
+                       ip->ip_off |= htons(IP_DF);
+               }
        } else {
                tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
        }
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
index 673dee911c87..febac7ad424c 100644
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -11969,14 +11969,10 @@ bbr_output_wtime(struct tcpcb *tp, const struct 
timeval *tv)
 #endif
        struct tcp_bbr *bbr;
        struct tcphdr *th;
-#ifdef NETFLIX_TCPOUDP
        struct udphdr *udp = NULL;
-#endif
        u_char opt[TCP_MAXOLEN];
        unsigned ipoptlen, optlen, hdrlen;
-#ifdef NETFLIX_TCPOUDP
        unsigned ulen;
-#endif
        uint32_t bbr_seq;
        uint32_t delay_calc=0;
        uint8_t doing_tlp = 0;
@@ -12991,10 +12987,8 @@ send:
                /* Maximum segment size. */
                if (flags & TH_SYN) {
                        to.to_mss = tcp_mssopt(&inp->inp_inc);
-#ifdef NETFLIX_TCPOUDP
                        if (tp->t_port)
                                to.to_mss -= V_tcp_udp_tunneling_overhead;
-#endif
                        to.to_flags |= TOF_MSS;
                        /*
                         * On SYN or SYN|ACK transmits on TFO connections,
@@ -13063,7 +13057,6 @@ send:
                    !(to.to_flags & TOF_FASTOPEN))
                        len = 0;
        }
-#ifdef NETFLIX_TCPOUDP
        if (tp->t_port) {
                if (V_tcp_udp_tunneling_port == 0) {
                        /* The port was removed?? */
@@ -13072,7 +13065,6 @@ send:
                }
                hdrlen += sizeof(struct udphdr);
        }
-#endif
 #ifdef INET6
        if (isipv6)
                ipoptlen = ip6_optlen(tp->t_inpcb);
@@ -13408,7 +13400,6 @@ send:
 #ifdef INET6
        if (isipv6) {
                ip6 = mtod(m, struct ip6_hdr *);
-#ifdef NETFLIX_TCPOUDP
                if (tp->t_port) {
                        udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + 
sizeof(struct ip6_hdr));
                        udp->uh_sport = htons(V_tcp_udp_tunneling_port);
@@ -13417,17 +13408,9 @@ send:
                        udp->uh_ulen = htons(ulen);
                        th = (struct tcphdr *)(udp + 1);
                } else {
-#endif
                        th = (struct tcphdr *)(ip6 + 1);
-
-#ifdef NETFLIX_TCPOUDP
                }
-#endif
-               tcpip_fillheaders(inp,
-#ifdef NETFLIX_TCPOUDP
-                                 tp->t_port,
-#endif
-                                 ip6, th);
+               tcpip_fillheaders(inp, tp->t_port, ip6, th);
        } else
 #endif                         /* INET6 */
        {
@@ -13435,7 +13418,6 @@ send:
 #ifdef TCPDEBUG
                ipov = (struct ipovly *)ip;
 #endif
-#ifdef NETFLIX_TCPOUDP
                if (tp->t_port) {
                        udp = (struct udphdr *)((caddr_t)ip + ipoptlen + 
sizeof(struct ip));
                        udp->uh_sport = htons(V_tcp_udp_tunneling_port);
@@ -13443,14 +13425,10 @@ send:
                        ulen = hdrlen + len - sizeof(struct ip);
                        udp->uh_ulen = htons(ulen);
                        th = (struct tcphdr *)(udp + 1);
-               } else
-#endif
+               } else {
                        th = (struct tcphdr *)(ip + 1);
-               tcpip_fillheaders(inp,
-#ifdef NETFLIX_TCPOUDP
-                                 tp->t_port,
-#endif
-                                 ip, th);
+               }
+               tcpip_fillheaders(inp, tp->t_port, ip, th);
        }
        /*
         * If we are doing retransmissions, then snd_nxt will not reflect
@@ -13600,7 +13578,6 @@ send:
                 * ip6_plen is not need to be filled now, and will be filled
                 * in ip6_output.
                 */
-#ifdef NETFLIX_TCPOUDP
                if (tp->t_port) {
                        m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
                        m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
@@ -13608,14 +13585,11 @@ send:
                        th->th_sum = htons(0);
                        UDPSTAT_INC(udps_opackets);
                } else {
-#endif
                        csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
                        m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
                        th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct 
tcphdr) +
                            optlen + len, IPPROTO_TCP, 0);
-#ifdef NETFLIX_TCPOUDP
                }
-#endif
        }
 #endif
 #if defined(INET6) && defined(INET)
@@ -13623,7 +13597,6 @@ send:
 #endif
 #ifdef INET
        {
-#ifdef NETFLIX_TCPOUDP
                if (tp->t_port) {
                        m->m_pkthdr.csum_flags = CSUM_UDP;
                        m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
@@ -13632,15 +13605,12 @@ send:
                        th->th_sum = htons(0);
                        UDPSTAT_INC(udps_opackets);
                } else {
-#endif
                        csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP;
                        m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
                        th->th_sum = in_pseudo(ip->ip_src.s_addr,
                            ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
                            IPPROTO_TCP + len + optlen));
-#ifdef NETFLIX_TCPOUDP
                }
-#endif
                /* IP version must be set here for ipv4/ipv6 checking later */
                KASSERT(ip->ip_v == IPVERSION,
                    ("%s: IP version incorrect: %d", __func__, ip->ip_v));
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index 0079bf8b6400..d2093e1afab7 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -13008,10 +13008,8 @@ send:
                if (flags & TH_SYN) {
                        tp->snd_nxt = tp->iss;
                        to.to_mss = tcp_mssopt(&inp->inp_inc);
-#ifdef NETFLIX_TCPOUDP
                        if (tp->t_port)
                                to.to_mss -= V_tcp_udp_tunneling_overhead;
-#endif
                        to.to_flags |= TOF_MSS;
 
                        /*
@@ -13088,7 +13086,6 @@ send:
                    !(to.to_flags & TOF_FASTOPEN))
                        len = 0;
        }
-#ifdef NETFLIX_TCPOUDP
        if (tp->t_port) {
                if (V_tcp_udp_tunneling_port == 0) {
                        /* The port was removed?? */
@@ -13097,7 +13094,6 @@ send:
                }
                hdrlen += sizeof(struct udphdr);
        }
-#endif
 #ifdef INET6
        if (isipv6)
                ipoptlen = ip6_optlen(tp->t_inpcb);
@@ -13372,7 +13368,6 @@ send:
 #ifdef INET6
        if (isipv6) {
                ip6 = mtod(m, struct ip6_hdr *);
-#ifdef NETFLIX_TCPOUDP
                if (tp->t_port) {
                        udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + 
sizeof(struct ip6_hdr));
                        udp->uh_sport = htons(V_tcp_udp_tunneling_port);
@@ -13380,14 +13375,10 @@ send:
                        ulen = hdrlen + len - sizeof(struct ip6_hdr);
                        udp->uh_ulen = htons(ulen);
                        th = (struct tcphdr *)(udp + 1);
-               } else
-#endif
+               } else {
                        th = (struct tcphdr *)(ip6 + 1);
-               tcpip_fillheaders(inp,
-#ifdef NETFLIX_TCPOUDP
-                                 tp->t_port,
-#endif
-                                 ip6, th);
+               }
+               tcpip_fillheaders(inp, tp->t_port, ip6, th);
        } else
 #endif                         /* INET6 */
        {
@@ -13395,7 +13386,6 @@ send:
 #ifdef TCPDEBUG
                ipov = (struct ipovly *)ip;
 #endif
-#ifdef NETFLIX_TCPOUDP
                if (tp->t_port) {
                        udp = (struct udphdr *)((caddr_t)ip + ipoptlen + 
sizeof(struct ip));
                        udp->uh_sport = htons(V_tcp_udp_tunneling_port);
@@ -13403,14 +13393,10 @@ send:
                        ulen = hdrlen + len - sizeof(struct ip);
                        udp->uh_ulen = htons(ulen);
                        th = (struct tcphdr *)(udp + 1);
-               } else
-#endif
+               } else {
                        th = (struct tcphdr *)(ip + 1);
-               tcpip_fillheaders(inp,
-#ifdef NETFLIX_TCPOUDP
-                                 tp->t_port,
-#endif
-                                 ip, th);
+               }
+               tcpip_fillheaders(inp, tp->t_port, ip, th);
        }
        /*
         * Fill in fields, remembering maximum advertised window for use in
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index e973555efbcb..1ebc7357def3 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -126,6 +126,8 @@ __FBSDID("$FreeBSD$");
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
 
 #include <netipsec/ipsec_support.h>
 
@@ -501,6 +503,80 @@ tcp_switch_back_to_default(struct tcpcb *tp)
        }
 }
 
+static void
+tcp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *inp,
+    const struct sockaddr *sa, void *ctx)
+{
+       struct ip *iph;
+#ifdef INET6
+       struct ip6_hdr *ip6;
+#endif
+       struct udphdr *uh;
+       struct tcphdr *th;
+       int thlen;
+       uint16_t port;
+
+       TCPSTAT_INC(tcps_tunneled_pkts);
+       if ((m->m_flags & M_PKTHDR) == 0) {
+               /* Can't handle one that is not a pkt hdr */
+               TCPSTAT_INC(tcps_tunneled_errs);
+               goto out;
+       }
+       thlen = sizeof(struct tcphdr);
+       if (m->m_len < off + sizeof(struct udphdr) + thlen &&
+           (m =  m_pullup(m, off + sizeof(struct udphdr) + thlen)) == NULL) {
+               TCPSTAT_INC(tcps_tunneled_errs);
+               goto out;
+       }
+       iph = mtod(m, struct ip *);
+       uh = (struct udphdr *)((caddr_t)iph + off);
+       th = (struct tcphdr *)(uh + 1);
+       thlen = th->th_off << 2;
+       if (m->m_len < off + sizeof(struct udphdr) + thlen) {
+               m =  m_pullup(m, off + sizeof(struct udphdr) + thlen);
+               if (m == NULL) {
+                       TCPSTAT_INC(tcps_tunneled_errs);
+                       goto out;
+               } else {
+                       iph = mtod(m, struct ip *);
+                       uh = (struct udphdr *)((caddr_t)iph + off);
+                       th = (struct tcphdr *)(uh + 1);
+               }
+       }
+       m->m_pkthdr.tcp_tun_port = port = uh->uh_sport;
+       bcopy(th, uh, m->m_len - off);
+       m->m_len -= sizeof(struct udphdr);
+       m->m_pkthdr.len -= sizeof(struct udphdr);
+       /*
+        * We use the same algorithm for
+        * both UDP and TCP for c-sum. So
+        * the code in tcp_input will skip
+        * the checksum. So we do nothing
+        * with the flag (m->m_pkthdr.csum_flags).
+        */
+       switch (iph->ip_v) {
+#ifdef INET
+       case IPVERSION:
+               iph->ip_len = htons(ntohs(iph->ip_len) - sizeof(struct udphdr));
+               tcp_input_with_port(&m, &off, IPPROTO_TCP, port);
+               break;
+#endif
+#ifdef INET6
+       case IPV6_VERSION >> 4:
+               ip6 = mtod(m, struct ip6_hdr *);
+               ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - sizeof(struct 
udphdr));
+               tcp6_input_with_port(&m, &off, IPPROTO_TCP, port);
+               break;
+#endif
+       default:
+               goto out;
+               break;
+       }
+       return;
+out:
+       m_freem(m);
+}
+
 static int
 sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS)
 {
@@ -598,6 +674,183 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available,
     NULL, 0, sysctl_net_inet_list_available, "A",
     "list available TCP Function sets");
 
+VNET_DEFINE(int, tcp_udp_tunneling_port) = TCP_TUNNELING_PORT_DEFAULT;
+
+#ifdef INET
+VNET_DEFINE(struct socket *, udp4_tun_socket) = NULL;
+#define        V_udp4_tun_socket       VNET(udp4_tun_socket)
+#endif
+#ifdef INET6
+VNET_DEFINE(struct socket *, udp6_tun_socket) = NULL;
+#define        V_udp6_tun_socket       VNET(udp6_tun_socket)
+#endif
+
+static void
+tcp_over_udp_stop(void)
+{
+       /*
+        * This function assumes sysctl caller holds inp_rinfo_lock()
+        * for writting!
+        */
+#ifdef INET
+       if (V_udp4_tun_socket != NULL) {
+               soclose(V_udp4_tun_socket);
+               V_udp4_tun_socket = NULL;
+       }
+#endif
+#ifdef INET6
+       if (V_udp6_tun_socket != NULL) {
+               soclose(V_udp6_tun_socket);
+               V_udp6_tun_socket = NULL;
+       }
+#endif
+}
+
+static int
+tcp_over_udp_start(void)
+{
+       uint16_t port;
+       int ret;
+#ifdef INET
+       struct sockaddr_in sin;
+#endif
+#ifdef INET6
+       struct sockaddr_in6 sin6;
+#endif
+       /*
+        * This function assumes sysctl caller holds inp_info_rlock()
+        * for writting!
+        */
+       port = V_tcp_udp_tunneling_port;
+       if (ntohs(port) == 0) {
+               /* Must have a port set */
+               return (EINVAL);
+       }
+#ifdef INET
+       if (V_udp4_tun_socket != NULL) {
+               /* Already running -- must stop first */
+               return (EALREADY);
+       }
+#endif
+#ifdef INET6
+       if (V_udp6_tun_socket != NULL) {
+               /* Already running -- must stop first */
+               return (EALREADY);
+       }
+#endif
+#ifdef INET
+       if ((ret = socreate(PF_INET, &V_udp4_tun_socket,
+           SOCK_DGRAM, IPPROTO_UDP,
+           curthread->td_ucred, curthread))) {
+               tcp_over_udp_stop();
+               return (ret);
+       }
+       /* Call the special UDP hook. */
+       if ((ret = udp_set_kernel_tunneling(V_udp4_tun_socket,
+           tcp_recv_udp_tunneled_packet,
+           tcp_ctlinput_viaudp,
+           NULL))) {
+               tcp_over_udp_stop();
+               return (ret);
+       }
+       /* Ok, we have a socket, bind it to the port. */
+       memset(&sin, 0, sizeof(struct sockaddr_in));
+       sin.sin_len = sizeof(struct sockaddr_in);
+       sin.sin_family = AF_INET;
+       sin.sin_port = htons(port);
+       if ((ret = sobind(V_udp4_tun_socket,
+           (struct sockaddr *)&sin, curthread))) {
+               tcp_over_udp_stop();
+               return (ret);
+       }
+#endif
+#ifdef INET6
+       if ((ret = socreate(PF_INET6, &V_udp6_tun_socket,
+           SOCK_DGRAM, IPPROTO_UDP,
+           curthread->td_ucred, curthread))) {
+               tcp_over_udp_stop();
+               return (ret);
+       }
+       /* Call the special UDP hook. */
+       if ((ret = udp_set_kernel_tunneling(V_udp6_tun_socket,
+           tcp_recv_udp_tunneled_packet,
+           tcp6_ctlinput_viaudp,
+           NULL))) {
+               tcp_over_udp_stop();
+               return (ret);
+       }
+       /* Ok, we have a socket, bind it to the port. */
+       memset(&sin6, 0, sizeof(struct sockaddr_in6));
+       sin6.sin6_len = sizeof(struct sockaddr_in6);
+       sin6.sin6_family = AF_INET6;
+       sin6.sin6_port = htons(port);
+       if ((ret = sobind(V_udp6_tun_socket,
+           (struct sockaddr *)&sin6, curthread))) {
+               tcp_over_udp_stop();
+               return (ret);
+       }
+#endif
+       return (0);
+}
+
+static int
+sysctl_net_inet_tcp_udp_tunneling_port_check(SYSCTL_HANDLER_ARGS)
+{
+       int error;
+       uint32_t old, new;
+
+       old = V_tcp_udp_tunneling_port;
+       new = old;
+       error = sysctl_handle_int(oidp, &new, 0, req);
+       if ((error == 0) &&
+           (req->newptr != NULL)) {
+               if ((new < TCP_TUNNELING_PORT_MIN) ||
+                   (new > TCP_TUNNELING_PORT_MAX)) {
+                       error = EINVAL;
+               } else {
+                       V_tcp_udp_tunneling_port = new;
+                       if (old != 0) {
+                               tcp_over_udp_stop();
+                       }
+                       if (new != 0) {
+                               error = tcp_over_udp_start();
+                       }
+               }
+       }
+       return (error);
+}
+
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_port,
+    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    &VNET_NAME(tcp_udp_tunneling_port),
+    0, &sysctl_net_inet_tcp_udp_tunneling_port_check, "IU",
+    "Tunneling port for tcp over udp");
+
+VNET_DEFINE(int, tcp_udp_tunneling_overhead) = TCP_TUNNELING_OVERHEAD_DEFAULT;
+
+static int
+sysctl_net_inet_tcp_udp_tunneling_overhead_check(SYSCTL_HANDLER_ARGS)
+{
+       int error, new;
+
+       new = V_tcp_udp_tunneling_overhead;
+       error = sysctl_handle_int(oidp, &new, 0, req);
+       if (error == 0 && req->newptr) {
+               if ((new < TCP_TUNNELING_OVERHEAD_MIN) ||
+                   (new > TCP_TUNNELING_OVERHEAD_MAX))
+                       error = EINVAL;
+               else
+                       V_tcp_udp_tunneling_overhead = new;
+       }
+       return (error);
+}
+
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_overhead,
+    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    &VNET_NAME(tcp_udp_tunneling_overhead),
+    0, &sysctl_net_inet_tcp_udp_tunneling_overhead_check, "IU",
+    "MSS reduction when using tcp over udp");
+
 /*
  * Exports one (struct tcp_function_info) for each alias/name.
  */
@@ -1314,7 +1567,7 @@ tcp_fini(void *xtp)
  * of the tcpcb each time to conserve mbufs.
  */
 void
-tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr)
+tcpip_fillheaders(struct inpcb *inp, uint16_t port, void *ip_ptr, void 
*tcp_ptr)
 {
        struct tcphdr *th = (struct tcphdr *)tcp_ptr;
 
@@ -1329,7 +1582,10 @@ tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void 
*tcp_ptr)
                        (inp->inp_flow & IPV6_FLOWINFO_MASK);
                ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
                        (IPV6_VERSION & IPV6_VERSION_MASK);
-               ip6->ip6_nxt = IPPROTO_TCP;
+               if (port == 0)
+                       ip6->ip6_nxt = IPPROTO_TCP;
+               else
+                       ip6->ip6_nxt = IPPROTO_UDP;
                ip6->ip6_plen = htons(sizeof(struct tcphdr));
                ip6->ip6_src = inp->in6p_laddr;
                ip6->ip6_dst = inp->in6p_faddr;
@@ -1351,7 +1607,10 @@ tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void 
*tcp_ptr)
                ip->ip_off = 0;
                ip->ip_ttl = inp->inp_ip_ttl;
                ip->ip_sum = 0;
-               ip->ip_p = IPPROTO_TCP;
+               if (port == 0)
+                       ip->ip_p = IPPROTO_TCP;
+               else
+                       ip->ip_p = IPPROTO_UDP;
                ip->ip_src = inp->inp_laddr;
                ip->ip_dst = inp->inp_faddr;
        }
@@ -1381,7 +1640,7 @@ tcpip_maketemplate(struct inpcb *inp)
        t = malloc(sizeof(*t), M_TEMP, M_NOWAIT);
        if (t == NULL)
                return (NULL);
-       tcpip_fillheaders(inp, (void *)&t->tt_ipgen, (void *)&t->tt_t);
+       tcpip_fillheaders(inp, 0, (void *)&t->tt_ipgen, (void *)&t->tt_t);
        return (t);
 }
 
@@ -1407,14 +1666,16 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct 
tcphdr *th, struct mbuf *m,
        struct inpcb *inp;
        struct ip *ip;
        struct mbuf *optm;
+       struct udphdr *uh = NULL;
        struct tcphdr *nth;
        u_char *optp;
 #ifdef INET6
        struct ip6_hdr *ip6;
        int isipv6;
 #endif /* INET6 */
-       int optlen, tlen, win;
+       int optlen, tlen, win, ulen;
        bool incl_opts;
+       uint16_t port;
 
        KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
        NET_EPOCH_ASSERT();
*** 1137 LINES SKIPPED ***
_______________________________________________
[email protected] mailing list
https://lists.freebsd.org/mailman/listinfo/dev-commits-src-all
To unsubscribe, send any mail to "[email protected]"

Reply via email to