On Tue, May 09, 2023 at 09:56:36AM +0200, Alexander Bluhm wrote:
> On Sun, May 07, 2023 at 09:00:31PM +0200, Alexander Bluhm wrote:
> > Not sure if I addressed all corner cases already.  I think IPsec
> > is missing.
> 
> Updated diff:
> - parts have been commited
> - works with IPsec now

Thanks for this solution.  Looks much better to me, then an IPSec lookup
in tcp_output() as its done in FreeBSD.

> - some bugs fixed
> - sysctl net.inet.tcp.tso
> - netstat TSO counter
> 
> If you test this, recompile sysctl and netstat with new kernel
> headers.  Then you can see, whether the diff has an effect on your
> setup.
> 
> # netstat -s -p tcp | grep TSO
>                 79 output TSO packets software chopped
>                 0 output TSO packets hardware processed
>                 840 output TSO packets generated
>                 0 output TSO packets dropped

Good idea.

> If you run into problems, disable the feature, and report if the
> problem goes away.  This helps to locate the bug.
> 
> # sysctl net.inet.tcp.tso=0
> net.inet.tcp.tso: 1 -> 0
> 
> I would like to keep the sysctl for now.  It makes performance
> comparison easier.  When we add hardware TSO it can be a quick
> workaround for driver problems.
> 
> When this has been tested a bit, I think it is ready for commit.
> Remaining issues can be handled in tree.  My tests pass, I am not
> aware of TCP problems.

I also did some testing in my setups.  Everything works.

> ok?

Diff looks fine to me, too.

ok jan@

> bluhm
> 
> Index: sys/net/pf.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf.c,v
> retrieving revision 1.1177
> diff -u -p -r1.1177 pf.c
> --- sys/net/pf.c      8 May 2023 13:22:13 -0000       1.1177
> +++ sys/net/pf.c      8 May 2023 22:37:04 -0000
> @@ -6561,6 +6561,16 @@ pf_route(struct pf_pdesc *pd, struct pf_
>               goto done;
>       }
>  
> +     if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_TSO) &&
> +         m0->m_pkthdr.ph_mss <= ifp->if_mtu) {
> +             if (tcp_chopper(m0, &ml, ifp, m0->m_pkthdr.ph_mss) ||
> +                 if_output_ml(ifp, &ml, sintosa(dst), rt))
> +                     goto done;
> +             tcpstat_inc(tcps_outswtso);
> +             goto done;
> +     }
> +     CLR(m0->m_pkthdr.csum_flags, M_TCP_TSO);
> +
>       /*
>        * Too large for interface; fragment if possible.
>        * Must be able to put at least 8 bytes per fragment.
> @@ -6594,6 +6604,7 @@ void
>  pf_route6(struct pf_pdesc *pd, struct pf_state *st)
>  {
>       struct mbuf             *m0;
> +     struct mbuf_list         ml;
>       struct sockaddr_in6     *dst, sin6;
>       struct rtentry          *rt = NULL;
>       struct ip6_hdr          *ip6;
> @@ -6685,11 +6696,21 @@ pf_route6(struct pf_pdesc *pd, struct pf
>               goto done;
>       }
>  
> -     if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu) {
> +     if (m0->m_pkthdr.len <= ifp->if_mtu) {
>               in6_proto_cksum_out(m0, ifp);
>               ifp->if_output(ifp, m0, sin6tosa(dst), rt);
>               goto done;
>       }
> +
> +     if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_TSO) &&
> +         m0->m_pkthdr.ph_mss <= ifp->if_mtu) {
> +             if (tcp_chopper(m0, &ml, ifp, m0->m_pkthdr.ph_mss) ||
> +                 if_output_ml(ifp, &ml, sin6tosa(dst), rt))
> +                     goto done;
> +             tcpstat_inc(tcps_outswtso);
> +             goto done;
> +     }
> +     CLR(m0->m_pkthdr.csum_flags, M_TCP_TSO);
>  
>       ip6stat_inc(ip6s_cantfrag);
>       if (st->rt != PF_DUPTO)
> Index: sys/netinet/in.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in.h,v
> retrieving revision 1.142
> diff -u -p -r1.142 in.h
> --- sys/netinet/in.h  11 Apr 2023 00:45:09 -0000      1.142
> +++ sys/netinet/in.h  8 May 2023 13:47:48 -0000
> @@ -780,6 +780,7 @@ int          in_canforward(struct in_addr);
>  int     in_cksum(struct mbuf *, int);
>  int     in4_cksum(struct mbuf *, u_int8_t, int, int);
>  void    in_proto_cksum_out(struct mbuf *, struct ifnet *);
> +int     in_ifcap_cksum(struct mbuf *, struct ifnet *, int);
>  void    in_ifdetach(struct ifnet *);
>  int     in_mask2len(struct in_addr *);
>  void    in_len2mask(struct in_addr *, int);
> Index: sys/netinet/ip_output.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_output.c,v
> retrieving revision 1.384
> diff -u -p -r1.384 ip_output.c
> --- sys/netinet/ip_output.c   8 May 2023 13:22:13 -0000       1.384
> +++ sys/netinet/ip_output.c   8 May 2023 22:37:04 -0000
> @@ -84,7 +84,6 @@ void ip_mloopback(struct ifnet *, struct
>  static __inline u_int16_t __attribute__((__unused__))
>      in_cksum_phdr(u_int32_t, u_int32_t, u_int32_t);
>  void in_delayed_cksum(struct mbuf *);
> -int in_ifcap_cksum(struct mbuf *, struct ifnet *, int);
>  
>  int ip_output_ipsec_lookup(struct mbuf *m, int hlen, struct inpcb *inp,
>      struct tdb **, int ipsecflowinfo);
> @@ -468,6 +467,16 @@ sendit:
>               goto done;
>       }
>  
> +     if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) &&
> +         m->m_pkthdr.ph_mss <= mtu) {
> +             if ((error = tcp_chopper(m, &ml, ifp, m->m_pkthdr.ph_mss)) ||
> +                 (error = if_output_ml(ifp, &ml, sintosa(dst), ro->ro_rt)))
> +                     goto done;
> +             tcpstat_inc(tcps_outswtso);
> +             goto done;
> +     }
> +     CLR(m->m_pkthdr.csum_flags, M_TCP_TSO);
> +
>       /*
>        * Too large for interface; fragment if possible.
>        * Must be able to put at least 8 bytes per fragment.
> @@ -597,12 +606,12 @@ ip_output_ipsec_pmtu_update(struct tdb *
>  int
>  ip_output_ipsec_send(struct tdb *tdb, struct mbuf *m, struct route *ro, int 
> fwd)
>  {
> -#if NPF > 0
> -     struct ifnet *encif;
> -#endif
> +     struct mbuf_list ml;
> +     struct ifnet *encif = NULL;
>       struct ip *ip;
>       struct in_addr dst;
> -     int error, rtableid;
> +     u_int len;
> +     int error, rtableid, tso = 0;
>  
>  #if NPF > 0
>       /*
> @@ -622,16 +631,22 @@ ip_output_ipsec_send(struct tdb *tdb, st
>        * Until now the change was not reconsidered.
>        * What's the behaviour?
>        */
> -     in_proto_cksum_out(m, encif);
>  #endif
>  
> -     /* Check if we are allowed to fragment */
> +     /* Check if we can chop the TCP packet */
>       ip = mtod(m, struct ip *);
> +     if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) &&
> +         m->m_pkthdr.ph_mss <= tdb->tdb_mtu) {
> +             tso = 1;
> +             len = m->m_pkthdr.ph_mss;
> +     } else
> +             len = ntohs(ip->ip_len);
> +
> +     /* Check if we are allowed to fragment */
>       dst = ip->ip_dst;
>       rtableid = m->m_pkthdr.ph_rtableid;
>       if (ip_mtudisc && (ip->ip_off & htons(IP_DF)) && tdb->tdb_mtu &&
> -         ntohs(ip->ip_len) > tdb->tdb_mtu &&
> -         tdb->tdb_mtutimeout > gettime()) {
> +         len > tdb->tdb_mtu && tdb->tdb_mtutimeout > gettime()) {
>               int transportmode;
>  
>               transportmode = (tdb->tdb_dst.sa.sa_family == AF_INET) &&
> @@ -652,14 +667,33 @@ ip_output_ipsec_send(struct tdb *tdb, st
>        */
>       m->m_flags &= ~(M_MCAST | M_BCAST);
>  
> -     /* Callee frees mbuf */
> +     if (tso) {
> +             error = tcp_chopper(m, &ml, encif, len);
> +             if (error)
> +                     goto done;
> +     } else {
> +             CLR(m->m_pkthdr.csum_flags, M_TCP_TSO);
> +             in_proto_cksum_out(m, encif);
> +             ml_init(&ml);
> +             ml_enqueue(&ml, m);
> +     }
> +
>       KERNEL_LOCK();
> -     error = ipsp_process_packet(m, tdb, AF_INET, 0);
> +     while ((m = ml_dequeue(&ml)) != NULL) {
> +             /* Callee frees mbuf */
> +             error = ipsp_process_packet(m, tdb, AF_INET, 0);
> +             if (error)
> +                     break;
> +     }
>       KERNEL_UNLOCK();
> + done:
>       if (error) {
> +             ml_purge(&ml);
>               ipsecstat_inc(ipsec_odrops);
>               tdbstat_inc(tdb, tdb_odrops);
>       }
> +     if (!error && tso)
> +             tcpstat_inc(tcps_outswtso);
>       if (ip_mtudisc && error == EMSGSIZE)
>               ip_output_ipsec_pmtu_update(tdb, ro, dst, rtableid, 0);
>       return error;
> Index: sys/netinet/tcp_output.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_output.c,v
> retrieving revision 1.135
> diff -u -p -r1.135 tcp_output.c
> --- sys/netinet/tcp_output.c  25 Apr 2023 22:56:28 -0000      1.135
> +++ sys/netinet/tcp_output.c  8 May 2023 22:37:04 -0000
> @@ -210,6 +210,7 @@ tcp_output(struct tcpcb *tp)
>  #ifdef TCP_ECN
>       int needect;
>  #endif
> +     int tso;
>  
>       if (tp->t_flags & TF_BLOCKOUTPUT) {
>               tp->t_flags |= TF_NEEDOUTPUT;
> @@ -279,6 +280,7 @@ again:
>       }
>  
>       sendalot = 0;
> +     tso = 0;
>       /*
>        * If in persist timeout with window of 0, send 1 byte.
>        * Otherwise, if window is small but nonzero
> @@ -346,8 +348,25 @@ again:
>       txmaxseg = ulmin(so->so_snd.sb_hiwat / 2, tp->t_maxseg);
>  
>       if (len > txmaxseg) {
> -             len = txmaxseg;
> -             sendalot = 1;
> +             if (tcp_do_tso &&
> +                 tp->t_inpcb->inp_options == NULL &&
> +                 tp->t_inpcb->inp_outputopts6 == NULL &&
> +#ifdef TCP_SIGNATURE
> +                 ((tp->t_flags & TF_SIGNATURE) == 0) &&
> +#endif
> +                 len >= 2 * tp->t_maxseg &&
> +                 tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
> +                 !(flags & (TH_SYN|TH_RST|TH_FIN))) {
> +                     tso = 1;
> +                     /* avoid small chopped packets */
> +                     if (len > (len / tp->t_maxseg) * tp->t_maxseg) {
> +                             len = (len / tp->t_maxseg) * tp->t_maxseg;
> +                             sendalot = 1;
> +                     }
> +             } else {
> +                     len = txmaxseg;
> +                     sendalot = 1;
> +             }
>       }
>       if (off + len < so->so_snd.sb_cc)
>               flags &= ~TH_FIN;
> @@ -365,7 +384,7 @@ again:
>        * to send into a small window), then must resend.
>        */
>       if (len) {
> -             if (len == txmaxseg)
> +             if (len >= txmaxseg)
>                       goto send;
>               if ((idle || (tp->t_flags & TF_NODELAY)) &&
>                   len + off >= so->so_snd.sb_cc && !soissending(so) &&
> @@ -616,10 +635,19 @@ send:
>       /*
>        * Adjust data length if insertion of options will
>        * bump the packet length beyond the t_maxopd length.
> +      * Clear the FIN bit because we cut off the tail of
> +      * the segment.
>        */
>       if (len > tp->t_maxopd - optlen) {
> -             len = tp->t_maxopd - optlen;
> -             sendalot = 1;
> +             if (tso) {
> +                     if (len + hdrlen + max_linkhdr > MAXMCLBYTES) {
> +                             len = MAXMCLBYTES - hdrlen - max_linkhdr;
> +                             sendalot = 1;
> +                     }
> +             } else {
> +                     len = tp->t_maxopd - optlen;
> +                     sendalot = 1;
> +             }
>               flags &= ~TH_FIN;
>       }
>  
> @@ -723,6 +751,12 @@ send:
>       m->m_pkthdr.ph_ifidx = 0;
>       m->m_pkthdr.len = hdrlen + len;
>  
> +     /* Enable TSO and specify the size of the resulting segments. */
> +     if (tso) {
> +             m->m_pkthdr.csum_flags |= M_TCP_TSO;
> +             m->m_pkthdr.ph_mss = tp->t_maxseg;
> +     }
> +
>       if (!tp->t_template)
>               panic("tcp_output");
>  #ifdef DIAGNOSTIC
> @@ -1152,4 +1186,177 @@ tcp_setpersist(struct tcpcb *tp)
>       TCP_TIMER_ARM(tp, TCPT_PERSIST, msec);
>       if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
>               tp->t_rxtshift++;
> +}
> +
> +int
> +tcp_chopper(struct mbuf *m0, struct mbuf_list *ml, struct ifnet *ifp,
> +    u_int mss)
> +{
> +     struct ip *ip = NULL;
> +#ifdef INET6
> +     struct ip6_hdr *ip6 = NULL;
> +#endif
> +     struct tcphdr *th;
> +     int firstlen, iphlen, hlen, tlen, off;
> +     int error;
> +
> +     ml_init(ml);
> +     ml_enqueue(ml, m0);
> +
> +     ip = mtod(m0, struct ip *);
> +     switch (ip->ip_v) {
> +     case 4:
> +             iphlen = ip->ip_hl << 2;
> +             if (ISSET(ip->ip_off, htons(IP_OFFMASK | IP_MF)) ||
> +                 iphlen != sizeof(struct ip) || ip->ip_p != IPPROTO_TCP) {
> +                     /* only TCP without fragment or IP option supported */
> +                     error = EPROTOTYPE;
> +                     goto bad;
> +             }
> +             break;
> +#ifdef INET6
> +     case 6:
> +             ip = NULL;
> +             ip6 = mtod(m0, struct ip6_hdr *);
> +             iphlen = sizeof(struct ip6_hdr);
> +             if (ip6->ip6_nxt != IPPROTO_TCP) {
> +                     /* only TCP without IPv6 header chain supported */
> +                     error = EPROTOTYPE;
> +                     goto bad;
> +             }
> +             break;
> +#endif
> +     default:
> +             panic("%s: unknown ip version %d", __func__, ip->ip_v);
> +     }
> +
> +     tlen = m0->m_pkthdr.len;
> +     if (tlen < iphlen + sizeof(struct tcphdr)) {
> +             error = ENOPROTOOPT;
> +             goto bad;
> +     }
> +     /* IP and TCP header should be contiguous, this check is paranoia */
> +     if (m0->m_len < iphlen + sizeof(*th)) {
> +             ml_dequeue(ml);
> +             if ((m0 = m_pullup(m0, iphlen + sizeof(*th))) == NULL) {
> +                     error = ENOBUFS;
> +                     goto bad;
> +             }
> +             ml_enqueue(ml, m0);
> +     }
> +     th = (struct tcphdr *)(mtod(m0, caddr_t) + iphlen);
> +     hlen = iphlen + (th->th_off << 2);
> +     if (tlen < hlen) {
> +             error = ENOPROTOOPT;
> +             goto bad;
> +     }
> +     firstlen = MIN(tlen - hlen, mss);
> +
> +     CLR(m0->m_pkthdr.csum_flags, M_TCP_TSO);
> +
> +     /*
> +      * Loop through length of payload after first segment,
> +      * make new header and copy data of each part and link onto chain.
> +      */
> +     for (off = hlen + firstlen; off < tlen; off += mss) {
> +             struct mbuf *m;
> +             struct tcphdr *mhth;
> +             int len;
> +
> +             len = MIN(tlen - off, mss);
> +
> +             MGETHDR(m, M_DONTWAIT, MT_HEADER);
> +             if (m == NULL) {
> +                     error = ENOBUFS;
> +                     goto bad;
> +             }
> +             ml_enqueue(ml, m);
> +             if ((error = m_dup_pkthdr(m, m0, M_DONTWAIT)) != 0)
> +                     goto bad;
> +
> +             /* IP and TCP header to the end, space for link layer header */
> +             m->m_len = hlen;
> +             m_align(m, hlen);
> +
> +             /* copy and adjust TCP header */
> +             mhth = (struct tcphdr *)(mtod(m, caddr_t) + iphlen);
> +             memcpy(mhth, th, hlen - iphlen);
> +             mhth->th_seq = htonl(ntohl(th->th_seq) + (off - hlen));
> +             if (off + len < tlen)
> +                     CLR(mhth->th_flags, TH_PUSH|TH_FIN);
> +
> +             /* add mbuf chain with payload */
> +             m->m_pkthdr.len = hlen + len;
> +             if ((m->m_next = m_copym(m0, off, len, M_DONTWAIT)) == NULL) {
> +                     error = ENOBUFS;
> +                     goto bad;
> +             }
> +
> +             /* copy and adjust IP header, calculate checksum */
> +             SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
> +             mhth->th_sum = 0;
> +             if (ip) {
> +                     struct ip *mhip;
> +
> +                     mhip = mtod(m, struct ip *);
> +                     *mhip = *ip;
> +                     mhip->ip_len = htons(hlen + len);
> +                     mhip->ip_id = htons(ip_randomid());
> +                     mhip->ip_sum = 0;
> +                     if (ifp && in_ifcap_cksum(m, ifp, IFCAP_CSUM_IPv4)) {
> +                             m->m_pkthdr.csum_flags |= M_IPV4_CSUM_OUT;
> +                     } else {
> +                             ipstat_inc(ips_outswcsum);
> +                             mhip->ip_sum = in_cksum(m, iphlen);
> +                     }
> +                     in_proto_cksum_out(m, ifp);
> +             }
> +#ifdef INET6
> +             if (ip6) {
> +                     struct ip6_hdr *mhip6;
> +
> +                     mhip6 = mtod(m, struct ip6_hdr *);
> +                     *mhip6 = *ip6;
> +                     mhip6->ip6_plen = htons(hlen - iphlen + len);
> +                     in6_proto_cksum_out(m, ifp);
> +             }
> +#endif
> +     }
> +
> +     /*
> +      * Update first segment by trimming what's been copied out
> +      * and updating header, then send each segment (in order).
> +      */
> +     if (hlen + firstlen < tlen) {
> +             m_adj(m0, hlen + firstlen - tlen);
> +             CLR(th->th_flags, TH_PUSH|TH_FIN);
> +     }
> +     /* adjust IP header, calculate checksum */
> +     SET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
> +     th->th_sum = 0;
> +     if (ip) {
> +             ip->ip_len = htons(m0->m_pkthdr.len);
> +             ip->ip_sum = 0;
> +             if (ifp && in_ifcap_cksum(m0, ifp, IFCAP_CSUM_IPv4)) {
> +                     m0->m_pkthdr.csum_flags |= M_IPV4_CSUM_OUT;
> +             } else {
> +                     ipstat_inc(ips_outswcsum);
> +                     ip->ip_sum = in_cksum(m0, iphlen);
> +             }
> +             in_proto_cksum_out(m0, ifp);
> +     }
> +#ifdef INET6
> +     if (ip6) {
> +             ip6->ip6_plen = htons(m0->m_pkthdr.len - iphlen);
> +             in6_proto_cksum_out(m0, ifp);
> +     }
> +#endif
> +
> +     tcpstat_add(tcps_outpkttso, ml_len(ml));
> +     return 0;
> +
> + bad:
> +     tcpstat_inc(tcps_outbadtso);
> +     ml_purge(ml);
> +     return error;
>  }
> Index: sys/netinet/tcp_subr.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_subr.c,v
> retrieving revision 1.190
> diff -u -p -r1.190 tcp_subr.c
> --- sys/netinet/tcp_subr.c    7 Nov 2022 11:22:55 -0000       1.190
> +++ sys/netinet/tcp_subr.c    8 May 2023 22:37:04 -0000
> @@ -119,6 +119,7 @@ int       tcp_ack_on_push = 0;    /* set to enabl
>  int  tcp_do_ecn = 0;         /* RFC3168 ECN enabled/disabled? */
>  #endif
>  int  tcp_do_rfc3390 = 2;     /* Increase TCP's Initial Window to 10*mss */
> +int  tcp_do_tso = 1;         /* TCP segmentation offload for output */
>  
>  #ifndef TCB_INITIAL_HASH_SIZE
>  #define      TCB_INITIAL_HASH_SIZE   128
> Index: sys/netinet/tcp_usrreq.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_usrreq.c,v
> retrieving revision 1.217
> diff -u -p -r1.217 tcp_usrreq.c
> --- sys/netinet/tcp_usrreq.c  14 Mar 2023 00:24:05 -0000      1.217
> +++ sys/netinet/tcp_usrreq.c  8 May 2023 22:37:04 -0000
> @@ -166,6 +166,7 @@ const struct sysctl_bounded_args tcpctl_
>       { TCPCTL_SYN_BUCKET_LIMIT, &tcp_syn_bucket_limit, 1, INT_MAX },
>       { TCPCTL_RFC3390, &tcp_do_rfc3390, 0, 2 },
>       { TCPCTL_ALWAYS_KEEPALIVE, &tcp_always_keepalive, 0, 1 },
> +     { TCPCTL_TSO, &tcp_do_tso, 0, 1 },
>  };
>  
>  struct       inpcbtable tcbtable;
> @@ -1335,6 +1336,10 @@ tcp_sysctl_tcpstat(void *oldp, size_t *o
>       ASSIGN(tcps_sack_rcv_opts);
>       ASSIGN(tcps_sack_snd_opts);
>       ASSIGN(tcps_sack_drop_opts);
> +     ASSIGN(tcps_outswtso);
> +     ASSIGN(tcps_outhwtso);
> +     ASSIGN(tcps_outpkttso);
> +     ASSIGN(tcps_outbadtso);
>  
>  #undef ASSIGN
>  
> @@ -1494,8 +1499,8 @@ tcp_sysctl(int *name, u_int namelen, voi
>  
>       default:
>               NET_LOCK();
> -             error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars), 
> name,
> -                  namelen, oldp, oldlenp, newp, newlen);
> +             error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars),
> +                 name, namelen, oldp, oldlenp, newp, newlen);
>               NET_UNLOCK();
>               return (error);
>       }
> Index: sys/netinet/tcp_var.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
> retrieving revision 1.163
> diff -u -p -r1.163 tcp_var.h
> --- sys/netinet/tcp_var.h     14 Mar 2023 00:24:05 -0000      1.163
> +++ sys/netinet/tcp_var.h     8 May 2023 22:37:04 -0000
> @@ -442,6 +442,11 @@ struct   tcpstat {
>       u_int64_t tcps_sack_rcv_opts;           /* SACK options received */
>       u_int64_t tcps_sack_snd_opts;           /* SACK options sent */
>       u_int64_t tcps_sack_drop_opts;          /* SACK options dropped */
> +
> +     u_int32_t tcps_outswtso;        /* output tso chopped in software */
> +     u_int32_t tcps_outhwtso;        /* output tso processed by hardware */
> +     u_int32_t tcps_outpkttso;       /* packets generated by tso */
> +     u_int32_t tcps_outbadtso;       /* output tso failed, packet dropped */
>  };
>  
>  /*
> @@ -473,7 +478,8 @@ struct    tcpstat {
>  #define      TCPCTL_SYN_USE_LIMIT   23 /* number of uses before reseeding 
> hash */
>  #define TCPCTL_ROOTONLY             24 /* return root only port bitmap */
>  #define      TCPCTL_SYN_HASH_SIZE   25 /* number of buckets in the hash */
> -#define      TCPCTL_MAXID           26
> +#define      TCPCTL_TSO             26 /* enable TCP segmentation offload */
> +#define      TCPCTL_MAXID           27
>  
>  #define      TCPCTL_NAMES { \
>       { 0, 0 }, \
> @@ -500,8 +506,9 @@ struct    tcpstat {
>       { "stats",      CTLTYPE_STRUCT }, \
>       { "always_keepalive",   CTLTYPE_INT }, \
>       { "synuselimit",        CTLTYPE_INT }, \
> -     { "rootonly", CTLTYPE_STRUCT }, \
> +     { "rootonly",   CTLTYPE_STRUCT }, \
>       { "synhashsize",        CTLTYPE_INT }, \
> +     { "tso",        CTLTYPE_INT }, \
>  }
>  
>  struct tcp_ident_mapping {
> @@ -614,6 +621,10 @@ enum tcpstat_counters {
>       tcps_sack_rcv_opts,
>       tcps_sack_snd_opts,
>       tcps_sack_drop_opts,
> +     tcps_outswtso,
> +     tcps_outhwtso,
> +     tcps_outpkttso,
> +     tcps_outbadtso,
>       tcps_ncounters,
>  };
>  
> @@ -665,6 +676,7 @@ extern    struct pool sackhl_pool;
>  extern       int tcp_sackhole_limit; /* max entries for tcp sack queues */
>  extern       int tcp_do_ecn;         /* RFC3168 ECN enabled/disabled? */
>  extern       int tcp_do_rfc3390;     /* RFC3390 Increasing TCP's Initial 
> Window */
> +extern       int tcp_do_tso;         /* enable TSO for TCP output packets */
>  
>  extern       struct pool tcpqe_pool;
>  extern       int tcp_reass_limit;    /* max entries for tcp reass queues */
> @@ -706,6 +718,7 @@ struct tcpcb *
>        tcp_newtcpcb(struct inpcb *, int);
>  void  tcp_notify(struct inpcb *, int);
>  int   tcp_output(struct tcpcb *);
> +int   tcp_chopper(struct mbuf *, struct mbuf_list *, struct ifnet *, u_int);
>  void  tcp_pulloutofband(struct socket *, u_int, struct mbuf *, int);
>  int   tcp_reass(struct tcpcb *, struct tcphdr *, struct mbuf *, int *);
>  void  tcp_rscale(struct tcpcb *, u_long);
> Index: sys/netinet6/ip6_output.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_output.c,v
> retrieving revision 1.274
> diff -u -p -r1.274 ip6_output.c
> --- sys/netinet6/ip6_output.c 8 May 2023 13:22:13 -0000       1.274
> +++ sys/netinet6/ip6_output.c 8 May 2023 22:37:04 -0000
> @@ -686,7 +686,9 @@ reroute:
>               dontfrag = 1;
>       else
>               dontfrag = 0;
> -     if (dontfrag && tlen > ifp->if_mtu) {   /* case 2-b */
> +     if (dontfrag &&                                 /* case 2-b */
> +         (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) ?
> +         m->m_pkthdr.csum_flags : tlen) > ifp->if_mtu) {
>  #ifdef IPSEC
>               if (ip_mtudisc)
>                       ipsec_adjust_mtu(m, mtu);
> @@ -698,12 +700,22 @@ reroute:
>       /*
>        * transmit packet without fragmentation
>        */
> -     if (dontfrag || (tlen <= mtu)) {        /* case 1-a and 2-a */
> +     if (dontfrag || tlen <= mtu) {                  /* case 1-a and 2-a */
>               in6_proto_cksum_out(m, ifp);
>               error = ifp->if_output(ifp, m, sin6tosa(dst), ro->ro_rt);
>               goto done;
>       }
>  
> +     if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) &&
> +         m->m_pkthdr.ph_mss <= mtu) {
> +             if ((error = tcp_chopper(m, &ml, ifp, m->m_pkthdr.ph_mss)) ||
> +                 (error = if_output_ml(ifp, &ml, sin6tosa(dst), ro->ro_rt)))
> +                     goto done;
> +             tcpstat_inc(tcps_outswtso);
> +             goto done;
> +     }
> +     CLR(m->m_pkthdr.csum_flags, M_TCP_TSO);
> +
>       /*
>        * try to fragment the packet.  case 1-b
>        */
> @@ -2829,12 +2841,12 @@ int
>  ip6_output_ipsec_send(struct tdb *tdb, struct mbuf *m, struct route_in6 *ro,
>      int tunalready, int fwd)
>  {
> -#if NPF > 0
> -     struct ifnet *encif;
> -#endif
> +     struct mbuf_list ml;
> +     struct ifnet *encif = NULL;
>       struct ip6_hdr *ip6;
>       struct in6_addr dst;
> -     int error, ifidx, rtableid;
> +     u_int len;
> +     int error, ifidx, rtableid, tso = 0;
>  
>  #if NPF > 0
>       /*
> @@ -2854,17 +2866,23 @@ ip6_output_ipsec_send(struct tdb *tdb, s
>        * Until now the change was not reconsidered.
>        * What's the behaviour?
>        */
> -     in6_proto_cksum_out(m, encif);
>  #endif
>  
> -     /* Check if we are allowed to fragment */
> +     /* Check if we can chop the TCP packet */
>       ip6 = mtod(m, struct ip6_hdr *);
> +     if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) &&
> +         m->m_pkthdr.ph_mss <= tdb->tdb_mtu) {
> +             tso = 1;
> +             len = m->m_pkthdr.ph_mss;
> +     } else
> +             len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen);
> +
> +     /* Check if we are allowed to fragment */
>       dst = ip6->ip6_dst;
>       ifidx = m->m_pkthdr.ph_ifidx;
>       rtableid = m->m_pkthdr.ph_rtableid;
>       if (ip_mtudisc && tdb->tdb_mtu &&
> -         sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) > tdb->tdb_mtu &&
> -         tdb->tdb_mtutimeout > gettime()) {
> +         len > tdb->tdb_mtu && tdb->tdb_mtutimeout > gettime()) {
>               int transportmode;
>  
>               transportmode = (tdb->tdb_dst.sa.sa_family == AF_INET6) &&
> @@ -2891,14 +2909,33 @@ ip6_output_ipsec_send(struct tdb *tdb, s
>        */
>       m->m_flags &= ~(M_BCAST | M_MCAST);
>  
> -     /* Callee frees mbuf */
> +     if (tso) {
> +             error = tcp_chopper(m, &ml, encif, len);
> +             if (error)
> +                     goto done;
> +     } else {
> +             CLR(m->m_pkthdr.csum_flags, M_TCP_TSO);
> +             in6_proto_cksum_out(m, encif);
> +             ml_init(&ml);
> +             ml_enqueue(&ml, m);
> +     }
> +
>       KERNEL_LOCK();
> -     error = ipsp_process_packet(m, tdb, AF_INET6, tunalready);
> +     while ((m = ml_dequeue(&ml)) != NULL) {
> +             /* Callee frees mbuf */
> +             error = ipsp_process_packet(m, tdb, AF_INET6, tunalready);
> +             if (error)
> +                     break;
> +     }
>       KERNEL_UNLOCK();
> + done:
>       if (error) {
> +             ml_purge(&ml);
>               ipsecstat_inc(ipsec_odrops);
>               tdbstat_inc(tdb, tdb_odrops);
>       }
> +     if (!error && tso)
> +             tcpstat_inc(tcps_outswtso);
>       if (ip_mtudisc && error == EMSGSIZE)
>               ip6_output_ipsec_pmtu_update(tdb, ro, &dst, ifidx, rtableid, 0);
>       return error;
> Index: sys/sys/mbuf.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/mbuf.h,v
> retrieving revision 1.256
> diff -u -p -r1.256 mbuf.h
> --- sys/sys/mbuf.h    5 May 2023 01:19:51 -0000       1.256
> +++ sys/sys/mbuf.h    8 May 2023 13:47:48 -0000
> @@ -129,12 +129,13 @@ struct  pkthdr {
>       SLIST_HEAD(, m_tag)      ph_tags;       /* list of packet tags */
>       int64_t                  ph_timestamp;  /* packet timestamp */
>       int                      len;           /* total packet length */
> +     u_int                    ph_rtableid;   /* routing table id */
> +     u_int                    ph_ifidx;      /* rcv interface index */
>       u_int16_t                ph_tagsset;    /* mtags attached */
>       u_int16_t                ph_flowid;     /* pseudo unique flow id */
>       u_int16_t                csum_flags;    /* checksum flags */
>       u_int16_t                ether_vtag;    /* Ethernet 802.1p+Q vlan tag */
> -     u_int                    ph_rtableid;   /* routing table id */
> -     u_int                    ph_ifidx;      /* rcv interface index */
> +     u_int16_t                ph_mss;        /* TCP max segment size */
>       u_int8_t                 ph_loopcnt;    /* mbuf is looping in kernel */
>       u_int8_t                 ph_family;     /* af, used when queueing */
>       struct pkthdr_pf         pf;
> @@ -226,6 +227,7 @@ struct mbuf {
>  #define      M_IPV6_DF_OUT           0x1000  /* don't fragment outgoing IPv6 
> */
>  #define      M_TIMESTAMP             0x2000  /* ph_timestamp is set */
>  #define      M_FLOWID                0x4000  /* ph_flowid is set */
> +#define      M_TCP_TSO               0x8000  /* TCP Segmentation Offload 
> needed */
>  
>  #ifdef _KERNEL
>  #define MCS_BITS \
> Index: usr.bin/netstat/inet.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/usr.bin/netstat/inet.c,v
> retrieving revision 1.174
> diff -u -p -r1.174 inet.c
> --- usr.bin/netstat/inet.c    12 Aug 2022 14:49:15 -0000      1.174
> +++ usr.bin/netstat/inet.c    8 May 2023 14:01:00 -0000
> @@ -408,6 +408,10 @@ tcp_stats(char *name)
>       p(tcps_sndwinup, "\t\t%u window update packet%s\n");
>       p(tcps_sndctrl, "\t\t%u control packet%s\n");
>       p(tcps_outswcsum, "\t\t%u packet%s software-checksummed\n");
> +     p(tcps_outswtso, "\t\t%u output TSO packet%s software chopped\n");
> +     p(tcps_outhwtso, "\t\t%u output TSO packet%s hardware processed\n");
> +     p(tcps_outpkttso, "\t\t%u output TSO packet%s generated\n");
> +     p(tcps_outbadtso, "\t\t%u output TSO packet%s dropped\n");
>       p(tcps_rcvtotal, "\t%u packet%s received\n");
>       p2(tcps_rcvackpack, tcps_rcvackbyte, "\t\t%u ack%s (for %llu 
> byte%s)\n");
>       p(tcps_rcvdupack, "\t\t%u duplicate ack%s\n");
> 

Reply via email to