Hi, There were some problems with ix(4) and ixl(4) hardware checksumming for the output path on strict alignment architectures.
I have merged jan@'s diffs and added some sanity checks and workarounds. - If the first mbuf is not aligned or not contigous, use m_copydata() to extract the IP, IPv6, TCP header. - If the header is in the first mbuf, use m_data for the fast path. - Add netstat counter for invalid header chains. This makes us aware when hardware checksumming fails. - Add netstat counter for header copies. This indicates that better storage allocation in the network stack is possible. It also allows to recognize alignment problems on non-strict architectures. - There is not risk of crashes on sparc64. Does this aproach make sense? ix(4) works quite well, but finds some UDP packets that need copy. ixl(4) has not been tested yet. I would like to have some feedback for the idea first. bluhm Index: sys/dev/pci/if_ix.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pci/if_ix.c,v retrieving revision 1.180 diff -u -p -r1.180 if_ix.c --- sys/dev/pci/if_ix.c 27 Jul 2021 01:44:55 -0000 1.180 +++ sys/dev/pci/if_ix.c 25 Jan 2022 23:48:53 -0000 @@ -1878,8 +1878,8 @@ ixgbe_setup_interface(struct ix_softc *s #if NVLAN > 0 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING; #endif - ifp->if_capabilities |= IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4; + ifp->if_capabilities |= IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6; /* * Specify the media types supported by this sc and register @@ -2437,12 +2437,6 @@ ixgbe_tx_ctx_setup(struct tx_ring *txr, #else struct ether_header *eh; #endif - struct ip *ip; -#ifdef notyet - struct ip6_hdr *ip6; -#endif - struct mbuf *m; - int ipoff; uint32_t vlan_macip_lens = 0, type_tucmd_mlhl = 0; int ehdrlen, ip_hlen = 0; uint16_t etype; @@ -2511,29 +2505,46 @@ ixgbe_tx_ctx_setup(struct tx_ring *txr, vlan_macip_lens |= ehdrlen << IXGBE_ADVTXD_MACLEN_SHIFT; switch (etype) { - case ETHERTYPE_IP: - if (mp->m_pkthdr.len < ehdrlen + sizeof(*ip)) + case ETHERTYPE_IP: { + struct ip *ip, ipdata; + + if (mp->m_pkthdr.len < ehdrlen + sizeof(*ip)) { + ipstat_inc(ips_outbadcsum); return (-1); - m = m_getptr(mp, ehdrlen, &ipoff); - KASSERT(m != NULL && m->m_len - ipoff >= sizeof(*ip)); - ip = (struct ip *)(m->m_data + ipoff); + } + if (((mtod(mp, unsigned long) + ehdrlen) & ALIGNBYTES) == 0 && + mp->m_len >= ehdrlen + sizeof(*ip)) { + ip = (struct ip *)(mp->m_data + ehdrlen); + } else { + ipstat_inc(ips_outcpycsum); + m_copydata(mp, ehdrlen, sizeof(ipdata), &ipdata); + ip = &ipdata; + } ip_hlen = ip->ip_hl << 2; ipproto = ip->ip_p; type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4; break; -#ifdef notyet - case ETHERTYPE_IPV6: - if (mp->m_pkthdr.len < ehdrlen + sizeof(*ip6)) + } + case ETHERTYPE_IPV6: { + struct ip6_hdr *ip6, ip6data; + + if (mp->m_pkthdr.len < ehdrlen + sizeof(*ip6)) { + ip6stat_inc(ip6s_outbadcsum); return (-1); - m = m_getptr(mp, ehdrlen, &ipoff); - KASSERT(m != NULL && m->m_len - ipoff >= sizeof(*ip6)); - ip6 = (struct ip6 *)(m->m_data + ipoff); + } + if (((mtod(mp, unsigned long) + ehdrlen) & ALIGNBYTES) == 0 && + mp->m_len >= ehdrlen + sizeof(*ip6)) { + ip6 = (struct ip6_hdr *)(mp->m_data + ehdrlen); + } else { + ip6stat_inc(ip6s_outcpycsum); + m_copydata(mp, ehdrlen, sizeof(ip6data), &ip6data); + ip6 = &ip6data; + } ip_hlen = sizeof(*ip6); - /* XXX-BZ this will go badly in case of ext hdrs. */ ipproto = ip6->ip6_nxt; type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6; break; -#endif + } default: offload = FALSE; break; @@ -2552,6 +2563,10 @@ ixgbe_tx_ctx_setup(struct tx_ring *txr, type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_UDP; break; default: + if (mp->m_pkthdr.csum_flags & M_TCP_CSUM_OUT) + tcpstat_inc(tcps_outbadcsum); + if (mp->m_pkthdr.csum_flags & M_UDP_CSUM_OUT) + udpstat_inc(udps_outbadcsum); offload = FALSE; break; } Index: sys/dev/pci/if_ixl.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pci/if_ixl.c,v retrieving revision 1.78 diff -u -p -r1.78 if_ixl.c --- sys/dev/pci/if_ixl.c 9 Jan 2022 05:42:54 -0000 1.78 +++ sys/dev/pci/if_ixl.c 25 Jan 2022 23:50:01 -0000 @@ -71,6 +71,7 @@ #include <net/if.h> #include <net/if_dl.h> #include <net/if_media.h> +#include <net/route.h> #include <net/toeplitz.h> #if NBPFILTER > 0 @@ -82,6 +83,15 @@ #endif #include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/ip6.h> +#include <netinet6/ip6_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/udp.h> +#include <netinet/udp_var.h> #include <netinet/if_ether.h> #include <dev/pci/pcireg.h> @@ -1388,6 +1398,7 @@ static int ixl_rxeof(struct ixl_softc *, static void ixl_rxfill(struct ixl_softc *, struct ixl_rx_ring *); static void ixl_rxrefill(void *); static int ixl_rxrinfo(struct ixl_softc *, struct if_rxrinfo *); +static void ixl_rx_checksum(struct mbuf *, uint64_t); #if NKSTAT > 0 static void ixl_kstat_attach(struct ixl_softc *); @@ -1942,9 +1953,10 @@ ixl_attach(struct device *parent, struct ifp->if_capabilities = IFCAP_VLAN_MTU; #if 0 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING; - ifp->if_capabilities |= IFCAP_CSUM_IPv4 | IFCAP_CSUM_TCPv4 | - IFCAP_CSUM_UDPv4; #endif + ifp->if_capabilities |= IFCAP_CSUM_IPv4; + ifp->if_capabilities |= IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4; + ifp->if_capabilities |= IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6; ifmedia_init(&sc->sc_media, 0, ixl_media_change, ixl_media_status); @@ -2771,6 +2783,119 @@ ixl_load_mbuf(bus_dma_tag_t dmat, bus_dm BUS_DMA_STREAMING | BUS_DMA_NOWAIT)); } +static int +ixl_tx_setup_offload(struct mbuf *mp, uint64_t *cmd) +{ + uint64_t ip_hdr_len; + uint8_t ipproto; + + switch (ntohs(mtod(mp, struct ether_header *)->ether_type)) { + case ETHERTYPE_IP: { + struct ip *ip, ipdata; + + if (mp->m_pkthdr.len < ETHER_HDR_LEN + sizeof(*ip)) { + ipstat_inc(ips_outbadcsum); + return (-1); + } + if (((mtod(mp, unsigned long) + ETHER_HDR_LEN) & ALIGNBYTES) + == 0 && mp->m_len >= ETHER_HDR_LEN + sizeof(*ip)) { + ip = (struct ip *)(mp->m_data + ETHER_HDR_LEN); + } else { + ipstat_inc(ips_outcpycsum); + m_copydata(mp, ETHER_HDR_LEN, sizeof(ipdata), &ipdata); + ip = &ipdata; + } + if (mp->m_pkthdr.csum_flags & M_IPV4_CSUM_OUT) + *cmd |= IXL_TX_DESC_CMD_IIPT_IPV4_CSUM; + else + *cmd |= IXL_TX_DESC_CMD_IIPT_IPV4; + + ip_hdr_len = ip->ip_hl << 2; + ipproto = ip->ip_p; + break; + } +#ifdef INET6 + case ETHERTYPE_IPV6: { + struct ip6_hdr *ip6, ip6data; + + if (mp->m_pkthdr.len < ETHER_HDR_LEN + sizeof(*ip6)) { + ip6stat_inc(ip6s_outbadcsum); + return (-1); + } + if (((mtod(mp, unsigned long) + ETHER_HDR_LEN) & ALIGNBYTES) + == 0 && mp->m_len >= ETHER_HDR_LEN + sizeof(*ip6)) { + ip6 = (struct ip6_hdr *)(mp->m_data + ETHER_HDR_LEN); + } else { + ip6stat_inc(ip6s_outcpycsum); + m_copydata(mp, ETHER_HDR_LEN, sizeof(ip6data), + &ip6data); + ip6 = &ip6data; + } + + *cmd |= IXL_TX_DESC_CMD_IIPT_IPV6; + + ip_hdr_len = sizeof(*ip6); + ipproto = ip6->ip6_nxt; + break; + } +#endif + default: + return (-1); + } + + *cmd |= (ETHER_HDR_LEN >> 1) << IXL_TX_DESC_MACLEN_SHIFT; + *cmd |= (ip_hdr_len >> 2) << IXL_TX_DESC_IPLEN_SHIFT; + + switch (ipproto) { + case IPPROTO_TCP: + if (mp->m_pkthdr.csum_flags & M_TCP_CSUM_OUT) { + struct tcphdr *th, thdata; + + if (mp->m_pkthdr.len < ETHER_HDR_LEN + ip_hdr_len + + sizeof(*th)) { + tcpstat_inc(tcps_outbadcsum); + return (-1); + } + if (((mtod(mp, unsigned long) + ETHER_HDR_LEN + + ip_hdr_len) & ALIGNBYTES) == 0 && + mp->m_len >= ETHER_HDR_LEN + ip_hdr_len + + sizeof(*th)) { + th = (struct tcphdr *)(mp->m_data + + ETHER_HDR_LEN + ip_hdr_len); + } else { + tcpstat_inc(tcps_outcpycsum); + m_copydata(mp, ETHER_HDR_LEN + ip_hdr_len, + sizeof(thdata), &thdata); + th = &thdata; + } + *cmd |= IXL_TX_DESC_CMD_L4T_EOFT_TCP; + *cmd |= (uint64_t)th->th_off << + IXL_TX_DESC_L4LEN_SHIFT; + } + break; + case IPPROTO_UDP: + if (mp->m_pkthdr.csum_flags & M_UDP_CSUM_OUT) { + if (mp->m_pkthdr.len < ETHER_HDR_LEN + ip_hdr_len + + sizeof(struct udphdr)) { + udpstat_inc(udps_outbadcsum); + return (-1); + } + *cmd |= IXL_TX_DESC_CMD_L4T_EOFT_UDP; + *cmd |= (sizeof(struct udphdr) >> 2) << + IXL_TX_DESC_L4LEN_SHIFT; + } + break; + default: + if (mp->m_pkthdr.csum_flags & M_TCP_CSUM_OUT) + tcpstat_inc(tcps_outbadcsum); + if (mp->m_pkthdr.csum_flags & M_UDP_CSUM_OUT) + udpstat_inc(udps_outbadcsum); + return (-1); + } + + return (0); +} + static void ixl_start(struct ifqueue *ifq) { @@ -2781,7 +2906,7 @@ ixl_start(struct ifqueue *ifq) struct ixl_tx_map *txm; bus_dmamap_t map; struct mbuf *m; - uint64_t cmd; + uint64_t cmd, off = 0; unsigned int prod, free, last, i; unsigned int mask; int post = 0; @@ -2828,12 +2953,15 @@ ixl_start(struct ifqueue *ifq) bus_dmamap_sync(sc->sc_dmat, map, 0, map->dm_mapsize, BUS_DMASYNC_PREWRITE); + ixl_tx_setup_offload(m, &off); + for (i = 0; i < map->dm_nsegs; i++) { txd = &ring[prod]; cmd = (uint64_t)map->dm_segs[i].ds_len << IXL_TX_DESC_BSIZE_SHIFT; cmd |= IXL_TX_DESC_DTYPE_DATA | IXL_TX_DESC_CMD_ICRC; + cmd |= off; htolem64(&txd->addr, map->dm_segs[i].ds_addr); htolem64(&txd->cmd, cmd); @@ -3190,6 +3318,7 @@ ixl_rxeof(struct ixl_softc *sc, struct i m->m_pkthdr.csum_flags |= M_FLOWID; } + ixl_rx_checksum(m, word); ml_enqueue(&ml, m); } else { ifp->if_ierrors++; /* XXX */ @@ -3320,6 +3449,23 @@ ixl_rxrinfo(struct ixl_softc *sc, struct free(ifr, M_TEMP, ixl_nqueues(sc) * sizeof(*ifr)); return (rv); +} + +static void +ixl_rx_checksum(struct mbuf *m, uint64_t word) +{ + if (!ISSET(word, IXL_RX_DESC_L3L4P)) + return; + + if (ISSET(word, IXL_RX_DESC_IPE)) + return; + + m->m_pkthdr.csum_flags |= M_IPV4_CSUM_IN_OK; + + if (ISSET(word, IXL_RX_DESC_L4E)) + return; + + m->m_pkthdr.csum_flags |= M_TCP_CSUM_IN_OK | M_UDP_CSUM_IN_OK; } static int Index: sys/dev/pci/ixgbe.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pci/ixgbe.h,v retrieving revision 1.32 diff -u -p -r1.32 ixgbe.h --- sys/dev/pci/ixgbe.h 18 Jul 2020 07:18:22 -0000 1.32 +++ sys/dev/pci/ixgbe.h 25 Jan 2022 22:25:56 -0000 @@ -60,11 +60,20 @@ #include <net/if.h> #include <net/if_media.h> +#include <net/route.h> #include <net/toeplitz.h> #include <netinet/in.h> #include <netinet/if_ether.h> #include <netinet/ip.h> +#include <netinet/ip6.h> +#include <netinet/ip_var.h> +#include <netinet6/ip6_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/udp.h> +#include <netinet/udp_var.h> #if NBPFILTER > 0 #include <net/bpf.h> Index: sys/netinet/ip_var.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_var.h,v retrieving revision 1.88 diff -u -p -r1.88 ip_var.h --- sys/netinet/ip_var.h 30 Mar 2021 08:37:11 -0000 1.88 +++ sys/netinet/ip_var.h 25 Jan 2022 18:07:02 -0000 @@ -88,6 +88,8 @@ struct ipstat { u_long ips_outswcsum; /* software checksummed on output */ u_long ips_notmember; /* multicasts for unregistered groups */ u_long ips_wrongif; /* packet received on wrong interface */ + u_long ips_outbadcsum; /* output hardware checksum failed */ + u_long ips_outcpycsum; /* output checksum needs copy */ }; struct ipoption { @@ -133,6 +135,8 @@ enum ipstat_counters { ips_outswcsum, /* software checksummed on output */ ips_notmember, /* multicasts for unregistered groups */ ips_wrongif, /* packet received on wrong interface */ + ips_outbadcsum, /* output hardware checksum failed */ + ips_outcpycsum, /* output checksum needs copy */ ips_ncounters }; Index: sys/netinet/tcp_var.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v retrieving revision 1.137 diff -u -p -r1.137 tcp_var.h --- sys/netinet/tcp_var.h 23 Jan 2022 21:44:31 -0000 1.137 +++ sys/netinet/tcp_var.h 25 Jan 2022 22:19:42 -0000 @@ -434,6 +434,9 @@ struct tcpstat { u_int64_t tcps_sack_rcv_opts; /* SACK options received */ u_int64_t tcps_sack_snd_opts; /* SACK options sent */ u_int64_t tcps_sack_drop_opts; /* SACK options dropped */ + + u_int64_t tcps_outbadcsum; /* output hardware checksum failed */ + u_int64_t tcps_outcpycsum; /* output checksum needs copy */ }; /* @@ -605,6 +608,9 @@ enum tcpstat_counters { tcps_sack_rcv_opts, tcps_sack_snd_opts, tcps_sack_drop_opts, + tcps_outbadcsum, + tcps_outcpycsum, + tcps_ncounters, }; Index: sys/netinet/udp_var.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_var.h,v retrieving revision 1.35 diff -u -p -r1.35 udp_var.h --- sys/netinet/udp_var.h 22 Aug 2020 17:54:57 -0000 1.35 +++ sys/netinet/udp_var.h 25 Jan 2022 22:22:08 -0000 @@ -68,6 +68,8 @@ struct udpstat { /* output statistics: */ u_long udps_opackets; /* total output packets */ u_long udps_outswcsum; /* output software-csummed packets */ + u_long udps_outbadcsum; /* output hardware checksum failed */ + u_long udps_outcpycsum; /* output checksum needs copy */ }; /* @@ -111,6 +113,8 @@ enum udpstat_counters { /* output statistics: */ udps_opackets, /* total output packets */ udps_outswcsum, /* output software-csummed packets */ + udps_outbadcsum, /* output hardware checksum failed */ + udps_outcpycsum, /* output checksum needs copy */ udps_ncounters }; Index: sys/netinet6/ip6_var.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_var.h,v retrieving revision 1.89 diff -u -p -r1.89 ip6_var.h --- sys/netinet6/ip6_var.h 1 Dec 2021 12:51:09 -0000 1.89 +++ sys/netinet6/ip6_var.h 25 Jan 2022 22:12:22 -0000 @@ -199,6 +199,8 @@ struct ip6stat { u_int64_t ip6s_forward_cachehit; u_int64_t ip6s_forward_cachemiss; u_int64_t ip6s_wrongif; + u_int64_t ip6s_outbadcsum; + u_int64_t ip6s_outcpycsum; }; #ifdef _KERNEL @@ -245,6 +247,9 @@ enum ip6stat_counters { ip6s_forward_cachehit = ip6s_sources_deprecated + 16, ip6s_forward_cachemiss, ip6s_wrongif, + ip6s_outbadcsum, + ip6s_outcpycsum, + ip6s_ncounters, }; Index: usr.bin/netstat/inet.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/usr.bin/netstat/inet.c,v retrieving revision 1.173 diff -u -p -r1.173 inet.c --- usr.bin/netstat/inet.c 5 Dec 2021 22:36:19 -0000 1.173 +++ usr.bin/netstat/inet.c 25 Jan 2022 23:11:34 -0000 @@ -408,6 +408,8 @@ tcp_stats(char *name) p(tcps_sndwinup, "\t\t%u window update packet%s\n"); p(tcps_sndctrl, "\t\t%u control packet%s\n"); p(tcps_outswcsum, "\t\t%u packet%s software-checksummed\n"); + p(tcps_outbadcsum, "\t%llu packet%s output hardware checksum failed\n"); + p(tcps_outcpycsum, "\t%llu packet%s output checksum needs copy\n"); p(tcps_rcvtotal, "\t%u packet%s received\n"); p2(tcps_rcvackpack, tcps_rcvackbyte, "\t\t%u ack%s (for %llu byte%s)\n"); p(tcps_rcvdupack, "\t\t%u duplicate ack%s\n"); @@ -540,6 +542,8 @@ udp_stats(char *name) p1(udps_nosum, "\t%lu with no checksum\n"); p(udps_inswcsum, "\t%lu input packet%s software-checksummed\n"); p(udps_outswcsum, "\t%lu output packet%s software-checksummed\n"); + p(udps_outbadcsum, "\t%lu packet%s output hardware checksum failed\n"); + p(udps_outcpycsum, "\t%lu packet%s output checksum needs copy\n"); p1(udps_noport, "\t%lu dropped due to no socket\n"); p(udps_noportbcast, "\t%lu broadcast/multicast datagram%s dropped due to no socket\n"); p1(udps_nosec, "\t%lu dropped due to missing IPsec protection\n"); @@ -610,6 +614,8 @@ ip_stats(char *name) p(ips_badaddr, "\t%lu datagram%s with bad address in header\n"); p(ips_inswcsum, "\t%lu input datagram%s software-checksummed\n"); p(ips_outswcsum, "\t%lu output datagram%s software-checksummed\n"); + p(ips_outbadcsum, "\t%lu packet%s output hardware checksum failed\n"); + p(ips_outcpycsum, "\t%lu packet%s output checksum needs copy\n"); p(ips_notmember, "\t%lu multicast packet%s which we don't join\n"); p(ips_wrongif, "\t%lu packet%s received on wrong interface\n"); #undef p Index: usr.bin/netstat/inet6.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/usr.bin/netstat/inet6.c,v retrieving revision 1.55 diff -u -p -r1.55 inet6.c --- usr.bin/netstat/inet6.c 26 Jan 2021 18:22:35 -0000 1.55 +++ usr.bin/netstat/inet6.c 25 Jan 2022 23:11:34 -0000 @@ -372,6 +372,8 @@ ip6_stats(char *name) p(ip6s_badscope, "\t%llu packet%s that violated scope rules\n"); p(ip6s_notmember, "\t%llu multicast packet%s which we don't join\n"); p(ip6s_wrongif, "\t%llu packet%s received on wrong interface\n"); + p(ip6s_outbadcsum, "\t%llu packet%s output hardware checksum failed\n"); + p(ip6s_outcpycsum, "\t%llu packet%s output checksum needs copy\n"); for (first = 1, i = 0; i < 256; i++) if (ip6stat.ip6s_nxthist[i] != 0) { if (first) {