This is a patch for the stack and the em driver to enable TSO on CURRENT. Previously I had problems getting it to work, but this is functional.
I should note that CURRENT is being a pain right now, when I comment out em in the config the kernel panics coming up, so I had to substitute this code into the tree. Rather bizarre :) I have this functionality running on a 6.1 based system, and our test group is already testing against that driver, so far things are looking good. I have designed it so the driver can continue to be built without support. There is also a sysctl in the stack code so you can set net.inet.tcp.tso_enable on or off and compare. I know there may be some refinements to add in, but I would like to get this into CURRENT as a start. Comments? Jack
--- dist/if_em.h Thu Aug 3 12:05:04 2006 +++ if_em.h Fri Sep 1 15:55:30 2006 @@ -36,6 +36,8 @@ #ifndef _EM_H_DEFINED_ #define _EM_H_DEFINED_ +#define TCP_TSO + /* Tunables */ /* @@ -138,6 +140,11 @@ #define EM_CHECKSUM_FEATURES (CSUM_TCP | CSUM_UDP) /* + * Inform the stack about transmit segmentation offload capabilities. + */ +#define EM_TCPSEG_FEATURES CSUM_TCPSEG + +/* * This parameter controls the duration of transmit watchdog timer. */ #define EM_TX_TIMEOUT 5 /* set to 5 seconds */ @@ -225,6 +232,7 @@ #define EM_RXBUFFER_16384 16384 #define EM_MAX_SCATTER 64 +#define EM_TSO_SIZE 65535 typedef enum _XSUM_CONTEXT_T { OFFLOAD_NONE, @@ -307,6 +315,7 @@ uint32_t txd_cmd; struct em_buffer *tx_buffer_area; bus_dma_tag_t txtag; /* dma tag for tx */ + uint32_t tx_tso; /* last tx was tso */ /* * Receive definitions --- dist/if_em.c Fri Aug 4 00:56:33 2006 +++ if_em.c Fri Sep 1 15:58:23 2006 @@ -72,6 +72,8 @@ #include <netinet/tcp.h> #include <netinet/udp.h> +#include <machine/in_cksum.h> + #include <dev/pci/pcivar.h> #include <dev/pci/pcireg.h> #include <dev/em/if_em_hw.h> @@ -229,6 +231,10 @@ struct mbuf *); static void em_transmit_checksum_setup(struct adapter *, struct mbuf *, uint32_t *, uint32_t *); +#ifdef TCP_TSO +static boolean_t em_tso_setup(struct adapter *, struct mbuf *, u_int32_t *, + uint32_t *); +#endif static void em_set_promisc(struct adapter *); static void em_disable_promisc(struct adapter *); static void em_set_multi(struct adapter *); @@ -302,6 +308,7 @@ #define E1000_TICKS_TO_USECS(ticks) ((1024 * (ticks) + 500) / 1000) #define E1000_USECS_TO_TICKS(usecs) ((1000 * (usecs) + 512) / 1024) +#define M_TSO_LEN 66 static int em_tx_int_delay_dflt = E1000_TICKS_TO_USECS(EM_TIDV); static int em_rx_int_delay_dflt = E1000_TICKS_TO_USECS(EM_RDTR); @@ -1061,11 +1068,14 @@ ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + ifp->if_hwassist = 0; if (adapter->hw.mac_type >= em_82543) { if (ifp->if_capenable & IFCAP_TXCSUM) ifp->if_hwassist = EM_CHECKSUM_FEATURES; - else - ifp->if_hwassist = 0; +#ifdef TCP_TSO + if (ifp->if_capenable & IFCAP_TCPSEG) + ifp->if_hwassist |= EM_TCPSEG_FEATURES; +#endif } callout_reset(&adapter->timer, hz, em_local_timer, adapter); @@ -1416,11 +1426,17 @@ struct m_tag *mtag; uint32_t txd_upper, txd_lower, txd_used, txd_saved; int nsegs, i, j; - int error; + int error, do_tso, tso_desc = 0; m_head = *m_headp; current_tx_desc = NULL; - txd_used = txd_saved = 0; + txd_upper = txd_lower = txd_used = txd_saved = 0; + +#ifdef TCP_TSO + do_tso = ((m_head->m_pkthdr.csum_flags & CSUM_TCPSEG) != 0); +#else + do_tso = 0; +#endif /* * Force a cleanup if number of TX descriptors @@ -1473,6 +1489,17 @@ *m_headp = m_head; } + /* + * TSO workaround: + * If an mbuf is only header we need + * to pull 4 bytes of data into it. + */ + if (do_tso && (m_head->m_len <= M_TSO_LEN)) { + m_head = m_pullup(m_head, M_TSO_LEN + 4); + if (m_head == NULL) + return (ENOBUFS); + } + /* * Map the packet for DMA. */ @@ -1487,23 +1514,43 @@ } KASSERT(nsegs != 0, ("em_encap: empty packet")); - if (nsegs > adapter->num_tx_desc_avail) { + /* + * TSO Hardware workaround, if this packet is not + * TSO, and is only a single descriptor long, and + * it follows a TSO burst, then we need to add a + * sentinel descriptor to prevent premature writeback. + */ + if ((do_tso == 0) && (adapter->tx_tso == TRUE)) { + if (nsegs == 1) + tso_desc = TRUE; + adapter->tx_tso = FALSE; + } + + if (nsegs > adapter->num_tx_desc_avail - 2) { adapter->no_tx_desc_avail2++; error = ENOBUFS; goto encap_fail; } - if (ifp->if_hwassist > 0) - em_transmit_checksum_setup(adapter, m_head, &txd_upper, &txd_lower); - else - txd_upper = txd_lower = 0; + /* Do hardware assists */ + if ( ifp->if_hwassist > 0) { +#ifdef TCP_TSO + if (em_tso_setup(adapter, m_head, &txd_upper, &txd_lower)) { + /* we need to make a final sentinel transmit desc */ + tso_desc = TRUE; + } else +#endif + em_transmit_checksum_setup(adapter, m_head, + &txd_upper, &txd_lower); + } i = adapter->next_avail_tx_desc; - if (adapter->pcix_82544) { + if (adapter->pcix_82544) txd_saved = i; - txd_used = 0; - } + for (j = 0; j < nsegs; j++) { + bus_size_t seg_len; + bus_addr_t seg_addr; /* If adapter is 82544 and on PCIX bus. */ if(adapter->pcix_82544) { DESC_ARRAY desc_array; @@ -1537,26 +1584,57 @@ txd_used++; } } else { - tx_buffer = &adapter->tx_buffer_area[i]; - current_tx_desc = &adapter->tx_desc_base[i]; - - current_tx_desc->buffer_addr = htole64(segs[j].ds_addr); - current_tx_desc->lower.data = htole32( - adapter->txd_cmd | txd_lower | segs[j].ds_len); - current_tx_desc->upper.data = htole32(txd_upper); - - if (++i == adapter->num_tx_desc) - i = 0; - - tx_buffer->m_head = NULL; + tx_buffer = &adapter->tx_buffer_area[i]; + current_tx_desc = &adapter->tx_desc_base[i]; + seg_addr = htole64(segs[j].ds_addr); + seg_len = segs[j].ds_len; + /* + ** TSO Workaround: + ** If this is the last descriptor, we want to + ** split it so we have a small final sentinel + */ + if (tso_desc && (j == (nsegs -1)) && (seg_len > 8)) { + seg_len -= 4; + current_tx_desc->buffer_addr = seg_addr; + current_tx_desc->lower.data = htole32( + adapter->txd_cmd | txd_lower | seg_len); + current_tx_desc->upper.data = + htole32(txd_upper); + if (++i == adapter->num_tx_desc) + i = 0; + /* Now make the sentinel */ + ++txd_used; /* using an extra txd */ + current_tx_desc = &adapter->tx_desc_base[i]; + tx_buffer = &adapter->tx_buffer_area[i]; + current_tx_desc->buffer_addr = + seg_addr + seg_len; + current_tx_desc->lower.data = htole32( + adapter->txd_cmd | txd_lower | 4); + current_tx_desc->upper.data = + htole32(txd_upper); + if (++i == adapter->num_tx_desc) + i = 0; + } else { + current_tx_desc->buffer_addr = seg_addr; + current_tx_desc->lower.data = htole32( + adapter->txd_cmd | txd_lower | seg_len); + current_tx_desc->upper.data = + htole32(txd_upper); + if (++i == adapter->num_tx_desc) + i = 0; + } + tx_buffer->m_head = NULL; } } adapter->next_avail_tx_desc = i; if (adapter->pcix_82544) adapter->num_tx_desc_avail -= txd_used; - else + else { adapter->num_tx_desc_avail -= nsegs; + if (tso_desc) /* TSO used an extra for sentinel */ + adapter->num_tx_desc_avail -= txd_used; + } if (mtag != NULL) { /* Set the vlan id. */ @@ -2226,6 +2304,15 @@ ifp->if_capenable |= IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM; } +#ifdef TCP_TSO + /* Enable TSO if available */ + if ((adapter->hw.mac_type > em_82544) && + (adapter->hw.mac_type != em_82547)) { + ifp->if_capabilities |= IFCAP_TCPSEG; + ifp->if_capenable |= IFCAP_TCPSEG; + } +#endif + /* * Tell the upper layer(s) we support long frames. */ @@ -2436,15 +2523,27 @@ static int em_setup_transmit_structures(struct adapter *adapter) { +#ifdef TCP_TSO + struct ifnet *ifp = adapter->ifp; +#endif device_t dev = adapter->dev; struct em_buffer *tx_buffer; - bus_size_t size; + bus_size_t size, segsize; int error, i; /* * Setup DMA descriptor areas. */ - size = roundup2(adapter->hw.max_frame_size, MCLBYTES); + segsize = size = roundup2(adapter->hw.max_frame_size, MCLBYTES); + +#ifdef TCP_TSO + /* Overrides for TSO - want large sizes */ + if (ifp->if_hwassist & EM_TCPSEG_FEATURES) { + size = EM_TSO_SIZE; + segsize = PAGE_SIZE; + } +#endif + if ((error = bus_dma_tag_create(NULL, /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ @@ -2452,7 +2551,7 @@ NULL, NULL, /* filter, filterarg */ size, /* maxsize */ EM_MAX_SCATTER, /* nsegments */ - size, /* maxsegsize */ + segsize, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockarg */ @@ -2713,6 +2812,87 @@ adapter->next_avail_tx_desc = curr_txd; } +#ifdef TCP_TSO +/********************************************************************** + * + * Setup work for hardware segmentation offload (TSO) + * + **********************************************************************/ +static boolean_t +em_tso_setup(struct adapter *adapter, + struct mbuf *mp, + u_int32_t *txd_upper, + u_int32_t *txd_lower) +{ + struct em_context_desc *TXD; + struct em_buffer *tx_buffer; + struct ip *ip; + struct tcphdr *th; + int curr_txd, hdr_len, ip_hlen, tcp_hlen; + + if (((mp->m_pkthdr.csum_flags & CSUM_TCPSEG) == 0) || + (mp->m_pkthdr.len <= E1000_TX_BUFFER_SIZE)) { + return FALSE; + } + + *txd_lower = (E1000_TXD_CMD_DEXT | + E1000_TXD_DTYP_D | + E1000_TXD_CMD_TSE); + + *txd_upper = (E1000_TXD_POPTS_IXSM | + E1000_TXD_POPTS_TXSM) << 8; + + curr_txd = adapter->next_avail_tx_desc; + tx_buffer = &adapter->tx_buffer_area[curr_txd]; + TXD = (struct em_context_desc *) &adapter->tx_desc_base[curr_txd]; + + mp->m_data += sizeof(struct ether_header); + ip = mtod(mp, struct ip *); + ip->ip_len = 0; + ip->ip_sum = 0; + ip_hlen = ip->ip_hl << 2 ; + th = (struct tcphdr *)((caddr_t)ip + ip_hlen); + tcp_hlen = th->th_off << 2; + + hdr_len = ETHER_HDR_LEN + ip_hlen + tcp_hlen; + th->th_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, + htons(IPPROTO_TCP)); + + mp->m_data -= sizeof(struct ether_header); + TXD->lower_setup.ip_fields.ipcss = ETHER_HDR_LEN; + TXD->lower_setup.ip_fields.ipcso = + ETHER_HDR_LEN + offsetof(struct ip, ip_sum); + TXD->lower_setup.ip_fields.ipcse = + htole16(ETHER_HDR_LEN + ip_hlen - 1); + + TXD->upper_setup.tcp_fields.tucss = + ETHER_HDR_LEN + ip_hlen; + TXD->upper_setup.tcp_fields.tucse = 0; + TXD->upper_setup.tcp_fields.tucso = + ETHER_HDR_LEN + ip_hlen + + offsetof(struct tcphdr, th_sum); + TXD->tcp_seg_setup.fields.mss = htole16(mp->m_pkthdr.tso_mss); + TXD->tcp_seg_setup.fields.hdr_len = hdr_len; + TXD->cmd_and_length = htole32(adapter->txd_cmd | + E1000_TXD_CMD_DEXT | + E1000_TXD_CMD_TSE | + E1000_TXD_CMD_IP | E1000_TXD_CMD_TCP | + (mp->m_pkthdr.len - (hdr_len))); + + tx_buffer->m_head = NULL; + + if (++curr_txd == adapter->num_tx_desc) + curr_txd = 0; + + adapter->num_tx_desc_avail--; + adapter->next_avail_tx_desc = curr_txd; + adapter->tx_tso = TRUE; + + return TRUE; +} +#endif /* TCP_TSO */ + /********************************************************************** * * Examine each tx_buffer in the used queue. If the hardware is done @@ -3639,6 +3819,12 @@ (long long)adapter->stats.gprc); device_printf(dev, "Good Packets Xmtd = %lld\n", (long long)adapter->stats.gptc); +#ifdef TCP_TSO + device_printf(dev, "TSO Contexts Xmtd = %lld\n", + (long long)adapter->stats.tsctc); + device_printf(dev, "TSO Contexts Failed = %lld\n", + (long long)adapter->stats.tsctfc); +#endif } static int
diff -Naur sys.dist/conf/options sys/conf/options --- sys.dist/conf/options Wed Aug 2 22:19:33 2006 +++ sys/conf/options Thu Aug 31 16:55:23 2006 @@ -386,6 +386,7 @@ SLIP_IFF_OPTS opt_slip.h TCPDEBUG TCP_SIGNATURE opt_inet.h +TCP_TSO opt_global.h TCP_SACK_DEBUG opt_tcp_sack.h TCP_DROP_SYNFIN opt_tcp_input.h DEV_VLAN opt_vlan.h diff -Naur sys.dist/net/if.h sys/net/if.h --- sys.dist/net/if.h Mon Jun 19 15:20:44 2006 +++ sys/net/if.h Thu Aug 31 12:33:40 2006 @@ -189,6 +189,7 @@ #define IFCAP_JUMBO_MTU 0x0020 /* 9000 byte MTU supported */ #define IFCAP_POLLING 0x0040 /* driver supports polling */ #define IFCAP_VLAN_HWCSUM 0x0080 /* can do IFCAP_HWCSUM on VLANs */ +#define IFCAP_TCPSEG 0x0100 /* can do TSO */ #define IFCAP_HWCSUM (IFCAP_RXCSUM | IFCAP_TXCSUM) diff -Naur sys.dist/netinet/ip_output.c sys/netinet/ip_output.c --- sys.dist/netinet/ip_output.c Thu Jun 29 06:38:36 2006 +++ sys/netinet/ip_output.c Thu Aug 31 15:16:44 2006 @@ -495,9 +495,11 @@ /* * If small enough for interface, or the interface will take - * care of the fragmentation for us, can just send directly. + * care of the fragmentation for us, or the interface is + * doing segmentation, we can just send directly. */ - if (ip->ip_len <= ifp->if_mtu || (ifp->if_hwassist & CSUM_FRAGMENT && + if (ip->ip_len <= ifp->if_mtu || + (ifp->if_hwassist & (CSUM_FRAGMENT | CSUM_TCPSEG) && ((ip->ip_off & IP_DF) == 0))) { ip->ip_len = htons(ip->ip_len); ip->ip_off = htons(ip->ip_off); diff -Naur sys.dist/netinet/tcp_output.c sys/netinet/tcp_output.c --- sys.dist/netinet/tcp_output.c Thu Feb 23 13:14:34 2006 +++ sys/netinet/tcp_output.c Thu Aug 31 16:50:33 2006 @@ -105,6 +105,12 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno, 0, "Enable NewReno Algorithms"); +#ifdef TCP_TSO +int tcp_enable_tso = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso_enable, CTLFLAG_RW, &tcp_enable_tso, + 0, "Enable TCP Segmentation Offload"); +#endif + /* * Tcp output routine: figure out what should be sent and send it. */ @@ -130,6 +136,9 @@ #if 0 int maxburst = TCP_MAXBURST; #endif +#ifdef TCP_TSO + int tso_capable, use_tso; +#endif #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; @@ -139,6 +148,10 @@ INP_LOCK_ASSERT(tp->t_inpcb); +#ifdef TCP_TSO + if (tcp_enable_tso) + use_tso = tso_capable = tcp_checktso(tp); +#endif /* * Determine length of data that should be transmitted, * and flags that will be used. @@ -380,9 +393,22 @@ * no longer contains the last data byte. */ if (len > tp->t_maxseg) { - len = tp->t_maxseg; +#ifdef TCP_TSO + if (use_tso) { + /* Even out the transmissions */ + len = ((ulmin(len, TCP_MAXWIN)/ tp->t_maxseg) * + tp->t_maxseg); + if (len <= tp->t_maxseg) + use_tso = 0; + } else +#endif + len = tp->t_maxseg; sendalot = 1; } +#ifdef TCP_TSO + else + use_tso = 0; +#endif if (sack_rxmit) { if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; @@ -406,7 +432,7 @@ * - we need to retransmit */ if (len) { - if (len == tp->t_maxseg) + if (len >= tp->t_maxseg) /* TSO makes it possible to be greater */ goto send; /* * NOTE! on localhost connections an 'ack' from the remote @@ -703,6 +729,19 @@ * Clear the FIN bit because we cut off the tail of * the segment. */ +#ifdef TCP_TSO + if (use_tso) { + if (len + optlen + ipoptlen > TCP_MAXWIN) { + /* + * If there is still more to send, + * don't close the connection. + */ + flags &= ~TH_FIN; + len = TCP_MAXWIN - optlen - ipoptlen; + sendalot = 1; + } + } else +#endif if (len + optlen + ipoptlen > tp->t_maxopd) { /* * If there is still more to send, don't close the connection. @@ -944,6 +983,12 @@ /* IP version must be set here for ipv4/ipv6 checking later */ KASSERT(ip->ip_v == IPVERSION, ("%s: IP version incorrect: %d", __func__, ip->ip_v)); +#ifdef TCP_TSO + if (use_tso) { /* Setup for TSO */ + m->m_pkthdr.tso_mss = tp->t_maxseg; + m->m_pkthdr.csum_flags |= CSUM_TCPSEG; + } +#endif } /* @@ -1076,7 +1121,11 @@ * Section 2. However the tcp hostcache migitates the problem * so it affects only the first tcp connection with a host. */ +#ifdef TCP_TSO + if (path_mtu_discovery && !use_tso) +#else if (path_mtu_discovery) +#endif ip->ip_off |= IP_DF; error = ip_output(m, tp->t_inpcb->inp_options, NULL, diff -Naur sys.dist/netinet/tcp_subr.c sys/netinet/tcp_subr.c --- sys.dist/netinet/tcp_subr.c Wed Aug 2 09:18:05 2006 +++ sys/netinet/tcp_subr.c Thu Aug 31 16:54:31 2006 @@ -1643,6 +1643,41 @@ } #endif /* INET6 */ +#ifdef TCP_TSO +/* + * Find the interface for this tcpcb and determine + * if a TSO hardware assist is available. + */ +boolean_t +tcp_checktso(tp) + struct tcpcb *tp; +{ + struct in_conninfo *inc; + struct route sro; + struct sockaddr_in *dst; + struct ifnet *ifp; + + inc = &tp->t_inpcb->inp_inc; + bzero(&sro, sizeof(sro)); + if (inc->inc_faddr.s_addr != INADDR_ANY) { + dst = (struct sockaddr_in *)&sro.ro_dst; + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = inc->inc_faddr; + rtalloc_ign(&sro, RTF_CLONING); + } + if (sro.ro_rt != NULL) { + ifp = sro.ro_rt->rt_ifp; + if (ifp->if_hwassist & CSUM_TCPSEG) { + RTFREE(sro.ro_rt); + return TRUE; + } + RTFREE(sro.ro_rt); + } + return FALSE; +} +#endif + #ifdef IPSEC /* compute ESP/AH header size for TCP, including outer IP header. */ size_t diff -Naur sys.dist/netinet/tcp_var.h sys/netinet/tcp_var.h --- sys.dist/netinet/tcp_var.h Mon Jun 26 08:35:25 2006 +++ sys/netinet/tcp_var.h Thu Aug 31 12:42:25 2006 @@ -510,6 +510,10 @@ u_long tcp_maxmtu6(struct in_conninfo *); void tcp_mss(struct tcpcb *, int); int tcp_mssopt(struct in_conninfo *); +#ifdef TCP_TSO +boolean_t + tcp_checktso(struct tcpcb *); +#endif struct inpcb * tcp_drop_syn_sent(struct inpcb *, int); struct inpcb * diff -Naur sys.dist/sys/mbuf.h sys/sys/mbuf.h --- sys.dist/sys/mbuf.h Sun Jul 23 18:49:57 2006 +++ sys/sys/mbuf.h Thu Aug 31 12:38:04 2006 @@ -110,6 +110,7 @@ /* variables for hardware checksum */ int csum_flags; /* flags regarding checksum */ int csum_data; /* data field used by csum routines */ + int tso_mss; /* TSO segment size */ SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */ }; @@ -215,6 +216,7 @@ #define CSUM_UDP 0x0004 /* will csum UDP */ #define CSUM_IP_FRAGS 0x0008 /* will csum IP fragments */ #define CSUM_FRAGMENT 0x0010 /* will do IP fragmentation */ +#define CSUM_TCPSEG 0x0020 /* eligible for TCP segmentation */ #define CSUM_IP_CHECKED 0x0100 /* did csum IP */ #define CSUM_IP_VALID 0x0200 /* ... the csum is valid */
_______________________________________________ freebsd-net@freebsd.org mailing list http://lists.freebsd.org/mailman/listinfo/freebsd-net To unsubscribe, send any mail to "[EMAIL PROTECTED]"