On Tue, Jan 30, 2024 at 02:32:24PM +0100, Hrvoje Popovski wrote:
> yes, and forwarding only without pf.
> I'm sending traffic from host connected to vlan/ix0 and forward through
> em5 to other host.
> I'm sending 1Gbps of traffic with cisco t-rex
I cannot reproduce.
ix0 at pci6 dev 0 function 0 "Intel 82599" rev 0x01, msix, 8 queues, address
90:e2:ba:d6:23:68
em1 at pci7 dev 0 function 1 "Intel I350" rev 0x01: msi, address
a0:36:9f:0a:4a:c5
root@ot42:.../~# ifconfig ix0 hwfeatures
ix0: flags=2008843<UP,BROADCAST,RUNNING,SIMPLEX,MULTICAST,LRO> mtu 1500
hwfeatures=71b7<CSUM_IPv4,CSUM_TCPv4,CSUM_UDPv4,VLAN_MTU,VLAN_HWTAGGING,CSUM_TCPv6,CSUM_UDPv6,TSOv4,TSOv6,LRO>
hardmtu 9198
lladdr 90:e2:ba:d6:23:68
description: Intel 82599
index 5 priority 0 llprio 3
media: Ethernet autoselect (10GSFP+Cu full-duplex,rxpause,txpause)
status: active
root@ot42:.../~# ifconfig em1 hwfeatures
em1: flags=8c43<UP,BROADCAST,RUNNING,OACTIVE,SIMPLEX,MULTICAST> mtu 1500
hwfeatures=31b7<CSUM_IPv4,CSUM_TCPv4,CSUM_UDPv4,VLAN_MTU,VLAN_HWTAGGING,CSUM_TCPv6,CSUM_UDPv6,TSOv4,TSOv6>
hardmtu 9216
lladdr a0:36:9f:0a:4a:c5
description: Intel I350
index 8 priority 0 llprio 3
media: Ethernet autoselect (1000baseT full-duplex,master)
status: active
inet 10.10.22.3 netmask 0xffffff00 broadcast 10.10.22.255
root@ot42:.../~# ifconfig vlan0 hwfeatures
vlan0: flags=8843<UP,BROADCAST,RUNNING,SIMPLEX,MULTICAST> mtu 1500
hwfeatures=3187<CSUM_IPv4,CSUM_TCPv4,CSUM_UDPv4,CSUM_TCPv6,CSUM_UDPv6,TSOv4,TSOv6>
hardmtu 9198
lladdr 90:e2:ba:d6:23:68
index 24 priority 0 llprio 3
encap: vnetid 221 parent ix0 txprio packet rxprio outer
groups: vlan
media: Ethernet autoselect (10GSFP+Cu full-duplex,rxpause,txpause)
status: active
inet 10.10.21.2 netmask 0xffffff00 broadcast 10.10.21.255
root@ot42:.../~# pfctl -si
Status: Disabled for 0 days 00:03:42 Debug: err
Running tcpbench -n100 from Linux via OpenBSD forwarding to Linux.
Simultaneous udpbench to create traffic mixture.
root@ot42:.../~# netstat -ss | egrep 'TSO|LRO'
1188 output TSO packets software chopped
33086906 output TSO packets hardware processed
265855748 output TSO packets generated
31090975 input LRO generated packets from hardware
176482178 input LRO coalesced packets by network device
Lot of LRO and TSO. Running diff below, which reverts em TSO backout
and adds sparc64 fix.
Hrvoje: What is different in your lab?
bluhm
Index: dev/pci/if_em.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pci/if_em.c,v
diff -u -p -r1.371 if_em.c
--- dev/pci/if_em.c 28 Jan 2024 18:42:58 -0000 1.371
+++ dev/pci/if_em.c 29 Jan 2024 14:37:36 -0000
@@ -291,6 +291,8 @@ void em_receive_checksum(struct em_softc
struct mbuf *);
u_int em_transmit_checksum_setup(struct em_queue *, struct mbuf *, u_int,
u_int32_t *, u_int32_t *);
+u_int em_tso_setup(struct em_queue *, struct mbuf *, u_int, u_int32_t *,
+ u_int32_t *);
u_int em_tx_ctx_setup(struct em_queue *, struct mbuf *, u_int, u_int32_t *,
u_int32_t *);
void em_iff(struct em_softc *);
@@ -1188,7 +1190,7 @@ em_flowstatus(struct em_softc *sc)
*
* This routine maps the mbufs to tx descriptors.
*
- * return 0 on success, positive on failure
+ * return 0 on failure, positive on success
**********************************************************************/
u_int
em_encap(struct em_queue *que, struct mbuf *m)
@@ -1236,7 +1238,15 @@ em_encap(struct em_queue *que, struct mb
}
if (sc->hw.mac_type >= em_82575 && sc->hw.mac_type <= em_i210) {
- used += em_tx_ctx_setup(que, m, head, &txd_upper, &txd_lower);
+ if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO)) {
+ used += em_tso_setup(que, m, head, &txd_upper,
+ &txd_lower);
+ if (!used)
+ return (used);
+ } else {
+ used += em_tx_ctx_setup(que, m, head, &txd_upper,
+ &txd_lower);
+ }
} else if (sc->hw.mac_type >= em_82543) {
used += em_transmit_checksum_setup(que, m, head,
&txd_upper, &txd_lower);
@@ -1569,6 +1579,21 @@ em_update_link_status(struct em_softc *s
ifp->if_link_state = link_state;
if_link_state_change(ifp);
}
+
+ /* Disable TSO for 10/100 speeds to avoid some hardware issues */
+ switch (sc->link_speed) {
+ case SPEED_10:
+ case SPEED_100:
+ if (sc->hw.mac_type >= em_82575 && sc->hw.mac_type <= em_i210) {
+ ifp->if_capabilities &= ~IFCAP_TSOv4;
+ ifp->if_capabilities &= ~IFCAP_TSOv6;
+ }
+ break;
+ case SPEED_1000:
+ if (sc->hw.mac_type >= em_82575 && sc->hw.mac_type <= em_i210)
+ ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6;
+ break;
+ }
}
/*********************************************************************
@@ -1988,6 +2013,7 @@ em_setup_interface(struct em_softc *sc)
if (sc->hw.mac_type >= em_82575 && sc->hw.mac_type <= em_i210) {
ifp->if_capabilities |= IFCAP_CSUM_IPv4;
ifp->if_capabilities |= IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
+ ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6;
}
/*
@@ -2231,9 +2257,9 @@ em_setup_transmit_structures(struct em_s
for (i = 0; i < sc->sc_tx_slots; i++) {
pkt = &que->tx.sc_tx_pkts_ring[i];
- error = bus_dmamap_create(sc->sc_dmat,
MAX_JUMBO_FRAME_SIZE,
+ error = bus_dmamap_create(sc->sc_dmat, EM_TSO_SIZE,
EM_MAX_SCATTER / (sc->pcix_82544 ? 2 : 1),
- MAX_JUMBO_FRAME_SIZE, 0, BUS_DMA_NOWAIT,
&pkt->pkt_map);
+ EM_TSO_SEG_SIZE, 0, BUS_DMA_NOWAIT, &pkt->pkt_map);
if (error != 0) {
printf("%s: Unable to create TX DMA map\n",
DEVNAME(sc));
@@ -2403,6 +2429,81 @@ em_free_transmit_structures(struct em_so
0, que->tx.sc_tx_dma.dma_map->dm_mapsize,
BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
}
+}
+
+u_int
+em_tso_setup(struct em_queue *que, struct mbuf *mp, u_int head,
+ u_int32_t *olinfo_status, u_int32_t *cmd_type_len)
+{
+ struct ether_extracted ext;
+ struct e1000_adv_tx_context_desc *TD;
+ uint32_t vlan_macip_lens = 0, type_tucmd_mlhl = 0, mss_l4len_idx = 0;
+ uint32_t paylen = 0;
+ uint8_t iphlen = 0;
+
+ *olinfo_status = 0;
+ *cmd_type_len = 0;
+ TD = (struct e1000_adv_tx_context_desc *)&que->tx.sc_tx_desc_ring[head];
+
+#if NVLAN > 0
+ if (ISSET(mp->m_flags, M_VLANTAG)) {
+ uint32_t vtag = mp->m_pkthdr.ether_vtag;
+ vlan_macip_lens |= vtag << E1000_ADVTXD_VLAN_SHIFT;
+ *cmd_type_len |= E1000_ADVTXD_DCMD_VLE;
+ }
+#endif
+
+ ether_extract_headers(mp, &ext);
+ if (ext.tcp == NULL)
+ goto out;
+
+ vlan_macip_lens |= (sizeof(*ext.eh) << E1000_ADVTXD_MACLEN_SHIFT);
+
+ if (ext.ip4) {
+ iphlen = ext.ip4->ip_hl << 2;
+
+ type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV4;
+ *olinfo_status |= E1000_TXD_POPTS_IXSM << 8;
+#ifdef INET6
+ } else if (ext.ip6) {
+ iphlen = sizeof(*ext.ip6);
+
+ type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV6;
+#endif
+ } else {
+ goto out;
+ }
+
+ *cmd_type_len |= E1000_ADVTXD_DTYP_DATA | E1000_ADVTXD_DCMD_IFCS;
+ *cmd_type_len |= E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DCMD_TSE;
+ paylen = mp->m_pkthdr.len - sizeof(*ext.eh) - iphlen -
+ (ext.tcp->th_off << 2);
+ *olinfo_status |= paylen << E1000_ADVTXD_PAYLEN_SHIFT;
+ vlan_macip_lens |= iphlen;
+ type_tucmd_mlhl |= E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DTYP_CTXT;
+
+ type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP;
+ *olinfo_status |= E1000_TXD_POPTS_TXSM << 8;
+
+ mss_l4len_idx |= mp->m_pkthdr.ph_mss << E1000_ADVTXD_MSS_SHIFT;
+ mss_l4len_idx |= (ext.tcp->th_off << 2) << E1000_ADVTXD_L4LEN_SHIFT;
+ /* 82575 needs the queue index added */
+ if (que->sc->hw.mac_type == em_82575)
+ mss_l4len_idx |= (que->me & 0xff) << 4;
+
+ htolem32(&TD->vlan_macip_lens, vlan_macip_lens);
+ htolem32(&TD->type_tucmd_mlhl, type_tucmd_mlhl);
+ htolem32(&TD->u.seqnum_seed, 0);
+ htolem32(&TD->mss_l4len_idx, mss_l4len_idx);
+
+ tcpstat_add(tcps_outpkttso, (paylen + mp->m_pkthdr.ph_mss - 1) /
+ mp->m_pkthdr.ph_mss);
+
+ return 1;
+
+out:
+ tcpstat_inc(tcps_outbadtso);
+ return 0;
}
u_int
Index: dev/pci/if_em.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pci/if_em.h,v
diff -u -p -r1.82 if_em.h
--- dev/pci/if_em.h 28 Jan 2024 18:42:58 -0000 1.82
+++ dev/pci/if_em.h 29 Jan 2024 14:37:36 -0000
@@ -55,11 +55,14 @@ POSSIBILITY OF SUCH DAMAGE.
#include <net/if.h>
#include <net/if_media.h>
+#include <net/route.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/if_ether.h>
#include <netinet/tcp.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
#include <netinet/udp.h>
#if NBPFILTER > 0
@@ -269,6 +272,7 @@ typedef int boolean_t;
#define EM_MAX_SCATTER 64
#define EM_TSO_SIZE 65535
+#define EM_TSO_SEG_SIZE 4096 /* Max dma segment size */
struct em_packet {
int pkt_eop; /* Index of the desc to watch */
Index: dev/pci/if_em_hw.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pci/if_em_hw.h,v
diff -u -p -r1.92 if_em_hw.h
--- dev/pci/if_em_hw.h 28 Jan 2024 18:42:58 -0000 1.92
+++ dev/pci/if_em_hw.h 29 Jan 2024 14:37:36 -0000
@@ -2150,6 +2150,7 @@ struct e1000_adv_tx_context_desc {
#define E1000_ADVTXD_DCMD_IFCS 0x02000000 /* Insert FCS (Ethernet CRC) */
#define E1000_ADVTXD_DCMD_DEXT 0x20000000 /* Descriptor extension (1=Adv) */
#define E1000_ADVTXD_DCMD_VLE 0x40000000 /* VLAN pkt enable */
+#define E1000_ADVTXD_DCMD_TSE 0x80000000 /* TCP Seg enable */
#define E1000_ADVTXD_PAYLEN_SHIFT 14 /* Adv desc PAYLEN shift */
/* Adv Transmit Descriptor Config Masks */
@@ -2159,6 +2160,10 @@ struct e1000_adv_tx_context_desc {
#define E1000_ADVTXD_TUCMD_IPV6 0x00000000 /* IP Packet Type:
0=IPv6 */
#define E1000_ADVTXD_TUCMD_L4T_UDP 0x00000000 /* L4 Packet TYPE of UDP */
#define E1000_ADVTXD_TUCMD_L4T_TCP 0x00000800 /* L4 Packet TYPE of TCP */
+
+/* Req requires Markers and CRC */
+#define E1000_ADVTXD_L4LEN_SHIFT 8 /* Adv ctxt L4LEN shift */
+#define E1000_ADVTXD_MSS_SHIFT 16 /* Adv ctxt MSS shift */
/* Multiple Receive Queue Control */
#define E1000_MRQC_ENABLE_MASK 0x00000003
Index: netinet/tcp.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp.h,v
diff -u -p -r1.24 tcp.h
--- netinet/tcp.h 19 May 2023 01:04:39 -0000 1.24
+++ netinet/tcp.h 29 Jan 2024 14:37:21 -0000
@@ -51,11 +51,11 @@ struct tcphdr {
tcp_seq th_seq; /* sequence number */
tcp_seq th_ack; /* acknowledgement number */
#if _BYTE_ORDER == _LITTLE_ENDIAN
- u_int32_t th_x2:4, /* (unused) */
+ u_int8_t th_x2:4, /* (unused) */
th_off:4; /* data offset */
#endif
#if _BYTE_ORDER == _BIG_ENDIAN
- u_int32_t th_off:4, /* data offset */
+ u_int8_t th_off:4, /* data offset */
th_x2:4; /* (unused) */
#endif
u_int8_t th_flags;