if_parse_packet(): refactor packet parsing on driver level
Hi, We have a lot of redundant code on the network device driver layer, that parses the content of mbufs for ethernet, ip and tcp header. This diff introduces a new function if_parse_packet() to centralize this feature. It just refactors ix(4) and ixl(4) code because, I could test this cards and won't blowup this diff. But, igc(3), ale(4) and oce(4) could also be improved with this. Beside of refactoring, we'll need this kind of code in ix(4) and other drivers for better checksum and TSO support. I'm not sure about the correct naming or place for this helper function. Thus, nitpicking and bike shading is welcome. :) bye, Jan Index: dev/pci/if_ix.c === RCS file: /mount/openbsd/cvs/src/sys/dev/pci/if_ix.c,v retrieving revision 1.189 diff -u -p -r1.189 if_ix.c --- dev/pci/if_ix.c 2 Sep 2022 14:08:09 - 1.189 +++ dev/pci/if_ix.c 24 Oct 2022 13:51:22 - @@ -2477,25 +2477,18 @@ static inline int ixgbe_csum_offload(struct mbuf *mp, uint32_t *vlan_macip_lens, uint32_t *type_tucmd_mlhl, uint32_t *olinfo_status) { - struct ether_header *eh = mtod(mp, struct ether_header *); - struct mbuf *m; - int hoff; int offload = 0; - uint32_t iphlen; uint8_t ipproto; - *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT); + struct if_hdr hdr; - switch (ntohs(eh->ether_type)) { - case ETHERTYPE_IP: { - struct ip *ip; + if_parse_packet(mp, ); - m = m_getptr(mp, sizeof(*eh), ); - KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip)); - ip = (struct ip *)(mtod(m, caddr_t) + hoff); + *vlan_macip_lens |= (hdr.l2len << IXGBE_ADVTXD_MACLEN_SHIFT); - iphlen = ip->ip_hl << 2; - ipproto = ip->ip_p; + switch (ntohs(hdr.eth->ether_type)) { + case ETHERTYPE_IP: { + ipproto = hdr.ip4->ip_p; if (ISSET(mp->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT)) { *olinfo_status |= IXGBE_TXD_POPTS_IXSM << 8; @@ -2508,15 +2501,7 @@ ixgbe_csum_offload(struct mbuf *mp, uint #ifdef INET6 case ETHERTYPE_IPV6: { - struct ip6_hdr *ip6; - - m = m_getptr(mp, sizeof(*eh), ); - KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip6)); - ip6 = (struct ip6_hdr *)(mtod(m, caddr_t) + hoff); - - iphlen = sizeof(*ip6); - ipproto = ip6->ip6_nxt; - + ipproto = hdr.ip6->ip6_nxt; *type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6; break; } @@ -2526,7 +2511,7 @@ ixgbe_csum_offload(struct mbuf *mp, uint return offload; } - *vlan_macip_lens |= iphlen; + *vlan_macip_lens |= hdr.l3len; switch (ipproto) { case IPPROTO_TCP: Index: dev/pci/if_ixgb.h === RCS file: /mount/openbsd/cvs/src/sys/dev/pci/if_ixgb.h,v retrieving revision 1.19 diff -u -p -r1.19 if_ixgb.h --- dev/pci/if_ixgb.h 24 Nov 2015 17:11:39 - 1.19 +++ dev/pci/if_ixgb.h 24 Oct 2022 13:27:43 - @@ -54,6 +54,7 @@ POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include Index: dev/pci/if_ixl.c === RCS file: /mount/openbsd/cvs/src/sys/dev/pci/if_ixl.c,v retrieving revision 1.84 diff -u -p -r1.84 if_ixl.c --- dev/pci/if_ixl.c5 Aug 2022 13:57:16 - 1.84 +++ dev/pci/if_ixl.c24 Oct 2022 16:34:29 - @@ -2784,11 +2784,12 @@ ixl_load_mbuf(bus_dma_tag_t dmat, bus_dm static uint64_t ixl_tx_setup_offload(struct mbuf *m0) { - struct mbuf *m; - int hoff; - uint64_t hlen; uint8_t ipproto; uint64_t offload = 0; + struct if_hdr hdr; + + memset(, 0, sizeof(hdr)); + if_parse_packet(m0, ); if (ISSET(m0->m_flags, M_VLANTAG)) { uint64_t vtag = m0->m_pkthdr.ether_vtag; @@ -2800,35 +2801,20 @@ ixl_tx_setup_offload(struct mbuf *m0) M_IPV4_CSUM_OUT|M_TCP_CSUM_OUT|M_UDP_CSUM_OUT)) return (offload); - switch (ntohs(mtod(m0, struct ether_header *)->ether_type)) { + switch (ntohs(hdr.eth->ether_type)) { case ETHERTYPE_IP: { - struct ip *ip; - - m = m_getptr(m0, ETHER_HDR_LEN, ); - KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip)); - ip = (struct ip *)(mtod(m, caddr_t) + hoff); - offload |= ISSET(m0->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT) ? IXL_TX_DESC_CMD_IIPT_IPV4_CSUM : IXL_TX_DESC_CMD_IIPT_IPV4; - hlen = ip->ip_hl << 2; - ipproto = ip->ip_p; + ipproto = hdr.ip4->ip_p; break; } #ifdef INET6 case ETHERTYPE_IPV6:
Fix kernel build without IPSEC option
Hi, if you build the kernel without IPSEC it will run into several compiler and linker errors. This diff add some missing #ifdefs to fix this. ok? bye, jan Index: net/if_pfsync.c === RCS file: /mount/openbsd/cvs/src/sys/net/if_pfsync.c,v retrieving revision 1.305 diff -u -p -r1.305 if_pfsync.c --- net/if_pfsync.c 21 Apr 2022 15:22:49 - 1.305 +++ net/if_pfsync.c 2 Nov 2022 10:20:38 - @@ -1576,7 +1576,9 @@ pfsync_grab_snapshot(struct pfsync_snaps int q; struct pf_state *st; struct pfsync_upd_req_item *ur; +#if defined(IPSEC) struct tdb *tdb; +#endif sn->sn_sc = sc; @@ -1602,6 +1604,7 @@ pfsync_grab_snapshot(struct pfsync_snaps } TAILQ_INIT(>sn_tdb_q); +#if defined(IPSEC) while ((tdb = TAILQ_FIRST(>sc_tdb_q)) != NULL) { TAILQ_REMOVE(>sc_tdb_q, tdb, tdb_sync_entry); TAILQ_INSERT_TAIL(>sn_tdb_q, tdb, tdb_sync_snap); @@ -1611,6 +1614,7 @@ pfsync_grab_snapshot(struct pfsync_snaps SET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED); mtx_leave(>tdb_mtx); } +#endif sn->sn_len = sc->sc_len; sc->sc_len = PFSYNC_MINPKT; @@ -1630,7 +1634,9 @@ pfsync_drop_snapshot(struct pfsync_snaps { struct pf_state *st; struct pfsync_upd_req_item *ur; +#if defined(IPSEC) struct tdb *t; +#endif int q; for (q = 0; q < PFSYNC_S_COUNT; q++) { @@ -1652,6 +1658,7 @@ pfsync_drop_snapshot(struct pfsync_snaps pool_put(>sn_sc->sc_pool, ur); } +#if defined(IPSEC) while ((t = TAILQ_FIRST(>sn_tdb_q)) != NULL) { TAILQ_REMOVE(>sn_tdb_q, t, tdb_sync_snap); mtx_enter(>tdb_mtx); @@ -1660,6 +1667,7 @@ pfsync_drop_snapshot(struct pfsync_snaps CLR(t->tdb_flags, TDBF_PFSYNC); mtx_leave(>tdb_mtx); } +#endif } int @@ -1748,7 +1756,6 @@ pfsync_sendout(void) struct pfsync_subheader *subh; struct pf_state *st; struct pfsync_upd_req_item *ur; - struct tdb *t; int offset; int q, count = 0; @@ -1842,7 +1849,10 @@ pfsync_sendout(void) sn.sn_plus = NULL; /* XXX memory leak ? */ } +#if defined(IPSEC) if (!TAILQ_EMPTY(_tdb_q)) { + struct tdb *t; + subh = (struct pfsync_subheader *)(m->m_data + offset); offset += sizeof(*subh); @@ -1865,6 +1875,7 @@ pfsync_sendout(void) subh->len = sizeof(struct pfsync_tdb) >> 2; subh->count = htons(count); } +#endif /* walk the queues */ for (q = 0; q < PFSYNC_S_COUNT; q++) { @@ -2486,6 +2497,7 @@ pfsync_q_del(struct pf_state *st) pf_state_unref(st); } +#if defined(IPSEC) void pfsync_update_tdb(struct tdb *t, int output) { @@ -2540,7 +2552,9 @@ pfsync_update_tdb(struct tdb *t, int out CLR(t->tdb_flags, TDBF_PFSYNC_RPL); mtx_leave(>tdb_mtx); } +#endif +#if defined(IPSEC) void pfsync_delete_tdb(struct tdb *t) { @@ -2576,6 +2590,7 @@ pfsync_delete_tdb(struct tdb *t) tdb_unref(t); } +#endif void pfsync_out_tdb(struct tdb *t, void *buf) Index: netinet/ip_ipsp.c === RCS file: /mount/openbsd/cvs/src/sys/netinet/ip_ipsp.c,v retrieving revision 1.273 diff -u -p -r1.273 ip_ipsp.c --- netinet/ip_ipsp.c 6 Aug 2022 15:57:59 - 1.273 +++ netinet/ip_ipsp.c 2 Nov 2022 12:09:22 - @@ -1081,7 +1081,7 @@ tdb_free(struct tdb *tdbp) tdbp->tdb_xform = NULL; } -#if NPFSYNC > 0 +#if NPFSYNC > 0 && defined(IPSEC) /* Cleanup pfsync references */ pfsync_delete_tdb(tdbp); #endif
Re: refactor mbuf parsing on driver level
On Thu, Jan 19, 2023 at 12:02:29PM +0300, Vitaliy Makkoveev wrote: > On Thu, Jan 19, 2023 at 01:55:57AM +0300, Vitaliy Makkoveev wrote: > > > On 19 Jan 2023, at 01:39, Jan Klemkow wrote: > > > > > > On Wed, Jan 18, 2023 at 10:50:25AM +0300, Vitaliy Makkoveev wrote: > > >> On Tue, Jan 17, 2023 at 11:09:17PM +0100, Jan Klemkow wrote: > > >>> we have several drivers which have to parse the content of mbufs. This > > >>> diff suggest a central parsing function for this. Thus, we can reduce > > >>> redundant code. > > >>> > > >>> I just start with ix(4) and ixl(4) because it was easy to test for me. > > >>> But, this could also improve em(4), igc(4), ale(4) and oce(4). > > >>> > > >>> I'm not sure about the name, the api nor the place of this code. So, if > > >>> someone has a better idea: i'm open to anything. > > >> > > >> I like code this deduplication. > > >> > > >> This newly introduced function doesn't touch ifnet but only extracts > > >> protocol headers from mbuf(9). I guess mbuf_extract_headers() or > > >> something like is much better for name with the ern/uipc_mbuf2.c as > > >> place. > > > > > > Good Point. Updates diff below. > > > > > > + > > > +/* Parse different TCP/IP protocol headers for a quick view inside an > > > mbuf. */ > > > +void > > > +m_exract_headers(struct mbuf *mp, struct ether_header **eh, struct ip > > > **ip4, > > > +struct ip6_hdr **ip6, struct tcphdr **tcp, struct udphdr **udp) > > > + > > > > Should be m_extract_headers(). The rest of the diff looks good to me. > > > > Please wait. > > The mandatory nullification of `ip4', `ip6' and other variables passed > to m_exract_headers() is not obvious. It is much better to return > the integer result of extraction like m_tag_copy_chain() does. Yes, the mandatory nullification seems to be more errorprone. In my opinion is the number of results it not that useful. You have to check the retuned pointers anyway. I moved the nullification inside of m_exract_headers(). Index: dev/pci/if_ix.c === RCS file: /cvs/src/sys/dev/pci/if_ix.c,v retrieving revision 1.189 diff -u -p -r1.189 if_ix.c --- dev/pci/if_ix.c 2 Sep 2022 14:08:09 - 1.189 +++ dev/pci/if_ix.c 19 Jan 2023 09:29:10 - @@ -2477,23 +2477,18 @@ static inline int ixgbe_csum_offload(struct mbuf *mp, uint32_t *vlan_macip_lens, uint32_t *type_tucmd_mlhl, uint32_t *olinfo_status) { - struct ether_header *eh = mtod(mp, struct ether_header *); - struct mbuf *m; - int hoff; + struct ether_header *eh; + struct ip *ip; + struct ip6_hdr *ip6; int offload = 0; uint32_t iphlen; uint8_t ipproto; - *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT); + m_extract_headers(mp, , , , NULL, NULL); - switch (ntohs(eh->ether_type)) { - case ETHERTYPE_IP: { - struct ip *ip; - - m = m_getptr(mp, sizeof(*eh), ); - KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip)); - ip = (struct ip *)(mtod(m, caddr_t) + hoff); + *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT); + if (ip) { iphlen = ip->ip_hl << 2; ipproto = ip->ip_p; @@ -2503,26 +2498,14 @@ ixgbe_csum_offload(struct mbuf *mp, uint } *type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4; - break; - } - #ifdef INET6 - case ETHERTYPE_IPV6: { - struct ip6_hdr *ip6; - - m = m_getptr(mp, sizeof(*eh), ); - KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip6)); - ip6 = (struct ip6_hdr *)(mtod(m, caddr_t) + hoff); - + } else if (ip6) { iphlen = sizeof(*ip6); ipproto = ip6->ip6_nxt; *type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6; - break; - } #endif - - default: + } else { return offload; } Index: dev/pci/if_ixl.c === RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v retrieving revision 1.84 diff -u -p -r1.84 if_ixl.c --- dev/pci/if_ixl.c5 Aug 2022 13:57:16 - 1.84 +++ dev/pci/if_ixl.c19 Jan 2023 09:29:17 - @@ -2784,8 +2784,10 @@ ixl_load_mbuf(bus_dma_tag_t dmat, bus_dm static uint64_t ixl_tx_setup_offload(struct mbuf *m0) { - struct mbuf *m; - int hoff; + struct ether_h
refactor mbuf parsing on driver level
Hi, we have several drivers which have to parse the content of mbufs. This diff suggest a central parsing function for this. Thus, we can reduce redundant code. I just start with ix(4) and ixl(4) because it was easy to test for me. But, this could also improve em(4), igc(4), ale(4) and oce(4). I'm not sure about the name, the api nor the place of this code. So, if someone has a better idea: i'm open to anything. bye, Jan Index: dev/pci/if_ix.c === RCS file: /cvs/src/sys/dev/pci/if_ix.c,v retrieving revision 1.189 diff -u -p -r1.189 if_ix.c --- dev/pci/if_ix.c 2 Sep 2022 14:08:09 - 1.189 +++ dev/pci/if_ix.c 17 Jan 2023 16:31:19 - @@ -2477,23 +2477,18 @@ static inline int ixgbe_csum_offload(struct mbuf *mp, uint32_t *vlan_macip_lens, uint32_t *type_tucmd_mlhl, uint32_t *olinfo_status) { - struct ether_header *eh = mtod(mp, struct ether_header *); - struct mbuf *m; - int hoff; + struct ether_header *eh = NULL; + struct ip *ip = NULL; + struct ip6_hdr *ip6 = NULL; int offload = 0; uint32_t iphlen; uint8_t ipproto; - *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT); + if_parse(mp, , , , NULL, NULL); - switch (ntohs(eh->ether_type)) { - case ETHERTYPE_IP: { - struct ip *ip; - - m = m_getptr(mp, sizeof(*eh), ); - KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip)); - ip = (struct ip *)(mtod(m, caddr_t) + hoff); + *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT); + if (ip) { iphlen = ip->ip_hl << 2; ipproto = ip->ip_p; @@ -2503,26 +2498,14 @@ ixgbe_csum_offload(struct mbuf *mp, uint } *type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4; - break; - } - #ifdef INET6 - case ETHERTYPE_IPV6: { - struct ip6_hdr *ip6; - - m = m_getptr(mp, sizeof(*eh), ); - KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip6)); - ip6 = (struct ip6_hdr *)(mtod(m, caddr_t) + hoff); - + } else if (ip6) { iphlen = sizeof(*ip6); ipproto = ip6->ip6_nxt; *type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6; - break; - } #endif - - default: + } else { return offload; } Index: dev/pci/if_ixl.c === RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v retrieving revision 1.84 diff -u -p -r1.84 if_ixl.c --- dev/pci/if_ixl.c5 Aug 2022 13:57:16 - 1.84 +++ dev/pci/if_ixl.c16 Jan 2023 23:58:05 - @@ -2784,12 +2784,15 @@ ixl_load_mbuf(bus_dma_tag_t dmat, bus_dm static uint64_t ixl_tx_setup_offload(struct mbuf *m0) { - struct mbuf *m; - int hoff; + struct ether_header *eh = NULL; + struct ip *ip = NULL; + struct ip6_hdr *ip6 = NULL; + struct tcphdr *th = NULL; uint64_t hlen; uint8_t ipproto; uint64_t offload = 0; + if (ISSET(m0->m_flags, M_VLANTAG)) { uint64_t vtag = m0->m_pkthdr.ether_vtag; offload |= IXL_TX_DESC_CMD_IL2TAG1; @@ -2800,39 +2803,23 @@ ixl_tx_setup_offload(struct mbuf *m0) M_IPV4_CSUM_OUT|M_TCP_CSUM_OUT|M_UDP_CSUM_OUT)) return (offload); - switch (ntohs(mtod(m0, struct ether_header *)->ether_type)) { - case ETHERTYPE_IP: { - struct ip *ip; - - m = m_getptr(m0, ETHER_HDR_LEN, ); - KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip)); - ip = (struct ip *)(mtod(m, caddr_t) + hoff); + if_parse(m0, , , , , NULL); + if (ip) { offload |= ISSET(m0->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT) ? IXL_TX_DESC_CMD_IIPT_IPV4_CSUM : IXL_TX_DESC_CMD_IIPT_IPV4; hlen = ip->ip_hl << 2; ipproto = ip->ip_p; - break; - } - #ifdef INET6 - case ETHERTYPE_IPV6: { - struct ip6_hdr *ip6; - - m = m_getptr(m0, ETHER_HDR_LEN, ); - KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip6)); - ip6 = (struct ip6_hdr *)(mtod(m, caddr_t) + hoff); - + } else if (ip6) { offload |= IXL_TX_DESC_CMD_IIPT_IPV6; hlen = sizeof(*ip6); ipproto = ip6->ip6_nxt; - break; - } #endif - default: + } else { panic("CSUM_OUT set for non-IP packet"); /* NOTREACHED */ } @@ -2842,15 +2829,12 @@ ixl_tx_setup_offload(struct mbuf *m0) switch (ipproto) { case IPPROTO_TCP: { - struct tcphdr *th; - if (!ISSET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_OUT))
Re: mem.4: be more accurate about securelevel
On Tue, Jan 17, 2023 at 11:02:07PM +0100, Theo Buehler wrote: > > at least this tool works for me: > > Surely you have kern.allowkmem=1 set. Yes, I do.
mem.4: be more accurate about securelevel
Hi, This diff adjust the manpage of mem(4) to be more accurate. You can open(2) mem(4) in securelevel 1 in readonly mode, but not writable. kern/spec_vnops.c: if (ap->a_cred != FSCRED && (ap->a_mode & FWRITE)) { ... /* * When running in secure mode, do not allow opens * for writing of /dev/mem, /dev/kmem, or character * devices whose corresponding block devices are * currently mounted. */ if (securelevel >= 1) { ... if (iskmemdev(dev)) return (EPERM); } } OK? bye, Jan Index: man4.alpha/mem.4 === RCS file: /cvs/src/share/man/man4/man4.alpha/mem.4,v retrieving revision 1.6 diff -u -p -r1.6 mem.4 --- man4.alpha/mem.412 Jan 2018 04:36:44 - 1.6 +++ man4.alpha/mem.417 Jan 2023 18:51:10 - @@ -62,7 +62,7 @@ kernel virtual memory begins at .Li 0xfc23 . .Pp Even with sufficient file system permissions, -these devices can only be opened when the +these devices can only be opened writable when the .Xr securelevel 7 is insecure or when the .Va kern.allowkmem Index: man4.amd64/mem.4 === RCS file: /cvs/src/share/man/man4/man4.amd64/mem.4,v retrieving revision 1.6 diff -u -p -r1.6 mem.4 --- man4.amd64/mem.412 Jan 2018 04:36:44 - 1.6 +++ man4.amd64/mem.417 Jan 2023 18:48:23 - @@ -63,7 +63,7 @@ The kernel virtual memory begins at addr .Li 0x8000 . .Pp Even with sufficient file system permissions, -these devices can only be opened when the +these devices can only be opened writable when the .Xr securelevel 7 is insecure or when the .Va kern.allowkmem Index: man4.hppa/mem.4 === RCS file: /cvs/src/share/man/man4/man4.hppa/mem.4,v retrieving revision 1.4 diff -u -p -r1.4 mem.4 --- man4.hppa/mem.4 12 Jan 2018 04:36:44 - 1.4 +++ man4.hppa/mem.4 17 Jan 2023 18:52:28 - @@ -51,7 +51,7 @@ On hppa, the physical memory range is al address 0; kernel virtual memory begins at address 0 as well. .Pp Even with sufficient file system permissions, -these devices can only be opened when the +these devices can only be opened writable when the .Xr securelevel 7 is insecure or when the .Va kern.allowkmem Index: man4.i386/mem.4 === RCS file: /cvs/src/share/man/man4/man4.i386/mem.4,v retrieving revision 1.12 diff -u -p -r1.12 mem.4 --- man4.i386/mem.4 12 Jan 2018 04:36:44 - 1.12 +++ man4.i386/mem.4 17 Jan 2023 18:53:00 - @@ -63,7 +63,7 @@ long, and ends at virtual address .Li 0xfe00 . .Pp Even with sufficient file system permissions, -these devices can only be opened when the +these devices can only be opened writable when the .Xr securelevel 7 is insecure or when the .Va kern.allowkmem Index: man4.landisk/mem.4 === RCS file: /cvs/src/share/man/man4/man4.landisk/mem.4,v retrieving revision 1.4 diff -u -p -r1.4 mem.4 --- man4.landisk/mem.4 12 Jan 2018 04:36:44 - 1.4 +++ man4.landisk/mem.4 17 Jan 2023 18:53:54 - @@ -58,7 +58,7 @@ The kernel virtual memory begins at addr .Li 0xc000 . .Pp Even with sufficient file system permissions, -these devices can only be opened when the +these devices can only be opened writable when the .Xr securelevel 7 is insecure or when the .Va kern.allowkmem Index: man4.loongson/mem.4 === RCS file: /cvs/src/share/man/man4/man4.loongson/mem.4,v retrieving revision 1.4 diff -u -p -r1.4 mem.4 --- man4.loongson/mem.4 12 Jan 2018 04:36:44 - 1.4 +++ man4.loongson/mem.4 17 Jan 2023 18:54:33 - @@ -88,7 +88,7 @@ The kernel virtual memory begins at addr .Ad 0xc000 . .Pp Even with sufficient file system permissions, -these devices can only be opened when the +these devices can only be opened writable when the .Xr securelevel 7 is insecure or when the .Va kern.allowkmem Index: man4.luna88k/mem.4 === RCS file: /cvs/src/share/man/man4/man4.luna88k/mem.4,v retrieving revision 1.4 diff -u -p -r1.4 mem.4 --- man4.luna88k/mem.4 12 Jan 2018 04:36:44 - 1.4 +++ man4.luna88k/mem.4 17 Jan 2023 18:54:47 - @@ -62,7 +62,7 @@ kernel virtual memory begins at .Ad 0x . .Pp Even with sufficient file system permissions, -these devices can only be opened when the +these devices can only be opened writable when the .Xr securelevel 7 is insecure or when the .Va kern.allowkmem Index: man4.macppc/mem.4
Re: mem.4: be more accurate about securelevel
On Tue, Jan 17, 2023 at 04:23:48PM -0500, Bryan Steele wrote: > On Tue, Jan 17, 2023 at 09:37:24PM +0100, Jan Klemkow wrote: > > Hi, > > > > This diff adjust the manpage of mem(4) to be more accurate. You can > > open(2) mem(4) in securelevel 1 in readonly mode, but not writable. > > > > kern/spec_vnops.c: > > > > if (ap->a_cred != FSCRED && (ap->a_mode & FWRITE)) { > > ... > > /* > > * When running in secure mode, do not allow opens > > * for writing of /dev/mem, /dev/kmem, or character > > * devices whose corresponding block devices are > > * currently mounted. > > */ > > if (securelevel >= 1) { > > ... > > if (iskmemdev(dev)) > > return (EPERM); > > } > > } > > > > OK? > > > > bye, > > Jan > > Are you sure about that? Have you tested it? > > https://github.com/openbsd/src/commit/19aedf236181e81baf170421900911c82671fae4 at least this tool works for me: #include #include #include #include #include #include #include int main(void) { kvm_t *kd; int mem; struct nlist nl[] = { {"_ix_debug_ioctl"}, {NULL} }; char errbuf[_POSIX2_LINE_MAX]; if ((kd = kvm_open(_PATH_KSYMS, NULL, NULL, O_RDWR, errbuf)) == NULL) errx(EXIT_FAILURE, "%s", errbuf); if (kvm_nlist(kd, nl) == -1) errx(EXIT_SUCCESS, "%s", kvm_geterr(kd)); if (kvm_read(kd, nl[0].n_value, , sizeof mem) != sizeof(mem)) errx(EXIT_SUCCESS, "%s", kvm_geterr(kd)); printf("mem: %d\n", mem); mem = 1; if (kvm_write(kd, nl[0].n_value, , sizeof mem) != sizeof(mem)) errx(EXIT_SUCCESS, "%s", kvm_geterr(kd)); if (kvm_close(kd) == -1) err(EXIT_FAILURE, "kvm_close"); return EXIT_SUCCESS; }
Re: refactor mbuf parsing on driver level
On Thu, Jan 19, 2023 at 02:55:29PM +0300, Vitaliy Makkoveev wrote: > On Thu, Jan 19, 2023 at 10:40:52AM +0100, Jan Klemkow wrote: > > On Thu, Jan 19, 2023 at 12:02:29PM +0300, Vitaliy Makkoveev wrote: > > > On Thu, Jan 19, 2023 at 01:55:57AM +0300, Vitaliy Makkoveev wrote: > > > > > On 19 Jan 2023, at 01:39, Jan Klemkow wrote: > > > > > On Wed, Jan 18, 2023 at 10:50:25AM +0300, Vitaliy Makkoveev wrote: > > > > >> On Tue, Jan 17, 2023 at 11:09:17PM +0100, Jan Klemkow wrote: > > > > >>> we have several drivers which have to parse the content of mbufs. > > > > >>> This > > > > >>> diff suggest a central parsing function for this. Thus, we can > > > > >>> reduce > > > > >>> redundant code. > > > > >>> > > > > >>> I just start with ix(4) and ixl(4) because it was easy to test for > > > > >>> me. > > > > >>> But, this could also improve em(4), igc(4), ale(4) and oce(4). > > > > >>> > > > > >>> I'm not sure about the name, the api nor the place of this code. > > > > >>> So, if > > > > >>> someone has a better idea: i'm open to anything. > > > > >> > > > > >> I like code this deduplication. > > > > >> > > > > >> This newly introduced function doesn't touch ifnet but only extracts > > > > >> protocol headers from mbuf(9). I guess mbuf_extract_headers() or > > > > >> something like is much better for name with the ern/uipc_mbuf2.c as > > > > >> place. > > > > > > > > > > Good Point. Updates diff below. > > > > > > > > > > + > > > > > +/* Parse different TCP/IP protocol headers for a quick view inside > > > > > an mbuf. */ > > > > > +void > > > > > +m_exract_headers(struct mbuf *mp, struct ether_header **eh, struct > > > > > ip **ip4, > > > > > +struct ip6_hdr **ip6, struct tcphdr **tcp, struct udphdr **udp) > > > > > + > > > > > > > > Should be m_extract_headers(). The rest of the diff looks good to me. > > > > > > Please wait. > > > > > > The mandatory nullification of `ip4', `ip6' and other variables passed > > > to m_exract_headers() is not obvious. It is much better to return > > > the integer result of extraction like m_tag_copy_chain() does. > > > > Yes, the mandatory nullification seems to be more errorprone. In my > > opinion is the number of results it not that useful. You have to check > > the retuned pointers anyway. > > > > I moved the nullification inside of m_exract_headers(). > > This is better. I also like the last return statement be removed from > m_extract_headers() before commit. Fixed below. Plus a suggestion from mpi to not pollute the namespace with all the headers in mbuf.h. Moved them to uipc_mbuf2.c. Index: dev/pci/if_ix.c === RCS file: /cvs/src/sys/dev/pci/if_ix.c,v retrieving revision 1.189 diff -u -p -r1.189 if_ix.c --- dev/pci/if_ix.c 2 Sep 2022 14:08:09 - 1.189 +++ dev/pci/if_ix.c 19 Jan 2023 09:29:10 - @@ -2477,23 +2477,18 @@ static inline int ixgbe_csum_offload(struct mbuf *mp, uint32_t *vlan_macip_lens, uint32_t *type_tucmd_mlhl, uint32_t *olinfo_status) { - struct ether_header *eh = mtod(mp, struct ether_header *); - struct mbuf *m; - int hoff; + struct ether_header *eh; + struct ip *ip; + struct ip6_hdr *ip6; int offload = 0; uint32_t iphlen; uint8_t ipproto; - *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT); + m_extract_headers(mp, , , , NULL, NULL); - switch (ntohs(eh->ether_type)) { - case ETHERTYPE_IP: { - struct ip *ip; - - m = m_getptr(mp, sizeof(*eh), ); - KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip)); - ip = (struct ip *)(mtod(m, caddr_t) + hoff); + *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT); + if (ip) { iphlen = ip->ip_hl << 2; ipproto = ip->ip_p; @@ -2503,26 +2498,14 @@ ixgbe_csum_offload(struct mbuf *mp, uint } *type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4; - break; - } - #ifdef INET6 - case ETHERTYPE_IPV6: { - struct ip6_hdr *ip6; - - m = m_getptr(mp,
Re: refactor mbuf parsing on driver level
On Tue, Jan 24, 2023 at 09:32:53PM +1000, David Gwynne wrote: > On Mon, Jan 23, 2023 at 09:25:34AM +0100, Jan Klemkow wrote: > > On Wed, Jan 18, 2023 at 03:49:25PM -0700, Theo de Raadt wrote: > > > Jan Klemkow wrote: > > > > On Wed, Jan 18, 2023 at 10:50:25AM +0300, Vitaliy Makkoveev wrote: > > > > > On Tue, Jan 17, 2023 at 11:09:17PM +0100, Jan Klemkow wrote: > > > > > > we have several drivers which have to parse the content of mbufs. > > > > > > This > > > > > > diff suggest a central parsing function for this. Thus, we can > > > > > > reduce > > > > > > redundant code. > > > > > > > > > > > > I just start with ix(4) and ixl(4) because it was easy to test for > > > > > > me. > > > > > > But, this could also improve em(4), igc(4), ale(4) and oce(4). > > > > > > > > > > > > I'm not sure about the name, the api nor the place of this code. > > > > > > So, if > > > > > > someone has a better idea: i'm open to anything. > > > > > > > > > > I like code this deduplication. > > > > > > > > > > This newly introduced function doesn't touch ifnet but only extracts > > > > > protocol headers from mbuf(9). I guess mbuf_extract_headers() or > > > > > something like is much better for name with the ern/uipc_mbuf2.c as > > > > > place. > > > > > > > > Good Point. Updates diff below. > > > > > > I agree, "extract" is a better name. dlg, do you have a comment? > > > > Whats you opinion about this diff? > > it makes ix and ixl prettier, so that's a good enough reason to do > it. it should go in net/if_ethersubr.c as ether_extract_headers() > though. > > could you try using a struct to carry the header pointers around and see > what that looks like? > > struct ether_extracted { > struct ether_header *eh; > struct ip *ip4; > struct ip6_hdr *ip6; > struct tcphdr *tcp; > struct udphdr *udp; > }; > > void ether_extract_headers(struct mbuf *, struct ether_extracted *); > > you can add a depth or flags argument if you want to be able to > tell it to return before looking for the tcp/udp headers if you > want. OK? Thanks, Jan Index: dev/pci/if_ix.c === RCS file: /cvs/src/sys/dev/pci/if_ix.c,v retrieving revision 1.189 diff -u -p -r1.189 if_ix.c --- dev/pci/if_ix.c 2 Sep 2022 14:08:09 - 1.189 +++ dev/pci/if_ix.c 24 Jan 2023 13:34:17 - @@ -2477,25 +2477,16 @@ static inline int ixgbe_csum_offload(struct mbuf *mp, uint32_t *vlan_macip_lens, uint32_t *type_tucmd_mlhl, uint32_t *olinfo_status) { - struct ether_header *eh = mtod(mp, struct ether_header *); - struct mbuf *m; - int hoff; + struct ether_extracted ext; int offload = 0; uint32_t iphlen; - uint8_t ipproto; - *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT); + ether_extract_headers(mp, ); - switch (ntohs(eh->ether_type)) { - case ETHERTYPE_IP: { - struct ip *ip; + *vlan_macip_lens |= (sizeof(*ext.eh) << IXGBE_ADVTXD_MACLEN_SHIFT); - m = m_getptr(mp, sizeof(*eh), ); - KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip)); - ip = (struct ip *)(mtod(m, caddr_t) + hoff); - - iphlen = ip->ip_hl << 2; - ipproto = ip->ip_p; + if (ext.ip4) { + iphlen = ext.ip4->ip_hl << 2; if (ISSET(mp->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT)) { *olinfo_status |= IXGBE_TXD_POPTS_IXSM << 8; @@ -2503,46 +2494,30 @@ ixgbe_csum_offload(struct mbuf *mp, uint } *type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4; - break; - } - #ifdef INET6 - case ETHERTYPE_IPV6: { - struct ip6_hdr *ip6; - - m = m_getptr(mp, sizeof(*eh), ); - KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip6)); - ip6 = (struct ip6_hdr *)(mtod(m, caddr_t) + hoff); - - iphlen = sizeof(*ip6); - ipproto = ip6->ip6_nxt; + } else if (ext.ip6) { + iphlen = sizeof(*ext.ip6); *type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6; - break; - } #endif - - default: + } else { return offload; } *vlan_macip_lens |= iphlen;
ifconfig(8): fix output of missing ipv6 addresses
Hi, ifconfig doesn't print ipv6 addresses if its used with media option. # ifconfig -A vio0: flags=8843 mtu 1500 ... inet 10.0.1.65 netmask 0xff00 broadcast 10.0.1.255 inet6 fe80::5054:ff:fe6a:b6fd%vio0 prefixlen 64 scopeid 0x1 inet6 fc00:1::1 prefixlen 64 inet 192.168.0.1 netmask 0xff00 broadcast 192.168.0.255 # ifconfig -A media vio0: flags=8843 mtu 1500 ... supported media: media autoselect inet 10.0.1.65 netmask 0xff00 broadcast 10.0.1.255 inet 192.168.0.1 netmask 0xff00 broadcast 192.168.0.255 As the diff below shows, afp is NULL by default, but set to inet if there is an additional program parameter. At the end, no specific address family is assumed if afp is NULL. Thus, the diff below introduces a new variable to remember if a specific address family was set by the user or not for printing all interface addresses. The regression test of ifconfig(8) is passing with the diff below. ok? bye, Jan Index: ifconfig.c === RCS file: /cvs/src/sbin/ifconfig/ifconfig.c,v retrieving revision 1.461 diff -u -p -r1.461 ifconfig.c --- ifconfig.c 18 Jan 2023 21:57:10 - 1.461 +++ ifconfig.c 23 Jan 2023 12:11:30 - @@ -746,6 +746,7 @@ const struct afswtch { }; const struct afswtch *afp; /*the address family being set or asked about*/ +const struct afswtch *pafp;/*the address family being used for printing*/ char joinname[IEEE80211_NWID_LEN]; size_t joinlen; @@ -840,7 +841,7 @@ main(int argc, char *argv[]) if (argc > 0) { for (afp = rafp = afs; rafp->af_name; rafp++) if (strcmp(rafp->af_name, *argv) == 0) { - afp = rafp; + pafp = afp = rafp; argc--; argv++; break; @@ -1216,7 +1217,7 @@ printif(char *name, int ifaliases) (ifa->ifa_addr->sa_family == AF_INET && ifaliases == 0 && noinet == 0)) continue; - if ((p = afp) != NULL) { + if ((p = pafp) != NULL) { if (ifa->ifa_addr->sa_family == p->af_af) p->af_status(1); } else { @@ -3514,7 +3515,7 @@ status(int link, struct sockaddr_dl *sdl proto_status: if (link == 0) { - if ((p = afp) != NULL) { + if ((p = pafp) != NULL) { p->af_status(1); } else for (p = afs; p->af_name; p++) { ifr.ifr_addr.sa_family = p->af_af;
Re: refactor mbuf parsing on driver level
On Tue, Jan 24, 2023 at 05:40:55PM +0300, Vitaliy Makkoveev wrote: > On Tue, Jan 24, 2023 at 03:14:36PM +0100, Jan Klemkow wrote: > > On Tue, Jan 24, 2023 at 09:32:53PM +1000, David Gwynne wrote: > > > On Mon, Jan 23, 2023 at 09:25:34AM +0100, Jan Klemkow wrote: > > > > On Wed, Jan 18, 2023 at 03:49:25PM -0700, Theo de Raadt wrote: > > > > > Jan Klemkow wrote: > > > > > > On Wed, Jan 18, 2023 at 10:50:25AM +0300, Vitaliy Makkoveev wrote: > > > > > > > On Tue, Jan 17, 2023 at 11:09:17PM +0100, Jan Klemkow wrote: > > > > > > > > we have several drivers which have to parse the content of > > > > > > > > mbufs. This > > > > > > > > diff suggest a central parsing function for this. Thus, we can > > > > > > > > reduce > > > > > > > > redundant code. > > > > > > > > > > > > > > > > I just start with ix(4) and ixl(4) because it was easy to test > > > > > > > > for me. > > > > > > > > But, this could also improve em(4), igc(4), ale(4) and oce(4). > > > > > > > > > > > > > > > > I'm not sure about the name, the api nor the place of this > > > > > > > > code. So, if > > > > > > > > someone has a better idea: i'm open to anything. > > > > > > > > > > > > > > I like code this deduplication. > > > > > > > > > > > > > > This newly introduced function doesn't touch ifnet but only > > > > > > > extracts > > > > > > > protocol headers from mbuf(9). I guess mbuf_extract_headers() or > > > > > > > something like is much better for name with the ern/uipc_mbuf2.c > > > > > > > as > > > > > > > place. > > > > > > > > > > > > Good Point. Updates diff below. > > > > > > > > > > I agree, "extract" is a better name. dlg, do you have a comment? > > > > > > > > Whats you opinion about this diff? > > > > > > it makes ix and ixl prettier, so that's a good enough reason to do > > > it. it should go in net/if_ethersubr.c as ether_extract_headers() > > > though. > > > > > > could you try using a struct to carry the header pointers around and see > > > what that looks like? > > > > > > struct ether_extracted { > > > struct ether_header *eh; > > > struct ip *ip4; > > > struct ip6_hdr *ip6; > > > struct tcphdr *tcp; > > > struct udphdr *udp; > > > }; > > > > > > void ether_extract_headers(struct mbuf *, struct ether_extracted *); > > > > > > you can add a depth or flags argument if you want to be able to > > > tell it to return before looking for the tcp/udp headers if you > > > want. > > Looks better then m_extract_headers(). Since ext->eh is always assigned > to non NULL value below, the "ext->eh = NULL;" is not necessary. Also > I'm not sure, but is memset() more reliable for `ext' zeroing? Anyway, > feel free to commit without memset(). OK? Thanks, Jan Index: dev/pci/if_ix.c === RCS file: /cvs/src/sys/dev/pci/if_ix.c,v retrieving revision 1.189 diff -u -p -r1.189 if_ix.c --- dev/pci/if_ix.c 2 Sep 2022 14:08:09 - 1.189 +++ dev/pci/if_ix.c 24 Jan 2023 13:34:17 - @@ -2477,25 +2477,16 @@ static inline int ixgbe_csum_offload(struct mbuf *mp, uint32_t *vlan_macip_lens, uint32_t *type_tucmd_mlhl, uint32_t *olinfo_status) { - struct ether_header *eh = mtod(mp, struct ether_header *); - struct mbuf *m; - int hoff; + struct ether_extracted ext; int offload = 0; uint32_t iphlen; - uint8_t ipproto; - *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT); + ether_extract_headers(mp, ); - switch (ntohs(eh->ether_type)) { - case ETHERTYPE_IP: { - struct ip *ip; + *vlan_macip_lens |= (sizeof(*ext.eh) << IXGBE_ADVTXD_MACLEN_SHIFT); - m = m_getptr(mp, sizeof(*eh), ); - KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip)); - ip = (struct ip *)(mtod(m, caddr_t) + hoff); - - iphlen = ip->ip_hl << 2; - ipproto = ip->ip_p; + if (ext.ip4) { + ip
Re: refactor mbuf parsing on driver level
On Thu, Jan 26, 2023 at 02:06:28PM +0300, Vitaliy Makkoveev wrote: > On Thu, Jan 26, 2023 at 11:37:51AM +0100, Christian Weisgerber wrote: > > Jan Klemkow: > > > > > we have several drivers which have to parse the content of mbufs. This > > > diff suggest a central parsing function for this. Thus, we can reduce > > > redundant code. > > > > > > I just start with ix(4) and ixl(4) because it was easy to test for me. > > > But, this could also improve em(4), igc(4), ale(4) and oce(4). > > > > Here's the corresponding change for em(4). > > This only affects 82575, 82576, i350, and i210. > > Tested on i210. > > > > ok? > > > > The ether_extract_headers() diff was reverted, because is wrong for the > cases other than tcp/udp/icmp. We need to fix it and recommit again > before continue. I'm already on the way, to fix this mess. I'll send a new diff soon. Sorry this inconvenience, Jan
Re: refactor mbuf parsing on driver level
On Fri, Jan 27, 2023 at 04:44:36PM +0100, Christian Weisgerber wrote: > > The ether_extract_headers() diff was reverted, because is wrong for the > > cases other than tcp/udp/icmp. We need to fix it and recommit again > > before continue. > > I think (TCP or) UDP fragments are the problem. Fragments don't have > the protocol header but will still end up here: > > case IPPROTO_UDP: > m = m_getptr(m, hoff + hlen, ); > KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ext->udp)); > ext->udp = (struct udphdr *)(mtod(m, caddr_t) + hoff); > break; > > If a tail fragment is too short, it will trigger the KASSERT(). > > Previously, this wasn't a problem, because if there was such a > KASSERT() as in ixl(4), it was behind a M_*_CSUM_OUT check, and we > never set those flags for fragments. I changed the diff below to be more robust and reconstruct my test equipment to build permanently over NFS. - I turned the KASSERTS to returns. - Check if the mbuf is large enough for an ether header. - additionally #ifdef'd INET6 around the ip6_hdr in the new struct Tested the diff on NFS client and server with several kernel builds. ok? Thanks, Jan Index: dev/pci/if_ix.c === RCS file: /cvs/src/sys/dev/pci/if_ix.c,v retrieving revision 1.191 diff -u -p -r1.191 if_ix.c --- dev/pci/if_ix.c 26 Jan 2023 07:32:39 - 1.191 +++ dev/pci/if_ix.c 27 Jan 2023 13:37:13 - @@ -2477,25 +2477,16 @@ static inline int ixgbe_csum_offload(struct mbuf *mp, uint32_t *vlan_macip_lens, uint32_t *type_tucmd_mlhl, uint32_t *olinfo_status) { - struct ether_header *eh = mtod(mp, struct ether_header *); - struct mbuf *m; - int hoff; + struct ether_extracted ext; int offload = 0; uint32_t iphlen; - uint8_t ipproto; - *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT); + ether_extract_headers(mp, ); - switch (ntohs(eh->ether_type)) { - case ETHERTYPE_IP: { - struct ip *ip; + *vlan_macip_lens |= (sizeof(*ext.eh) << IXGBE_ADVTXD_MACLEN_SHIFT); - m = m_getptr(mp, sizeof(*eh), ); - KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip)); - ip = (struct ip *)(mtod(m, caddr_t) + hoff); - - iphlen = ip->ip_hl << 2; - ipproto = ip->ip_p; + if (ext.ip4) { + iphlen = ext.ip4->ip_hl << 2; if (ISSET(mp->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT)) { *olinfo_status |= IXGBE_TXD_POPTS_IXSM << 8; @@ -2503,46 +2494,30 @@ ixgbe_csum_offload(struct mbuf *mp, uint } *type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4; - break; - } - #ifdef INET6 - case ETHERTYPE_IPV6: { - struct ip6_hdr *ip6; - - m = m_getptr(mp, sizeof(*eh), ); - KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip6)); - ip6 = (struct ip6_hdr *)(mtod(m, caddr_t) + hoff); - - iphlen = sizeof(*ip6); - ipproto = ip6->ip6_nxt; + } else if (ext.ip6) { + iphlen = sizeof(*ext.ip6); *type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6; - break; - } #endif - - default: + } else { return offload; } *vlan_macip_lens |= iphlen; - switch (ipproto) { - case IPPROTO_TCP: + if (ext.tcp) { *type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP; if (ISSET(mp->m_pkthdr.csum_flags, M_TCP_CSUM_OUT)) { *olinfo_status |= IXGBE_TXD_POPTS_TXSM << 8; offload = 1; } - break; - case IPPROTO_UDP: + } else if (ext.udp) { *type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_UDP; if (ISSET(mp->m_pkthdr.csum_flags, M_UDP_CSUM_OUT)) { *olinfo_status |= IXGBE_TXD_POPTS_TXSM << 8; offload = 1; } - break; } return offload; Index: dev/pci/if_ixl.c === RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v retrieving revision 1.86 diff -u -p -r1.86 if_ixl.c --- dev/pci/if_ixl.c26 Jan 2023 07:32:39 - 1.86 +++ dev/pci/if_ixl.c27 Jan 2023 13:37:13 - @@ -2784,10 +2784,8 @@ ixl_load_mbuf(bus_dma_tag_t dmat, bus_dm static uint64_t ixl_tx_setup_offload(struct mbuf *m0) { - struct mbuf *m; - int hoff; + struct ether_extracted ext; uint64_t hlen; - uint8_t ipproto; uint64_t offload = 0; if (ISSET(m0->m_flags, M_VLANTAG)) { @@ -2800,39 +2798,21 @@ ixl_tx_setup_offload(struct mbuf *m0) M_IPV4_CSUM_OUT|M_TCP_CSUM_OUT|M_UDP_CSUM_OUT))
Re: mem.4: be more accurate about securelevel
On Tue, Jan 17, 2023 at 11:02:07PM +0100, Theo Buehler wrote: > > at least this tool works for me: > > Surely you have kern.allowkmem=1 set. This diff should phrase it correctly. ok? Thanks, Jan Index: man4.alpha/mem.4 === RCS file: /cvs/src/share/man/man4/man4.alpha/mem.4,v retrieving revision 1.6 diff -u -p -r1.6 mem.4 --- man4.alpha/mem.412 Jan 2018 04:36:44 - 1.6 +++ man4.alpha/mem.418 Jan 2023 19:25:27 - @@ -63,11 +63,12 @@ kernel virtual memory begins at .Pp Even with sufficient file system permissions, these devices can only be opened when the -.Xr securelevel 7 -is insecure or when the .Va kern.allowkmem .Xr sysctl 2 variable is set. +Also the +.Xr securelevel 7 +insecure is needed, to open the device writable. .Sh FILES .Bl -tag -width /dev/kmem -compact .It /dev/mem Index: man4.amd64/mem.4 === RCS file: /cvs/src/share/man/man4/man4.amd64/mem.4,v retrieving revision 1.6 diff -u -p -r1.6 mem.4 --- man4.amd64/mem.412 Jan 2018 04:36:44 - 1.6 +++ man4.amd64/mem.418 Jan 2023 19:26:59 - @@ -64,11 +64,12 @@ The kernel virtual memory begins at addr .Pp Even with sufficient file system permissions, these devices can only be opened when the -.Xr securelevel 7 -is insecure or when the .Va kern.allowkmem .Xr sysctl 2 variable is set. +Also the +.Xr securelevel 7 +insecure is needed, to open the device writable. .Sh FILES .Bl -tag -width Pa -compact .It Pa /dev/mem Index: man4.hppa/mem.4 === RCS file: /cvs/src/share/man/man4/man4.hppa/mem.4,v retrieving revision 1.4 diff -u -p -r1.4 mem.4 --- man4.hppa/mem.4 12 Jan 2018 04:36:44 - 1.4 +++ man4.hppa/mem.4 18 Jan 2023 19:29:07 - @@ -52,11 +52,12 @@ address 0; kernel virtual memory begins .Pp Even with sufficient file system permissions, these devices can only be opened when the -.Xr securelevel 7 -is insecure or when the .Va kern.allowkmem .Xr sysctl 2 variable is set. +Also the +.Xr securelevel 7 +insecure is needed, to open the device writable. .Sh FILES .Bl -tag -width /dev/kmem -compact .It Pa /dev/mem Index: man4.i386/mem.4 === RCS file: /cvs/src/share/man/man4/man4.i386/mem.4,v retrieving revision 1.12 diff -u -p -r1.12 mem.4 --- man4.i386/mem.4 12 Jan 2018 04:36:44 - 1.12 +++ man4.i386/mem.4 18 Jan 2023 19:30:18 - @@ -64,11 +64,12 @@ long, and ends at virtual address .Pp Even with sufficient file system permissions, these devices can only be opened when the -.Xr securelevel 7 -is insecure or when the .Va kern.allowkmem .Xr sysctl 2 variable is set. +Also the +.Xr securelevel 7 +insecure is needed, to open the device writable. .Sh FILES .Bl -tag -width Pa -compact .It Pa /dev/mem Index: man4.landisk/mem.4 === RCS file: /cvs/src/share/man/man4/man4.landisk/mem.4,v retrieving revision 1.4 diff -u -p -r1.4 mem.4 --- man4.landisk/mem.4 12 Jan 2018 04:36:44 - 1.4 +++ man4.landisk/mem.4 18 Jan 2023 19:31:28 - @@ -59,11 +59,12 @@ The kernel virtual memory begins at addr .Pp Even with sufficient file system permissions, these devices can only be opened when the -.Xr securelevel 7 -is insecure or when the .Va kern.allowkmem .Xr sysctl 2 variable is set. +Also the +.Xr securelevel 7 +insecure is needed, to open the device writable. .Sh FILES .Bl -tag -width Pa -compact .It Pa /dev/mem Index: man4.loongson/mem.4 === RCS file: /cvs/src/share/man/man4/man4.loongson/mem.4,v retrieving revision 1.4 diff -u -p -r1.4 mem.4 --- man4.loongson/mem.4 12 Jan 2018 04:36:44 - 1.4 +++ man4.loongson/mem.4 18 Jan 2023 19:32:44 - @@ -89,11 +89,12 @@ The kernel virtual memory begins at addr .Pp Even with sufficient file system permissions, these devices can only be opened when the -.Xr securelevel 7 -is insecure or when the .Va kern.allowkmem .Xr sysctl 2 variable is set. +Also the +.Xr securelevel 7 +insecure is needed, to open the device writable. .Sh FILES .Bl -tag -width Pa -compact .It Pa /dev/mem Index: man4.luna88k/mem.4 === RCS file: /cvs/src/share/man/man4/man4.luna88k/mem.4,v retrieving revision 1.4 diff -u -p -r1.4 mem.4 --- man4.luna88k/mem.4 12 Jan 2018 04:36:44 - 1.4 +++ man4.luna88k/mem.4 18 Jan 2023 19:33:50 - @@ -63,11 +63,12 @@ kernel virtual memory begins at .Pp Even with sufficient file system permissions, these devices can only be opened when the -.Xr securelevel 7 -is insecure or when the .Va kern.allowkmem .Xr sysctl 2 variable is set. +Also the +.Xr securelevel 7 +insecure is needed, to open the device writable. .Sh FILES .Bl
Re: refactor mbuf parsing on driver level
On Wed, Jan 18, 2023 at 10:50:25AM +0300, Vitaliy Makkoveev wrote: > On Tue, Jan 17, 2023 at 11:09:17PM +0100, Jan Klemkow wrote: > > we have several drivers which have to parse the content of mbufs. This > > diff suggest a central parsing function for this. Thus, we can reduce > > redundant code. > > > > I just start with ix(4) and ixl(4) because it was easy to test for me. > > But, this could also improve em(4), igc(4), ale(4) and oce(4). > > > > I'm not sure about the name, the api nor the place of this code. So, if > > someone has a better idea: i'm open to anything. > > I like code this deduplication. > > This newly introduced function doesn't touch ifnet but only extracts > protocol headers from mbuf(9). I guess mbuf_extract_headers() or > something like is much better for name with the ern/uipc_mbuf2.c as > place. Good Point. Updates diff below. Thanks, Jan Index: dev/pci/if_ix.c === RCS file: /cvs/src/sys/dev/pci/if_ix.c,v retrieving revision 1.189 diff -u -p -r1.189 if_ix.c --- dev/pci/if_ix.c 2 Sep 2022 14:08:09 - 1.189 +++ dev/pci/if_ix.c 18 Jan 2023 21:06:58 - @@ -2477,23 +2477,18 @@ static inline int ixgbe_csum_offload(struct mbuf *mp, uint32_t *vlan_macip_lens, uint32_t *type_tucmd_mlhl, uint32_t *olinfo_status) { - struct ether_header *eh = mtod(mp, struct ether_header *); - struct mbuf *m; - int hoff; + struct ether_header *eh = NULL; + struct ip *ip = NULL; + struct ip6_hdr *ip6 = NULL; int offload = 0; uint32_t iphlen; uint8_t ipproto; - *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT); + m_exract_headers(mp, , , , NULL, NULL); - switch (ntohs(eh->ether_type)) { - case ETHERTYPE_IP: { - struct ip *ip; - - m = m_getptr(mp, sizeof(*eh), ); - KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip)); - ip = (struct ip *)(mtod(m, caddr_t) + hoff); + *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT); + if (ip) { iphlen = ip->ip_hl << 2; ipproto = ip->ip_p; @@ -2503,26 +2498,14 @@ ixgbe_csum_offload(struct mbuf *mp, uint } *type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4; - break; - } - #ifdef INET6 - case ETHERTYPE_IPV6: { - struct ip6_hdr *ip6; - - m = m_getptr(mp, sizeof(*eh), ); - KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip6)); - ip6 = (struct ip6_hdr *)(mtod(m, caddr_t) + hoff); - + } else if (ip6) { iphlen = sizeof(*ip6); ipproto = ip6->ip6_nxt; *type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6; - break; - } #endif - - default: + } else { return offload; } Index: dev/pci/if_ixl.c === RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v retrieving revision 1.84 diff -u -p -r1.84 if_ixl.c --- dev/pci/if_ixl.c5 Aug 2022 13:57:16 - 1.84 +++ dev/pci/if_ixl.c18 Jan 2023 20:47:01 - @@ -2784,12 +2784,15 @@ ixl_load_mbuf(bus_dma_tag_t dmat, bus_dm static uint64_t ixl_tx_setup_offload(struct mbuf *m0) { - struct mbuf *m; - int hoff; + struct ether_header *eh = NULL; + struct ip *ip = NULL; + struct ip6_hdr *ip6 = NULL; + struct tcphdr *th = NULL; uint64_t hlen; uint8_t ipproto; uint64_t offload = 0; + if (ISSET(m0->m_flags, M_VLANTAG)) { uint64_t vtag = m0->m_pkthdr.ether_vtag; offload |= IXL_TX_DESC_CMD_IL2TAG1; @@ -2800,39 +2803,23 @@ ixl_tx_setup_offload(struct mbuf *m0) M_IPV4_CSUM_OUT|M_TCP_CSUM_OUT|M_UDP_CSUM_OUT)) return (offload); - switch (ntohs(mtod(m0, struct ether_header *)->ether_type)) { - case ETHERTYPE_IP: { - struct ip *ip; - - m = m_getptr(m0, ETHER_HDR_LEN, ); - KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip)); - ip = (struct ip *)(mtod(m, caddr_t) + hoff); + m_exract_headers(m0, , , , , NULL); + if (ip) { offload |= ISSET(m0->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT) ? IXL_TX_DESC_CMD_IIPT_IPV4_CSUM : IXL_TX_DESC_CMD_IIPT_IPV4; hlen = ip->ip_hl << 2; ipproto = ip->ip_p; - break; - } - #ifdef INET6 - case ETHERTYPE_IPV6: { - struct ip6_hdr *ip6; - - m = m_getptr(m0, ETHER_HDR_LEN, ); - KASSERT(m != NULL &&
libcrypto: Fix EINVAL in openssl/tls_init
Hi, after tls_init() and OPENSSL_init_ssl() errno is always set to EINVAL. This is caused by a routine that tries to prefetch all error strings up to 127 from strerror(3). But, strerror(3) sets EINVAL for unknown values of error. Thus, I would suggest to set this constant to ELAST. So, we will avoid useless unknown error strings and a non-zero errno after tls_init(). I guess this is not serious enough for the current release. But, we might fix this after unlocking of the tree? ok? bye, Jan Index: lib/libcrypto//err/err.c === RCS file: /cvs/src/lib/libcrypto/err/err.c,v retrieving revision 1.50 diff -u -p -r1.50 err.c --- lib/libcrypto//err/err.c26 Dec 2022 07:18:52 - 1.50 +++ lib/libcrypto//err/err.c24 Mar 2023 20:07:18 - @@ -560,7 +560,7 @@ int_err_get_next_lib(void) #ifndef OPENSSL_NO_ERR -#define NUM_SYS_STR_REASONS 127 +#define NUM_SYS_STR_REASONS ELAST #define LEN_SYS_STR_REASON 32 static ERR_STRING_DATA SYS_str_reasons[NUM_SYS_STR_REASONS + 1];
Re: em(4) multiqueue
On Fri, Apr 14, 2023 at 10:26:14AM +0800, Kevin Lo wrote: > On Thu, Apr 13, 2023 at 01:30:36PM -0500, Brian Conway wrote: > > Reviving this thread, apologies for discontinuity in mail readers: > > https://marc.info/?t=16564219358 > > > > After rebasing on 7.3, my results have mirrored Hrvoje's testing at > > the end of that thread. No issues with throughput, unusual latency, > > or reliability. `vmstat -i` shows some level of balancing between > > the queues. I've been testing on as many em(4) systems as I have > > access to, some manually, some in a packet forwarder/firewall > > scenarios: > > Last time I tested (about a year go) on I211, rx locked up if I tried > something > like iperf3 or tcpbench. Don't know if you have a similar problem. I rebased the rest to current and tested it with tcpbench between the following interfaces: em0 at pci7 dev 0 function 0 "Intel 82580" rev 0x01, msix, 4 queues, address 90:e2:ba:df:d5:2c em0 at pci5 dev 0 function 0 "Intel I350" rev 0x01, msix, 8 queues, address 00:25:90:eb:b3:c2 After a second the connection stucked. As far as I can see, the sending side got a problem. ot45# tcpbench 192.168.99.3 elapsed_ms bytes mbps bwidth 1012 14574120 115.210 100.00% Conn: 1 Mbps: 115.210 Peak Mbps: 115.210 Avg Mbps: 115.210 2022 00.000-nan% ... ot46# tcpbench -s elapsed_ms bytes mbps bwidth 1017 14313480 112.594 100.00% Conn: 1 Mbps: 112.594 Peak Mbps: 112.594 Avg Mbps: 112.594 2027 00.000-nan% ... ot45# netstat -nf inet -p tcp Active Internet connections Proto Recv-Q Send-Q Local Address Foreign AddressTCP-State tcp 0 260640 192.168.99.1.18530 192.168.99.3.12345 CLOSING When I retried it, it sometimes work and most times not. kstat tells me, that transmit queues 1 to 3 are oactive and just 0 works: em0:0:txq:0 packets: 4042648 packets bytes: 5310138322 bytes qdrops: 9 packets errors: 0 packets qlen: 0 packets maxqlen: 511 packets oactive: false em0:0:txq:1 packets: 9812 packets bytes: 14846716 bytes qdrops: 0 packets errors: 0 packets qlen: 184 packets maxqlen: 511 packets oactive: true em0:0:txq:2 packets: 690362 packets bytes: 60011484 bytes qdrops: 0 packets errors: 0 packets qlen: 185 packets maxqlen: 511 packets oactive: true em0:0:txq:3 packets: 443181 packets bytes: 43829886 bytes qdrops: 0 packets errors: 0 packets qlen: 198 packets maxqlen: 511 packets oactive: true This is the rebased diff on current i tested: Index: dev/pci/files.pci === RCS file: /cvs/src/sys/dev/pci/files.pci,v retrieving revision 1.361 diff -u -p -r1.361 files.pci --- dev/pci/files.pci 23 Apr 2023 00:20:26 - 1.361 +++ dev/pci/files.pci 25 Apr 2023 11:25:47 - @@ -334,7 +334,7 @@ attach fxp at pci with fxp_pci file dev/pci/if_fxp_pci.cfxp_pci # Intel Pro/1000 -device em: ether, ifnet, ifmedia +device em: ether, ifnet, ifmedia, intrmap, stoeplitz attach em at pci file dev/pci/if_em.c em file dev/pci/if_em_hw.c em Index: dev/pci/if_em.c === RCS file: /cvs/src/sys/dev/pci/if_em.c,v retrieving revision 1.365 diff -u -p -r1.365 if_em.c --- dev/pci/if_em.c 9 Feb 2023 21:21:27 - 1.365 +++ dev/pci/if_em.c 25 Apr 2023 11:25:47 - @@ -247,6 +247,7 @@ int em_intr(void *); int em_allocate_legacy(struct em_softc *); void em_start(struct ifqueue *); int em_ioctl(struct ifnet *, u_long, caddr_t); +int em_rxrinfo(struct em_softc *, struct if_rxrinfo *); void em_watchdog(struct ifnet *); void em_init(void *); void em_stop(void *, int); @@ -309,8 +310,10 @@ int em_setup_queues_msix(struct em_soft int em_queue_intr_msix(void *); int em_link_intr_msix(void *); void em_enable_queue_intr_msix(struct em_queue *); +void em_setup_rss(struct em_softc *); #else #define em_allocate_msix(_sc) (-1) +#define em_setup_rss(_sc) 0 #endif #if NKSTAT > 0 @@ -333,7 +336,6 @@ struct cfdriver em_cd = { }; static int em_smart_pwr_down = FALSE; -int em_enable_msix = 0; /* * Device identification routine @@ -629,12 +631,12 @@ err_pci: void em_start(struct ifqueue *ifq) { + struct em_queue *que = ifq->ifq_softc; struct ifnet *ifp = ifq->ifq_if; struct em_softc *sc = ifp->if_softc; u_int head, free, used; struct mbuf *m; int post = 0; - struct
Re: libcrypto: Fix EINVAL in openssl/tls_init
On Fri, Mar 24, 2023 at 10:02:05PM +0100, Theo Buehler wrote: > > Thus, I would suggest to set this constant to ELAST. So, we will avoid > > useless unknown error strings and a non-zero errno after tls_init(). > > ELAST isn't portable. It's under __BSD_VISIBLE in sys/errno.h. > > It would seem better to use the save_errno idiom to store the errno > at the start of the loop and restore it at the end. > > And yes, we should fix this, after unluck. ok? Thanks, Jan Index: err/err.c === RCS file: /cvs/src/lib/libcrypto/err/err.c,v retrieving revision 1.50 diff -u -p -r1.50 err.c --- err/err.c 26 Dec 2022 07:18:52 - 1.50 +++ err/err.c 27 Mar 2023 07:58:25 - @@ -580,6 +580,7 @@ build_SYS_str_reasons(void) static char strerror_tab[NUM_SYS_STR_REASONS][LEN_SYS_STR_REASON]; int i; static int init = 1; + int save_errno = errno; CRYPTO_r_lock(CRYPTO_LOCK_ERR); if (!init) { @@ -594,6 +595,8 @@ build_SYS_str_reasons(void) return; } + /* strerror(3) will set errno to EINVAL when i is an unknown error. */ + save_errno = errno; for (i = 1; i <= NUM_SYS_STR_REASONS; i++) { ERR_STRING_DATA *str = _str_reasons[i - 1]; @@ -610,6 +613,7 @@ build_SYS_str_reasons(void) if (str->string == NULL) str->string = "unknown"; } + errno = save_errno; /* Now we still have SYS_str_reasons[NUM_SYS_STR_REASONS] = {0, NULL}, * as required by ERR_load_strings. */
Re: refactor mbuf parsing on driver level
On Mon, Feb 06, 2023 at 09:47:57PM +0100, Christian Weisgerber wrote: > Christian Weisgerber: > > > I also switched over em(4) to this and have successfully used it > > for a full 30-hour package build on the four amd64 ports machines > > with their I350 interfaces. Additionally, I've done some IPv6 > > testing at home over an I210. > > ok for this? I tested it with I350. Diff look fine. ok jan@ > igc(4) has very similar code, but I don't have access to a machine > with that hardware. Send me an ssh-key and I give you access to this machine: http://obsd-lab.genua.de/hw/ot34.html Thanks, Jan > > diff f8646d27d4041e5f595c04e17a876f12600deea7 > > f3f95d0cc0957a2f1e961cace4c3c9dd869e8c9e > > commit - f8646d27d4041e5f595c04e17a876f12600deea7 > > commit + f3f95d0cc0957a2f1e961cace4c3c9dd869e8c9e > > blob - c840377f0a3f1ef3c3e3072657698d8085ffd3a0 > > blob + 523ed5b0a18718c50bb30e2995d293fa1d2199a6 > > --- sys/dev/pci/if_em.c > > +++ sys/dev/pci/if_em.c > > @@ -2398,12 +2398,11 @@ em_tx_ctx_setup(struct em_queue *que, struct mbuf > > *mp, > > em_tx_ctx_setup(struct em_queue *que, struct mbuf *mp, u_int head, > > u_int32_t *olinfo_status, u_int32_t *cmd_type_len) > > { > > + struct ether_extracted ext; > > struct e1000_adv_tx_context_desc *TD; > > - struct ether_header *eh = mtod(mp, struct ether_header *); > > - struct mbuf *m; > > uint32_t vlan_macip_lens = 0, type_tucmd_mlhl = 0, mss_l4len_idx = 0; > > - int off = 0, hoff; > > - uint8_t ipproto, iphlen; > > + int off = 0; > > + uint8_t iphlen; > > > > *olinfo_status = 0; > > *cmd_type_len = 0; > > @@ -2418,44 +2417,26 @@ em_tx_ctx_setup(struct em_queue *que, struct mbuf > > *mp, > > } > > #endif > > > > - vlan_macip_lens |= (sizeof(*eh) << E1000_ADVTXD_MACLEN_SHIFT); > > - > > - switch (ntohs(eh->ether_type)) { > > - case ETHERTYPE_IP: { > > - struct ip *ip; > > + ether_extract_headers(mp, ); > > > > - m = m_getptr(mp, sizeof(*eh), ); > > - ip = (struct ip *)(mtod(m, caddr_t) + hoff); > > + vlan_macip_lens |= (sizeof(*ext.eh) << E1000_ADVTXD_MACLEN_SHIFT); > > > > - iphlen = ip->ip_hl << 2; > > - ipproto = ip->ip_p; > > + if (ext.ip4) { > > + iphlen = ext.ip4->ip_hl << 2; > > > > type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV4; > > if (ISSET(mp->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT)) { > > *olinfo_status |= E1000_TXD_POPTS_IXSM << 8; > > off = 1; > > } > > - > > - break; > > - } > > #ifdef INET6 > > - case ETHERTYPE_IPV6: { > > - struct ip6_hdr *ip6; > > + } else if (ext.ip6) { > > + iphlen = sizeof(*ext.ip6); > > > > - m = m_getptr(mp, sizeof(*eh), ); > > - ip6 = (struct ip6_hdr *)(mtod(m, caddr_t) + hoff); > > - > > - iphlen = sizeof(*ip6); > > - ipproto = ip6->ip6_nxt; > > - > > type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV6; > > - break; > > - } > > #endif > > - default: > > + } else { > > iphlen = 0; > > - ipproto = 0; > > - break; > > } > > > > *cmd_type_len |= E1000_ADVTXD_DTYP_DATA | E1000_ADVTXD_DCMD_IFCS; > > @@ -2464,21 +2445,18 @@ em_tx_ctx_setup(struct em_queue *que, struct mbuf > > *mp, > > vlan_macip_lens |= iphlen; > > type_tucmd_mlhl |= E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DTYP_CTXT; > > > > - switch (ipproto) { > > - case IPPROTO_TCP: > > + if (ext.tcp) { > > type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP; > > if (ISSET(mp->m_pkthdr.csum_flags, M_TCP_CSUM_OUT)) { > > *olinfo_status |= E1000_TXD_POPTS_TXSM << 8; > > off = 1; > > } > > - break; > > - case IPPROTO_UDP: > > + } else if (ext.udp) { > > type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_UDP; > > if (ISSET(mp->m_pkthdr.csum_flags, M_UDP_CSUM_OUT)) { > > *olinfo_status |= E1000_TXD_POPTS_TXSM << 8; > > off = 1; > > } > > - break; > > } > > > > if (!off) > > > > -- > Christian "naddy" Weisgerber na...@mips.inka.de >
Re: refactor mbuf parsing on driver level
On Tue, Jan 31, 2023 at 09:12:51PM +0100, Christian Weisgerber wrote: > Jan Klemkow: > > > - I turned the KASSERTS to returns. > > - Check if the mbuf is large enough for an ether header. > > - additionally #ifdef'd INET6 around the ip6_hdr in the new struct > > For non-initial fragments of TCP/UDP packets, ether_extract_headers() > will create ext.tcp/ext.udp pointers that do not point to a protocol > header. Should there be a check to exclude fragments? yes. bluhm also suggested this solution to me. ok? Thanks, Jan Index: dev/pci/if_ix.c === RCS file: /cvs/src/sys/dev/pci/if_ix.c,v retrieving revision 1.191 diff -u -p -r1.191 if_ix.c --- dev/pci/if_ix.c 26 Jan 2023 07:32:39 - 1.191 +++ dev/pci/if_ix.c 31 Jan 2023 21:05:40 - @@ -2477,25 +2477,16 @@ static inline int ixgbe_csum_offload(struct mbuf *mp, uint32_t *vlan_macip_lens, uint32_t *type_tucmd_mlhl, uint32_t *olinfo_status) { - struct ether_header *eh = mtod(mp, struct ether_header *); - struct mbuf *m; - int hoff; + struct ether_extracted ext; int offload = 0; uint32_t iphlen; - uint8_t ipproto; - *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT); + ether_extract_headers(mp, ); - switch (ntohs(eh->ether_type)) { - case ETHERTYPE_IP: { - struct ip *ip; + *vlan_macip_lens |= (sizeof(*ext.eh) << IXGBE_ADVTXD_MACLEN_SHIFT); - m = m_getptr(mp, sizeof(*eh), ); - KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip)); - ip = (struct ip *)(mtod(m, caddr_t) + hoff); - - iphlen = ip->ip_hl << 2; - ipproto = ip->ip_p; + if (ext.ip4) { + iphlen = ext.ip4->ip_hl << 2; if (ISSET(mp->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT)) { *olinfo_status |= IXGBE_TXD_POPTS_IXSM << 8; @@ -2503,46 +2494,30 @@ ixgbe_csum_offload(struct mbuf *mp, uint } *type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4; - break; - } - #ifdef INET6 - case ETHERTYPE_IPV6: { - struct ip6_hdr *ip6; - - m = m_getptr(mp, sizeof(*eh), ); - KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip6)); - ip6 = (struct ip6_hdr *)(mtod(m, caddr_t) + hoff); - - iphlen = sizeof(*ip6); - ipproto = ip6->ip6_nxt; + } else if (ext.ip6) { + iphlen = sizeof(*ext.ip6); *type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6; - break; - } #endif - - default: + } else { return offload; } *vlan_macip_lens |= iphlen; - switch (ipproto) { - case IPPROTO_TCP: + if (ext.tcp) { *type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP; if (ISSET(mp->m_pkthdr.csum_flags, M_TCP_CSUM_OUT)) { *olinfo_status |= IXGBE_TXD_POPTS_TXSM << 8; offload = 1; } - break; - case IPPROTO_UDP: + } else if (ext.udp) { *type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_UDP; if (ISSET(mp->m_pkthdr.csum_flags, M_UDP_CSUM_OUT)) { *olinfo_status |= IXGBE_TXD_POPTS_TXSM << 8; offload = 1; } - break; } return offload; Index: dev/pci/if_ixl.c === RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v retrieving revision 1.86 diff -u -p -r1.86 if_ixl.c --- dev/pci/if_ixl.c26 Jan 2023 07:32:39 - 1.86 +++ dev/pci/if_ixl.c31 Jan 2023 21:05:40 - @@ -2784,10 +2784,8 @@ ixl_load_mbuf(bus_dma_tag_t dmat, bus_dm static uint64_t ixl_tx_setup_offload(struct mbuf *m0) { - struct mbuf *m; - int hoff; + struct ether_extracted ext; uint64_t hlen; - uint8_t ipproto; uint64_t offload = 0; if (ISSET(m0->m_flags, M_VLANTAG)) { @@ -2800,39 +2798,21 @@ ixl_tx_setup_offload(struct mbuf *m0) M_IPV4_CSUM_OUT|M_TCP_CSUM_OUT|M_UDP_CSUM_OUT)) return (offload); - switch (ntohs(mtod(m0, struct ether_header *)->ether_type)) { - case ETHERTYPE_IP: { - struct ip *ip; - - m = m_getptr(m0, ETHER_HDR_LEN, ); - KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip)); - ip = (struct ip *)(mtod(m, caddr_t) + hoff); + ether_extract_headers(m0, ); + if (ext.ip4) { offload |= ISSET(m0->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT) ? IXL_TX_DESC_CMD_IIPT_IPV4_
ix(4): allocate less memory for tx buffers
Hi, TSO packets are limited to MAXMCLBYTES (64k). Thus, we don't need to allocate IXGBE_TSO_SIZE (256k) per packet for the transmit buffers. This saves 3/4 of the memory and allows me to pack over 8 ix(8) ports into one machine. Otherwise I run out of devbuf in malloc(9). ok? bye, Jan Index: dev/pci/if_ix.c === RCS file: /cvs/src/sys/dev/pci/if_ix.c,v retrieving revision 1.197 diff -u -p -r1.197 if_ix.c --- dev/pci/if_ix.c 1 Jun 2023 09:05:33 - 1.197 +++ dev/pci/if_ix.c 9 Jun 2023 16:01:18 - @@ -37,6 +37,12 @@ #include #include +/* + * Our TCP/IP Stack could not handle packets greater then MAXMCLBYTES. + * This interface could not handle packets greater then IXGBE_TSO_SIZE. + */ +CTASSERT(MAXMCLBYTES < IXGBE_TSO_SIZE); + /* * Driver version */ @@ -2263,7 +2269,7 @@ ixgbe_allocate_transmit_buffers(struct t /* Create the descriptor buffer dma maps */ for (i = 0; i < sc->num_tx_desc; i++) { txbuf = >tx_buffers[i]; - error = bus_dmamap_create(txr->txdma.dma_tag, IXGBE_TSO_SIZE, + error = bus_dmamap_create(txr->txdma.dma_tag, MAXMCLBYTES, sc->num_segs, PAGE_SIZE, 0, BUS_DMA_NOWAIT, >map);
Re: tcp lro tso path mtu
On Thu, Jul 06, 2023 at 10:19:21PM +0300, Alexander Bluhm wrote: > On Thu, Jul 06, 2023 at 08:49:03PM +0200, Jan Klemkow wrote: > > > @@ -109,6 +109,9 @@ > > > #include > > > #include > > > #include > > > > I think is a merge bug, isn't it? > > > > > +#include > > > +#include > > > +#include > > Right. > > > > + error = tcp_if_output_tso(ifp, mp, dst, rt, ifcap, mtu); > > > + if (error || *mp == NULL) > > > + return error; > > > + > > > + if ((*mp)->m_pkthdr.len <= mtu) { > > > > I may miss something but... > > > > Couldn't you move the *_cksum_out() calls above the upper > > tcp_if_output_tso() call? And than remove the *_cksum_out() calls > > inside of tcp_if_output_tso()? > > > > Thus, there is just one place where we call them. > > > > > + switch (dst->sa_family) { > > > + case AF_INET: > > > + in_hdr_cksum_out(*mp, ifp); > > > + in_proto_cksum_out(*mp, ifp); > > > + break; > > > +#ifdef INET6 > > > + case AF_INET6: > > > + in6_proto_cksum_out(*mp, ifp); > > > + break; > > > +#endif > > There is the case in tcp_if_output_tso() where we call tcp_chopper(). > Then checksum has to be calcualted after chopping. If I do it > always before tcp_if_output_tso(), we may caluclate it twice. Once > for the large packet and once for the small ones. > > New diff without duplicate includes. tested with v4/v6, direct and forwarding. ok jan@ > Index: net/if.c > === > RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v > retrieving revision 1.704 > diff -u -p -r1.704 if.c > --- net/if.c 6 Jul 2023 04:55:04 - 1.704 > +++ net/if.c 6 Jul 2023 19:15:00 - > @@ -886,6 +886,57 @@ if_output_ml(struct ifnet *ifp, struct m > } > > int > +if_output_tso(struct ifnet *ifp, struct mbuf **mp, struct sockaddr *dst, > +struct rtentry *rt, u_int mtu) > +{ > + uint32_t ifcap; > + int error; > + > + switch (dst->sa_family) { > + case AF_INET: > + ifcap = IFCAP_TSOv4; > + break; > +#ifdef INET6 > + case AF_INET6: > + ifcap = IFCAP_TSOv6; > + break; > +#endif > + default: > + unhandled_af(dst->sa_family); > + } > + > + /* > + * Try to send with TSO first. When forwarding LRO may set > + * maximium segment size in mbuf header. Chop TCP segment > + * even if it would fit interface MTU to preserve maximum > + * path MTU. > + */ > + error = tcp_if_output_tso(ifp, mp, dst, rt, ifcap, mtu); > + if (error || *mp == NULL) > + return error; > + > + if ((*mp)->m_pkthdr.len <= mtu) { > + switch (dst->sa_family) { > + case AF_INET: > + in_hdr_cksum_out(*mp, ifp); > + in_proto_cksum_out(*mp, ifp); > + break; > +#ifdef INET6 > + case AF_INET6: > + in6_proto_cksum_out(*mp, ifp); > + break; > +#endif > + } > + error = ifp->if_output(ifp, *mp, dst, rt); > + *mp = NULL; > + return error; > + } > + > + /* mp still contains mbuf that has to be fragmented or dropped. */ > + return 0; > +} > + > +int > if_output_mq(struct ifnet *ifp, struct mbuf_queue *mq, unsigned int *total, > struct sockaddr *dst, struct rtentry *rt) > { > Index: net/if_var.h > === > RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_var.h,v > retrieving revision 1.128 > diff -u -p -r1.128 if_var.h > --- net/if_var.h 28 Jun 2023 11:49:49 - 1.128 > +++ net/if_var.h 6 Jul 2023 19:12:39 - > @@ -329,6 +329,8 @@ int if_output_ml(struct ifnet *, struct > struct sockaddr *, struct rtentry *); > int if_output_mq(struct ifnet *, struct mbuf_queue *, unsigned int *, > struct sockaddr *, struct rtentry *); > +int if_output_tso(struct ifnet *, struct mbuf **, struct sockaddr *, > + struct rtentry *, u_int); > int if_output_local(struct ifnet *, struct mbuf *, sa_family_t); > void if_rtrequest_dummy(struct ifnet *, int, struct rtentry *); > void p2p_rtrequest(struct ifnet *, int, struct rtentry *); > Index: net/pf.c > ==
Re: tcp lro tso path mtu
On Mon, Jul 03, 2023 at 08:04:11PM +0300, Alexander Bluhm wrote: > As final step before making LRO (Large Receive Offload) the default, > we have to fix path MTU discovery when forwarding. > > The drivers, currently ix(4) and lo(4) only, record an upper bound > of the size of the original packets in ph_mss. When sending we > must chop the packets with TSO (TCP Segmentation Offload) to that > size. That means we have to call tcp_if_output_tso() before > ifp->if_output(). I have put that logic into if_output_tso() to > avoid code duplication. > > ok? I like the idea of this commit. Some comments below. Thanks, Jan > Index: net/if.c > === > RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v > retrieving revision 1.702 > diff -u -p -r1.702 if.c > --- net/if.c 2 Jul 2023 19:59:15 - 1.702 > +++ net/if.c 3 Jul 2023 10:28:30 - > @@ -109,6 +109,9 @@ > #include > #include > #include I think is a merge bug, isn't it? > +#include > +#include > +#include > @@ -883,6 +886,57 @@ if_output_ml(struct ifnet *ifp, struct m > ml_purge(ml); > > return error; > +} > + > +int > +if_output_tso(struct ifnet *ifp, struct mbuf **mp, struct sockaddr *dst, > +struct rtentry *rt, u_int mtu) > +{ > + uint32_t ifcap; > + int error; > + > + switch (dst->sa_family) { > + case AF_INET: > + ifcap = IFCAP_TSOv4; > + break; > +#ifdef INET6 > + case AF_INET6: > + ifcap = IFCAP_TSOv6; > + break; > +#endif > + default: > + unhandled_af(dst->sa_family); > + } > + > + /* > + * Try to send with TSO first. When forwarding LRO may set > + * maximium segment size in mbuf header. Chop TCP segment > + * even if it would fit interface MTU to preserve maximum > + * path MTU. > + */ > + error = tcp_if_output_tso(ifp, mp, dst, rt, ifcap, mtu); > + if (error || *mp == NULL) > + return error; > + > + if ((*mp)->m_pkthdr.len <= mtu) { I may miss something but... Couldn't you move the *_cksum_out() calls above the upper tcp_if_output_tso() call? And than remove the *_cksum_out() calls inside of tcp_if_output_tso()? Thus, there is just one place where we call them. > + switch (dst->sa_family) { > + case AF_INET: > + in_hdr_cksum_out(*mp, ifp); > + in_proto_cksum_out(*mp, ifp); > + break; > +#ifdef INET6 > + case AF_INET6: > + in6_proto_cksum_out(*mp, ifp); > + break; > +#endif > + } > + error = ifp->if_output(ifp, *mp, dst, rt); > + *mp = NULL; > + return error; > + } > + > + /* mp still contains mbuf that has to be fragmented or dropped. */ > + return 0; > }
ixl(4): protect admin queue with mutex
Hi, there is an issue with the admin queue of ixl(4) which leads into the following panic when the link state changes: uvm_fault(0x818005f8, 0x18, 0, 2) -> e kernel: page fault trap, code=0 Stopped at ixl_intr0+0xca: movq%rdx,0x18(%rax) TIDPIDUID PRFLAGS PFLAGS CPU COMMAND 392823 13219 00x100040 02 ifstated 444681 94950 90 0x1100010 06 ospf6d 428704 9496 90 0x1100010 09 ospf6d 106020 59273 85 0x1100010 01 ospfd 420435 72114 85 0x1100010 05 ospfd 295821 93368 73 0x1100010 03 syslogd 367116 56598 0 0x14000 0x2007 zerothread 275385 57815 0 0x14000 0x2004 softnet ixl_intr0(84509000) at ixl_intr0+0xca intr_handler(0,844b0b80) at intr_handler+0x5b Xintr_ioapic_edge25_untramp() at Xintr_ioapic_edge25_untramp+0x18f acpicpu_idle() at acpicpu_idle+0x1f6 sched_idle(0) at sched_idle+0x280 end trace frame: 0x0, count: 10 https://www.openbsd.org/ddb.html describes the minimum info required in bug reports. Insufficient info makes it difficult to find and fix bugs. ddb{0}> The queue is corrupted in a way, that slot->iaq_cookie is 0. Which causes the uvm fault when iatq is dereferenced. The following diff uses a mutex to protect the admin queue and avoids the issue above. ok? bye, Jan Index: dev/pci/if_ixl.c === RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v retrieving revision 1.87 diff -u -p -r1.87 if_ixl.c --- dev/pci/if_ixl.c6 Feb 2023 20:27:45 - 1.87 +++ dev/pci/if_ixl.c19 Jul 2023 07:05:40 - @@ -1274,6 +1274,7 @@ struct ixl_softc { unsigned int sc_atq_prod; unsigned int sc_atq_cons; + struct mutex sc_atq_mtx; struct ixl_dmamemsc_arq; struct task sc_arq_task; struct ixl_aq_bufs sc_arq_idle; @@ -1723,6 +1724,8 @@ ixl_attach(struct device *parent, struct /* initialise the adminq */ + mtx_init(>sc_atq_mtx, IPL_NET); + if (ixl_dmamem_alloc(sc, >sc_atq, sizeof(struct ixl_aq_desc) * IXL_AQ_NUM, IXL_AQ_ALIGN) != 0) { printf("\n" "%s: unable to allocate atq\n", DEVNAME(sc)); @@ -3599,6 +3602,8 @@ ixl_atq_post(struct ixl_softc *sc, struc struct ixl_aq_desc *atq, *slot; unsigned int prod; + mtx_enter(>sc_atq_mtx); + /* assert locked */ atq = IXL_DMA_KVA(>sc_atq); @@ -3618,6 +3623,8 @@ ixl_atq_post(struct ixl_softc *sc, struc prod &= IXL_AQ_MASK; sc->sc_atq_prod = prod; ixl_wr(sc, sc->sc_aq_regs->atq_tail, prod); + + mtx_leave(>sc_atq_mtx); } static void @@ -3628,11 +3635,15 @@ ixl_atq_done(struct ixl_softc *sc) unsigned int cons; unsigned int prod; + mtx_enter(>sc_atq_mtx); + prod = sc->sc_atq_prod; cons = sc->sc_atq_cons; - if (prod == cons) + if (prod == cons) { + mtx_leave(>sc_atq_mtx); return; + } atq = IXL_DMA_KVA(>sc_atq); @@ -3645,6 +3656,7 @@ ixl_atq_done(struct ixl_softc *sc) if (!ISSET(slot->iaq_flags, htole16(IXL_AQ_DD))) break; + KASSERT(slot->iaq_cookie != 0); iatq = (struct ixl_atq *)slot->iaq_cookie; iatq->iatq_desc = *slot; @@ -3661,6 +3673,8 @@ ixl_atq_done(struct ixl_softc *sc) BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE); sc->sc_atq_cons = cons; + + mtx_leave(>sc_atq_mtx); } static void @@ -3691,6 +3705,8 @@ ixl_atq_poll(struct ixl_softc *sc, struc unsigned int prod; unsigned int t = 0; + mtx_enter(>sc_atq_mtx); + atq = IXL_DMA_KVA(>sc_atq); prod = sc->sc_atq_prod; slot = atq + prod; @@ -3712,8 +3728,10 @@ ixl_atq_poll(struct ixl_softc *sc, struc while (ixl_rd(sc, sc->sc_aq_regs->atq_head) != prod) { delaymsec(1); - if (t++ > tm) + if (t++ > tm) { + mtx_leave(>sc_atq_mtx); return (ETIMEDOUT); + } } bus_dmamap_sync(sc->sc_dmat, IXL_DMA_MAP(>sc_atq), @@ -3724,6 +3742,7 @@ ixl_atq_poll(struct ixl_softc *sc, struc sc->sc_atq_cons = prod; + mtx_leave(>sc_atq_mtx); return (0); }
Re: tcp lro by default, call for testing
On Sat, Jul 08, 2023 at 05:15:26PM +0300, Alexander Bluhm wrote: > I am not aware of any more limitations when enabling LRO for TCP > in the network drivers. The feature allows to receive agregated > packets larger than the MTU. Receiving TCP streams becomes much > faster. > > As the network hardware is not aware whether a packet is received > locally or forwarded, everything is aggregated. In case of forwarding > it is split on output to packets not larger than the original > packets. So path MTU discovery should still work. If the outgoing > interface supports TSO, the packet is chopped in hardware. > > Currently only ix(4) and lo(4) support LRO, and ix(4) is limited > to IPv4 and newer than the old 82598 model. If the interface is > added to a bridge(4) or aggr(4), LRO is automatically disabled. I guess you mean veb(4) not aggr(4). We just avoid the in heritage of the LRO capability in aggr(4) but are using the feature. > So in case you possess any ix(4) hardware or do funky pf routing > on lo(4) please run this diff. If you encounter problems, report > and turn LRO off per interface with ifconfig -tcplro. Diff looks fine to me. I just would keep mentioning the default behavior in the manpage like this: ok jan@ Index: sbin/ifconfig/ifconfig.8 === RCS file: /cvs/src/sbin/ifconfig/ifconfig.8,v retrieving revision 1.397 diff -u -p -r1.397 ifconfig.8 --- sbin/ifconfig/ifconfig.87 Jun 2023 18:42:40 - 1.397 +++ sbin/ifconfig/ifconfig.810 Jul 2023 11:54:47 - @@ -517,9 +517,9 @@ It is not possible to use LRO with inter or .Xr tpmr 4 . Changing this option will re-initialize the network interface. +LRO is enabled by default. .It Cm -tcplro Disable LRO. -LRO is disabled by default. .It Cm up Mark an interface .Dq up . > Index: sys/dev/pci/if_ix.c > === > RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pci/if_ix.c,v > retrieving revision 1.198 > diff -u -p -r1.198 if_ix.c > --- sys/dev/pci/if_ix.c 8 Jul 2023 09:01:30 - 1.198 > +++ sys/dev/pci/if_ix.c 8 Jul 2023 13:51:26 - > @@ -1925,8 +1925,10 @@ ixgbe_setup_interface(struct ix_softc *s > ifp->if_capabilities |= IFCAP_CSUM_IPv4; > > ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6; > - if (sc->hw.mac.type != ixgbe_mac_82598EB) > + if (sc->hw.mac.type != ixgbe_mac_82598EB) { > + ifp->if_xflags |= IFXF_LRO; > ifp->if_capabilities |= IFCAP_LRO; > + } > > /* >* Specify the media types supported by this sc and register > Index: sys/net/if_loop.c > === > RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_loop.c,v > retrieving revision 1.95 > diff -u -p -r1.95 if_loop.c > --- sys/net/if_loop.c 2 Jul 2023 19:59:15 - 1.95 > +++ sys/net/if_loop.c 8 Jul 2023 13:51:26 - > @@ -172,11 +172,11 @@ loop_clone_create(struct if_clone *ifc, > ifp->if_softc = NULL; > ifp->if_mtu = LOMTU; > ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST; > - ifp->if_xflags = IFXF_CLONED; > + ifp->if_xflags = IFXF_CLONED | IFXF_LRO; > ifp->if_capabilities = IFCAP_CSUM_IPv4 | > IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 | > IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6 | > - IFCAP_LRO; > + IFCAP_LRO | IFCAP_TSOv4 | IFCAP_TSOv6; > ifp->if_rtrequest = lortrequest; > ifp->if_ioctl = loioctl; > ifp->if_input = loinput; > Index: sbin/ifconfig/ifconfig.8 > === > RCS file: /data/mirror/openbsd/cvs/src/sbin/ifconfig/ifconfig.8,v > retrieving revision 1.397 > diff -u -p -r1.397 ifconfig.8 > --- sbin/ifconfig/ifconfig.8 7 Jun 2023 18:42:40 - 1.397 > +++ sbin/ifconfig/ifconfig.8 7 Jul 2023 19:57:09 - > @@ -519,7 +519,6 @@ or > Changing this option will re-initialize the network interface. > .It Cm -tcplro > Disable LRO. > -LRO is disabled by default. > .It Cm up > Mark an interface > .Dq up . >
Add ethernet type check in ifsetlro()
Hi, bluhm pointed out that the ether_brport_isset() check it just allowed on ethernet devices. Thus, I put an additional ethernet check in the condition. This also fixes EBUSY errors of "ifconfig lo0 tcplro" calls in my setup. ok? bye, Jan Index: net/if.c === RCS file: /cvs/src/sys/net/if.c,v retrieving revision 1.702 diff -u -p -r1.702 if.c --- net/if.c2 Jul 2023 19:59:15 - 1.702 +++ net/if.c3 Jul 2023 20:58:32 - @@ -3206,7 +3206,7 @@ ifsetlro(struct ifnet *ifp, int on) KERNEL_ASSERT_LOCKED(); /* for if_flags */ if (on && !ISSET(ifp->if_xflags, IFXF_LRO)) { - if (ether_brport_isset(ifp)) { + if (ifp->if_type == IFT_ETHER && ether_brport_isset(ifp)) { error = EBUSY; goto out; }
Re: Virtio fix for testing
On Wed, May 24, 2023 at 08:50:26PM +0200, Stefan Fritsch wrote: > I forgot to mention that no stress test is necessary. If it boots and the > virtio devices work at all, that should be enough. Works for me on Linux/KVM with the following devices: vga1 at pci0 dev 2 function 0 "Qumranet Virtio 1.x GPU" rev 0x01 virtio0 at pci0 dev 4 function 0 "Qumranet Virtio Storage" rev 0x00 virtio1 at pci0 dev 6 function 0 "Qumranet Virtio Console" rev 0x00 virtio2 at pci0 dev 7 function 0 "Qumranet Virtio Memory Balloon" rev 0x00 virtio3 at pci0 dev 8 function 0 "Qumranet Virtio Network" rev 0x00 and on OpenBSD/VMM with: virtio0 at pci0 dev 1 function 0 "Qumranet Virtio RNG" rev 0x00 virtio1 at pci0 dev 2 function 0 "Qumranet Virtio Network" rev 0x00 virtio2 at pci0 dev 3 function 0 "Qumranet Virtio Storage" rev 0x00 virtio3 at pci0 dev 4 function 0 "Qumranet Virtio SCSI" rev 0x00 Thanks, Jan
Re: ifconfig rename tcplro
On Tue, Jun 06, 2023 at 05:54:31PM +0300, Vitaliy Makkoveev wrote: > On Tue, Jun 06, 2023 at 02:31:52PM +0200, Alexander Bluhm wrote: > > I would suggest to rename ifconfig tcprecvoffload to tcplro. Maybe > > it's just because I had to type that long name too often. > > > > With that we have consistent naming: > > # ifconfig ix0 tcplro > > # sysctl net.inet.tcp.tso=1 > > > > Also the coresponding flag are named LRO. > > # ifconfig ix1 hwfeatures > > ix1: flags=2008843 mtu 1500 > > > > hwfeatures=71b7 > > hardmtu 9198 > > > > The feature is quite new, so I have no backward compatiblity concerns. > > > > ok? > > Could you name it "lro" like FreeBSD uses? I also would prefer this one.
Re: ifconfig rename tcplro
On Tue, Jun 06, 2023 at 09:37:22AM -0700, Chris Cappuccio wrote: > Jan Klemkow [j.klem...@wemelug.de] wrote: > > On Tue, Jun 06, 2023 at 05:54:31PM +0300, Vitaliy Makkoveev wrote: > > > On Tue, Jun 06, 2023 at 02:31:52PM +0200, Alexander Bluhm wrote: > > > > I would suggest to rename ifconfig tcprecvoffload to tcplro. Maybe > > > > it's just because I had to type that long name too often. > > > > > > > > With that we have consistent naming: > > > > # ifconfig ix0 tcplro > > > > # sysctl net.inet.tcp.tso=1 > > > > > > > > Also the coresponding flag are named LRO. > > > > # ifconfig ix1 hwfeatures > > > > ix1: flags=2008843 mtu 1500 > > > > > > > > hwfeatures=71b7 > > > > hardmtu 9198 > > > > > > > > The feature is quite new, so I have no backward compatiblity concerns. > > > > > > > > ok? > > > > > > Could you name it "lro" like FreeBSD uses? > > > > I also would prefer this one. > > and tcpsendoffload back to tso ? > > was the reason for changing it from tso due to the initial conflation > of TSO and LRO in the tree? Yes. At the start of this, I just want to keep it simple with one ifconfig option "tso". But, tso is now default in tcp_output() and can be controlled globally via sysctl(2) net.inet.tcp.tso. Thus, we just need to control LRO per interface.
Re: ifconfig rename tcplro
On Wed, Jun 07, 2023 at 02:49:07PM +0300, Vitaliy Makkoveev wrote: > On Wed, Jun 07, 2023 at 01:29:09PM +0200, Alexander Bluhm wrote: > > On Wed, Jun 07, 2023 at 12:59:11PM +0300, Vitaliy Makkoveev wrote: > > > On Wed, Jun 07, 2023 at 10:19:32AM +1000, David Gwynne wrote: > > > > > > > > > > > > > On 7 Jun 2023, at 06:33, Vitaliy Makkoveev wrote: > > > > > > > > > >> On 6 Jun 2023, at 20:29, Alexander Bluhm > > > > >> wrote: > > > > >> > > > > >> On Tue, Jun 06, 2023 at 05:54:31PM +0300, Vitaliy Makkoveev wrote: > > > > >>> On Tue, Jun 06, 2023 at 02:31:52PM +0200, Alexander Bluhm wrote: > > > > Hi, > > > > > > > > I would suggest to rename ifconfig tcprecvoffload to tcplro. Maybe > > > > it's just because I had to type that long name too often. > > > > > > > > With that we have consistent naming: > > > > # ifconfig ix0 tcplro > > > > # sysctl net.inet.tcp.tso=1 > > > > > > > > Also the coresponding flag are named LRO. > > > > # ifconfig ix1 hwfeatures > > > > ix1: flags=2008843 mtu > > > > 1500 > > > > > > > > hwfeatures=71b7 > > > > hardmtu 9198 > > > > > > > > The feature is quite new, so I have no backward compatiblity > > > > concerns. > > > > > > > > ok? > > > > > > > > >>> > > > > >>> Could you name it "lro" like FreeBSD uses? > > > > >> > > > > >> When I started with this, LRO and TSO were unknown to me. So with > > > > >> TCP prefix it may be clearer to users where the feature belongs. > > > > >> > > > > >> Naming is hard. > > > > > > > > > > Yeah, naming is definitely hard. I propose to use lro because it is > > > > > already used for the same purpose by FreeBSD, so the same name helps > > > > > to avoid confusion. > > > > > > > > > >lro If the driver supports tcp(4) large receive offloading, > > > > >enable LRO on the interface. > > > > > > > > > > Also, we have used "tso" keyword for tcp segmentation offloading for > > > > > the same reason, until it became global net.inet.tcp.tso. > > > > > > > > Is it going to be used to enable lro for udp and other protocols as > > > > well? > > > > > > Why not? We have tso feature system wide, so why don't have receive > > > offloading feature global for all supported protocols? Especially since > > > I suspect this control will be moved from ifconfig to global > > > net.inet.tcp.lro like net.inet.tcp.tso. > > > > Maybe we can make lro the default, and then move it to net.inet.tcp.lro. > > But I like to see another driver to implement it first. > > > > > However, I'm not the fan of original "tcprecvoffload" and like shorter > > > naming. > > > > Can we use ifconfig tcplro for now? > > + it only affects TCP > > + user see that it is related to TCP > > + it is not a 3 letter abrevation claudio does not like > > + it is shorter than tcprecvoffload > > > > cons > > - FreeBSD calls it lro > > > > Feel free to use tcplro. Do so. OK jan@
ix(4): LRO forwarding
Hi, This diff sets needed offloading flags and the calculated mss to LRO mbufs in ix(4). Thus, we can forward this packets and process them via tcp_if_output_tso(). This diff also uses tcp_if_output_tso() in ip6_forward(). I tested the ip6_forward path via the address family transition in pf: pass in inet from 192.168.1.1 to 192.168.13.2 af-to \ inet6 from fc00:13::1 to fc00:13::2 ok? bye, Jan Index: dev/pci/if_ix.c === RCS file: /cvs/src/sys/dev/pci/if_ix.c,v retrieving revision 1.196 diff -u -p -r1.196 if_ix.c --- dev/pci/if_ix.c 23 May 2023 09:16:16 - 1.196 +++ dev/pci/if_ix.c 23 May 2023 11:02:52 - @@ -3257,13 +3257,38 @@ ixgbe_rxeof(struct rx_ring *rxr) if (sendmp->m_pkthdr.ph_mss > 0) { struct ether_extracted ext; + uint64_t hlen; uint16_t pkts = sendmp->m_pkthdr.ph_mss; + /* Calculate header size. */ ether_extract_headers(sendmp, ); - if (ext.tcp) + hlen = sizeof(*ext.eh); + if (ext.ip4) { + hlen += ext.ip4->ip_hl << 2; + } else if (ext.ip6) { + if (ext.ip6->ip6_nxt == IPPROTO_TCP) + hlen += sizeof(*ext.ip6); + else + tcpstat_inc(tcps_inbadlro); + } + if (ext.tcp) { tcpstat_inc(tcps_inhwlro); - else + hlen += ext.tcp->th_off << 2; + } else { tcpstat_inc(tcps_inbadlro); + } + + /* +* If we gonna forward this packet, we have to +* mark it as TSO, recalculate the TCP checksum +* and set a correct mss. +*/ + SET(sendmp->m_pkthdr.csum_flags, + M_TCP_CSUM_OUT | M_TCP_TSO); + + sendmp->m_pkthdr.ph_mss = + (sendmp->m_pkthdr.len - hlen) / pkts; + tcpstat_add(tcps_inpktlro, pkts); } Index: netinet6/ip6_forward.c === RCS file: /cvs/src/sys/netinet6/ip6_forward.c,v retrieving revision 1.109 diff -u -p -r1.109 ip6_forward.c --- netinet6/ip6_forward.c 5 Apr 2023 13:56:31 - 1.109 +++ netinet6/ip6_forward.c 23 May 2023 11:59:19 - @@ -63,8 +63,10 @@ #include #include #include -#include #endif +#include +#include +#include /* * Forward a packet. If some error occurs return the sender @@ -316,7 +318,11 @@ reroute: goto reroute; } #endif - in6_proto_cksum_out(m, ifp); + + error = tcp_if_output_tso(ifp, , sin6tosa(sin6), rt, IFCAP_TSOv6, + ifp->if_mtu); + if (error || m == NULL) + goto freecopy; /* Check the size after pf_test to give pf a chance to refragment. */ if (m->m_pkthdr.len > ifp->if_mtu) { @@ -326,6 +332,8 @@ reroute: m_freem(m); goto out; } + + in6_proto_cksum_out(m, ifp); error = ifp->if_output(ifp, m, sin6tosa(sin6), rt); if (error) {
Re: ix(4): allocate less memory for tx buffers
On Fri, Jun 09, 2023 at 06:59:57PM +0200, Jan Klemkow wrote: > On Fri, Jun 09, 2023 at 06:11:38PM +0200, Jan Klemkow wrote: > > TSO packets are limited to MAXMCLBYTES (64k). Thus, we don't need to > > allocate IXGBE_TSO_SIZE (256k) per packet for the transmit buffers. > > > > This saves 3/4 of the memory and allows me to pack over 8 ix(8) ports > > into one machine. Otherwise I run out of devbuf in malloc(9). > > fix typo in comment Use a more precise compare in the CTASSERT condition. ok? Index: dev/pci/if_ix.c === RCS file: /cvs/src/sys/dev/pci/if_ix.c,v retrieving revision 1.197 diff -u -p -r1.197 if_ix.c --- dev/pci/if_ix.c 1 Jun 2023 09:05:33 - 1.197 +++ dev/pci/if_ix.c 9 Jun 2023 16:01:18 - @@ -37,6 +37,12 @@ #include #include +/* + * Our TCP/IP Stack could not handle packets greater than MAXMCLBYTES. + * This interface could not handle packets greater than IXGBE_TSO_SIZE. + */ +CTASSERT(MAXMCLBYTES <= IXGBE_TSO_SIZE); + /* * Driver version */ @@ -2263,7 +2269,7 @@ ixgbe_allocate_transmit_buffers(struct t /* Create the descriptor buffer dma maps */ for (i = 0; i < sc->num_tx_desc; i++) { txbuf = >tx_buffers[i]; - error = bus_dmamap_create(txr->txdma.dma_tag, IXGBE_TSO_SIZE, + error = bus_dmamap_create(txr->txdma.dma_tag, MAXMCLBYTES, sc->num_segs, PAGE_SIZE, 0, BUS_DMA_NOWAIT, >map);
Re: ix(4): allocate less memory for tx buffers
On Fri, Jun 09, 2023 at 06:11:38PM +0200, Jan Klemkow wrote: > TSO packets are limited to MAXMCLBYTES (64k). Thus, we don't need to > allocate IXGBE_TSO_SIZE (256k) per packet for the transmit buffers. > > This saves 3/4 of the memory and allows me to pack over 8 ix(8) ports > into one machine. Otherwise I run out of devbuf in malloc(9). > > ok? fix typo in comment Index: dev/pci/if_ix.c === RCS file: /cvs/src/sys/dev/pci/if_ix.c,v retrieving revision 1.197 diff -u -p -r1.197 if_ix.c --- dev/pci/if_ix.c 1 Jun 2023 09:05:33 - 1.197 +++ dev/pci/if_ix.c 9 Jun 2023 16:01:18 - @@ -37,6 +37,12 @@ #include #include +/* + * Our TCP/IP Stack could not handle packets greater than MAXMCLBYTES. + * This interface could not handle packets greater than IXGBE_TSO_SIZE. + */ +CTASSERT(MAXMCLBYTES < IXGBE_TSO_SIZE); + /* * Driver version */ @@ -2263,7 +2269,7 @@ ixgbe_allocate_transmit_buffers(struct t /* Create the descriptor buffer dma maps */ for (i = 0; i < sc->num_tx_desc; i++) { txbuf = >tx_buffers[i]; - error = bus_dmamap_create(txr->txdma.dma_tag, IXGBE_TSO_SIZE, + error = bus_dmamap_create(txr->txdma.dma_tag, MAXMCLBYTES, sc->num_segs, PAGE_SIZE, 0, BUS_DMA_NOWAIT, >map);
Re: ifconfig rename tcplro
On Tue, Jun 06, 2023 at 02:31:52PM +0200, Alexander Bluhm wrote: > I would suggest to rename ifconfig tcprecvoffload to tcplro. Maybe > it's just because I had to type that long name too often. > > With that we have consistent naming: > # ifconfig ix0 tcplro > # sysctl net.inet.tcp.tso=1 > > Also the coresponding flag are named LRO. > # ifconfig ix1 hwfeatures > ix1: flags=2008843 mtu 1500 > > hwfeatures=71b7 > hardmtu 9198 > > The feature is quite new, so I have no backward compatiblity concerns. > > ok? I like this shorter naming. Its OK from my side. > Index: sbin/ifconfig/ifconfig.8 > === > RCS file: /data/mirror/openbsd/cvs/src/sbin/ifconfig/ifconfig.8,v > retrieving revision 1.396 > diff -u -p -r1.396 ifconfig.8 > --- sbin/ifconfig/ifconfig.8 1 Jun 2023 18:57:53 - 1.396 > +++ sbin/ifconfig/ifconfig.8 6 Jun 2023 12:18:07 - > @@ -501,7 +501,7 @@ Query and display information and diagno > modules installed in an interface. > It is only supported by drivers implementing the necessary functionality > on hardware which supports it. > -.It Cm tcprecvoffload > +.It Cm tcplro > Enable TCP large receive offload (LRO) if it's supported by the hardware; see > .Cm hwfeatures . > LRO enabled network interfaces modify received TCP/IP packets. > @@ -517,7 +517,7 @@ It is not possible to use LRO with inter > or > .Xr tpmr 4 . > Changing this option will re-initialize the network interface. > -.It Cm -tcprecvoffload > +.It Cm -tcplro > Disable LRO. > LRO is disabled by default. > .It Cm up > Index: sbin/ifconfig/ifconfig.c > === > RCS file: /data/mirror/openbsd/cvs/src/sbin/ifconfig/ifconfig.c,v > retrieving revision 1.465 > diff -u -p -r1.465 ifconfig.c > --- sbin/ifconfig/ifconfig.c 1 Jun 2023 18:57:54 - 1.465 > +++ sbin/ifconfig/ifconfig.c 6 Jun 2023 12:18:59 - > @@ -471,8 +471,8 @@ const struct cmd { > { "-soii", IFXF_INET6_NOSOII, 0, setifxflags }, > { "monitor",IFXF_MONITOR, 0, setifxflags }, > { "-monitor", -IFXF_MONITOR, 0, setifxflags }, > - { "tcprecvoffload", IFXF_LRO, 0, setifxflags }, > - { "-tcprecvoffload", -IFXF_LRO, 0, setifxflags }, > + { "tcplro", IFXF_LRO, 0, setifxflags }, > + { "-tcplro",-IFXF_LRO, 0, setifxflags }, > #ifndef SMALL > { "hwfeatures", NEXTARG0, 0, printifhwfeatures }, > { "metric", NEXTARG,0, setifmetric }, >
Re: ix(4): LRO forwarding
On Wed, May 24, 2023 at 05:28:58PM +0200, Alexander Bluhm wrote: > On Tue, May 23, 2023 at 02:14:57PM +0200, Jan Klemkow wrote: > > Hi, > > > > This diff sets needed offloading flags and the calculated mss to LRO > > mbufs in ix(4). Thus, we can forward this packets and process them via > > tcp_if_output_tso(). This diff also uses tcp_if_output_tso() in > > ip6_forward(). > > > > I tested the ip6_forward path via the address family transition in pf: > > > > pass in inet from 192.168.1.1 to 192.168.13.2 af-to \ > > inet6 from fc00:13::1 to fc00:13::2 > > > > ok? > > crashes during my tests with lro turned on. Looks like devision > by zero. I added a check, that avoids the TSO flags if mss it zero. Thus, we avoid a division by zero in later TSO processing. ok? Thanks, Jan Index: dev/pci/if_ix.c === RCS file: /cvs/src/sys/dev/pci/if_ix.c,v retrieving revision 1.196 diff -u -p -r1.196 if_ix.c --- dev/pci/if_ix.c 23 May 2023 09:16:16 - 1.196 +++ dev/pci/if_ix.c 25 May 2023 20:02:06 - @@ -3257,13 +3257,40 @@ ixgbe_rxeof(struct rx_ring *rxr) if (sendmp->m_pkthdr.ph_mss > 0) { struct ether_extracted ext; + uint64_t hlen; uint16_t pkts = sendmp->m_pkthdr.ph_mss; + /* Calculate header size. */ ether_extract_headers(sendmp, ); - if (ext.tcp) + hlen = sizeof(*ext.eh); + if (ext.ip4) { + hlen += ext.ip4->ip_hl << 2; + } else if (ext.ip6) { + if (ext.ip6->ip6_nxt == IPPROTO_TCP) + hlen += sizeof(*ext.ip6); + else + tcpstat_inc(tcps_inbadlro); + } + if (ext.tcp) { tcpstat_inc(tcps_inhwlro); - else + hlen += ext.tcp->th_off << 2; + } else { tcpstat_inc(tcps_inbadlro); + } + + /* +* If we gonna forward this packet, we have to +* mark it as TSO, recalculate the TCP checksum +* and set a correct mss. +*/ + sendmp->m_pkthdr.ph_mss = + (sendmp->m_pkthdr.len - hlen) / pkts; + + if (sendmp->m_pkthdr.ph_mss != 0) { + SET(sendmp->m_pkthdr.csum_flags, + M_TCP_CSUM_OUT | M_TCP_TSO); + } + tcpstat_add(tcps_inpktlro, pkts); } Index: netinet6/ip6_forward.c === RCS file: /cvs/src/sys/netinet6/ip6_forward.c,v retrieving revision 1.109 diff -u -p -r1.109 ip6_forward.c --- netinet6/ip6_forward.c 5 Apr 2023 13:56:31 - 1.109 +++ netinet6/ip6_forward.c 25 May 2023 20:03:06 - @@ -63,8 +63,10 @@ #include #include #include -#include #endif +#include +#include +#include /* * Forward a packet. If some error occurs return the sender @@ -316,7 +318,11 @@ reroute: goto reroute; } #endif - in6_proto_cksum_out(m, ifp); + + error = tcp_if_output_tso(ifp, , sin6tosa(sin6), rt, IFCAP_TSOv6, + ifp->if_mtu); + if (error || m == NULL) + goto freecopy; /* Check the size after pf_test to give pf a chance to refragment. */ if (m->m_pkthdr.len > ifp->if_mtu) { @@ -326,6 +332,8 @@ reroute: m_freem(m); goto out; } + + in6_proto_cksum_out(m, ifp); error = ifp->if_output(ifp, m, sin6tosa(sin6), rt); if (error) {
fix vlan handling with tcplro on ix(4)
Hi, I missed the vlan-tag size in the mss calculation of lro packets in ix(4). This diff add vlan-header detection in ether_extract_headers() and uses this information to calculate the right mss. This fixes forwarding of vlan tagged lro packets. ok? bye, Jan Index: dev/pci/if_ix.c === RCS file: /cvs/src/sys/dev/pci/if_ix.c,v retrieving revision 1.200 diff -u -p -r1.200 if_ix.c --- dev/pci/if_ix.c 18 Jul 2023 16:01:20 - 1.200 +++ dev/pci/if_ix.c 26 Jul 2023 09:21:15 - @@ -3275,6 +3275,10 @@ ixgbe_rxeof(struct rx_ring *rxr) /* Calculate header size. */ ether_extract_headers(sendmp, ); hdrlen = sizeof(*ext.eh); +#if NVLAN > 0 + if (ext.evh) + hdrlen += ETHER_VLAN_ENCAP_LEN; +#endif if (ext.ip4) hdrlen += ext.ip4->ip_hl << 2; if (ext.ip6) Index: net/if_ethersubr.c === RCS file: /cvs/src/sys/net/if_ethersubr.c,v retrieving revision 1.290 diff -u -p -r1.290 if_ethersubr.c --- net/if_ethersubr.c 6 Jul 2023 19:46:53 - 1.290 +++ net/if_ethersubr.c 26 Jul 2023 09:20:57 - @@ -1040,6 +1040,7 @@ ether_extract_headers(struct mbuf *mp, s uint64_t hlen; int hoff; uint8_t ipproto; + uint16_t ether_type; /* Return NULL if header was not recognized. */ memset(ext, 0, sizeof(*ext)); @@ -1048,9 +1049,20 @@ ether_extract_headers(struct mbuf *mp, s return; ext->eh = mtod(mp, struct ether_header *); - switch (ntohs(ext->eh->ether_type)) { + ether_type = ntohs(ext->eh->ether_type); + hlen = sizeof(*ext->eh); + +#if NVLAN > 0 + if (ether_type == ETHERTYPE_VLAN) { + ext->evh = mtod(mp, struct ether_vlan_header *); + ether_type = ntohs(ext->evh->evl_proto); + hlen = sizeof(*ext->evh); + } +#endif + + switch (ether_type) { case ETHERTYPE_IP: - m = m_getptr(mp, sizeof(*ext->eh), ); + m = m_getptr(mp, hlen, ); if (m == NULL || m->m_len - hoff < sizeof(*ext->ip4)) return; ext->ip4 = (struct ip *)(mtod(m, caddr_t) + hoff); @@ -1064,7 +1076,7 @@ ether_extract_headers(struct mbuf *mp, s break; #ifdef INET6 case ETHERTYPE_IPV6: - m = m_getptr(mp, sizeof(*ext->eh), ); + m = m_getptr(mp, hlen, ); if (m == NULL || m->m_len - hoff < sizeof(*ext->ip6)) return; ext->ip6 = (struct ip6_hdr *)(mtod(m, caddr_t) + hoff); Index: netinet/if_ether.h === RCS file: /cvs/src/sys/netinet/if_ether.h,v retrieving revision 1.89 diff -u -p -r1.89 if_ether.h --- netinet/if_ether.h 6 Jul 2023 19:46:53 - 1.89 +++ netinet/if_ether.h 26 Jul 2023 09:20:22 - @@ -301,11 +301,12 @@ uint64_t ether_addr_to_e64(const struct void ether_e64_to_addr(struct ether_addr *, uint64_t); struct ether_extracted { - struct ether_header *eh; - struct ip *ip4; - struct ip6_hdr *ip6; - struct tcphdr *tcp; - struct udphdr *udp; + struct ether_header *eh; + struct ether_vlan_header*evh; + struct ip *ip4; + struct ip6_hdr *ip6; + struct tcphdr *tcp; + struct udphdr *udp; }; void ether_extract_headers(struct mbuf *, struct ether_extracted *);
Re: lo(4) loopback LRO and TSO
On July 2, 2023 2:33:41 PM GMT+02:00, Claudio Jeker wrote: >On Sun, Jul 02, 2023 at 02:28:17PM +0200, Alexander Bluhm wrote: >> anyone? > >Was not able to test yet but I like the diff. >Right now this is a noop since LRO is not on by default for lo(4). >Because of that OK claudio@ The diff works fine in my sparc64 setup. ok jan@ >> On Fri, Jun 23, 2023 at 06:06:16PM +0200, Alexander Bluhm wrote: >> > Hi, >> > >> > Claudio@ mentioned the idea to use TSO and LRO on the loopback >> > interface to transfer TCP faster. >> > >> > I see a performance effect with this diff, but more importantly it >> > gives us more test coverage. Currently LRO on lo(4) is default >> > off. >> > >> > Future plan is: >> > - Fix some corner cases for LRO/TSO with TCP path-MTU discovery >> > and IP forwarding when LRO is enabled. >> > - Enable LRO/TSO for lo(4) and ix(4) per default. >> > - Jan@ commits his ixl(4) TSO diff. >> > >> > ok for lo(4) LRO/TSO with default off? >> > >> > bluhm >> > >> > Index: sys/net/if.c >> > === >> > RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v >> > retrieving revision 1.700 >> > diff -u -p -r1.700 if.c >> > --- sys/net/if.c 12 Jun 2023 21:19:54 - 1.700 >> > +++ sys/net/if.c 23 Jun 2023 15:48:27 - >> > @@ -106,6 +106,9 @@ >> > #ifdef MROUTING >> > #include >> > #endif >> > +#include >> > +#include >> > +#include >> > >> > #ifdef INET6 >> > #include >> > @@ -802,12 +805,29 @@ if_input_local(struct ifnet *ifp, struct >> > * is now incorrect, will be calculated before sending. >> > */ >> >keepcksum = m->m_pkthdr.csum_flags & (M_IPV4_CSUM_OUT | >> > - M_TCP_CSUM_OUT | M_UDP_CSUM_OUT | M_ICMP_CSUM_OUT); >> > + M_TCP_CSUM_OUT | M_UDP_CSUM_OUT | M_ICMP_CSUM_OUT | >> > + M_TCP_TSO); >> >m_resethdr(m); >> >m->m_flags |= M_LOOP | keepflags; >> >m->m_pkthdr.csum_flags = keepcksum; >> >m->m_pkthdr.ph_ifidx = ifp->if_index; >> >m->m_pkthdr.ph_rtableid = ifp->if_rdomain; >> > + >> > + if (ISSET(keepcksum, M_TCP_TSO) && m->m_pkthdr.len > ifp->if_mtu) { >> > + if (ifp->if_mtu > 0 && >> > + ((af == AF_INET && >> > + ISSET(ifp->if_capabilities, IFCAP_TSOv4)) || >> > + (af == AF_INET6 && >> > + ISSET(ifp->if_capabilities, IFCAP_TSOv6 { >> > + tcpstat_inc(tcps_inswlro); >> > + tcpstat_add(tcps_inpktlro, >> > + (m->m_pkthdr.len + ifp->if_mtu - 1) / ifp->if_mtu); >> > + } else { >> > + tcpstat_inc(tcps_inbadlro); >> > + m_freem(m); >> > + return (EPROTONOSUPPORT); >> > + } >> > + } >> > >> >if (ISSET(keepcksum, M_TCP_CSUM_OUT)) >> >m->m_pkthdr.csum_flags |= M_TCP_CSUM_IN_OK; >> > Index: sys/net/if_loop.c >> > === >> > RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_loop.c,v >> > retrieving revision 1.94 >> > diff -u -p -r1.94 if_loop.c >> > --- sys/net/if_loop.c 5 Jun 2023 11:35:46 - 1.94 >> > +++ sys/net/if_loop.c 23 Jun 2023 15:48:27 - >> > @@ -175,7 +175,8 @@ loop_clone_create(struct if_clone *ifc, >> >ifp->if_xflags = IFXF_CLONED; >> >ifp->if_capabilities = IFCAP_CSUM_IPv4 | >> >IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 | >> > - IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6; >> > + IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6 | >> > + IFCAP_LRO; >> >ifp->if_rtrequest = lortrequest; >> >ifp->if_ioctl = loioctl; >> >ifp->if_input = loinput; >> > @@ -281,6 +282,10 @@ loioctl(struct ifnet *ifp, u_long cmd, c >> > >> >switch (cmd) { >> >case SIOCSIFFLAGS: >> > + if (ISSET(ifp->if_xflags, IFXF_LRO)) >> > + SET(ifp->if_capabilities, IFCAP_TSOv4 | IFCAP_TSOv6); >> > + else >> > + CLR(ifp->if_capabilities, IFCAP_TSOv4 | IFCAP_TSOv6); >> >break; >> > >> >case SIOCSIFADDR: >> > Index: sys/netinet/tcp_usrreq.c >> > === >> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_usrreq.c,v >> > retrieving revision 1.219 >> > diff -u -p -r1.219 tcp_usrreq.c >> > --- sys/netinet/tcp_usrreq.c 23 May 2023 09:16:16 - 1.219 >> > +++ sys/netinet/tcp_usrreq.c 23 Jun 2023 15:48:27 - >> > @@ -1340,6 +1340,7 @@ tcp_sysctl_tcpstat(void *oldp, size_t *o >> >ASSIGN(tcps_outhwtso); >> >ASSIGN(tcps_outpkttso); >> >ASSIGN(tcps_outbadtso); >> > + ASSIGN(tcps_inswlro); >> >ASSIGN(tcps_inhwlro); >> >ASSIGN(tcps_inpktlro); >> >ASSIGN(tcps_inbadlro); >> > Index: sys/netinet/tcp_var.h >> > === >> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v >> > retrieving revision 1.167 >> > diff -u -p -r1.167 tcp_var.h >> >
Re: tso ip6 forward
On Fri, Jun 16, 2023 at 12:06:08PM +0200, Alexander Bluhm wrote: > On Mon, Jun 12, 2023 at 03:46:28PM +0200, Alexander Bluhm wrote: > > I found a little inconsistency in IPv6 forwarding with TSO. > > > > Sending with TSO should only done if the large packet does not fit > > in the interface MTU. In case tcp_if_output_tso() does not process > > the packet, we should send an ICMP6 error. Rearrange the code that > > it looks more like other calls to tcp_if_output_tso(). > > > > All these cases can only be reached when LRO is turned on for IPv6 > > which none of our drivers currently supports. > > jan@ pointed out that reordering TSO in ip6 forward breaks path MTU > discovery. So lets only fix the forward counters, icmp6 packet too > big and icmp6 redirect. > > First try to send with TSO. The goto senderr handles icmp6 redirect > and other errors. > > If TSO is not necessary and the interface MTU fits, just send the > packet. Again goto senderr handles icmp6. > > Finally care about icmp6 packet too big. Works fine in my setup. > ok? ok jan@ > Index: netinet6/ip6_forward.c > === > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_forward.c,v > retrieving revision 1.110 > diff -u -p -r1.110 ip6_forward.c > --- netinet6/ip6_forward.c1 Jun 2023 09:05:33 - 1.110 > +++ netinet6/ip6_forward.c16 Jun 2023 08:55:43 - > @@ -321,35 +321,30 @@ reroute: > > error = tcp_if_output_tso(ifp, , sin6tosa(sin6), rt, IFCAP_TSOv6, > ifp->if_mtu); > + if (error) > + ip6stat_inc(ip6s_cantforward); > + else if (m == NULL) > + ip6stat_inc(ip6s_forward); > if (error || m == NULL) > - goto freecopy; > + goto senderr; > > /* Check the size after pf_test to give pf a chance to refragment. */ > - if (m->m_pkthdr.len > ifp->if_mtu) { > - if (mcopy) > - icmp6_error(mcopy, ICMP6_PACKET_TOO_BIG, 0, > - ifp->if_mtu); > - m_freem(m); > - goto out; > + if (m->m_pkthdr.len <= ifp->if_mtu) { > + in6_proto_cksum_out(m, ifp); > + error = ifp->if_output(ifp, m, sin6tosa(sin6), rt); > + if (error) > + ip6stat_inc(ip6s_cantforward); > + else > + ip6stat_inc(ip6s_forward); > + goto senderr; > } > > - in6_proto_cksum_out(m, ifp); > - error = ifp->if_output(ifp, m, sin6tosa(sin6), rt); > - if (error) { > - ip6stat_inc(ip6s_cantforward); > - } else { > - ip6stat_inc(ip6s_forward); > - if (type) > - ip6stat_inc(ip6s_redirectsent); > - else { > - if (mcopy) > - goto freecopy; > - } > - } > + if (mcopy != NULL) > + icmp6_error(mcopy, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu); > + m_freem(m); > + goto out; > > -#if NPF > 0 || defined(IPSEC) > senderr: > -#endif > if (mcopy == NULL) > goto out; > > @@ -357,6 +352,7 @@ senderr: > case 0: > if (type == ND_REDIRECT) { > icmp6_redirect_output(mcopy, rt); > + ip6stat_inc(ip6s_redirectsent); > goto out; > } > goto freecopy; >
Re: software tcp send offloading
On Tue, May 09, 2023 at 09:56:36AM +0200, Alexander Bluhm wrote: > On Sun, May 07, 2023 at 09:00:31PM +0200, Alexander Bluhm wrote: > > Not sure if I addressed all corner cases already. I think IPsec > > is missing. > > Updated diff: > - parts have been commited > - works with IPsec now Thanks for this solution. Looks much better to me, then an IPSec lookup in tcp_output() as its done in FreeBSD. > - some bugs fixed > - sysctl net.inet.tcp.tso > - netstat TSO counter > > If you test this, recompile sysctl and netstat with new kernel > headers. Then you can see, whether the diff has an effect on your > setup. > > # netstat -s -p tcp | grep TSO > 79 output TSO packets software chopped > 0 output TSO packets hardware processed > 840 output TSO packets generated > 0 output TSO packets dropped Good idea. > If you run into problems, disable the feature, and report if the > problem goes away. This helps to locate the bug. > > # sysctl net.inet.tcp.tso=0 > net.inet.tcp.tso: 1 -> 0 > > I would like to keep the sysctl for now. It makes performance > comparison easier. When we add hardware TSO it can be a quick > workaround for driver problems. > > When this has been tested a bit, I think it is ready for commit. > Remaining issues can be handled in tree. My tests pass, I am not > aware of TCP problems. I also did some testing in my setups. Everything works. > ok? Diff looks fine to me, too. ok jan@ > bluhm > > Index: sys/net/pf.c > === > RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf.c,v > retrieving revision 1.1177 > diff -u -p -r1.1177 pf.c > --- sys/net/pf.c 8 May 2023 13:22:13 - 1.1177 > +++ sys/net/pf.c 8 May 2023 22:37:04 - > @@ -6561,6 +6561,16 @@ pf_route(struct pf_pdesc *pd, struct pf_ > goto done; > } > > + if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_TSO) && > + m0->m_pkthdr.ph_mss <= ifp->if_mtu) { > + if (tcp_chopper(m0, , ifp, m0->m_pkthdr.ph_mss) || > + if_output_ml(ifp, , sintosa(dst), rt)) > + goto done; > + tcpstat_inc(tcps_outswtso); > + goto done; > + } > + CLR(m0->m_pkthdr.csum_flags, M_TCP_TSO); > + > /* >* Too large for interface; fragment if possible. >* Must be able to put at least 8 bytes per fragment. > @@ -6594,6 +6604,7 @@ void > pf_route6(struct pf_pdesc *pd, struct pf_state *st) > { > struct mbuf *m0; > + struct mbuf_list ml; > struct sockaddr_in6 *dst, sin6; > struct rtentry *rt = NULL; > struct ip6_hdr *ip6; > @@ -6685,11 +6696,21 @@ pf_route6(struct pf_pdesc *pd, struct pf > goto done; > } > > - if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu) { > + if (m0->m_pkthdr.len <= ifp->if_mtu) { > in6_proto_cksum_out(m0, ifp); > ifp->if_output(ifp, m0, sin6tosa(dst), rt); > goto done; > } > + > + if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_TSO) && > + m0->m_pkthdr.ph_mss <= ifp->if_mtu) { > + if (tcp_chopper(m0, , ifp, m0->m_pkthdr.ph_mss) || > + if_output_ml(ifp, , sin6tosa(dst), rt)) > + goto done; > + tcpstat_inc(tcps_outswtso); > + goto done; > + } > + CLR(m0->m_pkthdr.csum_flags, M_TCP_TSO); > > ip6stat_inc(ip6s_cantfrag); > if (st->rt != PF_DUPTO) > Index: sys/netinet/in.h > === > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in.h,v > retrieving revision 1.142 > diff -u -p -r1.142 in.h > --- sys/netinet/in.h 11 Apr 2023 00:45:09 - 1.142 > +++ sys/netinet/in.h 8 May 2023 13:47:48 - > @@ -780,6 +780,7 @@ int in_canforward(struct in_addr); > int in_cksum(struct mbuf *, int); > int in4_cksum(struct mbuf *, u_int8_t, int, int); > voidin_proto_cksum_out(struct mbuf *, struct ifnet *); > +int in_ifcap_cksum(struct mbuf *, struct ifnet *, int); > voidin_ifdetach(struct ifnet *); > int in_mask2len(struct in_addr *); > voidin_len2mask(struct in_addr *, int); > Index: sys/netinet/ip_output.c > === > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_output.c,v > retrieving revision 1.384 > diff -u -p -r1.384 ip_output.c > --- sys/netinet/ip_output.c 8 May 2023 13:22:13 - 1.384 > +++ sys/netinet/ip_output.c 8 May 2023 22:37:04 - > @@ -84,7 +84,6 @@ void ip_mloopback(struct ifnet *, struct > static __inline u_int16_t __attribute__((__unused__)) > in_cksum_phdr(u_int32_t, u_int32_t, u_int32_t); > void in_delayed_cksum(struct mbuf *); > -int in_ifcap_cksum(struct mbuf *, struct ifnet *, int); > > int ip_output_ipsec_lookup(struct
seperate LRO/TSO flags
Hi, This diff introduces separate flags for TCP offloading. We split this into LRO (large receive offloading) and TSO (TCP segmentation offloading). Thus, we are able to turn it on/off separately. For ifconfig(8) we use "tcprecvoffload" and "tcpsendoffload". So, the user has a better insight of what this features are doing. ok? bye, Jan Index: sbin/ifconfig/ifconfig.8 === RCS file: /cvs/src/sbin/ifconfig/ifconfig.8,v retrieving revision 1.394 diff -u -p -r1.394 ifconfig.8 --- sbin/ifconfig/ifconfig.826 Apr 2023 02:38:08 - 1.394 +++ sbin/ifconfig/ifconfig.810 May 2023 16:22:30 - @@ -282,8 +282,10 @@ tag. As CSUM_TCPv4, but supports IPv6 datagrams. .It Sy CSUM_UDPv6 As above, for UDP. +.It Sy LRO +The device supports TCP large receive offloading (LRO). .It Sy TSO -The device supports TCP segment offloading (TSO). +The device supports TCP segmentation offloading (TSO). .It Sy WOL The device supports Wake on LAN (WoL). .It Sy hardmtu @@ -491,10 +493,30 @@ Query and display information and diagno modules installed in an interface. It is only supported by drivers implementing the necessary functionality on hardware which supports it. -.It Cm tso +.It Cm tcprecvoffload +Enable TCP large receive offloading (LRO) if it's supported by the hardware; see +.Cm hwfeatures . +LRO enabled network interfaces modify received TCP/IP packets. +This will also affect traffic of upper layer interfaces, +such as +.Xr vlan 4 , +.Xr aggr 4 , +and +.Xr carp 4 . +It is not possible to use LRO with interfaces attached to a +.Xr bridge 4 , +.Xr veb 4 , +or +.Xr tpmr 4 . +Changing this option will re-initialize the network interface. +.It Cm -tcprecvoffload +Disable LRO. +LRO is disabled by default. +.It Cm tcpsendoffload Enable TCP segmentation offloading (TSO) if it's supported by the hardware; see .Cm hwfeatures . -TSO enabled NICs modify received TCP/IP packets. +TSO enabled network interfaces are able to split large TCP segments into smaller +peaces that fits into MTU and MSS. This will also affect traffic of upper layer interfaces, such as .Xr vlan 4 , @@ -506,8 +528,7 @@ It is not possible to use TSO with inter .Xr veb 4 , or .Xr tpmr 4 . -Changing this option will re-initialize the network interface. -.It Cm -tso +.It Cm -tcpsendoffload Disable TSO. TSO is disabled by default. .It Cm up Index: sbin/ifconfig/ifconfig.c === RCS file: /cvs/src/sbin/ifconfig/ifconfig.c,v retrieving revision 1.462 diff -u -p -r1.462 ifconfig.c --- sbin/ifconfig/ifconfig.c8 Mar 2023 04:43:06 - 1.462 +++ sbin/ifconfig/ifconfig.c10 May 2023 15:40:17 - @@ -126,7 +126,7 @@ #define HWFEATURESBITS \ "\024\1CSUM_IPv4\2CSUM_TCPv4\3CSUM_UDPv4" \ "\5VLAN_MTU\6VLAN_HWTAGGING\10CSUM_TCPv6" \ - "\11CSUM_UDPv6\17TSO\20WOL" + "\11CSUM_UDPv6\16LRO\17TSO\20WOL" struct ifencap { unsigned int ife_flags; @@ -469,8 +469,10 @@ const struct cmd { { "-soii", IFXF_INET6_NOSOII, 0, setifxflags }, { "monitor",IFXF_MONITOR, 0, setifxflags }, { "-monitor", -IFXF_MONITOR, 0, setifxflags }, - { "tso",IFXF_TSO, 0, setifxflags }, - { "-tso", -IFXF_TSO, 0, setifxflags }, + { "tcprecvoffload", IFXF_LRO, 0, setifxflags }, + { "-tcprecvoffload", -IFXF_LRO, 0, setifxflags }, + { "tcpsendoffload", IFXF_TSO, 0, setifxflags }, + { "-tcpsendoffload", -IFXF_TSO, 0, setifxflags }, #ifndef SMALL { "hwfeatures", NEXTARG0, 0, printifhwfeatures }, { "metric", NEXTARG,0, setifmetric }, @@ -674,7 +676,7 @@ const structcmd { "\7RUNNING\10NOARP\11PROMISC\12ALLMULTI\13OACTIVE\14SIMPLEX"\ "\15LINK0\16LINK1\17LINK2\20MULTICAST" \ "\23AUTOCONF6TEMP\24MPLS\25WOL\26AUTOCONF6\27INET6_NOSOII" \ - "\30AUTOCONF4" "\31MONITOR" "\32TSO" + "\30AUTOCONF4" "\31MONITOR" "\32LRO" "\33TSO" intgetinfo(struct ifreq *, int); void getsock(int); Index: sys/dev/pci/if_ix.c === RCS file: /cvs/src/sys/dev/pci/if_ix.c,v retrieving revision 1.193 diff -u -p -r1.193 if_ix.c --- sys/dev/pci/if_ix.c 28 Apr 2023 10:18:57 - 1.193 +++ sys/dev/pci/if_ix.c 10 May 2023 16:32:44 - @@ -1925,7 +1925,7 @@ ixgbe_setup_interface(struct ix_softc *s ifp->if_capabilities |= IFCAP_CSUM_IPv4; if (sc->hw.mac.type != ixgbe_mac_82598EB) - ifp->if_capabilities |= IFCAP_TSO; + ifp->if_capabilities |= IFCAP_LRO; /* *
Re: seperate LRO/TSO flags
On Wed, May 10, 2023 at 11:13:04AM -0600, Todd C. Miller wrote: > On Wed, 10 May 2023 19:03:58 +0200, Jan Klemkow wrote: > > This diff introduces separate flags for TCP offloading. We split this > > into LRO (large receive offloading) and TSO (TCP segmentation > > offloading). Thus, we are able to turn it on/off separately. > > > > For ifconfig(8) we use "tcprecvoffload" and "tcpsendoffload". So, the > > user has a better insight of what this features are doing. > > Is it possible to control these at the address family level? In > other words, is it possible to enable "tcprecvoffload" and > "tcpsendoffload" for inet but not inet6 or vice versa? For tcprecvoffload and ix(4) it's not possible to enable/disable it per address family. Its just one flag for the hardware. For tcpsendoffload its possible, but I won't do that till its necessary. Why would you want to differentiate the address families here? bye, Jan
Re: seperate LRO/TSO flags
On Sat, May 13, 2023 at 04:44:18PM +0200, Christian Weisgerber wrote: > Jan Klemkow: > > > This diff introduces separate flags for TCP offloading. We split this > > into LRO (large receive offloading) and TSO (TCP segmentation > > offloading). Thus, we are able to turn it on/off separately. > > Wait, why do we even have a knob for TSO? > > We specifically decided not to have a knob for checksum offloading, > because it should just work out of the box, and if it doesn't, then > it should be disabled by the driver. It should not be the admin's > task to figure out if the implementation is broken and to fiddle > with the knobs (hi, FreeBSD!). > > I would assume that line of thinking extends to TSO. You are right. This is reflected in the current state of the diff below. We just need a knob for TCP Large Receive Offload (LRO) because it changes the TCP segments. You may want to avoid this on a forwarding router. ok? Thanks, Jan Index: sbin/ifconfig/ifconfig.8 === RCS file: /cvs/src/sbin/ifconfig/ifconfig.8,v retrieving revision 1.394 diff -u -p -r1.394 ifconfig.8 --- sbin/ifconfig/ifconfig.826 Apr 2023 02:38:08 - 1.394 +++ sbin/ifconfig/ifconfig.812 May 2023 06:22:35 - @@ -282,8 +282,18 @@ tag. As CSUM_TCPv4, but supports IPv6 datagrams. .It Sy CSUM_UDPv6 As above, for UDP. -.It Sy TSO -The device supports TCP segment offloading (TSO). +.It Sy LRO +The device supports TCP large receive offload (LRO). +.It Sy TSOv4 +The device supports IPv4 TCP segmentation offload (TSO). +TSO is used by default. +Use the +.Xr sysctl 8 +variable +.Va net.inet.tcp.tso +to disable this feature. +.It Sy TSOv6 +As above, for IPv6. .It Sy WOL The device supports Wake on LAN (WoL). .It Sy hardmtu @@ -491,25 +501,25 @@ Query and display information and diagno modules installed in an interface. It is only supported by drivers implementing the necessary functionality on hardware which supports it. -.It Cm tso -Enable TCP segmentation offloading (TSO) if it's supported by the hardware; see +.It Cm tcprecvoffload +Enable TCP large receive offload (LRO) if it's supported by the hardware; see .Cm hwfeatures . -TSO enabled NICs modify received TCP/IP packets. +LRO enabled network interfaces modify received TCP/IP packets. This will also affect traffic of upper layer interfaces, such as .Xr vlan 4 , .Xr aggr 4 , and .Xr carp 4 . -It is not possible to use TSO with interfaces attached to a +It is not possible to use LRO with interfaces attached to a .Xr bridge 4 , .Xr veb 4 , or .Xr tpmr 4 . Changing this option will re-initialize the network interface. -.It Cm -tso -Disable TSO. -TSO is disabled by default. +.It Cm -tcprecvoffload +Disable LRO. +LRO is disabled by default. .It Cm up Mark an interface .Dq up . Index: sbin/ifconfig/ifconfig.c === RCS file: /cvs/src/sbin/ifconfig/ifconfig.c,v retrieving revision 1.462 diff -u -p -r1.462 ifconfig.c --- sbin/ifconfig/ifconfig.c8 Mar 2023 04:43:06 - 1.462 +++ sbin/ifconfig/ifconfig.c11 May 2023 17:33:55 - @@ -126,7 +126,7 @@ #define HWFEATURESBITS \ "\024\1CSUM_IPv4\2CSUM_TCPv4\3CSUM_UDPv4" \ "\5VLAN_MTU\6VLAN_HWTAGGING\10CSUM_TCPv6" \ - "\11CSUM_UDPv6\17TSO\20WOL" + "\11CSUM_UDPv6\15LRO\16TSOv4\17TSOv6\20WOL" struct ifencap { unsigned int ife_flags; @@ -469,8 +469,8 @@ const structcmd { { "-soii", IFXF_INET6_NOSOII, 0, setifxflags }, { "monitor",IFXF_MONITOR, 0, setifxflags }, { "-monitor", -IFXF_MONITOR, 0, setifxflags }, - { "tso",IFXF_TSO, 0, setifxflags }, - { "-tso", -IFXF_TSO, 0, setifxflags }, + { "tcprecvoffload", IFXF_LRO, 0, setifxflags }, + { "-tcprecvoffload", -IFXF_LRO, 0, setifxflags }, #ifndef SMALL { "hwfeatures", NEXTARG0, 0, printifhwfeatures }, { "metric", NEXTARG,0, setifmetric }, @@ -674,7 +674,7 @@ const structcmd { "\7RUNNING\10NOARP\11PROMISC\12ALLMULTI\13OACTIVE\14SIMPLEX"\ "\15LINK0\16LINK1\17LINK2\20MULTICAST" \ "\23AUTOCONF6TEMP\24MPLS\25WOL\26AUTOCONF6\27INET6_NOSOII" \ - "\30AUTOCONF4" "\31MONITOR" "\32TSO" + "\30AUTOCONF4" "\31MONITOR" "\32LRO" intgetinfo(struct ifreq *, int); void getsock(int); Index: sys/dev/pci/if_ix.c
Re: seperate LRO/TSO flags
On Mon, May 15, 2023 at 11:40:20AM +0200, Alexander Bluhm wrote: > On Mon, May 15, 2023 at 09:34:21AM +0200, Jan Klemkow wrote: > > @@ -251,12 +251,16 @@ struct if_status_description { > > #defineIFCAP_VLAN_HWTAGGING0x0020 /* hardware VLAN tag > > support */ > > #defineIFCAP_CSUM_TCPv60x0080 /* can do IPv6/TCP > > checksums */ > > #defineIFCAP_CSUM_UDPv60x0100 /* can do IPv6/UDP > > checksums */ > > -#defineIFCAP_TSO 0x4000 /* TCP segment > > offloading */ > > +#defineIFCAP_LRO 0x1000 /* TCP large recv > > offload */ > > +#defineIFCAP_TSOv4 0x2000 /* TCP segmentation > > offload */ > > +#defineIFCAP_TSOv6 0x4000 /* TCP segmentation > > offload */ > > #defineIFCAP_WOL 0x8000 /* can do wake on lan */ > > I would prefer to keep the numbers of IFCAP_TSO/IFCAP_LRO as this > is just a naming error. Then we have less confusion during the > ifconfig transition phase. > > +#define IFCAP_TSOv4 0x1000 > +#define IFCAP_TSOv6 0x2000 > -#define IFCAP_TSO0x4000 > +#define IFCAP_LRO0x4000 > > > +#define IFCAP_TSO (IFCAP_TSOv4 | IFCAP_TSOv6) > > + > > Could you please remove this chunk and expand it, where is used? > This one more define does not make the code clearer. And this flag > IFCAP_TSO had a different meaning before renaming. When it is not > introduced again, the compiler makes sure that no renaming was > forgotten. done Also: - updated the diff to the current source state - improved the vlan(4) capability handling @dlg: Whats your opinion about this diff? ok? Thanks, Jan Index: sbin/ifconfig/ifconfig.8 === RCS file: /cvs/src/sbin/ifconfig/ifconfig.8,v retrieving revision 1.394 diff -u -p -r1.394 ifconfig.8 --- sbin/ifconfig/ifconfig.826 Apr 2023 02:38:08 - 1.394 +++ sbin/ifconfig/ifconfig.815 May 2023 18:46:48 - @@ -282,8 +282,18 @@ tag. As CSUM_TCPv4, but supports IPv6 datagrams. .It Sy CSUM_UDPv6 As above, for UDP. -.It Sy TSO -The device supports TCP segment offloading (TSO). +.It Sy LRO +The device supports TCP large receive offload (LRO). +.It Sy TSOv4 +The device supports IPv4 TCP segmentation offload (TSO). +TSO is used by default. +Use the +.Xr sysctl 8 +variable +.Va net.inet.tcp.tso +to disable this feature. +.It Sy TSOv6 +As above, for IPv6. .It Sy WOL The device supports Wake on LAN (WoL). .It Sy hardmtu @@ -491,25 +501,25 @@ Query and display information and diagno modules installed in an interface. It is only supported by drivers implementing the necessary functionality on hardware which supports it. -.It Cm tso -Enable TCP segmentation offloading (TSO) if it's supported by the hardware; see +.It Cm tcprecvoffload +Enable TCP large receive offload (LRO) if it's supported by the hardware; see .Cm hwfeatures . -TSO enabled NICs modify received TCP/IP packets. +LRO enabled network interfaces modify received TCP/IP packets. This will also affect traffic of upper layer interfaces, such as .Xr vlan 4 , .Xr aggr 4 , and .Xr carp 4 . -It is not possible to use TSO with interfaces attached to a +It is not possible to use LRO with interfaces attached to a .Xr bridge 4 , .Xr veb 4 , or .Xr tpmr 4 . Changing this option will re-initialize the network interface. -.It Cm -tso -Disable TSO. -TSO is disabled by default. +.It Cm -tcprecvoffload +Disable LRO. +LRO is disabled by default. .It Cm up Mark an interface .Dq up . Index: sbin/ifconfig/ifconfig.c === RCS file: /cvs/src/sbin/ifconfig/ifconfig.c,v retrieving revision 1.463 diff -u -p -r1.463 ifconfig.c --- sbin/ifconfig/ifconfig.c12 May 2023 18:24:13 - 1.463 +++ sbin/ifconfig/ifconfig.c15 May 2023 20:27:51 - @@ -126,7 +126,7 @@ #define HWFEATURESBITS \ "\024\1CSUM_IPv4\2CSUM_TCPv4\3CSUM_UDPv4" \ "\5VLAN_MTU\6VLAN_HWTAGGING\10CSUM_TCPv6" \ - "\11CSUM_UDPv6\17TSO\20WOL" + "\11CSUM_UDPv6\15TSOv4\16TSOv6\17LSO\20WOL" struct ifencap { unsigned int ife_flags; @@ -469,8 +469,8 @@ const structcmd { { "-soii", IFXF_INET6_NOSOII, 0, setifxflags }, { "monitor",IFXF_MONITOR, 0, setifxflags }, { "-monitor", -IFXF_MONITOR, 0, setifxflags }, - { "tso",IFXF_TSO, 0, setifxflags }, - { "-tso", -IFXF_TSO, 0, setifxflags }, +
Add LRO counter in ix(4)
Hi, This diff introduces new counters for LRO packets, we get from the network interface. It shows, how many packets the network interface has coalesced into LRO packets. In followup diff, this packet counter will also be used to set the ph_mss variable to valid value. So, the stack is able to forward or redirect this kind of packets. ok? bye, Jan Index: usr.bin/netstat/inet.c === RCS file: /cvs/src/usr.bin/netstat/inet.c,v retrieving revision 1.175 diff -u -p -r1.175 inet.c --- usr.bin/netstat/inet.c 10 May 2023 12:07:17 - 1.175 +++ usr.bin/netstat/inet.c 16 May 2023 17:55:20 - @@ -412,6 +412,10 @@ tcp_stats(char *name) p(tcps_outhwtso, "\t\t%u output TSO packet%s hardware processed\n"); p(tcps_outpkttso, "\t\t%u output TSO packet%s generated\n"); p(tcps_outbadtso, "\t\t%u output TSO packet%s dropped\n"); + p(tcps_inhwlro, "\t\t%u input LRO generated packet%s from hardware\n"); + p(tcps_inpktlro, "\t\t%u input LRO coalesced packet%s from hardware\n"); + p(tcps_inbadlro, "\t\t%u input bad LRO packet%s from hardware\n"); + p(tcps_rcvtotal, "\t%u packet%s received\n"); p2(tcps_rcvackpack, tcps_rcvackbyte, "\t\t%u ack%s (for %llu byte%s)\n"); p(tcps_rcvdupack, "\t\t%u duplicate ack%s\n"); Index: sys/dev/pci/if_ix.c === RCS file: /cvs/src/sys/dev/pci/if_ix.c,v retrieving revision 1.194 diff -u -p -r1.194 if_ix.c --- sys/dev/pci/if_ix.c 16 May 2023 14:32:54 - 1.194 +++ sys/dev/pci/if_ix.c 16 May 2023 18:49:33 - @@ -3175,12 +3175,23 @@ ixgbe_rxeof(struct rx_ring *rxr) sendmp = rxbuf->fmp; rxbuf->buf = rxbuf->fmp = NULL; - if (sendmp != NULL) /* secondary frag */ + if (sendmp != NULL) { /* secondary frag */ sendmp->m_pkthdr.len += mp->m_len; - else { + + /* +* This function iterates over interleaved descriptors. +* Thus, we reuse ph_mss as global segment counter per +* TCP connection, insteat of introducing a new variable +* in m_pkthdr. +*/ + if (rsccnt) + sendmp->m_pkthdr.ph_mss += rsccnt - 1; + } else { /* first desc of a non-ps chain */ sendmp = mp; sendmp->m_pkthdr.len = mp->m_len; + if (rsccnt) + sendmp->m_pkthdr.ph_mss = rsccnt - 1; #if NVLAN > 0 if (sc->vlan_stripping && staterr & IXGBE_RXD_STAT_VP) { sendmp->m_pkthdr.ether_vtag = vtag; @@ -3200,6 +3211,21 @@ ixgbe_rxeof(struct rx_ring *rxr) if (hashtype != IXGBE_RXDADV_RSSTYPE_NONE) { sendmp->m_pkthdr.ph_flowid = hash; SET(sendmp->m_pkthdr.csum_flags, M_FLOWID); + } + + if (sendmp->m_pkthdr.ph_mss == 1) + sendmp->m_pkthdr.ph_mss = 0; + + if (sendmp->m_pkthdr.ph_mss > 0) { + struct ether_extracted ext; + uint16_t pkts = sendmp->m_pkthdr.ph_mss; + + ether_extract_headers(sendmp, ); + if (ext.tcp) + tcpstat_inc(tcps_inhwlro); + else + tcpstat_inc(tcps_inbadlro); + tcpstat_add(tcps_inpktlro, pkts); } ml_enqueue(, sendmp); Index: sys/dev/pci/ixgbe.h === RCS file: /cvs/src/sys/dev/pci/ixgbe.h,v retrieving revision 1.33 diff -u -p -r1.33 ixgbe.h --- sys/dev/pci/ixgbe.h 8 Feb 2022 03:38:00 - 1.33 +++ sys/dev/pci/ixgbe.h 16 May 2023 17:55:20 - @@ -60,12 +60,18 @@ #include #include +#include #include +struct tdb; + #include #include #include #include +#include +#include +#include #if NBPFILTER > 0 #include Index: sys/netinet/tcp_usrreq.c === RCS file: /cvs/src/sys/netinet/tcp_usrreq.c,v retrieving revision 1.218 diff -u -p -r1.218 tcp_usrreq.c --- sys/netinet/tcp_usrreq.c10 May 2023 12:07:16 - 1.218 +++ sys/netinet/tcp_usrreq.c16 May 2023 17:55:20 - @@ -1340,6 +1340,9 @@ tcp_sysctl_tcpstat(void *oldp, size_t *o ASSIGN(tcps_outhwtso); ASSIGN(tcps_outpkttso); ASSIGN(tcps_outbadtso); + ASSIGN(tcps_inhwlro); + ASSIGN(tcps_inpktlro); +
Re: ifconfig: SIOCSIFFLAGS: device not configured
On Thu, May 11, 2023 at 09:17:37PM +0200, Hrvoje Popovski wrote: > is it possible to change "ifconfig: SIOCSIFFLAGS: device not configured" > message that it has an interface name in it, something like: > ifconfig pfsync0: SIOCSIFFLAGS: device not configured <- in my case. > > I have many vlans and static routes in my setup and while testing some > diffs, it took me a long time to figure out which interface the message > was coming from. > > starting network > add host 10.11.2.69: gateway 10.12.253.225 > add host 10.250.184.36: gateway 10.12.253.225 > add host 9.9.9.9: gateway 10.12.253.225 > add host 10.11.1.234: gateway 10.12.253.225 > add host 10.11.1.235: gateway 10.12.253.225 > add host 10.11.255.123: gateway 10.12.253.225 > add net 10.101/16: gateway 10.12.253.225 > ifconfig: SIOCSIFFLAGS: Device not configured > add net 16/8: gateway 192.168.100.112 > add net a192:a168:a100:a100::/64: gateway 192:168:1000:1000::112 > add net 48/8: gateway 192.168.111.112 > add net a192:a168:a111:a111::/64: gateway 192:168::::112 > reordering: ld.so libc libcrypto sshd. > > or when I'm doing sh /etc/netstart and have aggr interface > > ifconfig: SIOCSTRUNKPORT: Device busy > ifconfig: SIOCSTRUNKPORT: Device busy > > to change > ifconfig ix0: SIOCSTRUNKPORT: Device busy > ifconfig ix1: SIOCSTRUNKPORT: Device busy I also run into this issue sometimes. So, here is diff that prints the interface name in front of most of these anonym error messages. ok? Jan Index: ifconfig.c === RCS file: /cvs/src/sbin/ifconfig/ifconfig.c,v retrieving revision 1.462 diff -u -p -r1.462 ifconfig.c --- ifconfig.c 8 Mar 2023 04:43:06 - 1.462 +++ ifconfig.c 12 May 2023 14:14:01 - @@ -1070,14 +1070,14 @@ printgroup(char *groupname, int ifaliase errno == ENOENT) return (-1); else - err(1, "SIOCGIFGMEMB"); + err(1, "%s: SIOCGIFGMEMB", ifgr.ifgr_name); } len = ifgr.ifgr_len; if ((ifgr.ifgr_groups = calloc(1, len)) == NULL) err(1, "printgroup"); if (ioctl(sock, SIOCGIFGMEMB, (caddr_t)) == -1) - err(1, "SIOCGIFGMEMB"); + err(1, "%s: SIOCGIFGMEMB", ifgr.ifgr_name); for (ifg = ifgr.ifgr_groups; ifg && len >= sizeof(struct ifg_req); ifg++) { @@ -1099,7 +1099,7 @@ printgroupattribs(char *groupname) bzero(, sizeof(ifgr)); strlcpy(ifgr.ifgr_name, groupname, sizeof(ifgr.ifgr_name)); if (ioctl(sock, SIOCGIFGATTR, (caddr_t)) == -1) - err(1, "SIOCGIFGATTR"); + err(1, "%s: SIOCGIFGATTR", ifgr.ifgr_name); printf("%s:", groupname); printf(" carp demote count %d", ifgr.ifgr_attrib.ifg_carp_demoted); @@ -1122,7 +1122,8 @@ setgroupattribs(char *groupname, int arg if (argc > 1) { neg = strtonum(argv[1], 0, 128, ); if (errstr) - errx(1, "invalid carp demotion: %s", errstr); + errx(1, "%s: invalid carp demotion: %s", ifgr.ifgr_name, + errstr); } if (p[0] == '-') { @@ -1135,7 +1136,7 @@ setgroupattribs(char *groupname, int arg usage(); if (ioctl(sock, SIOCSIFGATTR, (caddr_t)) == -1) - err(1, "SIOCSIFGATTR"); + err(1, "%s: SIOCSIFGATTR", ifgr.ifgr_name); } void @@ -1249,7 +1250,7 @@ clone_create(const char *addr, int param (void) strlcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); if (ioctl(sock, SIOCIFCREATE, ) == -1) - err(1, "SIOCIFCREATE"); + err(1, "%s: SIOCIFCREATE", ifr.ifr_name); } void @@ -1258,7 +1259,7 @@ clone_destroy(const char *addr, int para (void) strlcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); if (ioctl(sock, SIOCIFDESTROY, ) == -1) - err(1, "SIOCIFDESTROY"); + err(1, "%s: SIOCIFDESTROY", ifr.ifr_name); } struct if_clonereq * @@ -1422,7 +1423,7 @@ setifflags(const char *vname, int value) bcopy((char *), (char *)_ifr, sizeof(struct ifreq)); if (ioctl(sock, SIOCGIFFLAGS, (caddr_t)_ifr) == -1) - err(1, "SIOCGIFFLAGS"); + err(1, "%s: SIOCGIFFLAGS", my_ifr.ifr_name); (void) strlcpy(my_ifr.ifr_name, ifname, sizeof(my_ifr.ifr_name)); flags = my_ifr.ifr_flags; @@ -1433,7 +1434,7 @@ setifflags(const char *vname, int value) flags |= value; my_ifr.ifr_flags = flags; if (ioctl(sock, SIOCSIFFLAGS, (caddr_t)_ifr) == -1) - err(1, "SIOCSIFFLAGS"); + err(1, "%s: SIOCSIFFLAGS", my_ifr.ifr_name); } void @@ -1444,7 +1445,7 @@ setifxflags(const char *vname, int value bcopy((char *), (char *)_ifr, sizeof(struct ifreq)); if (ioctl(sock, SIOCGIFXFLAGS, (caddr_t)_ifr) == -1) -
Re: Add LRO counter in ix(4)
On Thu, May 18, 2023 at 12:01:44AM +0200, Alexander Bluhm wrote: > On Tue, May 16, 2023 at 09:11:48PM +0200, Jan Klemkow wrote: > > @@ -412,6 +412,10 @@ tcp_stats(char *name) > > p(tcps_outhwtso, "\t\t%u output TSO packet%s hardware processed\n"); > > p(tcps_outpkttso, "\t\t%u output TSO packet%s generated\n"); > > p(tcps_outbadtso, "\t\t%u output TSO packet%s dropped\n"); > > + p(tcps_inhwlro, "\t\t%u input LRO generated packet%s from hardware\n"); > > + p(tcps_inpktlro, "\t\t%u input LRO coalesced packet%s from hardware\n"); > > ... coalesced packet%s by hardware done > > + p(tcps_inbadlro, "\t\t%u input bad LRO packet%s from hardware\n"); > > + > > Move this down to the "packets received" section. You included it > in "packets sent". done > > + /* > > +* This function iterates over interleaved descriptors. > > +* Thus, we reuse ph_mss as global segment counter per > > +* TCP connection, insteat of introducing a new variable > > s/insteat/instead/ done ok? Thanks, Jan diff --git a/sys/dev/pci/if_ix.c b/sys/dev/pci/if_ix.c index 4119a2416dc..924a6d63236 100644 --- a/sys/dev/pci/if_ix.c +++ b/sys/dev/pci/if_ix.c @@ -3214,12 +3214,23 @@ ixgbe_rxeof(struct rx_ring *rxr) sendmp = rxbuf->fmp; rxbuf->buf = rxbuf->fmp = NULL; - if (sendmp != NULL) /* secondary frag */ + if (sendmp != NULL) { /* secondary frag */ sendmp->m_pkthdr.len += mp->m_len; - else { + + /* +* This function iterates over interleaved descriptors. +* Thus, we reuse ph_mss as global segment counter per +* TCP connection, instead of introducing a new variable +* in m_pkthdr. +*/ + if (rsccnt) + sendmp->m_pkthdr.ph_mss += rsccnt - 1; + } else { /* first desc of a non-ps chain */ sendmp = mp; sendmp->m_pkthdr.len = mp->m_len; + if (rsccnt) + sendmp->m_pkthdr.ph_mss = rsccnt - 1; #if NVLAN > 0 if (sc->vlan_stripping && staterr & IXGBE_RXD_STAT_VP) { sendmp->m_pkthdr.ether_vtag = vtag; @@ -3241,6 +3252,21 @@ ixgbe_rxeof(struct rx_ring *rxr) SET(sendmp->m_pkthdr.csum_flags, M_FLOWID); } + if (sendmp->m_pkthdr.ph_mss == 1) + sendmp->m_pkthdr.ph_mss = 0; + + if (sendmp->m_pkthdr.ph_mss > 0) { + struct ether_extracted ext; + uint16_t pkts = sendmp->m_pkthdr.ph_mss; + + ether_extract_headers(sendmp, ); + if (ext.tcp) + tcpstat_inc(tcps_inhwlro); + else + tcpstat_inc(tcps_inbadlro); + tcpstat_add(tcps_inpktlro, pkts); + } + ml_enqueue(, sendmp); } next_desc: diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index 120e3cc5ea7..3970636cde1 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -1340,6 +1340,9 @@ tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp) ASSIGN(tcps_outhwtso); ASSIGN(tcps_outpkttso); ASSIGN(tcps_outbadtso); + ASSIGN(tcps_inhwlro); + ASSIGN(tcps_inpktlro); + ASSIGN(tcps_inbadlro); #undef ASSIGN diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 0a9630d719f..e706fedd0e7 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -447,6 +447,9 @@ struct tcpstat { u_int32_t tcps_outhwtso;/* output tso processed by hardware */ u_int32_t tcps_outpkttso; /* packets generated by tso */ u_int32_t tcps_outbadtso; /* output tso failed, packet dropped */ + u_int32_t tcps_inhwlro; /* input lro from hardware */ + u_int32_t tcps_inpktlro;/* packets coalessed by hardware lro */ + u_int32_t tcps_inbadlro;/* input bad lro packets from hardware */ }; /* @@ -625,6 +628,9 @@ enum tcpstat_counters { tcps_outhwtso, tcps_outpkttso, tcps_outbadtso, + tcps_inhwlro, + tcps_inpktlro, + tcps_inbadlro, tcps_ncounters, }; diff --git a/usr.bin/
Fix wrong interface mtu in tcp_mss
Hi, We use the wrong interface and mtu in tcp_mss() to calculate the mss if the destination address points is a local address. In ip_output() we use the correct interface and its mtu. This limits the mss to 1448 if the mtu of the interface it 1500, instead of using a local 32k mss. The bigger issue is: local bulk traffic with the current TSO implementation is broken. tcp_output() creates TSO packets with an mss smaller then 32k and ip_output() calls if_output instead of tcp_if_output_tso() because it fits into the mtu check of lo0. This diff takes the same logic to pick the interface in tcp_mss() as its done in ip_output() and fixes both issues. ok? bye, Jan Index: netinet/tcp_input.c === RCS file: /cvs/src/sys/netinet/tcp_input.c,v retrieving revision 1.387 diff -u -p -r1.387 tcp_input.c --- netinet/tcp_input.c 14 Mar 2023 00:24:05 - 1.387 +++ netinet/tcp_input.c 19 May 2023 17:22:47 - @@ -2805,7 +2805,11 @@ tcp_mss(struct tcpcb *tp, int offer) if (rt == NULL) goto out; - ifp = if_get(rt->rt_ifidx); + if (ISSET(rt->rt_flags, RTF_LOCAL)) + ifp = if_get(rtable_loindex(inp->inp_rtableid)); + else + ifp = if_get(rt->rt_ifidx); + if (ifp == NULL) goto out;