if_parse_packet(): refactor packet parsing on driver level

2022-10-24 Thread Jan Klemkow
Hi,

We have a lot of redundant code on the network device driver layer, that
parses the content of mbufs for ethernet, ip and tcp header.  This diff
introduces a new function if_parse_packet() to centralize this feature.
It just refactors ix(4) and ixl(4) code because, I could test this cards
and won't blowup this diff.  But, igc(3), ale(4) and oce(4) could also
be improved with this.  Beside of refactoring, we'll need this kind of
code in ix(4) and other drivers for better checksum and TSO support.

I'm not sure about the correct naming or place for this helper function.
Thus, nitpicking and bike shading is welcome. :)

bye,
Jan

Index: dev/pci/if_ix.c
===
RCS file: /mount/openbsd/cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.189
diff -u -p -r1.189 if_ix.c
--- dev/pci/if_ix.c 2 Sep 2022 14:08:09 -   1.189
+++ dev/pci/if_ix.c 24 Oct 2022 13:51:22 -
@@ -2477,25 +2477,18 @@ static inline int
 ixgbe_csum_offload(struct mbuf *mp, uint32_t *vlan_macip_lens,
 uint32_t *type_tucmd_mlhl, uint32_t *olinfo_status)
 {
-   struct ether_header *eh = mtod(mp, struct ether_header *);
-   struct mbuf *m;
-   int hoff;
int offload = 0;
-   uint32_t iphlen;
uint8_t ipproto;
 
-   *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT);
+   struct if_hdr hdr;
 
-   switch (ntohs(eh->ether_type)) {
-   case ETHERTYPE_IP: {
-   struct ip *ip;
+   if_parse_packet(mp, );
 
-   m = m_getptr(mp, sizeof(*eh), );
-   KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip));
-   ip = (struct ip *)(mtod(m, caddr_t) + hoff);
+   *vlan_macip_lens |= (hdr.l2len << IXGBE_ADVTXD_MACLEN_SHIFT);
 
-   iphlen = ip->ip_hl << 2;
-   ipproto = ip->ip_p;
+   switch (ntohs(hdr.eth->ether_type)) {
+   case ETHERTYPE_IP: {
+   ipproto = hdr.ip4->ip_p;
 
if (ISSET(mp->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT)) {
*olinfo_status |= IXGBE_TXD_POPTS_IXSM << 8;
@@ -2508,15 +2501,7 @@ ixgbe_csum_offload(struct mbuf *mp, uint
 
 #ifdef INET6
case ETHERTYPE_IPV6: {
-   struct ip6_hdr *ip6;
-
-   m = m_getptr(mp, sizeof(*eh), );
-   KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip6));
-   ip6 = (struct ip6_hdr *)(mtod(m, caddr_t) + hoff);
-
-   iphlen = sizeof(*ip6);
-   ipproto = ip6->ip6_nxt;
-
+   ipproto = hdr.ip6->ip6_nxt;
*type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6;
break;
}
@@ -2526,7 +2511,7 @@ ixgbe_csum_offload(struct mbuf *mp, uint
return offload;
}
 
-   *vlan_macip_lens |= iphlen;
+   *vlan_macip_lens |= hdr.l3len;
 
switch (ipproto) {
case IPPROTO_TCP:
Index: dev/pci/if_ixgb.h
===
RCS file: /mount/openbsd/cvs/src/sys/dev/pci/if_ixgb.h,v
retrieving revision 1.19
diff -u -p -r1.19 if_ixgb.h
--- dev/pci/if_ixgb.h   24 Nov 2015 17:11:39 -  1.19
+++ dev/pci/if_ixgb.h   24 Oct 2022 13:27:43 -
@@ -54,6 +54,7 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
Index: dev/pci/if_ixl.c
===
RCS file: /mount/openbsd/cvs/src/sys/dev/pci/if_ixl.c,v
retrieving revision 1.84
diff -u -p -r1.84 if_ixl.c
--- dev/pci/if_ixl.c5 Aug 2022 13:57:16 -   1.84
+++ dev/pci/if_ixl.c24 Oct 2022 16:34:29 -
@@ -2784,11 +2784,12 @@ ixl_load_mbuf(bus_dma_tag_t dmat, bus_dm
 static uint64_t
 ixl_tx_setup_offload(struct mbuf *m0)
 {
-   struct mbuf *m;
-   int hoff;
-   uint64_t hlen;
uint8_t ipproto;
uint64_t offload = 0;
+   struct if_hdr hdr;
+
+   memset(, 0, sizeof(hdr));
+   if_parse_packet(m0, );
 
if (ISSET(m0->m_flags, M_VLANTAG)) {
uint64_t vtag = m0->m_pkthdr.ether_vtag;
@@ -2800,35 +2801,20 @@ ixl_tx_setup_offload(struct mbuf *m0)
M_IPV4_CSUM_OUT|M_TCP_CSUM_OUT|M_UDP_CSUM_OUT))
return (offload);
 
-   switch (ntohs(mtod(m0, struct ether_header *)->ether_type)) {
+   switch (ntohs(hdr.eth->ether_type)) {
case ETHERTYPE_IP: {
-   struct ip *ip;
-
-   m = m_getptr(m0, ETHER_HDR_LEN, );
-   KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip));
-   ip = (struct ip *)(mtod(m, caddr_t) + hoff);
-
offload |= ISSET(m0->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT) ?
IXL_TX_DESC_CMD_IIPT_IPV4_CSUM :
IXL_TX_DESC_CMD_IIPT_IPV4;
  
-   hlen = ip->ip_hl << 2;
-   ipproto = ip->ip_p;
+   ipproto = hdr.ip4->ip_p;
break;
}
 
 #ifdef INET6
case ETHERTYPE_IPV6: 

Fix kernel build without IPSEC option

2022-11-02 Thread Jan Klemkow
Hi,

if you build the kernel without IPSEC it will run into several compiler
and linker errors.  This diff add some missing #ifdefs to fix this.

ok?

bye,
jan

Index: net/if_pfsync.c
===
RCS file: /mount/openbsd/cvs/src/sys/net/if_pfsync.c,v
retrieving revision 1.305
diff -u -p -r1.305 if_pfsync.c
--- net/if_pfsync.c 21 Apr 2022 15:22:49 -  1.305
+++ net/if_pfsync.c 2 Nov 2022 10:20:38 -
@@ -1576,7 +1576,9 @@ pfsync_grab_snapshot(struct pfsync_snaps
int q;
struct pf_state *st;
struct pfsync_upd_req_item *ur;
+#if defined(IPSEC)
struct tdb *tdb;
+#endif
 
sn->sn_sc = sc;
 
@@ -1602,6 +1604,7 @@ pfsync_grab_snapshot(struct pfsync_snaps
}
 
TAILQ_INIT(>sn_tdb_q);
+#if defined(IPSEC)
while ((tdb = TAILQ_FIRST(>sc_tdb_q)) != NULL) {
TAILQ_REMOVE(>sc_tdb_q, tdb, tdb_sync_entry);
TAILQ_INSERT_TAIL(>sn_tdb_q, tdb, tdb_sync_snap);
@@ -1611,6 +1614,7 @@ pfsync_grab_snapshot(struct pfsync_snaps
SET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED);
mtx_leave(>tdb_mtx);
}
+#endif
 
sn->sn_len = sc->sc_len;
sc->sc_len = PFSYNC_MINPKT;
@@ -1630,7 +1634,9 @@ pfsync_drop_snapshot(struct pfsync_snaps
 {
struct pf_state *st;
struct pfsync_upd_req_item *ur;
+#if defined(IPSEC)
struct tdb *t;
+#endif
int q;
 
for (q = 0; q < PFSYNC_S_COUNT; q++) {
@@ -1652,6 +1658,7 @@ pfsync_drop_snapshot(struct pfsync_snaps
pool_put(>sn_sc->sc_pool, ur);
}
 
+#if defined(IPSEC)
while ((t = TAILQ_FIRST(>sn_tdb_q)) != NULL) {
TAILQ_REMOVE(>sn_tdb_q, t, tdb_sync_snap);
mtx_enter(>tdb_mtx);
@@ -1660,6 +1667,7 @@ pfsync_drop_snapshot(struct pfsync_snaps
CLR(t->tdb_flags, TDBF_PFSYNC);
mtx_leave(>tdb_mtx);
}
+#endif
 }
 
 int
@@ -1748,7 +1756,6 @@ pfsync_sendout(void)
struct pfsync_subheader *subh;
struct pf_state *st;
struct pfsync_upd_req_item *ur;
-   struct tdb *t;
int offset;
int q, count = 0;
 
@@ -1842,7 +1849,10 @@ pfsync_sendout(void)
sn.sn_plus = NULL;  /* XXX memory leak ? */
}
 
+#if defined(IPSEC)
if (!TAILQ_EMPTY(_tdb_q)) {
+   struct tdb *t;
+
subh = (struct pfsync_subheader *)(m->m_data + offset);
offset += sizeof(*subh);
 
@@ -1865,6 +1875,7 @@ pfsync_sendout(void)
subh->len = sizeof(struct pfsync_tdb) >> 2;
subh->count = htons(count);
}
+#endif
 
/* walk the queues */
for (q = 0; q < PFSYNC_S_COUNT; q++) {
@@ -2486,6 +2497,7 @@ pfsync_q_del(struct pf_state *st)
pf_state_unref(st);
 }
 
+#if defined(IPSEC)
 void
 pfsync_update_tdb(struct tdb *t, int output)
 {
@@ -2540,7 +2552,9 @@ pfsync_update_tdb(struct tdb *t, int out
CLR(t->tdb_flags, TDBF_PFSYNC_RPL);
mtx_leave(>tdb_mtx);
 }
+#endif
 
+#if defined(IPSEC)
 void
 pfsync_delete_tdb(struct tdb *t)
 {
@@ -2576,6 +2590,7 @@ pfsync_delete_tdb(struct tdb *t)
 
tdb_unref(t);
 }
+#endif
 
 void
 pfsync_out_tdb(struct tdb *t, void *buf)
Index: netinet/ip_ipsp.c
===
RCS file: /mount/openbsd/cvs/src/sys/netinet/ip_ipsp.c,v
retrieving revision 1.273
diff -u -p -r1.273 ip_ipsp.c
--- netinet/ip_ipsp.c   6 Aug 2022 15:57:59 -   1.273
+++ netinet/ip_ipsp.c   2 Nov 2022 12:09:22 -
@@ -1081,7 +1081,7 @@ tdb_free(struct tdb *tdbp)
tdbp->tdb_xform = NULL;
}
 
-#if NPFSYNC > 0
+#if NPFSYNC > 0 && defined(IPSEC)
/* Cleanup pfsync references */
pfsync_delete_tdb(tdbp);
 #endif



Re: refactor mbuf parsing on driver level

2023-01-19 Thread Jan Klemkow
On Thu, Jan 19, 2023 at 12:02:29PM +0300, Vitaliy Makkoveev wrote:
> On Thu, Jan 19, 2023 at 01:55:57AM +0300, Vitaliy Makkoveev wrote:
> > > On 19 Jan 2023, at 01:39, Jan Klemkow  wrote:
> > > 
> > > On Wed, Jan 18, 2023 at 10:50:25AM +0300, Vitaliy Makkoveev wrote:
> > >> On Tue, Jan 17, 2023 at 11:09:17PM +0100, Jan Klemkow wrote:
> > >>> we have several drivers which have to parse the content of mbufs.  This
> > >>> diff suggest a central parsing function for this.  Thus, we can reduce
> > >>> redundant code.
> > >>> 
> > >>> I just start with ix(4) and ixl(4) because it was easy to test for me.
> > >>> But, this could also improve em(4), igc(4), ale(4) and oce(4).
> > >>> 
> > >>> I'm not sure about the name, the api nor the place of this code.  So, if
> > >>> someone has a better idea: i'm open to anything.
> > >> 
> > >> I like code this deduplication.
> > >> 
> > >> This newly introduced function doesn't touch ifnet but only extracts
> > >> protocol headers from mbuf(9). I guess mbuf_extract_headers() or
> > >> something like is much better for name with the ern/uipc_mbuf2.c as
> > >> place.
> > > 
> > > Good Point.  Updates diff below.
> > > 
> > > +
> > > +/* Parse different TCP/IP protocol headers for a quick view inside an 
> > > mbuf. */
> > > +void
> > > +m_exract_headers(struct mbuf *mp, struct ether_header **eh, struct ip 
> > > **ip4,
> > > +struct ip6_hdr **ip6, struct tcphdr **tcp, struct udphdr **udp)
> > > +
> > 
> > Should be m_extract_headers(). The rest of the diff looks good to me.
> > 
> 
> Please wait.
> 
> The mandatory nullification of `ip4', `ip6' and other variables passed
> to m_exract_headers() is not obvious. It is much better to return
> the integer result of extraction like m_tag_copy_chain() does.

Yes, the mandatory nullification seems to be more errorprone.  In my
opinion is the number of results it not that useful. You have to check
the retuned pointers anyway.

I moved the nullification inside of m_exract_headers().

Index: dev/pci/if_ix.c
===
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.189
diff -u -p -r1.189 if_ix.c
--- dev/pci/if_ix.c 2 Sep 2022 14:08:09 -   1.189
+++ dev/pci/if_ix.c 19 Jan 2023 09:29:10 -
@@ -2477,23 +2477,18 @@ static inline int
 ixgbe_csum_offload(struct mbuf *mp, uint32_t *vlan_macip_lens,
 uint32_t *type_tucmd_mlhl, uint32_t *olinfo_status)
 {
-   struct ether_header *eh = mtod(mp, struct ether_header *);
-   struct mbuf *m;
-   int hoff;
+   struct ether_header *eh;
+   struct ip *ip;
+   struct ip6_hdr *ip6;
int offload = 0;
uint32_t iphlen;
uint8_t ipproto;
 
-   *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT);
+   m_extract_headers(mp, , , , NULL, NULL);
 
-   switch (ntohs(eh->ether_type)) {
-   case ETHERTYPE_IP: {
-   struct ip *ip;
-
-   m = m_getptr(mp, sizeof(*eh), );
-   KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip));
-   ip = (struct ip *)(mtod(m, caddr_t) + hoff);
+   *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT);
 
+   if (ip) {
iphlen = ip->ip_hl << 2;
ipproto = ip->ip_p;
 
@@ -2503,26 +2498,14 @@ ixgbe_csum_offload(struct mbuf *mp, uint
}
 
*type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
-   break;
-   }
-
 #ifdef INET6
-   case ETHERTYPE_IPV6: {
-   struct ip6_hdr *ip6;
-
-   m = m_getptr(mp, sizeof(*eh), );
-   KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip6));
-   ip6 = (struct ip6_hdr *)(mtod(m, caddr_t) + hoff);
-
+   } else if (ip6) {
iphlen = sizeof(*ip6);
ipproto = ip6->ip6_nxt;
 
*type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6;
-   break;
-   }
 #endif
-
-   default:
+   } else {
return offload;
}
 
Index: dev/pci/if_ixl.c
===
RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v
retrieving revision 1.84
diff -u -p -r1.84 if_ixl.c
--- dev/pci/if_ixl.c5 Aug 2022 13:57:16 -   1.84
+++ dev/pci/if_ixl.c19 Jan 2023 09:29:17 -
@@ -2784,8 +2784,10 @@ ixl_load_mbuf(bus_dma_tag_t dmat, bus_dm
 static uint64_t
 ixl_tx_setup_offload(struct mbuf *m0)
 {
-   struct mbuf *m;
-   int hoff;
+   struct ether_h

refactor mbuf parsing on driver level

2023-01-17 Thread Jan Klemkow
Hi,

we have several drivers which have to parse the content of mbufs.  This
diff suggest a central parsing function for this.  Thus, we can reduce
redundant code.

I just start with ix(4) and ixl(4) because it was easy to test for me.
But, this could also improve em(4), igc(4), ale(4) and oce(4).

I'm not sure about the name, the api nor the place of this code.  So, if
someone has a better idea: i'm open to anything.

bye,
Jan

Index: dev/pci/if_ix.c
===
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.189
diff -u -p -r1.189 if_ix.c
--- dev/pci/if_ix.c 2 Sep 2022 14:08:09 -   1.189
+++ dev/pci/if_ix.c 17 Jan 2023 16:31:19 -
@@ -2477,23 +2477,18 @@ static inline int
 ixgbe_csum_offload(struct mbuf *mp, uint32_t *vlan_macip_lens,
 uint32_t *type_tucmd_mlhl, uint32_t *olinfo_status)
 {
-   struct ether_header *eh = mtod(mp, struct ether_header *);
-   struct mbuf *m;
-   int hoff;
+   struct ether_header *eh = NULL;
+   struct ip *ip = NULL;
+   struct ip6_hdr *ip6 = NULL;
int offload = 0;
uint32_t iphlen;
uint8_t ipproto;
 
-   *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT);
+   if_parse(mp, , , , NULL, NULL);
 
-   switch (ntohs(eh->ether_type)) {
-   case ETHERTYPE_IP: {
-   struct ip *ip;
-
-   m = m_getptr(mp, sizeof(*eh), );
-   KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip));
-   ip = (struct ip *)(mtod(m, caddr_t) + hoff);
+   *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT);
 
+   if (ip) {
iphlen = ip->ip_hl << 2;
ipproto = ip->ip_p;
 
@@ -2503,26 +2498,14 @@ ixgbe_csum_offload(struct mbuf *mp, uint
}
 
*type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
-   break;
-   }
-
 #ifdef INET6
-   case ETHERTYPE_IPV6: {
-   struct ip6_hdr *ip6;
-
-   m = m_getptr(mp, sizeof(*eh), );
-   KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip6));
-   ip6 = (struct ip6_hdr *)(mtod(m, caddr_t) + hoff);
-
+   } else if (ip6) {
iphlen = sizeof(*ip6);
ipproto = ip6->ip6_nxt;
 
*type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6;
-   break;
-   }
 #endif
-
-   default:
+   } else {
return offload;
}
 
Index: dev/pci/if_ixl.c
===
RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v
retrieving revision 1.84
diff -u -p -r1.84 if_ixl.c
--- dev/pci/if_ixl.c5 Aug 2022 13:57:16 -   1.84
+++ dev/pci/if_ixl.c16 Jan 2023 23:58:05 -
@@ -2784,12 +2784,15 @@ ixl_load_mbuf(bus_dma_tag_t dmat, bus_dm
 static uint64_t
 ixl_tx_setup_offload(struct mbuf *m0)
 {
-   struct mbuf *m;
-   int hoff;
+   struct ether_header *eh = NULL;
+   struct ip *ip = NULL;
+   struct ip6_hdr *ip6 = NULL;
+   struct tcphdr *th = NULL;
uint64_t hlen;
uint8_t ipproto;
uint64_t offload = 0;
 
+
if (ISSET(m0->m_flags, M_VLANTAG)) {
uint64_t vtag = m0->m_pkthdr.ether_vtag;
offload |= IXL_TX_DESC_CMD_IL2TAG1;
@@ -2800,39 +2803,23 @@ ixl_tx_setup_offload(struct mbuf *m0)
M_IPV4_CSUM_OUT|M_TCP_CSUM_OUT|M_UDP_CSUM_OUT))
return (offload);
 
-   switch (ntohs(mtod(m0, struct ether_header *)->ether_type)) {
-   case ETHERTYPE_IP: {
-   struct ip *ip;
-
-   m = m_getptr(m0, ETHER_HDR_LEN, );
-   KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip));
-   ip = (struct ip *)(mtod(m, caddr_t) + hoff);
+   if_parse(m0, , , , , NULL);
 
+   if (ip) {
offload |= ISSET(m0->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT) ?
IXL_TX_DESC_CMD_IIPT_IPV4_CSUM :
IXL_TX_DESC_CMD_IIPT_IPV4;
  
hlen = ip->ip_hl << 2;
ipproto = ip->ip_p;
-   break;
-   }
-
 #ifdef INET6
-   case ETHERTYPE_IPV6: {
-   struct ip6_hdr *ip6;
-
-   m = m_getptr(m0, ETHER_HDR_LEN, );
-   KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip6));
-   ip6 = (struct ip6_hdr *)(mtod(m, caddr_t) + hoff);
- 
+   } else if (ip6) {
offload |= IXL_TX_DESC_CMD_IIPT_IPV6;
 
hlen = sizeof(*ip6);
ipproto = ip6->ip6_nxt;
-   break;
-   }
 #endif
-   default:
+   } else {
panic("CSUM_OUT set for non-IP packet");
/* NOTREACHED */
}
@@ -2842,15 +2829,12 @@ ixl_tx_setup_offload(struct mbuf *m0)
 
switch (ipproto) {
case IPPROTO_TCP: {
-   struct tcphdr *th;
-
if (!ISSET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_OUT))
  

Re: mem.4: be more accurate about securelevel

2023-01-17 Thread Jan Klemkow
On Tue, Jan 17, 2023 at 11:02:07PM +0100, Theo Buehler wrote:
> > at least this tool works for me:
> 
> Surely you have kern.allowkmem=1 set.

Yes, I do.



mem.4: be more accurate about securelevel

2023-01-17 Thread Jan Klemkow
Hi,

This diff adjust the manpage of mem(4) to be more accurate.  You can
open(2) mem(4) in securelevel 1 in readonly mode, but not writable.

kern/spec_vnops.c:

if (ap->a_cred != FSCRED && (ap->a_mode & FWRITE)) {
...
/*
 * When running in secure mode, do not allow opens
 * for writing of /dev/mem, /dev/kmem, or character
 * devices whose corresponding block devices are
 * currently mounted.
 */
if (securelevel >= 1) {
...
if (iskmemdev(dev))
return (EPERM);
}
}

OK?

bye,
Jan

Index: man4.alpha/mem.4
===
RCS file: /cvs/src/share/man/man4/man4.alpha/mem.4,v
retrieving revision 1.6
diff -u -p -r1.6 mem.4
--- man4.alpha/mem.412 Jan 2018 04:36:44 -  1.6
+++ man4.alpha/mem.417 Jan 2023 18:51:10 -
@@ -62,7 +62,7 @@ kernel virtual memory begins at
 .Li 0xfc23 .
 .Pp
 Even with sufficient file system permissions,
-these devices can only be opened when the
+these devices can only be opened writable when the
 .Xr securelevel 7
 is insecure or when the
 .Va kern.allowkmem
Index: man4.amd64/mem.4
===
RCS file: /cvs/src/share/man/man4/man4.amd64/mem.4,v
retrieving revision 1.6
diff -u -p -r1.6 mem.4
--- man4.amd64/mem.412 Jan 2018 04:36:44 -  1.6
+++ man4.amd64/mem.417 Jan 2023 18:48:23 -
@@ -63,7 +63,7 @@ The kernel virtual memory begins at addr
 .Li 0x8000 .
 .Pp
 Even with sufficient file system permissions,
-these devices can only be opened when the
+these devices can only be opened writable when the
 .Xr securelevel 7
 is insecure or when the
 .Va kern.allowkmem
Index: man4.hppa/mem.4
===
RCS file: /cvs/src/share/man/man4/man4.hppa/mem.4,v
retrieving revision 1.4
diff -u -p -r1.4 mem.4
--- man4.hppa/mem.4 12 Jan 2018 04:36:44 -  1.4
+++ man4.hppa/mem.4 17 Jan 2023 18:52:28 -
@@ -51,7 +51,7 @@ On hppa, the physical memory range is al
 address 0; kernel virtual memory begins at address 0 as well.
 .Pp
 Even with sufficient file system permissions,
-these devices can only be opened when the
+these devices can only be opened writable when the
 .Xr securelevel 7
 is insecure or when the
 .Va kern.allowkmem
Index: man4.i386/mem.4
===
RCS file: /cvs/src/share/man/man4/man4.i386/mem.4,v
retrieving revision 1.12
diff -u -p -r1.12 mem.4
--- man4.i386/mem.4 12 Jan 2018 04:36:44 -  1.12
+++ man4.i386/mem.4 17 Jan 2023 18:53:00 -
@@ -63,7 +63,7 @@ long, and ends at virtual address
 .Li 0xfe00 .
 .Pp
 Even with sufficient file system permissions,
-these devices can only be opened when the
+these devices can only be opened writable when the
 .Xr securelevel 7
 is insecure or when the
 .Va kern.allowkmem
Index: man4.landisk/mem.4
===
RCS file: /cvs/src/share/man/man4/man4.landisk/mem.4,v
retrieving revision 1.4
diff -u -p -r1.4 mem.4
--- man4.landisk/mem.4  12 Jan 2018 04:36:44 -  1.4
+++ man4.landisk/mem.4  17 Jan 2023 18:53:54 -
@@ -58,7 +58,7 @@ The kernel virtual memory begins at addr
 .Li 0xc000 .
 .Pp
 Even with sufficient file system permissions,
-these devices can only be opened when the
+these devices can only be opened writable when the
 .Xr securelevel 7
 is insecure or when the
 .Va kern.allowkmem
Index: man4.loongson/mem.4
===
RCS file: /cvs/src/share/man/man4/man4.loongson/mem.4,v
retrieving revision 1.4
diff -u -p -r1.4 mem.4
--- man4.loongson/mem.4 12 Jan 2018 04:36:44 -  1.4
+++ man4.loongson/mem.4 17 Jan 2023 18:54:33 -
@@ -88,7 +88,7 @@ The kernel virtual memory begins at addr
 .Ad 0xc000 .
 .Pp
 Even with sufficient file system permissions,
-these devices can only be opened when the
+these devices can only be opened writable when the
 .Xr securelevel 7
 is insecure or when the
 .Va kern.allowkmem
Index: man4.luna88k/mem.4
===
RCS file: /cvs/src/share/man/man4/man4.luna88k/mem.4,v
retrieving revision 1.4
diff -u -p -r1.4 mem.4
--- man4.luna88k/mem.4  12 Jan 2018 04:36:44 -  1.4
+++ man4.luna88k/mem.4  17 Jan 2023 18:54:47 -
@@ -62,7 +62,7 @@ kernel virtual memory begins at
 .Ad 0x .
 .Pp
 Even with sufficient file system permissions,
-these devices can only be opened when the
+these devices can only be opened writable when the
 .Xr securelevel 7
 is insecure or when the
 .Va kern.allowkmem
Index: man4.macppc/mem.4

Re: mem.4: be more accurate about securelevel

2023-01-17 Thread Jan Klemkow
On Tue, Jan 17, 2023 at 04:23:48PM -0500, Bryan Steele wrote:
> On Tue, Jan 17, 2023 at 09:37:24PM +0100, Jan Klemkow wrote:
> > Hi,
> > 
> > This diff adjust the manpage of mem(4) to be more accurate.  You can
> > open(2) mem(4) in securelevel 1 in readonly mode, but not writable.
> > 
> > kern/spec_vnops.c:
> > 
> > if (ap->a_cred != FSCRED && (ap->a_mode & FWRITE)) {
> > ...
> > /*
> >  * When running in secure mode, do not allow opens
> >  * for writing of /dev/mem, /dev/kmem, or character
> >  * devices whose corresponding block devices are
> >  * currently mounted.
> >  */
> > if (securelevel >= 1) {
> > ...
> > if (iskmemdev(dev))
> > return (EPERM);
> > }
> > }
> > 
> > OK?
> > 
> > bye,
> > Jan
> 
> Are you sure about that? Have you tested it?
> 
> https://github.com/openbsd/src/commit/19aedf236181e81baf170421900911c82671fae4

at least this tool works for me:

#include 
#include 
#include 
#include 
#include 
#include 

#include 

int
main(void)
{
kvm_t *kd;
int mem;
struct nlist nl[] = {
{"_ix_debug_ioctl"},
{NULL}
};

char errbuf[_POSIX2_LINE_MAX];

if ((kd = kvm_open(_PATH_KSYMS, NULL, NULL, O_RDWR, errbuf)) == NULL)
errx(EXIT_FAILURE, "%s", errbuf);

if (kvm_nlist(kd, nl) == -1)
errx(EXIT_SUCCESS, "%s", kvm_geterr(kd));

if (kvm_read(kd, nl[0].n_value, , sizeof mem) != sizeof(mem))
errx(EXIT_SUCCESS, "%s", kvm_geterr(kd));

printf("mem: %d\n", mem);

mem = 1;

if (kvm_write(kd, nl[0].n_value, , sizeof mem) != sizeof(mem))
errx(EXIT_SUCCESS, "%s", kvm_geterr(kd));

if (kvm_close(kd) == -1)
err(EXIT_FAILURE, "kvm_close");

return EXIT_SUCCESS;
}



Re: refactor mbuf parsing on driver level

2023-01-19 Thread Jan Klemkow
On Thu, Jan 19, 2023 at 02:55:29PM +0300, Vitaliy Makkoveev wrote:
> On Thu, Jan 19, 2023 at 10:40:52AM +0100, Jan Klemkow wrote:
> > On Thu, Jan 19, 2023 at 12:02:29PM +0300, Vitaliy Makkoveev wrote:
> > > On Thu, Jan 19, 2023 at 01:55:57AM +0300, Vitaliy Makkoveev wrote:
> > > > > On 19 Jan 2023, at 01:39, Jan Klemkow  wrote:
> > > > > On Wed, Jan 18, 2023 at 10:50:25AM +0300, Vitaliy Makkoveev wrote:
> > > > >> On Tue, Jan 17, 2023 at 11:09:17PM +0100, Jan Klemkow wrote:
> > > > >>> we have several drivers which have to parse the content of mbufs.  
> > > > >>> This
> > > > >>> diff suggest a central parsing function for this.  Thus, we can 
> > > > >>> reduce
> > > > >>> redundant code.
> > > > >>> 
> > > > >>> I just start with ix(4) and ixl(4) because it was easy to test for 
> > > > >>> me.
> > > > >>> But, this could also improve em(4), igc(4), ale(4) and oce(4).
> > > > >>> 
> > > > >>> I'm not sure about the name, the api nor the place of this code.  
> > > > >>> So, if
> > > > >>> someone has a better idea: i'm open to anything.
> > > > >> 
> > > > >> I like code this deduplication.
> > > > >> 
> > > > >> This newly introduced function doesn't touch ifnet but only extracts
> > > > >> protocol headers from mbuf(9). I guess mbuf_extract_headers() or
> > > > >> something like is much better for name with the ern/uipc_mbuf2.c as
> > > > >> place.
> > > > > 
> > > > > Good Point.  Updates diff below.
> > > > > 
> > > > > +
> > > > > +/* Parse different TCP/IP protocol headers for a quick view inside 
> > > > > an mbuf. */
> > > > > +void
> > > > > +m_exract_headers(struct mbuf *mp, struct ether_header **eh, struct 
> > > > > ip **ip4,
> > > > > +struct ip6_hdr **ip6, struct tcphdr **tcp, struct udphdr **udp)
> > > > > +
> > > > 
> > > > Should be m_extract_headers(). The rest of the diff looks good to me.
> > > 
> > > Please wait.
> > > 
> > > The mandatory nullification of `ip4', `ip6' and other variables passed
> > > to m_exract_headers() is not obvious. It is much better to return
> > > the integer result of extraction like m_tag_copy_chain() does.
> > 
> > Yes, the mandatory nullification seems to be more errorprone.  In my
> > opinion is the number of results it not that useful. You have to check
> > the retuned pointers anyway.
> > 
> > I moved the nullification inside of m_exract_headers().
> 
> This is better. I also like the last return statement be removed from
> m_extract_headers() before commit.

Fixed below.  Plus a suggestion from mpi to not pollute the namespace
with all the headers in mbuf.h.  Moved them to uipc_mbuf2.c.

Index: dev/pci/if_ix.c
===
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.189
diff -u -p -r1.189 if_ix.c
--- dev/pci/if_ix.c 2 Sep 2022 14:08:09 -   1.189
+++ dev/pci/if_ix.c 19 Jan 2023 09:29:10 -
@@ -2477,23 +2477,18 @@ static inline int
 ixgbe_csum_offload(struct mbuf *mp, uint32_t *vlan_macip_lens,
 uint32_t *type_tucmd_mlhl, uint32_t *olinfo_status)
 {
-   struct ether_header *eh = mtod(mp, struct ether_header *);
-   struct mbuf *m;
-   int hoff;
+   struct ether_header *eh;
+   struct ip *ip;
+   struct ip6_hdr *ip6;
int offload = 0;
uint32_t iphlen;
uint8_t ipproto;
 
-   *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT);
+   m_extract_headers(mp, , , , NULL, NULL);
 
-   switch (ntohs(eh->ether_type)) {
-   case ETHERTYPE_IP: {
-   struct ip *ip;
-
-   m = m_getptr(mp, sizeof(*eh), );
-   KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip));
-   ip = (struct ip *)(mtod(m, caddr_t) + hoff);
+   *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT);
 
+   if (ip) {
iphlen = ip->ip_hl << 2;
ipproto = ip->ip_p;
 
@@ -2503,26 +2498,14 @@ ixgbe_csum_offload(struct mbuf *mp, uint
}
 
*type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
-   break;
-   }
-
 #ifdef INET6
-   case ETHERTYPE_IPV6: {
-   struct ip6_hdr *ip6;
-
-   m = m_getptr(mp,

Re: refactor mbuf parsing on driver level

2023-01-24 Thread Jan Klemkow
On Tue, Jan 24, 2023 at 09:32:53PM +1000, David Gwynne wrote:
> On Mon, Jan 23, 2023 at 09:25:34AM +0100, Jan Klemkow wrote:
> > On Wed, Jan 18, 2023 at 03:49:25PM -0700, Theo de Raadt wrote:
> > > Jan Klemkow  wrote:
> > > > On Wed, Jan 18, 2023 at 10:50:25AM +0300, Vitaliy Makkoveev wrote:
> > > > > On Tue, Jan 17, 2023 at 11:09:17PM +0100, Jan Klemkow wrote:
> > > > > > we have several drivers which have to parse the content of mbufs.  
> > > > > > This
> > > > > > diff suggest a central parsing function for this.  Thus, we can 
> > > > > > reduce
> > > > > > redundant code.
> > > > > > 
> > > > > > I just start with ix(4) and ixl(4) because it was easy to test for 
> > > > > > me.
> > > > > > But, this could also improve em(4), igc(4), ale(4) and oce(4).
> > > > > > 
> > > > > > I'm not sure about the name, the api nor the place of this code.  
> > > > > > So, if
> > > > > > someone has a better idea: i'm open to anything.
> > > > > 
> > > > > I like code this deduplication.
> > > > > 
> > > > > This newly introduced function doesn't touch ifnet but only extracts
> > > > > protocol headers from mbuf(9). I guess mbuf_extract_headers() or
> > > > > something like is much better for name with the ern/uipc_mbuf2.c as
> > > > > place.
> > > > 
> > > > Good Point.  Updates diff below.
> > > 
> > > I agree, "extract" is a better name.  dlg, do you have a comment?
> > 
> > Whats you opinion about this diff?
> 
> it makes ix and ixl prettier, so that's a good enough reason to do
> it. it should go in net/if_ethersubr.c as ether_extract_headers()
> though.
> 
> could you try using a struct to carry the header pointers around and see
> what that looks like?
> 
> struct ether_extracted {
>   struct ether_header *eh;
>   struct ip   *ip4;
>   struct ip6_hdr  *ip6;
>   struct tcphdr   *tcp;
>   struct udphdr   *udp;
> };
> 
> void ether_extract_headers(struct mbuf *, struct ether_extracted *);
> 
> you can add a depth or flags argument if you want to be able to
> tell it to return before looking for the tcp/udp headers if you
> want.

OK?

Thanks,
Jan

Index: dev/pci/if_ix.c
===
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.189
diff -u -p -r1.189 if_ix.c
--- dev/pci/if_ix.c 2 Sep 2022 14:08:09 -   1.189
+++ dev/pci/if_ix.c 24 Jan 2023 13:34:17 -
@@ -2477,25 +2477,16 @@ static inline int
 ixgbe_csum_offload(struct mbuf *mp, uint32_t *vlan_macip_lens,
 uint32_t *type_tucmd_mlhl, uint32_t *olinfo_status)
 {
-   struct ether_header *eh = mtod(mp, struct ether_header *);
-   struct mbuf *m;
-   int hoff;
+   struct ether_extracted ext;
int offload = 0;
uint32_t iphlen;
-   uint8_t ipproto;
 
-   *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT);
+   ether_extract_headers(mp, );
 
-   switch (ntohs(eh->ether_type)) {
-   case ETHERTYPE_IP: {
-   struct ip *ip;
+   *vlan_macip_lens |= (sizeof(*ext.eh) << IXGBE_ADVTXD_MACLEN_SHIFT);
 
-   m = m_getptr(mp, sizeof(*eh), );
-   KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip));
-   ip = (struct ip *)(mtod(m, caddr_t) + hoff);
-
-   iphlen = ip->ip_hl << 2;
-   ipproto = ip->ip_p;
+   if (ext.ip4) {
+   iphlen = ext.ip4->ip_hl << 2;
 
if (ISSET(mp->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT)) {
*olinfo_status |= IXGBE_TXD_POPTS_IXSM << 8;
@@ -2503,46 +2494,30 @@ ixgbe_csum_offload(struct mbuf *mp, uint
}
 
*type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
-   break;
-   }
-
 #ifdef INET6
-   case ETHERTYPE_IPV6: {
-   struct ip6_hdr *ip6;
-
-   m = m_getptr(mp, sizeof(*eh), );
-   KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip6));
-   ip6 = (struct ip6_hdr *)(mtod(m, caddr_t) + hoff);
-
-   iphlen = sizeof(*ip6);
-   ipproto = ip6->ip6_nxt;
+   } else if (ext.ip6) {
+   iphlen = sizeof(*ext.ip6);
 
*type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6;
-   break;
-   }
 #endif
-
-   default:
+   } else {
return offload;
}
 
*vlan_macip_lens |= iphlen;

ifconfig(8): fix output of missing ipv6 addresses

2023-01-23 Thread Jan Klemkow
Hi,

ifconfig doesn't print ipv6 addresses if its used with media option.

# ifconfig -A
vio0: flags=8843 mtu 1500
...
inet 10.0.1.65 netmask 0xff00 broadcast 10.0.1.255
inet6 fe80::5054:ff:fe6a:b6fd%vio0 prefixlen 64 scopeid 0x1
inet6 fc00:1::1 prefixlen 64
inet 192.168.0.1 netmask 0xff00 broadcast 192.168.0.255

# ifconfig -A media
vio0: flags=8843 mtu 1500
...
supported media:
media autoselect
inet 10.0.1.65 netmask 0xff00 broadcast 10.0.1.255
inet 192.168.0.1 netmask 0xff00 broadcast 192.168.0.255

As the diff below shows, afp is NULL by default, but set to inet if
there is an additional program parameter.  At the end, no specific
address family is assumed if afp is NULL.  Thus, the diff below
introduces a new variable to remember if a specific address family was
set by the user or not for printing all interface addresses.

The regression test of ifconfig(8) is passing with the diff below.

ok?

bye,
Jan

Index: ifconfig.c
===
RCS file: /cvs/src/sbin/ifconfig/ifconfig.c,v
retrieving revision 1.461
diff -u -p -r1.461 ifconfig.c
--- ifconfig.c  18 Jan 2023 21:57:10 -  1.461
+++ ifconfig.c  23 Jan 2023 12:11:30 -
@@ -746,6 +746,7 @@ const struct afswtch {
 };
 
 const struct afswtch *afp; /*the address family being set or asked about*/
+const struct afswtch *pafp;/*the address family being used for printing*/
 
 char joinname[IEEE80211_NWID_LEN];
 size_t joinlen;
@@ -840,7 +841,7 @@ main(int argc, char *argv[])
if (argc > 0) {
for (afp = rafp = afs; rafp->af_name; rafp++)
if (strcmp(rafp->af_name, *argv) == 0) {
-   afp = rafp;
+   pafp = afp = rafp;
argc--;
argv++;
break;
@@ -1216,7 +1217,7 @@ printif(char *name, int ifaliases)
(ifa->ifa_addr->sa_family == AF_INET &&
ifaliases == 0 && noinet == 0))
continue;
-   if ((p = afp) != NULL) {
+   if ((p = pafp) != NULL) {
if (ifa->ifa_addr->sa_family == p->af_af)
p->af_status(1);
} else {
@@ -3514,7 +3515,7 @@ status(int link, struct sockaddr_dl *sdl
 
  proto_status:
if (link == 0) {
-   if ((p = afp) != NULL) {
+   if ((p = pafp) != NULL) {
p->af_status(1);
} else for (p = afs; p->af_name; p++) {
ifr.ifr_addr.sa_family = p->af_af;



Re: refactor mbuf parsing on driver level

2023-01-24 Thread Jan Klemkow
On Tue, Jan 24, 2023 at 05:40:55PM +0300, Vitaliy Makkoveev wrote:
> On Tue, Jan 24, 2023 at 03:14:36PM +0100, Jan Klemkow wrote:
> > On Tue, Jan 24, 2023 at 09:32:53PM +1000, David Gwynne wrote:
> > > On Mon, Jan 23, 2023 at 09:25:34AM +0100, Jan Klemkow wrote:
> > > > On Wed, Jan 18, 2023 at 03:49:25PM -0700, Theo de Raadt wrote:
> > > > > Jan Klemkow  wrote:
> > > > > > On Wed, Jan 18, 2023 at 10:50:25AM +0300, Vitaliy Makkoveev wrote:
> > > > > > > On Tue, Jan 17, 2023 at 11:09:17PM +0100, Jan Klemkow wrote:
> > > > > > > > we have several drivers which have to parse the content of 
> > > > > > > > mbufs.  This
> > > > > > > > diff suggest a central parsing function for this.  Thus, we can 
> > > > > > > > reduce
> > > > > > > > redundant code.
> > > > > > > > 
> > > > > > > > I just start with ix(4) and ixl(4) because it was easy to test 
> > > > > > > > for me.
> > > > > > > > But, this could also improve em(4), igc(4), ale(4) and oce(4).
> > > > > > > > 
> > > > > > > > I'm not sure about the name, the api nor the place of this 
> > > > > > > > code.  So, if
> > > > > > > > someone has a better idea: i'm open to anything.
> > > > > > > 
> > > > > > > I like code this deduplication.
> > > > > > > 
> > > > > > > This newly introduced function doesn't touch ifnet but only 
> > > > > > > extracts
> > > > > > > protocol headers from mbuf(9). I guess mbuf_extract_headers() or
> > > > > > > something like is much better for name with the ern/uipc_mbuf2.c 
> > > > > > > as
> > > > > > > place.
> > > > > > 
> > > > > > Good Point.  Updates diff below.
> > > > > 
> > > > > I agree, "extract" is a better name.  dlg, do you have a comment?
> > > > 
> > > > Whats you opinion about this diff?
> > > 
> > > it makes ix and ixl prettier, so that's a good enough reason to do
> > > it. it should go in net/if_ethersubr.c as ether_extract_headers()
> > > though.
> > > 
> > > could you try using a struct to carry the header pointers around and see
> > > what that looks like?
> > > 
> > > struct ether_extracted {
> > >   struct ether_header *eh;
> > >   struct ip   *ip4;
> > >   struct ip6_hdr  *ip6;
> > >   struct tcphdr   *tcp;
> > >   struct udphdr   *udp;
> > > };
> > > 
> > > void ether_extract_headers(struct mbuf *, struct ether_extracted *);
> > > 
> > > you can add a depth or flags argument if you want to be able to
> > > tell it to return before looking for the tcp/udp headers if you
> > > want.
> 
> Looks better then m_extract_headers(). Since ext->eh is always assigned
> to non NULL value below, the "ext->eh = NULL;" is not necessary. Also
> I'm not sure, but is memset() more reliable for `ext' zeroing? Anyway,
> feel free to commit without memset().

OK?

Thanks,
Jan

Index: dev/pci/if_ix.c
===
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.189
diff -u -p -r1.189 if_ix.c
--- dev/pci/if_ix.c 2 Sep 2022 14:08:09 -   1.189
+++ dev/pci/if_ix.c 24 Jan 2023 13:34:17 -
@@ -2477,25 +2477,16 @@ static inline int
 ixgbe_csum_offload(struct mbuf *mp, uint32_t *vlan_macip_lens,
 uint32_t *type_tucmd_mlhl, uint32_t *olinfo_status)
 {
-   struct ether_header *eh = mtod(mp, struct ether_header *);
-   struct mbuf *m;
-   int hoff;
+   struct ether_extracted ext;
int offload = 0;
uint32_t iphlen;
-   uint8_t ipproto;
 
-   *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT);
+   ether_extract_headers(mp, );
 
-   switch (ntohs(eh->ether_type)) {
-   case ETHERTYPE_IP: {
-   struct ip *ip;
+   *vlan_macip_lens |= (sizeof(*ext.eh) << IXGBE_ADVTXD_MACLEN_SHIFT);
 
-   m = m_getptr(mp, sizeof(*eh), );
-   KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip));
-   ip = (struct ip *)(mtod(m, caddr_t) + hoff);
-
-   iphlen = ip->ip_hl << 2;
-   ipproto = ip->ip_p;
+   if (ext.ip4) {
+   ip

Re: refactor mbuf parsing on driver level

2023-01-26 Thread Jan Klemkow
On Thu, Jan 26, 2023 at 02:06:28PM +0300, Vitaliy Makkoveev wrote:
> On Thu, Jan 26, 2023 at 11:37:51AM +0100, Christian Weisgerber wrote:
> > Jan Klemkow:
> > 
> > > we have several drivers which have to parse the content of mbufs.  This
> > > diff suggest a central parsing function for this.  Thus, we can reduce
> > > redundant code.
> > > 
> > > I just start with ix(4) and ixl(4) because it was easy to test for me.
> > > But, this could also improve em(4), igc(4), ale(4) and oce(4).
> > 
> > Here's the corresponding change for em(4).
> > This only affects 82575, 82576, i350, and i210.
> > Tested on i210.
> > 
> > ok?
> > 
> 
> The ether_extract_headers() diff was reverted, because is wrong for the
> cases other than tcp/udp/icmp. We need to fix it and recommit again
> before continue.

I'm already on the way, to fix this mess.  I'll send a new diff soon.

Sorry this inconvenience,
Jan



Re: refactor mbuf parsing on driver level

2023-01-30 Thread Jan Klemkow
On Fri, Jan 27, 2023 at 04:44:36PM +0100, Christian Weisgerber wrote:
> > The ether_extract_headers() diff was reverted, because is wrong for the
> > cases other than tcp/udp/icmp. We need to fix it and recommit again
> > before continue.
> 
> I think (TCP or) UDP fragments are the problem.  Fragments don't have
> the protocol header but will still end up here:
> 
> case IPPROTO_UDP:
> m = m_getptr(m, hoff + hlen, );
> KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ext->udp));
> ext->udp = (struct udphdr *)(mtod(m, caddr_t) + hoff);
> break;
> 
> If a tail fragment is too short, it will trigger the KASSERT().
> 
> Previously, this wasn't a problem, because if there was such a
> KASSERT() as in ixl(4), it was behind a M_*_CSUM_OUT check, and we
> never set those flags for fragments.

I changed the diff below to be more robust and reconstruct my test
equipment to build permanently over NFS.

 - I turned the KASSERTS to returns.
 - Check if the mbuf is large enough for an ether header.
 - additionally #ifdef'd INET6 around the ip6_hdr in the new struct

Tested the diff on NFS client and server with several kernel builds.

ok?

Thanks,
Jan

Index: dev/pci/if_ix.c
===
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.191
diff -u -p -r1.191 if_ix.c
--- dev/pci/if_ix.c 26 Jan 2023 07:32:39 -  1.191
+++ dev/pci/if_ix.c 27 Jan 2023 13:37:13 -
@@ -2477,25 +2477,16 @@ static inline int
 ixgbe_csum_offload(struct mbuf *mp, uint32_t *vlan_macip_lens,
 uint32_t *type_tucmd_mlhl, uint32_t *olinfo_status)
 {
-   struct ether_header *eh = mtod(mp, struct ether_header *);
-   struct mbuf *m;
-   int hoff;
+   struct ether_extracted ext;
int offload = 0;
uint32_t iphlen;
-   uint8_t ipproto;
 
-   *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT);
+   ether_extract_headers(mp, );
 
-   switch (ntohs(eh->ether_type)) {
-   case ETHERTYPE_IP: {
-   struct ip *ip;
+   *vlan_macip_lens |= (sizeof(*ext.eh) << IXGBE_ADVTXD_MACLEN_SHIFT);
 
-   m = m_getptr(mp, sizeof(*eh), );
-   KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip));
-   ip = (struct ip *)(mtod(m, caddr_t) + hoff);
-
-   iphlen = ip->ip_hl << 2;
-   ipproto = ip->ip_p;
+   if (ext.ip4) {
+   iphlen = ext.ip4->ip_hl << 2;
 
if (ISSET(mp->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT)) {
*olinfo_status |= IXGBE_TXD_POPTS_IXSM << 8;
@@ -2503,46 +2494,30 @@ ixgbe_csum_offload(struct mbuf *mp, uint
}
 
*type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
-   break;
-   }
-
 #ifdef INET6
-   case ETHERTYPE_IPV6: {
-   struct ip6_hdr *ip6;
-
-   m = m_getptr(mp, sizeof(*eh), );
-   KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip6));
-   ip6 = (struct ip6_hdr *)(mtod(m, caddr_t) + hoff);
-
-   iphlen = sizeof(*ip6);
-   ipproto = ip6->ip6_nxt;
+   } else if (ext.ip6) {
+   iphlen = sizeof(*ext.ip6);
 
*type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6;
-   break;
-   }
 #endif
-
-   default:
+   } else {
return offload;
}
 
*vlan_macip_lens |= iphlen;
 
-   switch (ipproto) {
-   case IPPROTO_TCP:
+   if (ext.tcp) {
*type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP;
if (ISSET(mp->m_pkthdr.csum_flags, M_TCP_CSUM_OUT)) {
*olinfo_status |= IXGBE_TXD_POPTS_TXSM << 8;
offload = 1;
}
-   break;
-   case IPPROTO_UDP:
+   } else if (ext.udp) {
*type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_UDP;
if (ISSET(mp->m_pkthdr.csum_flags, M_UDP_CSUM_OUT)) {
*olinfo_status |= IXGBE_TXD_POPTS_TXSM << 8;
offload = 1;
}
-   break;
}
 
return offload;
Index: dev/pci/if_ixl.c
===
RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v
retrieving revision 1.86
diff -u -p -r1.86 if_ixl.c
--- dev/pci/if_ixl.c26 Jan 2023 07:32:39 -  1.86
+++ dev/pci/if_ixl.c27 Jan 2023 13:37:13 -
@@ -2784,10 +2784,8 @@ ixl_load_mbuf(bus_dma_tag_t dmat, bus_dm
 static uint64_t
 ixl_tx_setup_offload(struct mbuf *m0)
 {
-   struct mbuf *m;
-   int hoff;
+   struct ether_extracted ext;
uint64_t hlen;
-   uint8_t ipproto;
uint64_t offload = 0;
 
if (ISSET(m0->m_flags, M_VLANTAG)) {
@@ -2800,39 +2798,21 @@ ixl_tx_setup_offload(struct mbuf *m0)
M_IPV4_CSUM_OUT|M_TCP_CSUM_OUT|M_UDP_CSUM_OUT))

Re: mem.4: be more accurate about securelevel

2023-01-18 Thread Jan Klemkow
On Tue, Jan 17, 2023 at 11:02:07PM +0100, Theo Buehler wrote:
> > at least this tool works for me:
> 
> Surely you have kern.allowkmem=1 set.

This diff should phrase it correctly.

ok?

Thanks,
Jan

Index: man4.alpha/mem.4
===
RCS file: /cvs/src/share/man/man4/man4.alpha/mem.4,v
retrieving revision 1.6
diff -u -p -r1.6 mem.4
--- man4.alpha/mem.412 Jan 2018 04:36:44 -  1.6
+++ man4.alpha/mem.418 Jan 2023 19:25:27 -
@@ -63,11 +63,12 @@ kernel virtual memory begins at
 .Pp
 Even with sufficient file system permissions,
 these devices can only be opened when the
-.Xr securelevel 7
-is insecure or when the
 .Va kern.allowkmem
 .Xr sysctl 2
 variable is set.
+Also the
+.Xr securelevel 7
+insecure is needed, to open the device writable.
 .Sh FILES
 .Bl -tag -width /dev/kmem -compact
 .It /dev/mem
Index: man4.amd64/mem.4
===
RCS file: /cvs/src/share/man/man4/man4.amd64/mem.4,v
retrieving revision 1.6
diff -u -p -r1.6 mem.4
--- man4.amd64/mem.412 Jan 2018 04:36:44 -  1.6
+++ man4.amd64/mem.418 Jan 2023 19:26:59 -
@@ -64,11 +64,12 @@ The kernel virtual memory begins at addr
 .Pp
 Even with sufficient file system permissions,
 these devices can only be opened when the
-.Xr securelevel 7
-is insecure or when the
 .Va kern.allowkmem
 .Xr sysctl 2
 variable is set.
+Also the
+.Xr securelevel 7
+insecure is needed, to open the device writable.
 .Sh FILES
 .Bl -tag -width Pa -compact
 .It Pa /dev/mem
Index: man4.hppa/mem.4
===
RCS file: /cvs/src/share/man/man4/man4.hppa/mem.4,v
retrieving revision 1.4
diff -u -p -r1.4 mem.4
--- man4.hppa/mem.4 12 Jan 2018 04:36:44 -  1.4
+++ man4.hppa/mem.4 18 Jan 2023 19:29:07 -
@@ -52,11 +52,12 @@ address 0; kernel virtual memory begins 
 .Pp
 Even with sufficient file system permissions,
 these devices can only be opened when the
-.Xr securelevel 7
-is insecure or when the
 .Va kern.allowkmem
 .Xr sysctl 2
 variable is set.
+Also the
+.Xr securelevel 7
+insecure is needed, to open the device writable.
 .Sh FILES
 .Bl -tag -width /dev/kmem -compact
 .It Pa /dev/mem
Index: man4.i386/mem.4
===
RCS file: /cvs/src/share/man/man4/man4.i386/mem.4,v
retrieving revision 1.12
diff -u -p -r1.12 mem.4
--- man4.i386/mem.4 12 Jan 2018 04:36:44 -  1.12
+++ man4.i386/mem.4 18 Jan 2023 19:30:18 -
@@ -64,11 +64,12 @@ long, and ends at virtual address
 .Pp
 Even with sufficient file system permissions,
 these devices can only be opened when the
-.Xr securelevel 7
-is insecure or when the
 .Va kern.allowkmem
 .Xr sysctl 2
 variable is set.
+Also the
+.Xr securelevel 7
+insecure is needed, to open the device writable.
 .Sh FILES
 .Bl -tag -width Pa -compact
 .It Pa /dev/mem
Index: man4.landisk/mem.4
===
RCS file: /cvs/src/share/man/man4/man4.landisk/mem.4,v
retrieving revision 1.4
diff -u -p -r1.4 mem.4
--- man4.landisk/mem.4  12 Jan 2018 04:36:44 -  1.4
+++ man4.landisk/mem.4  18 Jan 2023 19:31:28 -
@@ -59,11 +59,12 @@ The kernel virtual memory begins at addr
 .Pp
 Even with sufficient file system permissions,
 these devices can only be opened when the
-.Xr securelevel 7
-is insecure or when the
 .Va kern.allowkmem
 .Xr sysctl 2
 variable is set.
+Also the
+.Xr securelevel 7
+insecure is needed, to open the device writable.
 .Sh FILES
 .Bl -tag -width Pa -compact
 .It Pa /dev/mem
Index: man4.loongson/mem.4
===
RCS file: /cvs/src/share/man/man4/man4.loongson/mem.4,v
retrieving revision 1.4
diff -u -p -r1.4 mem.4
--- man4.loongson/mem.4 12 Jan 2018 04:36:44 -  1.4
+++ man4.loongson/mem.4 18 Jan 2023 19:32:44 -
@@ -89,11 +89,12 @@ The kernel virtual memory begins at addr
 .Pp
 Even with sufficient file system permissions,
 these devices can only be opened when the
-.Xr securelevel 7
-is insecure or when the
 .Va kern.allowkmem
 .Xr sysctl 2
 variable is set.
+Also the
+.Xr securelevel 7
+insecure is needed, to open the device writable.
 .Sh FILES
 .Bl -tag -width Pa -compact
 .It Pa /dev/mem
Index: man4.luna88k/mem.4
===
RCS file: /cvs/src/share/man/man4/man4.luna88k/mem.4,v
retrieving revision 1.4
diff -u -p -r1.4 mem.4
--- man4.luna88k/mem.4  12 Jan 2018 04:36:44 -  1.4
+++ man4.luna88k/mem.4  18 Jan 2023 19:33:50 -
@@ -63,11 +63,12 @@ kernel virtual memory begins at
 .Pp
 Even with sufficient file system permissions,
 these devices can only be opened when the
-.Xr securelevel 7
-is insecure or when the
 .Va kern.allowkmem
 .Xr sysctl 2
 variable is set.
+Also the
+.Xr securelevel 7
+insecure is needed, to open the device writable.
 .Sh FILES
 .Bl 

Re: refactor mbuf parsing on driver level

2023-01-18 Thread Jan Klemkow
On Wed, Jan 18, 2023 at 10:50:25AM +0300, Vitaliy Makkoveev wrote:
> On Tue, Jan 17, 2023 at 11:09:17PM +0100, Jan Klemkow wrote:
> > we have several drivers which have to parse the content of mbufs.  This
> > diff suggest a central parsing function for this.  Thus, we can reduce
> > redundant code.
> > 
> > I just start with ix(4) and ixl(4) because it was easy to test for me.
> > But, this could also improve em(4), igc(4), ale(4) and oce(4).
> > 
> > I'm not sure about the name, the api nor the place of this code.  So, if
> > someone has a better idea: i'm open to anything.
> 
> I like code this deduplication.
> 
> This newly introduced function doesn't touch ifnet but only extracts
> protocol headers from mbuf(9). I guess mbuf_extract_headers() or
> something like is much better for name with the ern/uipc_mbuf2.c as
> place.

Good Point.  Updates diff below.

Thanks,
Jan

Index: dev/pci/if_ix.c
===
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.189
diff -u -p -r1.189 if_ix.c
--- dev/pci/if_ix.c 2 Sep 2022 14:08:09 -   1.189
+++ dev/pci/if_ix.c 18 Jan 2023 21:06:58 -
@@ -2477,23 +2477,18 @@ static inline int
 ixgbe_csum_offload(struct mbuf *mp, uint32_t *vlan_macip_lens,
 uint32_t *type_tucmd_mlhl, uint32_t *olinfo_status)
 {
-   struct ether_header *eh = mtod(mp, struct ether_header *);
-   struct mbuf *m;
-   int hoff;
+   struct ether_header *eh = NULL;
+   struct ip *ip = NULL;
+   struct ip6_hdr *ip6 = NULL;
int offload = 0;
uint32_t iphlen;
uint8_t ipproto;
 
-   *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT);
+   m_exract_headers(mp, , , , NULL, NULL);
 
-   switch (ntohs(eh->ether_type)) {
-   case ETHERTYPE_IP: {
-   struct ip *ip;
-
-   m = m_getptr(mp, sizeof(*eh), );
-   KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip));
-   ip = (struct ip *)(mtod(m, caddr_t) + hoff);
+   *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT);
 
+   if (ip) {
iphlen = ip->ip_hl << 2;
ipproto = ip->ip_p;
 
@@ -2503,26 +2498,14 @@ ixgbe_csum_offload(struct mbuf *mp, uint
}
 
*type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
-   break;
-   }
-
 #ifdef INET6
-   case ETHERTYPE_IPV6: {
-   struct ip6_hdr *ip6;
-
-   m = m_getptr(mp, sizeof(*eh), );
-   KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip6));
-   ip6 = (struct ip6_hdr *)(mtod(m, caddr_t) + hoff);
-
+   } else if (ip6) {
iphlen = sizeof(*ip6);
ipproto = ip6->ip6_nxt;
 
*type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6;
-   break;
-   }
 #endif
-
-   default:
+   } else {
return offload;
}
 
Index: dev/pci/if_ixl.c
===
RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v
retrieving revision 1.84
diff -u -p -r1.84 if_ixl.c
--- dev/pci/if_ixl.c5 Aug 2022 13:57:16 -   1.84
+++ dev/pci/if_ixl.c18 Jan 2023 20:47:01 -
@@ -2784,12 +2784,15 @@ ixl_load_mbuf(bus_dma_tag_t dmat, bus_dm
 static uint64_t
 ixl_tx_setup_offload(struct mbuf *m0)
 {
-   struct mbuf *m;
-   int hoff;
+   struct ether_header *eh = NULL;
+   struct ip *ip = NULL;
+   struct ip6_hdr *ip6 = NULL;
+   struct tcphdr *th = NULL;
uint64_t hlen;
uint8_t ipproto;
uint64_t offload = 0;
 
+
if (ISSET(m0->m_flags, M_VLANTAG)) {
uint64_t vtag = m0->m_pkthdr.ether_vtag;
offload |= IXL_TX_DESC_CMD_IL2TAG1;
@@ -2800,39 +2803,23 @@ ixl_tx_setup_offload(struct mbuf *m0)
M_IPV4_CSUM_OUT|M_TCP_CSUM_OUT|M_UDP_CSUM_OUT))
return (offload);
 
-   switch (ntohs(mtod(m0, struct ether_header *)->ether_type)) {
-   case ETHERTYPE_IP: {
-   struct ip *ip;
-
-   m = m_getptr(m0, ETHER_HDR_LEN, );
-   KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip));
-   ip = (struct ip *)(mtod(m, caddr_t) + hoff);
+   m_exract_headers(m0, , , , , NULL);
 
+   if (ip) {
offload |= ISSET(m0->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT) ?
IXL_TX_DESC_CMD_IIPT_IPV4_CSUM :
IXL_TX_DESC_CMD_IIPT_IPV4;
  
hlen = ip->ip_hl << 2;
ipproto = ip->ip_p;
-   break;
-   }
-
 #ifdef INET6
-   case ETHERTYPE_IPV6: {
-   struct ip6_hdr *ip6;
-
-   m = m_getptr(m0, ETHER_HDR_LEN, );
-   KASSERT(m != NULL && 

libcrypto: Fix EINVAL in openssl/tls_init

2023-03-24 Thread Jan Klemkow
Hi,

after tls_init() and OPENSSL_init_ssl() errno is always set to EINVAL.
This is caused by a routine that tries to prefetch all error strings
up to 127 from strerror(3).  But, strerror(3) sets EINVAL for unknown
values of error.

Thus, I would suggest to set this constant to ELAST.  So, we will avoid
useless unknown error strings and a non-zero errno after tls_init().

I guess this is not serious enough for the current release.  But, we
might fix this after unlocking of the tree?

ok?

bye,
Jan

Index: lib/libcrypto//err/err.c
===
RCS file: /cvs/src/lib/libcrypto/err/err.c,v
retrieving revision 1.50
diff -u -p -r1.50 err.c
--- lib/libcrypto//err/err.c26 Dec 2022 07:18:52 -  1.50
+++ lib/libcrypto//err/err.c24 Mar 2023 20:07:18 -
@@ -560,7 +560,7 @@ int_err_get_next_lib(void)
 
 
 #ifndef OPENSSL_NO_ERR
-#define NUM_SYS_STR_REASONS 127
+#define NUM_SYS_STR_REASONS ELAST
 #define LEN_SYS_STR_REASON 32
 
 static ERR_STRING_DATA SYS_str_reasons[NUM_SYS_STR_REASONS + 1];



Re: em(4) multiqueue

2023-04-25 Thread Jan Klemkow
On Fri, Apr 14, 2023 at 10:26:14AM +0800, Kevin Lo wrote:
> On Thu, Apr 13, 2023 at 01:30:36PM -0500, Brian Conway wrote:
> > Reviving this thread, apologies for discontinuity in mail readers: 
> > https://marc.info/?t=16564219358
> > 
> > After rebasing on 7.3, my results have mirrored Hrvoje's testing at
> > the end of that thread. No issues with throughput, unusual latency,
> > or reliability. `vmstat -i` shows some level of balancing between
> > the queues. I've been testing on as many em(4) systems as I have
> > access to, some manually, some in a packet forwarder/firewall
> > scenarios:
> 
> Last time I tested (about a year go) on I211, rx locked up if I tried 
> something
> like iperf3 or tcpbench.  Don't know if you have a similar problem.

I rebased the rest to current and tested it with tcpbench between the
following interfaces:

em0 at pci7 dev 0 function 0 "Intel 82580" rev 0x01, msix, 4 queues, address 
90:e2:ba:df:d5:2c
em0 at pci5 dev 0 function 0 "Intel I350" rev 0x01, msix, 8 queues, address 
00:25:90:eb:b3:c2

After a second the connection stucked.  As far as I can see, the
sending side got a problem.

ot45# tcpbench 192.168.99.3
  elapsed_ms  bytes mbps   bwidth
1012   14574120  115.210  100.00%
Conn:   1 Mbps:  115.210 Peak Mbps:  115.210 Avg Mbps:  115.210
2022  00.000-nan%
...

ot46# tcpbench -s
  elapsed_ms  bytes mbps   bwidth
1017   14313480  112.594  100.00%
Conn:   1 Mbps:  112.594 Peak Mbps:  112.594 Avg Mbps:  112.594
2027  00.000-nan%
...

ot45# netstat  -nf inet -p tcp
Active Internet connections
Proto   Recv-Q Send-Q  Local Address  Foreign AddressTCP-State
tcp  0 260640  192.168.99.1.18530 192.168.99.3.12345 CLOSING

When I retried it, it sometimes work and most times not.

kstat tells me, that transmit queues 1 to 3 are oactive and just 0
works:

em0:0:txq:0
 packets: 4042648 packets
   bytes: 5310138322 bytes
  qdrops: 9 packets
  errors: 0 packets
qlen: 0 packets
 maxqlen: 511 packets
 oactive: false
em0:0:txq:1
 packets: 9812 packets
   bytes: 14846716 bytes
  qdrops: 0 packets
  errors: 0 packets
qlen: 184 packets
 maxqlen: 511 packets
 oactive: true
em0:0:txq:2
 packets: 690362 packets
   bytes: 60011484 bytes
  qdrops: 0 packets
  errors: 0 packets
qlen: 185 packets
 maxqlen: 511 packets
 oactive: true
em0:0:txq:3
 packets: 443181 packets
   bytes: 43829886 bytes
  qdrops: 0 packets
  errors: 0 packets
qlen: 198 packets
 maxqlen: 511 packets
 oactive: true

This is the rebased diff on current i tested:

Index: dev/pci/files.pci
===
RCS file: /cvs/src/sys/dev/pci/files.pci,v
retrieving revision 1.361
diff -u -p -r1.361 files.pci
--- dev/pci/files.pci   23 Apr 2023 00:20:26 -  1.361
+++ dev/pci/files.pci   25 Apr 2023 11:25:47 -
@@ -334,7 +334,7 @@ attach  fxp at pci with fxp_pci
 file   dev/pci/if_fxp_pci.cfxp_pci
 
 # Intel Pro/1000
-device em: ether, ifnet, ifmedia
+device em: ether, ifnet, ifmedia, intrmap, stoeplitz
 attach em at pci
 file   dev/pci/if_em.c em
 file   dev/pci/if_em_hw.c  em
Index: dev/pci/if_em.c
===
RCS file: /cvs/src/sys/dev/pci/if_em.c,v
retrieving revision 1.365
diff -u -p -r1.365 if_em.c
--- dev/pci/if_em.c 9 Feb 2023 21:21:27 -   1.365
+++ dev/pci/if_em.c 25 Apr 2023 11:25:47 -
@@ -247,6 +247,7 @@ int  em_intr(void *);
 int  em_allocate_legacy(struct em_softc *);
 void em_start(struct ifqueue *);
 int  em_ioctl(struct ifnet *, u_long, caddr_t);
+int  em_rxrinfo(struct em_softc *, struct if_rxrinfo *);
 void em_watchdog(struct ifnet *);
 void em_init(void *);
 void em_stop(void *, int);
@@ -309,8 +310,10 @@ int  em_setup_queues_msix(struct em_soft
 int  em_queue_intr_msix(void *);
 int  em_link_intr_msix(void *);
 void em_enable_queue_intr_msix(struct em_queue *);
+void em_setup_rss(struct em_softc *);
 #else
 #define em_allocate_msix(_sc)  (-1)
+#define em_setup_rss(_sc)  0
 #endif
 
 #if NKSTAT > 0
@@ -333,7 +336,6 @@ struct cfdriver em_cd = {
 };
 
 static int em_smart_pwr_down = FALSE;
-int em_enable_msix = 0;
 
 /*
  *  Device identification routine
@@ -629,12 +631,12 @@ err_pci:
 void
 em_start(struct ifqueue *ifq)
 {
+   struct em_queue *que = ifq->ifq_softc;
struct ifnet *ifp = ifq->ifq_if;
struct em_softc *sc = ifp->if_softc;
u_int head, free, used;
struct mbuf *m;
int post = 0;
-   struct 

Re: libcrypto: Fix EINVAL in openssl/tls_init

2023-03-27 Thread Jan Klemkow
On Fri, Mar 24, 2023 at 10:02:05PM +0100, Theo Buehler wrote:
> > Thus, I would suggest to set this constant to ELAST.  So, we will avoid
> > useless unknown error strings and a non-zero errno after tls_init().
> 
> ELAST isn't portable. It's under __BSD_VISIBLE in sys/errno.h.
> 
> It would seem better to use the save_errno idiom to store the errno
> at the start of the loop and restore it at the end.
> 
> And yes, we should fix this, after unluck.

ok?

Thanks,
Jan

Index: err/err.c
===
RCS file: /cvs/src/lib/libcrypto/err/err.c,v
retrieving revision 1.50
diff -u -p -r1.50 err.c
--- err/err.c   26 Dec 2022 07:18:52 -  1.50
+++ err/err.c   27 Mar 2023 07:58:25 -
@@ -580,6 +580,7 @@ build_SYS_str_reasons(void)
static char strerror_tab[NUM_SYS_STR_REASONS][LEN_SYS_STR_REASON];
int i;
static int init = 1;
+   int save_errno = errno;
 
CRYPTO_r_lock(CRYPTO_LOCK_ERR);
if (!init) {
@@ -594,6 +595,8 @@ build_SYS_str_reasons(void)
return;
}
 
+   /* strerror(3) will set errno to EINVAL when i is an unknown error. */
+   save_errno = errno;
for (i = 1; i <= NUM_SYS_STR_REASONS; i++) {
ERR_STRING_DATA *str = _str_reasons[i - 1];
 
@@ -610,6 +613,7 @@ build_SYS_str_reasons(void)
if (str->string == NULL)
str->string = "unknown";
}
+   errno = save_errno;
 
/* Now we still have SYS_str_reasons[NUM_SYS_STR_REASONS] = {0, NULL},
 * as required by ERR_load_strings. */



Re: refactor mbuf parsing on driver level

2023-02-06 Thread Jan Klemkow
On Mon, Feb 06, 2023 at 09:47:57PM +0100, Christian Weisgerber wrote:
> Christian Weisgerber:
> 
> > I also switched over em(4) to this and have successfully used it
> > for a full 30-hour package build on the four amd64 ports machines
> > with their I350 interfaces.  Additionally, I've done some IPv6
> > testing at home over an I210.
> 
> ok for this?

I tested it with I350.  Diff look fine.

ok jan@

> igc(4) has very similar code, but I don't have access to a machine
> with that hardware.

Send me an ssh-key and I give you access to this machine:
http://obsd-lab.genua.de/hw/ot34.html

Thanks,
Jan

> > diff f8646d27d4041e5f595c04e17a876f12600deea7 
> > f3f95d0cc0957a2f1e961cace4c3c9dd869e8c9e
> > commit - f8646d27d4041e5f595c04e17a876f12600deea7
> > commit + f3f95d0cc0957a2f1e961cace4c3c9dd869e8c9e
> > blob - c840377f0a3f1ef3c3e3072657698d8085ffd3a0
> > blob + 523ed5b0a18718c50bb30e2995d293fa1d2199a6
> > --- sys/dev/pci/if_em.c
> > +++ sys/dev/pci/if_em.c
> > @@ -2398,12 +2398,11 @@ em_tx_ctx_setup(struct em_queue *que, struct mbuf 
> > *mp,
> >  em_tx_ctx_setup(struct em_queue *que, struct mbuf *mp, u_int head,
> >  u_int32_t *olinfo_status, u_int32_t *cmd_type_len)
> >  {
> > +   struct ether_extracted ext;
> > struct e1000_adv_tx_context_desc *TD;
> > -   struct ether_header *eh = mtod(mp, struct ether_header *);
> > -   struct mbuf *m;
> > uint32_t vlan_macip_lens = 0, type_tucmd_mlhl = 0, mss_l4len_idx = 0;
> > -   int off = 0, hoff;
> > -   uint8_t ipproto, iphlen;
> > +   int off = 0;
> > +   uint8_t iphlen;
> >  
> > *olinfo_status = 0;
> > *cmd_type_len = 0;
> > @@ -2418,44 +2417,26 @@ em_tx_ctx_setup(struct em_queue *que, struct mbuf 
> > *mp,
> > }
> >  #endif
> >  
> > -   vlan_macip_lens |= (sizeof(*eh) << E1000_ADVTXD_MACLEN_SHIFT);
> > -   
> > -   switch (ntohs(eh->ether_type)) {
> > -   case ETHERTYPE_IP: {
> > -   struct ip *ip;
> > +   ether_extract_headers(mp, );
> >  
> > -   m = m_getptr(mp, sizeof(*eh), );
> > -   ip = (struct ip *)(mtod(m, caddr_t) + hoff);
> > +   vlan_macip_lens |= (sizeof(*ext.eh) << E1000_ADVTXD_MACLEN_SHIFT);
> >  
> > -   iphlen = ip->ip_hl << 2;
> > -   ipproto = ip->ip_p;
> > +   if (ext.ip4) {
> > +   iphlen = ext.ip4->ip_hl << 2;
> >  
> > type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV4;
> > if (ISSET(mp->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT)) {
> > *olinfo_status |= E1000_TXD_POPTS_IXSM << 8;
> > off = 1;
> > }
> > -
> > -   break;
> > -   }
> >  #ifdef INET6
> > -   case ETHERTYPE_IPV6: {
> > -   struct ip6_hdr *ip6;
> > +   } else if (ext.ip6) {
> > +   iphlen = sizeof(*ext.ip6);
> >  
> > -   m = m_getptr(mp, sizeof(*eh), );
> > -   ip6 = (struct ip6_hdr *)(mtod(m, caddr_t) + hoff);
> > -
> > -   iphlen = sizeof(*ip6);
> > -   ipproto = ip6->ip6_nxt;
> > -
> > type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV6;
> > -   break;
> > -   }
> >  #endif
> > -   default:
> > +   } else {
> > iphlen = 0;
> > -   ipproto = 0;
> > -   break;
> > }
> >  
> > *cmd_type_len |= E1000_ADVTXD_DTYP_DATA | E1000_ADVTXD_DCMD_IFCS;
> > @@ -2464,21 +2445,18 @@ em_tx_ctx_setup(struct em_queue *que, struct mbuf 
> > *mp,
> > vlan_macip_lens |= iphlen;
> > type_tucmd_mlhl |= E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DTYP_CTXT;
> >  
> > -   switch (ipproto) {
> > -   case IPPROTO_TCP:
> > +   if (ext.tcp) {
> > type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP;
> > if (ISSET(mp->m_pkthdr.csum_flags, M_TCP_CSUM_OUT)) {
> > *olinfo_status |= E1000_TXD_POPTS_TXSM << 8;
> > off = 1;
> > }
> > -   break;
> > -   case IPPROTO_UDP:
> > +   } else if (ext.udp) {
> > type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_UDP;
> > if (ISSET(mp->m_pkthdr.csum_flags, M_UDP_CSUM_OUT)) {
> > *olinfo_status |= E1000_TXD_POPTS_TXSM << 8;
> > off = 1;
> > }
> > -   break;
> > }
> >  
> > if (!off)
> > 
> 
> -- 
> Christian "naddy" Weisgerber  na...@mips.inka.de
> 



Re: refactor mbuf parsing on driver level

2023-01-31 Thread Jan Klemkow
On Tue, Jan 31, 2023 at 09:12:51PM +0100, Christian Weisgerber wrote:
> Jan Klemkow:
> 
> >  - I turned the KASSERTS to returns.
> >  - Check if the mbuf is large enough for an ether header.
> >  - additionally #ifdef'd INET6 around the ip6_hdr in the new struct
> 
> For non-initial fragments of TCP/UDP packets, ether_extract_headers()
> will create ext.tcp/ext.udp pointers that do not point to a protocol
> header.  Should there be a check to exclude fragments?

yes.  bluhm also suggested this solution to me.

ok?

Thanks,
Jan

Index: dev/pci/if_ix.c
===
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.191
diff -u -p -r1.191 if_ix.c
--- dev/pci/if_ix.c 26 Jan 2023 07:32:39 -  1.191
+++ dev/pci/if_ix.c 31 Jan 2023 21:05:40 -
@@ -2477,25 +2477,16 @@ static inline int
 ixgbe_csum_offload(struct mbuf *mp, uint32_t *vlan_macip_lens,
 uint32_t *type_tucmd_mlhl, uint32_t *olinfo_status)
 {
-   struct ether_header *eh = mtod(mp, struct ether_header *);
-   struct mbuf *m;
-   int hoff;
+   struct ether_extracted ext;
int offload = 0;
uint32_t iphlen;
-   uint8_t ipproto;
 
-   *vlan_macip_lens |= (sizeof(*eh) << IXGBE_ADVTXD_MACLEN_SHIFT);
+   ether_extract_headers(mp, );
 
-   switch (ntohs(eh->ether_type)) {
-   case ETHERTYPE_IP: {
-   struct ip *ip;
+   *vlan_macip_lens |= (sizeof(*ext.eh) << IXGBE_ADVTXD_MACLEN_SHIFT);
 
-   m = m_getptr(mp, sizeof(*eh), );
-   KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip));
-   ip = (struct ip *)(mtod(m, caddr_t) + hoff);
-
-   iphlen = ip->ip_hl << 2;
-   ipproto = ip->ip_p;
+   if (ext.ip4) {
+   iphlen = ext.ip4->ip_hl << 2;
 
if (ISSET(mp->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT)) {
*olinfo_status |= IXGBE_TXD_POPTS_IXSM << 8;
@@ -2503,46 +2494,30 @@ ixgbe_csum_offload(struct mbuf *mp, uint
}
 
*type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
-   break;
-   }
-
 #ifdef INET6
-   case ETHERTYPE_IPV6: {
-   struct ip6_hdr *ip6;
-
-   m = m_getptr(mp, sizeof(*eh), );
-   KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip6));
-   ip6 = (struct ip6_hdr *)(mtod(m, caddr_t) + hoff);
-
-   iphlen = sizeof(*ip6);
-   ipproto = ip6->ip6_nxt;
+   } else if (ext.ip6) {
+   iphlen = sizeof(*ext.ip6);
 
*type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6;
-   break;
-   }
 #endif
-
-   default:
+   } else {
return offload;
}
 
*vlan_macip_lens |= iphlen;
 
-   switch (ipproto) {
-   case IPPROTO_TCP:
+   if (ext.tcp) {
*type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP;
if (ISSET(mp->m_pkthdr.csum_flags, M_TCP_CSUM_OUT)) {
*olinfo_status |= IXGBE_TXD_POPTS_TXSM << 8;
offload = 1;
}
-   break;
-   case IPPROTO_UDP:
+   } else if (ext.udp) {
*type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_UDP;
if (ISSET(mp->m_pkthdr.csum_flags, M_UDP_CSUM_OUT)) {
*olinfo_status |= IXGBE_TXD_POPTS_TXSM << 8;
offload = 1;
}
-   break;
}
 
return offload;
Index: dev/pci/if_ixl.c
===
RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v
retrieving revision 1.86
diff -u -p -r1.86 if_ixl.c
--- dev/pci/if_ixl.c26 Jan 2023 07:32:39 -  1.86
+++ dev/pci/if_ixl.c31 Jan 2023 21:05:40 -
@@ -2784,10 +2784,8 @@ ixl_load_mbuf(bus_dma_tag_t dmat, bus_dm
 static uint64_t
 ixl_tx_setup_offload(struct mbuf *m0)
 {
-   struct mbuf *m;
-   int hoff;
+   struct ether_extracted ext;
uint64_t hlen;
-   uint8_t ipproto;
uint64_t offload = 0;
 
if (ISSET(m0->m_flags, M_VLANTAG)) {
@@ -2800,39 +2798,21 @@ ixl_tx_setup_offload(struct mbuf *m0)
M_IPV4_CSUM_OUT|M_TCP_CSUM_OUT|M_UDP_CSUM_OUT))
return (offload);
 
-   switch (ntohs(mtod(m0, struct ether_header *)->ether_type)) {
-   case ETHERTYPE_IP: {
-   struct ip *ip;
-
-   m = m_getptr(m0, ETHER_HDR_LEN, );
-   KASSERT(m != NULL && m->m_len - hoff >= sizeof(*ip));
-   ip = (struct ip *)(mtod(m, caddr_t) + hoff);
+   ether_extract_headers(m0, );
 
+   if (ext.ip4) {
offload |= ISSET(m0->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT) ?
IXL_TX_DESC_CMD_IIPT_IPV4_

ix(4): allocate less memory for tx buffers

2023-06-09 Thread Jan Klemkow
Hi,

TSO packets are limited to MAXMCLBYTES (64k).  Thus, we don't need to
allocate IXGBE_TSO_SIZE (256k) per packet for the transmit buffers.

This saves 3/4 of the memory and allows me to pack over 8 ix(8) ports
into one machine.  Otherwise I run out of devbuf in malloc(9).

ok?

bye,
Jan

Index: dev/pci/if_ix.c
===
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.197
diff -u -p -r1.197 if_ix.c
--- dev/pci/if_ix.c 1 Jun 2023 09:05:33 -   1.197
+++ dev/pci/if_ix.c 9 Jun 2023 16:01:18 -
@@ -37,6 +37,12 @@
 #include 
 #include 
 
+/*
+ * Our TCP/IP Stack could not handle packets greater then MAXMCLBYTES.
+ * This interface could not handle packets greater then IXGBE_TSO_SIZE.
+ */
+CTASSERT(MAXMCLBYTES < IXGBE_TSO_SIZE);
+
 /*
  *  Driver version
  */
@@ -2263,7 +2269,7 @@ ixgbe_allocate_transmit_buffers(struct t
/* Create the descriptor buffer dma maps */
for (i = 0; i < sc->num_tx_desc; i++) {
txbuf = >tx_buffers[i];
-   error = bus_dmamap_create(txr->txdma.dma_tag, IXGBE_TSO_SIZE,
+   error = bus_dmamap_create(txr->txdma.dma_tag, MAXMCLBYTES,
sc->num_segs, PAGE_SIZE, 0,
BUS_DMA_NOWAIT, >map);
 



Re: tcp lro tso path mtu

2023-07-06 Thread Jan Klemkow
On Thu, Jul 06, 2023 at 10:19:21PM +0300, Alexander Bluhm wrote:
> On Thu, Jul 06, 2023 at 08:49:03PM +0200, Jan Klemkow wrote:
> > > @@ -109,6 +109,9 @@
> > >  #include 
> > >  #include 
> > >  #include 
> > 
> > I think is a merge bug, isn't it?
> > 
> > > +#include 
> > > +#include 
> > > +#include 
> 
> Right.
> 
> > > + error = tcp_if_output_tso(ifp, mp, dst, rt, ifcap, mtu);
> > > + if (error || *mp == NULL)
> > > + return error;
> > > +
> > > + if ((*mp)->m_pkthdr.len <= mtu) {
> > 
> > I may miss something but...
> > 
> > Couldn't you move the *_cksum_out() calls above the upper
> > tcp_if_output_tso() call?  And than remove the *_cksum_out() calls
> > inside of tcp_if_output_tso()?
> > 
> > Thus, there is just one place where we call them.
> > 
> > > + switch (dst->sa_family) {
> > > + case AF_INET:
> > > + in_hdr_cksum_out(*mp, ifp);
> > > + in_proto_cksum_out(*mp, ifp);
> > > + break;
> > > +#ifdef INET6
> > > + case AF_INET6:
> > > + in6_proto_cksum_out(*mp, ifp);
> > > + break;
> > > +#endif
> 
> There is the case in tcp_if_output_tso() where we call tcp_chopper().
> Then checksum has to be calcualted after chopping.  If I do it
> always before tcp_if_output_tso(), we may caluclate it twice.  Once
> for the large packet and once for the small ones.
> 
> New diff without duplicate includes.

tested with v4/v6, direct and forwarding.

ok jan@

> Index: net/if.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v
> retrieving revision 1.704
> diff -u -p -r1.704 if.c
> --- net/if.c  6 Jul 2023 04:55:04 -   1.704
> +++ net/if.c  6 Jul 2023 19:15:00 -
> @@ -886,6 +886,57 @@ if_output_ml(struct ifnet *ifp, struct m
>  }
>  
>  int
> +if_output_tso(struct ifnet *ifp, struct mbuf **mp, struct sockaddr *dst,
> +struct rtentry *rt, u_int mtu)
> +{
> + uint32_t ifcap;
> + int error;
> +
> + switch (dst->sa_family) {
> + case AF_INET:
> + ifcap = IFCAP_TSOv4;
> + break;
> +#ifdef INET6
> + case AF_INET6:
> + ifcap = IFCAP_TSOv6;
> + break;
> +#endif
> + default:
> + unhandled_af(dst->sa_family);
> + }
> +
> + /*
> +  * Try to send with TSO first.  When forwarding LRO may set
> +  * maximium segment size in mbuf header.  Chop TCP segment
> +  * even if it would fit interface MTU to preserve maximum
> +  * path MTU.
> +  */
> + error = tcp_if_output_tso(ifp, mp, dst, rt, ifcap, mtu);
> + if (error || *mp == NULL)
> + return error;
> +
> + if ((*mp)->m_pkthdr.len <= mtu) {
> + switch (dst->sa_family) {
> + case AF_INET:
> + in_hdr_cksum_out(*mp, ifp);
> + in_proto_cksum_out(*mp, ifp);
> + break;
> +#ifdef INET6
> + case AF_INET6:
> + in6_proto_cksum_out(*mp, ifp);
> + break;
> +#endif
> + }
> + error = ifp->if_output(ifp, *mp, dst, rt);
> + *mp = NULL;
> + return error;
> + }
> +
> + /* mp still contains mbuf that has to be fragmented or dropped. */
> + return 0;
> +}
> +
> +int
>  if_output_mq(struct ifnet *ifp, struct mbuf_queue *mq, unsigned int *total,
>  struct sockaddr *dst, struct rtentry *rt)
>  {
> Index: net/if_var.h
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_var.h,v
> retrieving revision 1.128
> diff -u -p -r1.128 if_var.h
> --- net/if_var.h  28 Jun 2023 11:49:49 -  1.128
> +++ net/if_var.h  6 Jul 2023 19:12:39 -
> @@ -329,6 +329,8 @@ int   if_output_ml(struct ifnet *, struct 
>   struct sockaddr *, struct rtentry *);
>  int  if_output_mq(struct ifnet *, struct mbuf_queue *, unsigned int *,
>   struct sockaddr *, struct rtentry *);
> +int  if_output_tso(struct ifnet *, struct mbuf **, struct sockaddr *,
> + struct rtentry *, u_int);
>  int  if_output_local(struct ifnet *, struct mbuf *, sa_family_t);
>  void if_rtrequest_dummy(struct ifnet *, int, struct rtentry *);
>  void p2p_rtrequest(struct ifnet *, int, struct rtentry *);
> Index: net/pf.c
> ==

Re: tcp lro tso path mtu

2023-07-06 Thread Jan Klemkow
On Mon, Jul 03, 2023 at 08:04:11PM +0300, Alexander Bluhm wrote:
> As final step before making LRO (Large Receive Offload) the default,
> we have to fix path MTU discovery when forwarding.
> 
> The drivers, currently ix(4) and lo(4) only, record an upper bound
> of the size of the original packets in ph_mss.  When sending we
> must chop the packets with TSO (TCP Segmentation Offload) to that
> size.  That means we have to call tcp_if_output_tso() before
> ifp->if_output().  I have put that logic into if_output_tso() to
> avoid code duplication.
> 
> ok?

I like the idea of this commit.  Some comments below.

Thanks,
Jan

> Index: net/if.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v
> retrieving revision 1.702
> diff -u -p -r1.702 if.c
> --- net/if.c  2 Jul 2023 19:59:15 -   1.702
> +++ net/if.c  3 Jul 2023 10:28:30 -
> @@ -109,6 +109,9 @@
>  #include 
>  #include 
>  #include 

I think is a merge bug, isn't it?

> +#include 
> +#include 
> +#include 

> @@ -883,6 +886,57 @@ if_output_ml(struct ifnet *ifp, struct m
>   ml_purge(ml);
>  
>   return error;
> +}
> +
> +int
> +if_output_tso(struct ifnet *ifp, struct mbuf **mp, struct sockaddr *dst,
> +struct rtentry *rt, u_int mtu)
> +{
> + uint32_t ifcap;
> + int error;
> +
> + switch (dst->sa_family) {
> + case AF_INET:
> + ifcap = IFCAP_TSOv4;
> + break;
> +#ifdef INET6
> + case AF_INET6:
> + ifcap = IFCAP_TSOv6;
> + break;
> +#endif
> + default:
> + unhandled_af(dst->sa_family);
> + }
> +
> + /*
> +  * Try to send with TSO first.  When forwarding LRO may set
> +  * maximium segment size in mbuf header.  Chop TCP segment
> +  * even if it would fit interface MTU to preserve maximum
> +  * path MTU.
> +  */
> + error = tcp_if_output_tso(ifp, mp, dst, rt, ifcap, mtu);
> + if (error || *mp == NULL)
> + return error;
> +
> + if ((*mp)->m_pkthdr.len <= mtu) {

I may miss something but...

Couldn't you move the *_cksum_out() calls above the upper
tcp_if_output_tso() call?  And than remove the *_cksum_out() calls
inside of tcp_if_output_tso()?

Thus, there is just one place where we call them.

> + switch (dst->sa_family) {
> + case AF_INET:
> + in_hdr_cksum_out(*mp, ifp);
> + in_proto_cksum_out(*mp, ifp);
> + break;
> +#ifdef INET6
> + case AF_INET6:
> + in6_proto_cksum_out(*mp, ifp);
> + break;
> +#endif
> + }
> + error = ifp->if_output(ifp, *mp, dst, rt);
> + *mp = NULL;
> + return error;
> + }
> +
> + /* mp still contains mbuf that has to be fragmented or dropped. */
> + return 0;
>  }



ixl(4): protect admin queue with mutex

2023-07-19 Thread Jan Klemkow
Hi,

there is an issue with the admin queue of ixl(4) which leads into the
following panic when the link state changes:

uvm_fault(0x818005f8, 0x18, 0, 2) -> e
kernel: page fault trap, code=0
Stopped at  ixl_intr0+0xca: movq%rdx,0x18(%rax)
TIDPIDUID PRFLAGS PFLAGS  CPU  COMMAND
 392823  13219  00x100040  02  ifstated
 444681  94950 90   0x1100010  06  ospf6d
 428704   9496 90   0x1100010  09  ospf6d
 106020  59273 85   0x1100010  01  ospfd
 420435  72114 85   0x1100010  05  ospfd
 295821  93368 73   0x1100010  03  syslogd
 367116  56598  0 0x14000  0x2007  zerothread
 275385  57815  0 0x14000  0x2004  softnet
ixl_intr0(84509000) at ixl_intr0+0xca
intr_handler(0,844b0b80) at intr_handler+0x5b
Xintr_ioapic_edge25_untramp() at Xintr_ioapic_edge25_untramp+0x18f
acpicpu_idle() at acpicpu_idle+0x1f6
sched_idle(0) at sched_idle+0x280
end trace frame: 0x0, count: 10
https://www.openbsd.org/ddb.html describes the minimum info required in bug
reports.  Insufficient info makes it difficult to find and fix bugs.
ddb{0}>

The queue is corrupted in a way, that slot->iaq_cookie is 0.  Which
causes the uvm fault when iatq is dereferenced.

The following diff uses a mutex to protect the admin queue and avoids
the issue above.

ok?

bye,
Jan

Index: dev/pci/if_ixl.c
===
RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v
retrieving revision 1.87
diff -u -p -r1.87 if_ixl.c
--- dev/pci/if_ixl.c6 Feb 2023 20:27:45 -   1.87
+++ dev/pci/if_ixl.c19 Jul 2023 07:05:40 -
@@ -1274,6 +1274,7 @@ struct ixl_softc {
unsigned int sc_atq_prod;
unsigned int sc_atq_cons;
 
+   struct mutex sc_atq_mtx;
struct ixl_dmamemsc_arq;
struct task  sc_arq_task;
struct ixl_aq_bufs   sc_arq_idle;
@@ -1723,6 +1724,8 @@ ixl_attach(struct device *parent, struct
 
/* initialise the adminq */
 
+   mtx_init(>sc_atq_mtx, IPL_NET);
+
if (ixl_dmamem_alloc(sc, >sc_atq,
sizeof(struct ixl_aq_desc) * IXL_AQ_NUM, IXL_AQ_ALIGN) != 0) {
printf("\n" "%s: unable to allocate atq\n", DEVNAME(sc));
@@ -3599,6 +3602,8 @@ ixl_atq_post(struct ixl_softc *sc, struc
struct ixl_aq_desc *atq, *slot;
unsigned int prod;
 
+   mtx_enter(>sc_atq_mtx);
+
/* assert locked */
 
atq = IXL_DMA_KVA(>sc_atq);
@@ -3618,6 +3623,8 @@ ixl_atq_post(struct ixl_softc *sc, struc
prod &= IXL_AQ_MASK;
sc->sc_atq_prod = prod;
ixl_wr(sc, sc->sc_aq_regs->atq_tail, prod);
+
+   mtx_leave(>sc_atq_mtx);
 }
 
 static void
@@ -3628,11 +3635,15 @@ ixl_atq_done(struct ixl_softc *sc)
unsigned int cons;
unsigned int prod;
 
+   mtx_enter(>sc_atq_mtx);
+
prod = sc->sc_atq_prod;
cons = sc->sc_atq_cons;
 
-   if (prod == cons)
+   if (prod == cons) {
+   mtx_leave(>sc_atq_mtx);
return;
+   }
 
atq = IXL_DMA_KVA(>sc_atq);
 
@@ -3645,6 +3656,7 @@ ixl_atq_done(struct ixl_softc *sc)
if (!ISSET(slot->iaq_flags, htole16(IXL_AQ_DD)))
break;
 
+   KASSERT(slot->iaq_cookie != 0);
iatq = (struct ixl_atq *)slot->iaq_cookie;
iatq->iatq_desc = *slot;
 
@@ -3661,6 +3673,8 @@ ixl_atq_done(struct ixl_softc *sc)
BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE);
 
sc->sc_atq_cons = cons;
+
+   mtx_leave(>sc_atq_mtx);
 }
 
 static void
@@ -3691,6 +3705,8 @@ ixl_atq_poll(struct ixl_softc *sc, struc
unsigned int prod;
unsigned int t = 0;
 
+   mtx_enter(>sc_atq_mtx);
+
atq = IXL_DMA_KVA(>sc_atq);
prod = sc->sc_atq_prod;
slot = atq + prod;
@@ -3712,8 +3728,10 @@ ixl_atq_poll(struct ixl_softc *sc, struc
while (ixl_rd(sc, sc->sc_aq_regs->atq_head) != prod) {
delaymsec(1);
 
-   if (t++ > tm)
+   if (t++ > tm) {
+   mtx_leave(>sc_atq_mtx);
return (ETIMEDOUT);
+   }
}
 
bus_dmamap_sync(sc->sc_dmat, IXL_DMA_MAP(>sc_atq),
@@ -3724,6 +3742,7 @@ ixl_atq_poll(struct ixl_softc *sc, struc
 
sc->sc_atq_cons = prod;
 
+   mtx_leave(>sc_atq_mtx);
return (0);
 }
 



Re: tcp lro by default, call for testing

2023-07-10 Thread Jan Klemkow
On Sat, Jul 08, 2023 at 05:15:26PM +0300, Alexander Bluhm wrote:
> I am not aware of any more limitations when enabling LRO for TCP
> in the network drivers.  The feature allows to receive agregated
> packets larger than the MTU.  Receiving TCP streams becomes much
> faster.
> 
> As the network hardware is not aware whether a packet is received
> locally or forwarded, everything is aggregated.  In case of forwarding
> it is split on output to packets not larger than the original
> packets.  So path MTU discovery should still work.  If the outgoing
> interface supports TSO, the packet is chopped in hardware.
> 
> Currently only ix(4) and lo(4) support LRO, and ix(4) is limited
> to IPv4 and newer than the old 82598 model.  If the interface is
> added to a bridge(4) or aggr(4), LRO is automatically disabled.

I guess you mean veb(4) not aggr(4).  We just avoid the in heritage
of the LRO capability in aggr(4) but are using the feature.

> So in case you possess any ix(4) hardware or do funky pf routing
> on lo(4) please run this diff.  If you encounter problems, report
> and turn LRO off per interface with ifconfig -tcplro.

Diff looks fine to me.  I just would keep mentioning the default
behavior in the manpage like this:

ok jan@

Index: sbin/ifconfig/ifconfig.8
===
RCS file: /cvs/src/sbin/ifconfig/ifconfig.8,v
retrieving revision 1.397
diff -u -p -r1.397 ifconfig.8
--- sbin/ifconfig/ifconfig.87 Jun 2023 18:42:40 -   1.397
+++ sbin/ifconfig/ifconfig.810 Jul 2023 11:54:47 -
@@ -517,9 +517,9 @@ It is not possible to use LRO with inter
 or
 .Xr tpmr 4 .
 Changing this option will re-initialize the network interface.
+LRO is enabled by default.
 .It Cm -tcplro
 Disable LRO.
-LRO is disabled by default.
 .It Cm up
 Mark an interface
 .Dq up .


> Index: sys/dev/pci/if_ix.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pci/if_ix.c,v
> retrieving revision 1.198
> diff -u -p -r1.198 if_ix.c
> --- sys/dev/pci/if_ix.c   8 Jul 2023 09:01:30 -   1.198
> +++ sys/dev/pci/if_ix.c   8 Jul 2023 13:51:26 -
> @@ -1925,8 +1925,10 @@ ixgbe_setup_interface(struct ix_softc *s
>   ifp->if_capabilities |= IFCAP_CSUM_IPv4;
>  
>   ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6;
> - if (sc->hw.mac.type != ixgbe_mac_82598EB)
> + if (sc->hw.mac.type != ixgbe_mac_82598EB) {
> + ifp->if_xflags |= IFXF_LRO;
>   ifp->if_capabilities |= IFCAP_LRO;
> + }
>  
>   /*
>* Specify the media types supported by this sc and register
> Index: sys/net/if_loop.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_loop.c,v
> retrieving revision 1.95
> diff -u -p -r1.95 if_loop.c
> --- sys/net/if_loop.c 2 Jul 2023 19:59:15 -   1.95
> +++ sys/net/if_loop.c 8 Jul 2023 13:51:26 -
> @@ -172,11 +172,11 @@ loop_clone_create(struct if_clone *ifc, 
>   ifp->if_softc = NULL;
>   ifp->if_mtu = LOMTU;
>   ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST;
> - ifp->if_xflags = IFXF_CLONED;
> + ifp->if_xflags = IFXF_CLONED | IFXF_LRO;
>   ifp->if_capabilities = IFCAP_CSUM_IPv4 |
>   IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 |
>   IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6 |
> - IFCAP_LRO;
> + IFCAP_LRO | IFCAP_TSOv4 | IFCAP_TSOv6;
>   ifp->if_rtrequest = lortrequest;
>   ifp->if_ioctl = loioctl;
>   ifp->if_input = loinput;
> Index: sbin/ifconfig/ifconfig.8
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sbin/ifconfig/ifconfig.8,v
> retrieving revision 1.397
> diff -u -p -r1.397 ifconfig.8
> --- sbin/ifconfig/ifconfig.8  7 Jun 2023 18:42:40 -   1.397
> +++ sbin/ifconfig/ifconfig.8  7 Jul 2023 19:57:09 -
> @@ -519,7 +519,6 @@ or
>  Changing this option will re-initialize the network interface.
>  .It Cm -tcplro
>  Disable LRO.
> -LRO is disabled by default.
>  .It Cm up
>  Mark an interface
>  .Dq up .
> 



Add ethernet type check in ifsetlro()

2023-07-03 Thread Jan Klemkow
Hi,

bluhm pointed out that the ether_brport_isset() check it just allowed on
ethernet devices.  Thus, I put an additional ethernet check in the
condition.  This also fixes EBUSY errors of "ifconfig lo0 tcplro" calls
in my setup.

ok?

bye,
Jan

Index: net/if.c
===
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.702
diff -u -p -r1.702 if.c
--- net/if.c2 Jul 2023 19:59:15 -   1.702
+++ net/if.c3 Jul 2023 20:58:32 -
@@ -3206,7 +3206,7 @@ ifsetlro(struct ifnet *ifp, int on)
KERNEL_ASSERT_LOCKED(); /* for if_flags */
 
if (on && !ISSET(ifp->if_xflags, IFXF_LRO)) {
-   if (ether_brport_isset(ifp)) {
+   if (ifp->if_type == IFT_ETHER && ether_brport_isset(ifp)) {
error = EBUSY;
goto out;
}



Re: Virtio fix for testing

2023-05-26 Thread Jan Klemkow
On Wed, May 24, 2023 at 08:50:26PM +0200, Stefan Fritsch wrote:
> I forgot to mention that no stress test is necessary. If it boots and the
> virtio devices work at all, that should be enough.

Works for me on Linux/KVM with the following devices:

vga1 at pci0 dev 2 function 0 "Qumranet Virtio 1.x GPU" rev 0x01
virtio0 at pci0 dev 4 function 0 "Qumranet Virtio Storage" rev 0x00
virtio1 at pci0 dev 6 function 0 "Qumranet Virtio Console" rev 0x00
virtio2 at pci0 dev 7 function 0 "Qumranet Virtio Memory Balloon" rev 0x00
virtio3 at pci0 dev 8 function 0 "Qumranet Virtio Network" rev 0x00

and on OpenBSD/VMM with:

virtio0 at pci0 dev 1 function 0 "Qumranet Virtio RNG" rev 0x00
virtio1 at pci0 dev 2 function 0 "Qumranet Virtio Network" rev 0x00
virtio2 at pci0 dev 3 function 0 "Qumranet Virtio Storage" rev 0x00
virtio3 at pci0 dev 4 function 0 "Qumranet Virtio SCSI" rev 0x00

Thanks,
Jan



Re: ifconfig rename tcplro

2023-06-06 Thread Jan Klemkow
On Tue, Jun 06, 2023 at 05:54:31PM +0300, Vitaliy Makkoveev wrote:
> On Tue, Jun 06, 2023 at 02:31:52PM +0200, Alexander Bluhm wrote:
> > I would suggest to rename ifconfig tcprecvoffload to tcplro.  Maybe
> > it's just because I had to type that long name too often.
> > 
> > With that we have consistent naming:
> > # ifconfig ix0 tcplro
> > # sysctl net.inet.tcp.tso=1
> > 
> > Also the coresponding flag are named LRO.
> > # ifconfig ix1 hwfeatures
> > ix1: flags=2008843 mtu 1500
> > 
> > hwfeatures=71b7
> >  hardmtu 9198
> > 
> > The feature is quite new, so I have no backward compatiblity concerns.
> > 
> > ok?
> 
> Could you name it "lro" like FreeBSD uses?

I also would prefer this one.



Re: ifconfig rename tcplro

2023-06-06 Thread Jan Klemkow
On Tue, Jun 06, 2023 at 09:37:22AM -0700, Chris Cappuccio wrote:
> Jan Klemkow [j.klem...@wemelug.de] wrote:
> > On Tue, Jun 06, 2023 at 05:54:31PM +0300, Vitaliy Makkoveev wrote:
> > > On Tue, Jun 06, 2023 at 02:31:52PM +0200, Alexander Bluhm wrote:
> > > > I would suggest to rename ifconfig tcprecvoffload to tcplro.  Maybe
> > > > it's just because I had to type that long name too often.
> > > > 
> > > > With that we have consistent naming:
> > > > # ifconfig ix0 tcplro
> > > > # sysctl net.inet.tcp.tso=1
> > > > 
> > > > Also the coresponding flag are named LRO.
> > > > # ifconfig ix1 hwfeatures
> > > > ix1: flags=2008843 mtu 1500
> > > > 
> > > > hwfeatures=71b7
> > > >  hardmtu 9198
> > > > 
> > > > The feature is quite new, so I have no backward compatiblity concerns.
> > > > 
> > > > ok?
> > > 
> > > Could you name it "lro" like FreeBSD uses?
> > 
> > I also would prefer this one.
> 
> and tcpsendoffload back to tso ?
> 
> was the reason for changing it from tso due to the initial conflation
> of TSO and LRO in the tree?

Yes.  At the start of this, I just want to keep it simple with one
ifconfig option "tso".  But, tso is now default in tcp_output() and
can be controlled globally via sysctl(2) net.inet.tcp.tso.  Thus, we
just need to control LRO per interface.



Re: ifconfig rename tcplro

2023-06-07 Thread Jan Klemkow
On Wed, Jun 07, 2023 at 02:49:07PM +0300, Vitaliy Makkoveev wrote:
> On Wed, Jun 07, 2023 at 01:29:09PM +0200, Alexander Bluhm wrote:
> > On Wed, Jun 07, 2023 at 12:59:11PM +0300, Vitaliy Makkoveev wrote:
> > > On Wed, Jun 07, 2023 at 10:19:32AM +1000, David Gwynne wrote:
> > > > 
> > > > 
> > > > > On 7 Jun 2023, at 06:33, Vitaliy Makkoveev  wrote:
> > > > > 
> > > > >> On 6 Jun 2023, at 20:29, Alexander Bluhm  
> > > > >> wrote:
> > > > >> 
> > > > >> On Tue, Jun 06, 2023 at 05:54:31PM +0300, Vitaliy Makkoveev wrote:
> > > > >>> On Tue, Jun 06, 2023 at 02:31:52PM +0200, Alexander Bluhm wrote:
> > > >  Hi,
> > > >  
> > > >  I would suggest to rename ifconfig tcprecvoffload to tcplro.  Maybe
> > > >  it's just because I had to type that long name too often.
> > > >  
> > > >  With that we have consistent naming:
> > > >  # ifconfig ix0 tcplro
> > > >  # sysctl net.inet.tcp.tso=1
> > > >  
> > > >  Also the coresponding flag are named LRO.
> > > >  # ifconfig ix1 hwfeatures
> > > >  ix1: flags=2008843 mtu 
> > > >  1500
> > > >    
> > > >  hwfeatures=71b7
> > > >   hardmtu 9198
> > > >  
> > > >  The feature is quite new, so I have no backward compatiblity 
> > > >  concerns.
> > > >  
> > > >  ok?
> > > >  
> > > > >>> 
> > > > >>> Could you name it "lro" like FreeBSD uses?
> > > > >> 
> > > > >> When I started with this, LRO and TSO were unknown to me.  So with
> > > > >> TCP prefix it may be clearer to users where the feature belongs.
> > > > >> 
> > > > >> Naming is hard.
> > > > > 
> > > > > Yeah, naming is definitely hard. I propose to use lro because it is
> > > > > already used for the same purpose by FreeBSD, so the same name helps
> > > > > to avoid confusion.
> > > > > 
> > > > >lro If the driver supports tcp(4) large receive offloading,
> > > > >enable LRO on the interface.
> > > > > 
> > > > > Also, we have used "tso" keyword for tcp segmentation offloading for
> > > > > the same reason, until it became global net.inet.tcp.tso.
> > > > 
> > > > Is it going to be used to enable lro for udp and other protocols as 
> > > > well?
> > > 
> > > Why not? We have tso feature system wide, so why don't have receive
> > > offloading feature global for all supported protocols? Especially since
> > > I suspect this control will be moved from ifconfig to global
> > > net.inet.tcp.lro like net.inet.tcp.tso.
> > 
> > Maybe we can make lro the default, and then move it to net.inet.tcp.lro.
> > But I like to see another driver to implement it first.
> > 
> > > However, I'm not the fan of original "tcprecvoffload" and like shorter
> > > naming.
> > 
> > Can we use ifconfig tcplro for now?
> > + it only affects TCP
> > + user see that it is related to TCP
> > + it is not a 3 letter abrevation claudio does not like
> > + it is shorter than tcprecvoffload
> > 
> > cons
> > - FreeBSD calls it lro
> > 
> 
> Feel free to use tcplro.

Do so.  OK jan@



ix(4): LRO forwarding

2023-05-23 Thread Jan Klemkow
Hi,

This diff sets needed offloading flags and the calculated mss to LRO
mbufs in ix(4).  Thus, we can forward this packets and process them via
tcp_if_output_tso().  This diff also uses tcp_if_output_tso() in
ip6_forward().

I tested the ip6_forward path via the address family transition in pf:

pass in inet from 192.168.1.1 to 192.168.13.2 af-to \
inet6 from fc00:13::1 to fc00:13::2

ok?

bye,
Jan

Index: dev/pci/if_ix.c
===
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.196
diff -u -p -r1.196 if_ix.c
--- dev/pci/if_ix.c 23 May 2023 09:16:16 -  1.196
+++ dev/pci/if_ix.c 23 May 2023 11:02:52 -
@@ -3257,13 +3257,38 @@ ixgbe_rxeof(struct rx_ring *rxr)
 
if (sendmp->m_pkthdr.ph_mss > 0) {
struct ether_extracted ext;
+   uint64_t hlen;
uint16_t pkts = sendmp->m_pkthdr.ph_mss;
 
+   /* Calculate header size. */
ether_extract_headers(sendmp, );
-   if (ext.tcp)
+   hlen = sizeof(*ext.eh);
+   if (ext.ip4) {
+   hlen += ext.ip4->ip_hl << 2;
+   } else if (ext.ip6) {
+   if (ext.ip6->ip6_nxt == IPPROTO_TCP)
+   hlen += sizeof(*ext.ip6);
+   else
+   tcpstat_inc(tcps_inbadlro);
+   }
+   if (ext.tcp) {
tcpstat_inc(tcps_inhwlro);
-   else
+   hlen += ext.tcp->th_off << 2;
+   } else {
tcpstat_inc(tcps_inbadlro);
+   }
+
+   /*
+* If we gonna forward this packet, we have to
+* mark it as TSO, recalculate the TCP checksum
+* and set a correct mss.
+*/
+   SET(sendmp->m_pkthdr.csum_flags,
+   M_TCP_CSUM_OUT | M_TCP_TSO);
+
+   sendmp->m_pkthdr.ph_mss =
+   (sendmp->m_pkthdr.len - hlen) / pkts;
+
tcpstat_add(tcps_inpktlro, pkts);
}
 
Index: netinet6/ip6_forward.c
===
RCS file: /cvs/src/sys/netinet6/ip6_forward.c,v
retrieving revision 1.109
diff -u -p -r1.109 ip6_forward.c
--- netinet6/ip6_forward.c  5 Apr 2023 13:56:31 -   1.109
+++ netinet6/ip6_forward.c  23 May 2023 11:59:19 -
@@ -63,8 +63,10 @@
 #include 
 #include 
 #include 
-#include 
 #endif
+#include 
+#include 
+#include 
 
 /*
  * Forward a packet.  If some error occurs return the sender
@@ -316,7 +318,11 @@ reroute:
goto reroute;
}
 #endif
-   in6_proto_cksum_out(m, ifp);
+
+   error = tcp_if_output_tso(ifp, , sin6tosa(sin6), rt, IFCAP_TSOv6,
+   ifp->if_mtu);
+   if (error || m == NULL)
+   goto freecopy;
 
/* Check the size after pf_test to give pf a chance to refragment. */
if (m->m_pkthdr.len > ifp->if_mtu) {
@@ -326,6 +332,8 @@ reroute:
m_freem(m);
goto out;
}
+
+   in6_proto_cksum_out(m, ifp);
 
error = ifp->if_output(ifp, m, sin6tosa(sin6), rt);
if (error) {



Re: ix(4): allocate less memory for tx buffers

2023-06-09 Thread Jan Klemkow
On Fri, Jun 09, 2023 at 06:59:57PM +0200, Jan Klemkow wrote:
> On Fri, Jun 09, 2023 at 06:11:38PM +0200, Jan Klemkow wrote:
> > TSO packets are limited to MAXMCLBYTES (64k).  Thus, we don't need to
> > allocate IXGBE_TSO_SIZE (256k) per packet for the transmit buffers.
> > 
> > This saves 3/4 of the memory and allows me to pack over 8 ix(8) ports
> > into one machine.  Otherwise I run out of devbuf in malloc(9).
> 
> fix typo in comment

Use a more precise compare in the CTASSERT condition.

ok?

Index: dev/pci/if_ix.c
===
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.197
diff -u -p -r1.197 if_ix.c
--- dev/pci/if_ix.c 1 Jun 2023 09:05:33 -   1.197
+++ dev/pci/if_ix.c 9 Jun 2023 16:01:18 -
@@ -37,6 +37,12 @@
 #include 
 #include 
 
+/*
+ * Our TCP/IP Stack could not handle packets greater than MAXMCLBYTES.
+ * This interface could not handle packets greater than IXGBE_TSO_SIZE.
+ */
+CTASSERT(MAXMCLBYTES <= IXGBE_TSO_SIZE);
+
 /*
  *  Driver version
  */
@@ -2263,7 +2269,7 @@ ixgbe_allocate_transmit_buffers(struct t
/* Create the descriptor buffer dma maps */
for (i = 0; i < sc->num_tx_desc; i++) {
txbuf = >tx_buffers[i];
-   error = bus_dmamap_create(txr->txdma.dma_tag, IXGBE_TSO_SIZE,
+   error = bus_dmamap_create(txr->txdma.dma_tag, MAXMCLBYTES,
sc->num_segs, PAGE_SIZE, 0,
BUS_DMA_NOWAIT, >map);
 



Re: ix(4): allocate less memory for tx buffers

2023-06-09 Thread Jan Klemkow
On Fri, Jun 09, 2023 at 06:11:38PM +0200, Jan Klemkow wrote:
> TSO packets are limited to MAXMCLBYTES (64k).  Thus, we don't need to
> allocate IXGBE_TSO_SIZE (256k) per packet for the transmit buffers.
> 
> This saves 3/4 of the memory and allows me to pack over 8 ix(8) ports
> into one machine.  Otherwise I run out of devbuf in malloc(9).
> 
> ok?

fix typo in comment

Index: dev/pci/if_ix.c
===
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.197
diff -u -p -r1.197 if_ix.c
--- dev/pci/if_ix.c 1 Jun 2023 09:05:33 -   1.197
+++ dev/pci/if_ix.c 9 Jun 2023 16:01:18 -
@@ -37,6 +37,12 @@
 #include 
 #include 
 
+/*
+ * Our TCP/IP Stack could not handle packets greater than MAXMCLBYTES.
+ * This interface could not handle packets greater than IXGBE_TSO_SIZE.
+ */
+CTASSERT(MAXMCLBYTES < IXGBE_TSO_SIZE);
+
 /*
  *  Driver version
  */
@@ -2263,7 +2269,7 @@ ixgbe_allocate_transmit_buffers(struct t
/* Create the descriptor buffer dma maps */
for (i = 0; i < sc->num_tx_desc; i++) {
txbuf = >tx_buffers[i];
-   error = bus_dmamap_create(txr->txdma.dma_tag, IXGBE_TSO_SIZE,
+   error = bus_dmamap_create(txr->txdma.dma_tag, MAXMCLBYTES,
sc->num_segs, PAGE_SIZE, 0,
BUS_DMA_NOWAIT, >map);
 



Re: ifconfig rename tcplro

2023-06-06 Thread Jan Klemkow
On Tue, Jun 06, 2023 at 02:31:52PM +0200, Alexander Bluhm wrote:
> I would suggest to rename ifconfig tcprecvoffload to tcplro.  Maybe
> it's just because I had to type that long name too often.
> 
> With that we have consistent naming:
> # ifconfig ix0 tcplro
> # sysctl net.inet.tcp.tso=1
> 
> Also the coresponding flag are named LRO.
> # ifconfig ix1 hwfeatures
> ix1: flags=2008843 mtu 1500
> 
> hwfeatures=71b7
>  hardmtu 9198
> 
> The feature is quite new, so I have no backward compatiblity concerns.
> 
> ok?

I like this shorter naming.
Its OK from my side.

> Index: sbin/ifconfig/ifconfig.8
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sbin/ifconfig/ifconfig.8,v
> retrieving revision 1.396
> diff -u -p -r1.396 ifconfig.8
> --- sbin/ifconfig/ifconfig.8  1 Jun 2023 18:57:53 -   1.396
> +++ sbin/ifconfig/ifconfig.8  6 Jun 2023 12:18:07 -
> @@ -501,7 +501,7 @@ Query and display information and diagno
>  modules installed in an interface.
>  It is only supported by drivers implementing the necessary functionality
>  on hardware which supports it.
> -.It Cm tcprecvoffload
> +.It Cm tcplro
>  Enable TCP large receive offload (LRO) if it's supported by the hardware; see
>  .Cm hwfeatures .
>  LRO enabled network interfaces modify received TCP/IP packets.
> @@ -517,7 +517,7 @@ It is not possible to use LRO with inter
>  or
>  .Xr tpmr 4 .
>  Changing this option will re-initialize the network interface.
> -.It Cm -tcprecvoffload
> +.It Cm -tcplro
>  Disable LRO.
>  LRO is disabled by default.
>  .It Cm up
> Index: sbin/ifconfig/ifconfig.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sbin/ifconfig/ifconfig.c,v
> retrieving revision 1.465
> diff -u -p -r1.465 ifconfig.c
> --- sbin/ifconfig/ifconfig.c  1 Jun 2023 18:57:54 -   1.465
> +++ sbin/ifconfig/ifconfig.c  6 Jun 2023 12:18:59 -
> @@ -471,8 +471,8 @@ const struct  cmd {
>   { "-soii",  IFXF_INET6_NOSOII,  0,  setifxflags },
>   { "monitor",IFXF_MONITOR,   0,  setifxflags },
>   { "-monitor",   -IFXF_MONITOR,  0,  setifxflags },
> - { "tcprecvoffload", IFXF_LRO,   0,  setifxflags },
> - { "-tcprecvoffload", -IFXF_LRO, 0,  setifxflags },
> + { "tcplro", IFXF_LRO,   0,  setifxflags },
> + { "-tcplro",-IFXF_LRO,  0,  setifxflags },
>  #ifndef SMALL
>   { "hwfeatures", NEXTARG0,   0,  printifhwfeatures },
>   { "metric", NEXTARG,0,  setifmetric },
> 



Re: ix(4): LRO forwarding

2023-05-25 Thread Jan Klemkow
On Wed, May 24, 2023 at 05:28:58PM +0200, Alexander Bluhm wrote:
> On Tue, May 23, 2023 at 02:14:57PM +0200, Jan Klemkow wrote:
> > Hi,
> > 
> > This diff sets needed offloading flags and the calculated mss to LRO
> > mbufs in ix(4).  Thus, we can forward this packets and process them via
> > tcp_if_output_tso().  This diff also uses tcp_if_output_tso() in
> > ip6_forward().
> > 
> > I tested the ip6_forward path via the address family transition in pf:
> > 
> > pass in inet from 192.168.1.1 to 192.168.13.2 af-to \
> > inet6 from fc00:13::1 to fc00:13::2
> > 
> > ok?
> 
> crashes during my tests with lro turned on.  Looks like devision
> by zero.

I added a check, that avoids the TSO flags if mss it zero.  Thus, we
avoid a division by zero in later TSO processing.

ok?

Thanks,
Jan

Index: dev/pci/if_ix.c
===
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.196
diff -u -p -r1.196 if_ix.c
--- dev/pci/if_ix.c 23 May 2023 09:16:16 -  1.196
+++ dev/pci/if_ix.c 25 May 2023 20:02:06 -
@@ -3257,13 +3257,40 @@ ixgbe_rxeof(struct rx_ring *rxr)
 
if (sendmp->m_pkthdr.ph_mss > 0) {
struct ether_extracted ext;
+   uint64_t hlen;
uint16_t pkts = sendmp->m_pkthdr.ph_mss;
 
+   /* Calculate header size. */
ether_extract_headers(sendmp, );
-   if (ext.tcp)
+   hlen = sizeof(*ext.eh);
+   if (ext.ip4) {
+   hlen += ext.ip4->ip_hl << 2;
+   } else if (ext.ip6) {
+   if (ext.ip6->ip6_nxt == IPPROTO_TCP)
+   hlen += sizeof(*ext.ip6);
+   else
+   tcpstat_inc(tcps_inbadlro);
+   }
+   if (ext.tcp) {
tcpstat_inc(tcps_inhwlro);
-   else
+   hlen += ext.tcp->th_off << 2;
+   } else {
tcpstat_inc(tcps_inbadlro);
+   }
+
+   /*
+* If we gonna forward this packet, we have to
+* mark it as TSO, recalculate the TCP checksum
+* and set a correct mss.
+*/
+   sendmp->m_pkthdr.ph_mss =
+   (sendmp->m_pkthdr.len - hlen) / pkts;
+
+   if (sendmp->m_pkthdr.ph_mss != 0) {
+   SET(sendmp->m_pkthdr.csum_flags,
+   M_TCP_CSUM_OUT | M_TCP_TSO);
+   }
+
tcpstat_add(tcps_inpktlro, pkts);
}
 
Index: netinet6/ip6_forward.c
===
RCS file: /cvs/src/sys/netinet6/ip6_forward.c,v
retrieving revision 1.109
diff -u -p -r1.109 ip6_forward.c
--- netinet6/ip6_forward.c  5 Apr 2023 13:56:31 -   1.109
+++ netinet6/ip6_forward.c  25 May 2023 20:03:06 -
@@ -63,8 +63,10 @@
 #include 
 #include 
 #include 
-#include 
 #endif
+#include 
+#include 
+#include 
 
 /*
  * Forward a packet.  If some error occurs return the sender
@@ -316,7 +318,11 @@ reroute:
goto reroute;
}
 #endif
-   in6_proto_cksum_out(m, ifp);
+
+   error = tcp_if_output_tso(ifp, , sin6tosa(sin6), rt, IFCAP_TSOv6,
+   ifp->if_mtu);
+   if (error || m == NULL)
+   goto freecopy;
 
/* Check the size after pf_test to give pf a chance to refragment. */
if (m->m_pkthdr.len > ifp->if_mtu) {
@@ -326,6 +332,8 @@ reroute:
m_freem(m);
goto out;
}
+
+   in6_proto_cksum_out(m, ifp);
 
error = ifp->if_output(ifp, m, sin6tosa(sin6), rt);
if (error) {



fix vlan handling with tcplro on ix(4)

2023-07-26 Thread Jan Klemkow
Hi,

I missed the vlan-tag size in the mss calculation of lro packets in
ix(4).  This diff add vlan-header detection in ether_extract_headers()
and uses this information to calculate the right mss.

This fixes forwarding of vlan tagged lro packets.

ok?

bye,
Jan

Index: dev/pci/if_ix.c
===
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.200
diff -u -p -r1.200 if_ix.c
--- dev/pci/if_ix.c 18 Jul 2023 16:01:20 -  1.200
+++ dev/pci/if_ix.c 26 Jul 2023 09:21:15 -
@@ -3275,6 +3275,10 @@ ixgbe_rxeof(struct rx_ring *rxr)
/* Calculate header size. */
ether_extract_headers(sendmp, );
hdrlen = sizeof(*ext.eh);
+#if NVLAN > 0
+   if (ext.evh)
+   hdrlen += ETHER_VLAN_ENCAP_LEN;
+#endif
if (ext.ip4)
hdrlen += ext.ip4->ip_hl << 2;
if (ext.ip6)
Index: net/if_ethersubr.c
===
RCS file: /cvs/src/sys/net/if_ethersubr.c,v
retrieving revision 1.290
diff -u -p -r1.290 if_ethersubr.c
--- net/if_ethersubr.c  6 Jul 2023 19:46:53 -   1.290
+++ net/if_ethersubr.c  26 Jul 2023 09:20:57 -
@@ -1040,6 +1040,7 @@ ether_extract_headers(struct mbuf *mp, s
uint64_t hlen;
int  hoff;
uint8_t  ipproto;
+   uint16_t ether_type;
 
/* Return NULL if header was not recognized. */
memset(ext, 0, sizeof(*ext));
@@ -1048,9 +1049,20 @@ ether_extract_headers(struct mbuf *mp, s
return;
 
ext->eh = mtod(mp, struct ether_header *);
-   switch (ntohs(ext->eh->ether_type)) {
+   ether_type = ntohs(ext->eh->ether_type);
+   hlen = sizeof(*ext->eh);
+
+#if NVLAN > 0
+   if (ether_type == ETHERTYPE_VLAN) {
+   ext->evh = mtod(mp, struct ether_vlan_header *);
+   ether_type = ntohs(ext->evh->evl_proto);
+   hlen = sizeof(*ext->evh);
+   }
+#endif
+
+   switch (ether_type) {
case ETHERTYPE_IP:
-   m = m_getptr(mp, sizeof(*ext->eh), );
+   m = m_getptr(mp, hlen, );
if (m == NULL || m->m_len - hoff < sizeof(*ext->ip4))
return;
ext->ip4 = (struct ip *)(mtod(m, caddr_t) + hoff);
@@ -1064,7 +1076,7 @@ ether_extract_headers(struct mbuf *mp, s
break;
 #ifdef INET6
case ETHERTYPE_IPV6:
-   m = m_getptr(mp, sizeof(*ext->eh), );
+   m = m_getptr(mp, hlen, );
if (m == NULL || m->m_len - hoff < sizeof(*ext->ip6))
return;
ext->ip6 = (struct ip6_hdr *)(mtod(m, caddr_t) + hoff);
Index: netinet/if_ether.h
===
RCS file: /cvs/src/sys/netinet/if_ether.h,v
retrieving revision 1.89
diff -u -p -r1.89 if_ether.h
--- netinet/if_ether.h  6 Jul 2023 19:46:53 -   1.89
+++ netinet/if_ether.h  26 Jul 2023 09:20:22 -
@@ -301,11 +301,12 @@ uint64_t  ether_addr_to_e64(const struct 
 void   ether_e64_to_addr(struct ether_addr *, uint64_t);
 
 struct ether_extracted {
-   struct ether_header *eh;
-   struct ip   *ip4;
-   struct ip6_hdr  *ip6;
-   struct tcphdr   *tcp;
-   struct udphdr   *udp;
+   struct ether_header *eh;
+   struct ether_vlan_header*evh;
+   struct ip   *ip4;
+   struct ip6_hdr  *ip6;
+   struct tcphdr   *tcp;
+   struct udphdr   *udp;
 };
 
 void ether_extract_headers(struct mbuf *, struct ether_extracted *);



Re: lo(4) loopback LRO and TSO

2023-07-02 Thread Jan Klemkow



On July 2, 2023 2:33:41 PM GMT+02:00, Claudio Jeker  
wrote:
>On Sun, Jul 02, 2023 at 02:28:17PM +0200, Alexander Bluhm wrote:
>> anyone?
>
>Was not able to test yet but I like the diff.
>Right now this is a noop since LRO is not on by default for lo(4).
>Because of that OK claudio@

The diff works fine in my sparc64 setup.
ok jan@

>> On Fri, Jun 23, 2023 at 06:06:16PM +0200, Alexander Bluhm wrote:
>> > Hi,
>> > 
>> > Claudio@ mentioned the idea to use TSO and LRO on the loopback
>> > interface to transfer TCP faster.
>> > 
>> > I see a performance effect with this diff, but more importantly it
>> > gives us more test coverage.  Currently LRO on lo(4) is default
>> > off.
>> > 
>> > Future plan is:
>> > - Fix some corner cases for LRO/TSO with TCP path-MTU discovery
>> >   and IP forwarding when LRO is enabled.
>> > - Enable LRO/TSO for lo(4) and ix(4) per default.
>> > - Jan@ commits his ixl(4) TSO diff.
>> > 
>> > ok for lo(4) LRO/TSO with default off?
>> > 
>> > bluhm
>> > 
>> > Index: sys/net/if.c
>> > ===
>> > RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v
>> > retrieving revision 1.700
>> > diff -u -p -r1.700 if.c
>> > --- sys/net/if.c   12 Jun 2023 21:19:54 -  1.700
>> > +++ sys/net/if.c   23 Jun 2023 15:48:27 -
>> > @@ -106,6 +106,9 @@
>> >  #ifdef MROUTING
>> >  #include 
>> >  #endif
>> > +#include 
>> > +#include 
>> > +#include 
>> >  
>> >  #ifdef INET6
>> >  #include 
>> > @@ -802,12 +805,29 @@ if_input_local(struct ifnet *ifp, struct
>> > * is now incorrect, will be calculated before sending.
>> > */
>> >keepcksum = m->m_pkthdr.csum_flags & (M_IPV4_CSUM_OUT |
>> > -  M_TCP_CSUM_OUT | M_UDP_CSUM_OUT | M_ICMP_CSUM_OUT);
>> > +  M_TCP_CSUM_OUT | M_UDP_CSUM_OUT | M_ICMP_CSUM_OUT |
>> > +  M_TCP_TSO);
>> >m_resethdr(m);
>> >m->m_flags |= M_LOOP | keepflags;
>> >m->m_pkthdr.csum_flags = keepcksum;
>> >m->m_pkthdr.ph_ifidx = ifp->if_index;
>> >m->m_pkthdr.ph_rtableid = ifp->if_rdomain;
>> > +
>> > +  if (ISSET(keepcksum, M_TCP_TSO) && m->m_pkthdr.len > ifp->if_mtu) {
>> > +  if (ifp->if_mtu > 0 &&
>> > +  ((af == AF_INET &&
>> > +  ISSET(ifp->if_capabilities, IFCAP_TSOv4)) ||
>> > +  (af == AF_INET6 &&
>> > +  ISSET(ifp->if_capabilities, IFCAP_TSOv6 {
>> > +  tcpstat_inc(tcps_inswlro);
>> > +  tcpstat_add(tcps_inpktlro,
>> > +  (m->m_pkthdr.len + ifp->if_mtu - 1) / ifp->if_mtu);
>> > +  } else {
>> > +  tcpstat_inc(tcps_inbadlro);
>> > +  m_freem(m);
>> > +  return (EPROTONOSUPPORT);
>> > +  }
>> > +  }
>> >  
>> >if (ISSET(keepcksum, M_TCP_CSUM_OUT))
>> >m->m_pkthdr.csum_flags |= M_TCP_CSUM_IN_OK;
>> > Index: sys/net/if_loop.c
>> > ===
>> > RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_loop.c,v
>> > retrieving revision 1.94
>> > diff -u -p -r1.94 if_loop.c
>> > --- sys/net/if_loop.c  5 Jun 2023 11:35:46 -   1.94
>> > +++ sys/net/if_loop.c  23 Jun 2023 15:48:27 -
>> > @@ -175,7 +175,8 @@ loop_clone_create(struct if_clone *ifc, 
>> >ifp->if_xflags = IFXF_CLONED;
>> >ifp->if_capabilities = IFCAP_CSUM_IPv4 |
>> >IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 |
>> > -  IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
>> > +  IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6 |
>> > +  IFCAP_LRO;
>> >ifp->if_rtrequest = lortrequest;
>> >ifp->if_ioctl = loioctl;
>> >ifp->if_input = loinput;
>> > @@ -281,6 +282,10 @@ loioctl(struct ifnet *ifp, u_long cmd, c
>> >  
>> >switch (cmd) {
>> >case SIOCSIFFLAGS:
>> > +  if (ISSET(ifp->if_xflags, IFXF_LRO))
>> > +  SET(ifp->if_capabilities, IFCAP_TSOv4 | IFCAP_TSOv6);
>> > +  else
>> > +  CLR(ifp->if_capabilities, IFCAP_TSOv4 | IFCAP_TSOv6);
>> >break;
>> >  
>> >case SIOCSIFADDR:
>> > Index: sys/netinet/tcp_usrreq.c
>> > ===
>> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_usrreq.c,v
>> > retrieving revision 1.219
>> > diff -u -p -r1.219 tcp_usrreq.c
>> > --- sys/netinet/tcp_usrreq.c   23 May 2023 09:16:16 -  1.219
>> > +++ sys/netinet/tcp_usrreq.c   23 Jun 2023 15:48:27 -
>> > @@ -1340,6 +1340,7 @@ tcp_sysctl_tcpstat(void *oldp, size_t *o
>> >ASSIGN(tcps_outhwtso);
>> >ASSIGN(tcps_outpkttso);
>> >ASSIGN(tcps_outbadtso);
>> > +  ASSIGN(tcps_inswlro);
>> >ASSIGN(tcps_inhwlro);
>> >ASSIGN(tcps_inpktlro);
>> >ASSIGN(tcps_inbadlro);
>> > Index: sys/netinet/tcp_var.h
>> > ===
>> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
>> > retrieving revision 1.167
>> > diff -u -p -r1.167 tcp_var.h
>> > 

Re: tso ip6 forward

2023-06-16 Thread Jan Klemkow
On Fri, Jun 16, 2023 at 12:06:08PM +0200, Alexander Bluhm wrote:
> On Mon, Jun 12, 2023 at 03:46:28PM +0200, Alexander Bluhm wrote:
> > I found a little inconsistency in IPv6 forwarding with TSO.
> > 
> > Sending with TSO should only done if the large packet does not fit
> > in the interface MTU.  In case tcp_if_output_tso() does not process
> > the packet, we should send an ICMP6 error.  Rearrange the code that
> > it looks more like other calls to tcp_if_output_tso().
> > 
> > All these cases can only be reached when LRO is turned on for IPv6
> > which none of our drivers currently supports.
> 
> jan@ pointed out that reordering TSO in ip6 forward breaks path MTU
> discovery.  So lets only fix the forward counters, icmp6 packet too
> big and icmp6 redirect.
> 
> First try to send with TSO.  The goto senderr handles icmp6 redirect
> and other errors.
> 
> If TSO is not necessary and the interface MTU fits, just send the
> packet.  Again goto senderr handles icmp6.
> 
> Finally care about icmp6 packet too big.

Works fine in my setup.

> ok?

ok jan@

> Index: netinet6/ip6_forward.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_forward.c,v
> retrieving revision 1.110
> diff -u -p -r1.110 ip6_forward.c
> --- netinet6/ip6_forward.c1 Jun 2023 09:05:33 -   1.110
> +++ netinet6/ip6_forward.c16 Jun 2023 08:55:43 -
> @@ -321,35 +321,30 @@ reroute:
>  
>   error = tcp_if_output_tso(ifp, , sin6tosa(sin6), rt, IFCAP_TSOv6,
>   ifp->if_mtu);
> + if (error)
> + ip6stat_inc(ip6s_cantforward);
> + else if (m == NULL)
> + ip6stat_inc(ip6s_forward);
>   if (error || m == NULL)
> - goto freecopy;
> + goto senderr;
>  
>   /* Check the size after pf_test to give pf a chance to refragment. */
> - if (m->m_pkthdr.len > ifp->if_mtu) {
> - if (mcopy)
> - icmp6_error(mcopy, ICMP6_PACKET_TOO_BIG, 0,
> - ifp->if_mtu);
> - m_freem(m);
> - goto out;
> + if (m->m_pkthdr.len <= ifp->if_mtu) {
> + in6_proto_cksum_out(m, ifp);
> + error = ifp->if_output(ifp, m, sin6tosa(sin6), rt);
> + if (error)
> + ip6stat_inc(ip6s_cantforward);
> + else
> + ip6stat_inc(ip6s_forward);
> + goto senderr;
>   }
>  
> - in6_proto_cksum_out(m, ifp);
> - error = ifp->if_output(ifp, m, sin6tosa(sin6), rt);
> - if (error) {
> - ip6stat_inc(ip6s_cantforward);
> - } else {
> - ip6stat_inc(ip6s_forward);
> - if (type)
> - ip6stat_inc(ip6s_redirectsent);
> - else {
> - if (mcopy)
> - goto freecopy;
> - }
> - }
> + if (mcopy != NULL)
> + icmp6_error(mcopy, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu);
> + m_freem(m);
> + goto out;
>  
> -#if NPF > 0 || defined(IPSEC)
>  senderr:
> -#endif
>   if (mcopy == NULL)
>   goto out;
>  
> @@ -357,6 +352,7 @@ senderr:
>   case 0:
>   if (type == ND_REDIRECT) {
>   icmp6_redirect_output(mcopy, rt);
> + ip6stat_inc(ip6s_redirectsent);
>   goto out;
>   }
>   goto freecopy;
> 



Re: software tcp send offloading

2023-05-09 Thread Jan Klemkow
On Tue, May 09, 2023 at 09:56:36AM +0200, Alexander Bluhm wrote:
> On Sun, May 07, 2023 at 09:00:31PM +0200, Alexander Bluhm wrote:
> > Not sure if I addressed all corner cases already.  I think IPsec
> > is missing.
> 
> Updated diff:
> - parts have been commited
> - works with IPsec now

Thanks for this solution.  Looks much better to me, then an IPSec lookup
in tcp_output() as its done in FreeBSD.

> - some bugs fixed
> - sysctl net.inet.tcp.tso
> - netstat TSO counter
> 
> If you test this, recompile sysctl and netstat with new kernel
> headers.  Then you can see, whether the diff has an effect on your
> setup.
> 
> # netstat -s -p tcp | grep TSO
> 79 output TSO packets software chopped
> 0 output TSO packets hardware processed
> 840 output TSO packets generated
> 0 output TSO packets dropped

Good idea.

> If you run into problems, disable the feature, and report if the
> problem goes away.  This helps to locate the bug.
> 
> # sysctl net.inet.tcp.tso=0
> net.inet.tcp.tso: 1 -> 0
> 
> I would like to keep the sysctl for now.  It makes performance
> comparison easier.  When we add hardware TSO it can be a quick
> workaround for driver problems.
> 
> When this has been tested a bit, I think it is ready for commit.
> Remaining issues can be handled in tree.  My tests pass, I am not
> aware of TCP problems.

I also did some testing in my setups.  Everything works.

> ok?

Diff looks fine to me, too.

ok jan@

> bluhm
> 
> Index: sys/net/pf.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf.c,v
> retrieving revision 1.1177
> diff -u -p -r1.1177 pf.c
> --- sys/net/pf.c  8 May 2023 13:22:13 -   1.1177
> +++ sys/net/pf.c  8 May 2023 22:37:04 -
> @@ -6561,6 +6561,16 @@ pf_route(struct pf_pdesc *pd, struct pf_
>   goto done;
>   }
>  
> + if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_TSO) &&
> + m0->m_pkthdr.ph_mss <= ifp->if_mtu) {
> + if (tcp_chopper(m0, , ifp, m0->m_pkthdr.ph_mss) ||
> + if_output_ml(ifp, , sintosa(dst), rt))
> + goto done;
> + tcpstat_inc(tcps_outswtso);
> + goto done;
> + }
> + CLR(m0->m_pkthdr.csum_flags, M_TCP_TSO);
> +
>   /*
>* Too large for interface; fragment if possible.
>* Must be able to put at least 8 bytes per fragment.
> @@ -6594,6 +6604,7 @@ void
>  pf_route6(struct pf_pdesc *pd, struct pf_state *st)
>  {
>   struct mbuf *m0;
> + struct mbuf_list ml;
>   struct sockaddr_in6 *dst, sin6;
>   struct rtentry  *rt = NULL;
>   struct ip6_hdr  *ip6;
> @@ -6685,11 +6696,21 @@ pf_route6(struct pf_pdesc *pd, struct pf
>   goto done;
>   }
>  
> - if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu) {
> + if (m0->m_pkthdr.len <= ifp->if_mtu) {
>   in6_proto_cksum_out(m0, ifp);
>   ifp->if_output(ifp, m0, sin6tosa(dst), rt);
>   goto done;
>   }
> +
> + if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_TSO) &&
> + m0->m_pkthdr.ph_mss <= ifp->if_mtu) {
> + if (tcp_chopper(m0, , ifp, m0->m_pkthdr.ph_mss) ||
> + if_output_ml(ifp, , sin6tosa(dst), rt))
> + goto done;
> + tcpstat_inc(tcps_outswtso);
> + goto done;
> + }
> + CLR(m0->m_pkthdr.csum_flags, M_TCP_TSO);
>  
>   ip6stat_inc(ip6s_cantfrag);
>   if (st->rt != PF_DUPTO)
> Index: sys/netinet/in.h
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in.h,v
> retrieving revision 1.142
> diff -u -p -r1.142 in.h
> --- sys/netinet/in.h  11 Apr 2023 00:45:09 -  1.142
> +++ sys/netinet/in.h  8 May 2023 13:47:48 -
> @@ -780,6 +780,7 @@ int  in_canforward(struct in_addr);
>  int in_cksum(struct mbuf *, int);
>  int in4_cksum(struct mbuf *, u_int8_t, int, int);
>  voidin_proto_cksum_out(struct mbuf *, struct ifnet *);
> +int in_ifcap_cksum(struct mbuf *, struct ifnet *, int);
>  voidin_ifdetach(struct ifnet *);
>  int in_mask2len(struct in_addr *);
>  voidin_len2mask(struct in_addr *, int);
> Index: sys/netinet/ip_output.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_output.c,v
> retrieving revision 1.384
> diff -u -p -r1.384 ip_output.c
> --- sys/netinet/ip_output.c   8 May 2023 13:22:13 -   1.384
> +++ sys/netinet/ip_output.c   8 May 2023 22:37:04 -
> @@ -84,7 +84,6 @@ void ip_mloopback(struct ifnet *, struct
>  static __inline u_int16_t __attribute__((__unused__))
>  in_cksum_phdr(u_int32_t, u_int32_t, u_int32_t);
>  void in_delayed_cksum(struct mbuf *);
> -int in_ifcap_cksum(struct mbuf *, struct ifnet *, int);
>  
>  int ip_output_ipsec_lookup(struct 

seperate LRO/TSO flags

2023-05-10 Thread Jan Klemkow
Hi,

This diff introduces separate flags for TCP offloading.  We split this
into LRO (large receive offloading) and TSO (TCP segmentation
offloading).  Thus, we are able to turn it on/off separately.

For ifconfig(8) we use "tcprecvoffload" and "tcpsendoffload".  So, the
user has a better insight of what this features are doing.

ok?

bye,
Jan

Index: sbin/ifconfig/ifconfig.8
===
RCS file: /cvs/src/sbin/ifconfig/ifconfig.8,v
retrieving revision 1.394
diff -u -p -r1.394 ifconfig.8
--- sbin/ifconfig/ifconfig.826 Apr 2023 02:38:08 -  1.394
+++ sbin/ifconfig/ifconfig.810 May 2023 16:22:30 -
@@ -282,8 +282,10 @@ tag.
 As CSUM_TCPv4, but supports IPv6 datagrams.
 .It Sy CSUM_UDPv6
 As above, for UDP.
+.It Sy LRO
+The device supports TCP large receive offloading (LRO).
 .It Sy TSO
-The device supports TCP segment offloading (TSO).
+The device supports TCP segmentation offloading (TSO).
 .It Sy WOL
 The device supports Wake on LAN (WoL).
 .It Sy hardmtu
@@ -491,10 +493,30 @@ Query and display information and diagno
 modules installed in an interface.
 It is only supported by drivers implementing the necessary functionality
 on hardware which supports it.
-.It Cm tso
+.It Cm tcprecvoffload
+Enable TCP large receive offloading (LRO) if it's supported by the hardware; 
see
+.Cm hwfeatures .
+LRO enabled network interfaces modify received TCP/IP packets.
+This will also affect traffic of upper layer interfaces,
+such as
+.Xr vlan 4 ,
+.Xr aggr 4 ,
+and
+.Xr carp 4 .
+It is not possible to use LRO with interfaces attached to a
+.Xr bridge 4 ,
+.Xr veb 4 ,
+or
+.Xr tpmr 4 .
+Changing this option will re-initialize the network interface.
+.It Cm -tcprecvoffload
+Disable LRO.
+LRO is disabled by default.
+.It Cm tcpsendoffload
 Enable TCP segmentation offloading (TSO) if it's supported by the hardware; see
 .Cm hwfeatures .
-TSO enabled NICs modify received TCP/IP packets.
+TSO enabled network interfaces are able to split large TCP segments into 
smaller
+peaces that fits into MTU and MSS.
 This will also affect traffic of upper layer interfaces,
 such as
 .Xr vlan 4 ,
@@ -506,8 +528,7 @@ It is not possible to use TSO with inter
 .Xr veb 4 ,
 or
 .Xr tpmr 4 .
-Changing this option will re-initialize the network interface.
-.It Cm -tso
+.It Cm -tcpsendoffload
 Disable TSO.
 TSO is disabled by default.
 .It Cm up
Index: sbin/ifconfig/ifconfig.c
===
RCS file: /cvs/src/sbin/ifconfig/ifconfig.c,v
retrieving revision 1.462
diff -u -p -r1.462 ifconfig.c
--- sbin/ifconfig/ifconfig.c8 Mar 2023 04:43:06 -   1.462
+++ sbin/ifconfig/ifconfig.c10 May 2023 15:40:17 -
@@ -126,7 +126,7 @@
 #define HWFEATURESBITS \
"\024\1CSUM_IPv4\2CSUM_TCPv4\3CSUM_UDPv4"   \
"\5VLAN_MTU\6VLAN_HWTAGGING\10CSUM_TCPv6"   \
-   "\11CSUM_UDPv6\17TSO\20WOL"
+   "\11CSUM_UDPv6\16LRO\17TSO\20WOL"
 
 struct ifencap {
unsigned int ife_flags;
@@ -469,8 +469,10 @@ const struct   cmd {
{ "-soii",  IFXF_INET6_NOSOII,  0,  setifxflags },
{ "monitor",IFXF_MONITOR,   0,  setifxflags },
{ "-monitor",   -IFXF_MONITOR,  0,  setifxflags },
-   { "tso",IFXF_TSO,   0,  setifxflags },
-   { "-tso",   -IFXF_TSO,  0,  setifxflags },
+   { "tcprecvoffload", IFXF_LRO,   0,  setifxflags },
+   { "-tcprecvoffload", -IFXF_LRO, 0,  setifxflags },
+   { "tcpsendoffload", IFXF_TSO,   0,  setifxflags },
+   { "-tcpsendoffload", -IFXF_TSO, 0,  setifxflags },
 #ifndef SMALL
{ "hwfeatures", NEXTARG0,   0,  printifhwfeatures },
{ "metric", NEXTARG,0,  setifmetric },
@@ -674,7 +676,7 @@ const structcmd {
"\7RUNNING\10NOARP\11PROMISC\12ALLMULTI\13OACTIVE\14SIMPLEX"\
"\15LINK0\16LINK1\17LINK2\20MULTICAST"  \
"\23AUTOCONF6TEMP\24MPLS\25WOL\26AUTOCONF6\27INET6_NOSOII"  \
-   "\30AUTOCONF4" "\31MONITOR" "\32TSO"
+   "\30AUTOCONF4" "\31MONITOR" "\32LRO" "\33TSO"
 
 intgetinfo(struct ifreq *, int);
 void   getsock(int);
Index: sys/dev/pci/if_ix.c
===
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.193
diff -u -p -r1.193 if_ix.c
--- sys/dev/pci/if_ix.c 28 Apr 2023 10:18:57 -  1.193
+++ sys/dev/pci/if_ix.c 10 May 2023 16:32:44 -
@@ -1925,7 +1925,7 @@ ixgbe_setup_interface(struct ix_softc *s
ifp->if_capabilities |= IFCAP_CSUM_IPv4;
 
if (sc->hw.mac.type != ixgbe_mac_82598EB)
-   ifp->if_capabilities |= IFCAP_TSO;
+   ifp->if_capabilities |= IFCAP_LRO;
 
/*
 * 

Re: seperate LRO/TSO flags

2023-05-10 Thread Jan Klemkow
On Wed, May 10, 2023 at 11:13:04AM -0600, Todd C. Miller wrote:
> On Wed, 10 May 2023 19:03:58 +0200, Jan Klemkow wrote:
> > This diff introduces separate flags for TCP offloading.  We split this
> > into LRO (large receive offloading) and TSO (TCP segmentation
> > offloading).  Thus, we are able to turn it on/off separately.
> >
> > For ifconfig(8) we use "tcprecvoffload" and "tcpsendoffload".  So, the
> > user has a better insight of what this features are doing.
> 
> Is it possible to control these at the address family level?  In
> other words, is it possible to enable "tcprecvoffload" and
> "tcpsendoffload" for inet but not inet6 or vice versa?

For tcprecvoffload and ix(4) it's not possible to enable/disable it per
address family.  Its just one flag for the hardware.

For tcpsendoffload its possible, but I won't do that till its necessary.

Why would you want to differentiate the address families here?

bye,
Jan



Re: seperate LRO/TSO flags

2023-05-15 Thread Jan Klemkow
On Sat, May 13, 2023 at 04:44:18PM +0200, Christian Weisgerber wrote:
> Jan Klemkow:
> 
> > This diff introduces separate flags for TCP offloading.  We split this
> > into LRO (large receive offloading) and TSO (TCP segmentation
> > offloading).  Thus, we are able to turn it on/off separately.
> 
> Wait, why do we even have a knob for TSO?
> 
> We specifically decided not to have a knob for checksum offloading,
> because it should just work out of the box, and if it doesn't, then
> it should be disabled by the driver.  It should not be the admin's
> task to figure out if the implementation is broken and to fiddle
> with the knobs (hi, FreeBSD!).
> 
> I would assume that line of thinking extends to TSO.

You are right.  This is reflected in the current state of the diff
below.

We just need a knob for TCP Large Receive Offload (LRO) because it
changes the TCP segments.  You may want to avoid this on a forwarding
router.

ok?

Thanks,
Jan

Index: sbin/ifconfig/ifconfig.8
===
RCS file: /cvs/src/sbin/ifconfig/ifconfig.8,v
retrieving revision 1.394
diff -u -p -r1.394 ifconfig.8
--- sbin/ifconfig/ifconfig.826 Apr 2023 02:38:08 -  1.394
+++ sbin/ifconfig/ifconfig.812 May 2023 06:22:35 -
@@ -282,8 +282,18 @@ tag.
 As CSUM_TCPv4, but supports IPv6 datagrams.
 .It Sy CSUM_UDPv6
 As above, for UDP.
-.It Sy TSO
-The device supports TCP segment offloading (TSO).
+.It Sy LRO
+The device supports TCP large receive offload (LRO).
+.It Sy TSOv4
+The device supports IPv4 TCP segmentation offload (TSO).
+TSO is used by default.
+Use the
+.Xr sysctl 8
+variable
+.Va net.inet.tcp.tso
+to disable this feature.
+.It Sy TSOv6
+As above, for IPv6.
 .It Sy WOL
 The device supports Wake on LAN (WoL).
 .It Sy hardmtu
@@ -491,25 +501,25 @@ Query and display information and diagno
 modules installed in an interface.
 It is only supported by drivers implementing the necessary functionality
 on hardware which supports it.
-.It Cm tso
-Enable TCP segmentation offloading (TSO) if it's supported by the hardware; see
+.It Cm tcprecvoffload
+Enable TCP large receive offload (LRO) if it's supported by the hardware; see
 .Cm hwfeatures .
-TSO enabled NICs modify received TCP/IP packets.
+LRO enabled network interfaces modify received TCP/IP packets.
 This will also affect traffic of upper layer interfaces,
 such as
 .Xr vlan 4 ,
 .Xr aggr 4 ,
 and
 .Xr carp 4 .
-It is not possible to use TSO with interfaces attached to a
+It is not possible to use LRO with interfaces attached to a
 .Xr bridge 4 ,
 .Xr veb 4 ,
 or
 .Xr tpmr 4 .
 Changing this option will re-initialize the network interface.
-.It Cm -tso
-Disable TSO.
-TSO is disabled by default.
+.It Cm -tcprecvoffload
+Disable LRO.
+LRO is disabled by default.
 .It Cm up
 Mark an interface
 .Dq up .
Index: sbin/ifconfig/ifconfig.c
===
RCS file: /cvs/src/sbin/ifconfig/ifconfig.c,v
retrieving revision 1.462
diff -u -p -r1.462 ifconfig.c
--- sbin/ifconfig/ifconfig.c8 Mar 2023 04:43:06 -   1.462
+++ sbin/ifconfig/ifconfig.c11 May 2023 17:33:55 -
@@ -126,7 +126,7 @@
 #define HWFEATURESBITS \
"\024\1CSUM_IPv4\2CSUM_TCPv4\3CSUM_UDPv4"   \
"\5VLAN_MTU\6VLAN_HWTAGGING\10CSUM_TCPv6"   \
-   "\11CSUM_UDPv6\17TSO\20WOL"
+   "\11CSUM_UDPv6\15LRO\16TSOv4\17TSOv6\20WOL"
 
 struct ifencap {
unsigned int ife_flags;
@@ -469,8 +469,8 @@ const structcmd {
{ "-soii",  IFXF_INET6_NOSOII,  0,  setifxflags },
{ "monitor",IFXF_MONITOR,   0,  setifxflags },
{ "-monitor",   -IFXF_MONITOR,  0,  setifxflags },
-   { "tso",IFXF_TSO,   0,  setifxflags },
-   { "-tso",   -IFXF_TSO,  0,  setifxflags },
+   { "tcprecvoffload", IFXF_LRO,   0,  setifxflags },
+   { "-tcprecvoffload", -IFXF_LRO, 0,  setifxflags },
 #ifndef SMALL
{ "hwfeatures", NEXTARG0,   0,  printifhwfeatures },
{ "metric", NEXTARG,0,  setifmetric },
@@ -674,7 +674,7 @@ const structcmd {
"\7RUNNING\10NOARP\11PROMISC\12ALLMULTI\13OACTIVE\14SIMPLEX"\
"\15LINK0\16LINK1\17LINK2\20MULTICAST"  \
"\23AUTOCONF6TEMP\24MPLS\25WOL\26AUTOCONF6\27INET6_NOSOII"  \
-   "\30AUTOCONF4" "\31MONITOR" "\32TSO"
+   "\30AUTOCONF4" "\31MONITOR" "\32LRO"
 
 intgetinfo(struct ifreq *, int);
 void   getsock(int);
Index: sys/dev/pci/if_ix.c

Re: seperate LRO/TSO flags

2023-05-15 Thread Jan Klemkow
On Mon, May 15, 2023 at 11:40:20AM +0200, Alexander Bluhm wrote:
> On Mon, May 15, 2023 at 09:34:21AM +0200, Jan Klemkow wrote:
> > @@ -251,12 +251,16 @@ struct if_status_description {
> >  #defineIFCAP_VLAN_HWTAGGING0x0020  /* hardware VLAN tag 
> > support */
> >  #defineIFCAP_CSUM_TCPv60x0080  /* can do IPv6/TCP 
> > checksums */
> >  #defineIFCAP_CSUM_UDPv60x0100  /* can do IPv6/UDP 
> > checksums */
> > -#defineIFCAP_TSO   0x4000  /* TCP segment 
> > offloading */
> > +#defineIFCAP_LRO   0x1000  /* TCP large recv 
> > offload */
> > +#defineIFCAP_TSOv4 0x2000  /* TCP segmentation 
> > offload */
> > +#defineIFCAP_TSOv6 0x4000  /* TCP segmentation 
> > offload */
> >  #defineIFCAP_WOL   0x8000  /* can do wake on lan */
> 
> I would prefer to keep the numbers of IFCAP_TSO/IFCAP_LRO as this
> is just a naming error.  Then we have less confusion during the
> ifconfig transition phase.
> 
> +#define IFCAP_TSOv4  0x1000
> +#define IFCAP_TSOv6  0x2000
> -#define IFCAP_TSO0x4000
> +#define IFCAP_LRO0x4000
> 
> > +#define IFCAP_TSO  (IFCAP_TSOv4 | IFCAP_TSOv6)
> > +
> 
> Could you please remove this chunk and expand it, where is used?
> This one more define does not make the code clearer.  And this flag
> IFCAP_TSO had a different meaning before renaming.  When it is not
> introduced again, the compiler makes sure that no renaming was
> forgotten.

done

Also:

 - updated the diff to the current source state
 - improved the vlan(4) capability handling

@dlg: Whats your opinion about this diff?

ok?

Thanks,
Jan

Index: sbin/ifconfig/ifconfig.8
===
RCS file: /cvs/src/sbin/ifconfig/ifconfig.8,v
retrieving revision 1.394
diff -u -p -r1.394 ifconfig.8
--- sbin/ifconfig/ifconfig.826 Apr 2023 02:38:08 -  1.394
+++ sbin/ifconfig/ifconfig.815 May 2023 18:46:48 -
@@ -282,8 +282,18 @@ tag.
 As CSUM_TCPv4, but supports IPv6 datagrams.
 .It Sy CSUM_UDPv6
 As above, for UDP.
-.It Sy TSO
-The device supports TCP segment offloading (TSO).
+.It Sy LRO
+The device supports TCP large receive offload (LRO).
+.It Sy TSOv4
+The device supports IPv4 TCP segmentation offload (TSO).
+TSO is used by default.
+Use the
+.Xr sysctl 8
+variable
+.Va net.inet.tcp.tso
+to disable this feature.
+.It Sy TSOv6
+As above, for IPv6.
 .It Sy WOL
 The device supports Wake on LAN (WoL).
 .It Sy hardmtu
@@ -491,25 +501,25 @@ Query and display information and diagno
 modules installed in an interface.
 It is only supported by drivers implementing the necessary functionality
 on hardware which supports it.
-.It Cm tso
-Enable TCP segmentation offloading (TSO) if it's supported by the hardware; see
+.It Cm tcprecvoffload
+Enable TCP large receive offload (LRO) if it's supported by the hardware; see
 .Cm hwfeatures .
-TSO enabled NICs modify received TCP/IP packets.
+LRO enabled network interfaces modify received TCP/IP packets.
 This will also affect traffic of upper layer interfaces,
 such as
 .Xr vlan 4 ,
 .Xr aggr 4 ,
 and
 .Xr carp 4 .
-It is not possible to use TSO with interfaces attached to a
+It is not possible to use LRO with interfaces attached to a
 .Xr bridge 4 ,
 .Xr veb 4 ,
 or
 .Xr tpmr 4 .
 Changing this option will re-initialize the network interface.
-.It Cm -tso
-Disable TSO.
-TSO is disabled by default.
+.It Cm -tcprecvoffload
+Disable LRO.
+LRO is disabled by default.
 .It Cm up
 Mark an interface
 .Dq up .
Index: sbin/ifconfig/ifconfig.c
===
RCS file: /cvs/src/sbin/ifconfig/ifconfig.c,v
retrieving revision 1.463
diff -u -p -r1.463 ifconfig.c
--- sbin/ifconfig/ifconfig.c12 May 2023 18:24:13 -  1.463
+++ sbin/ifconfig/ifconfig.c15 May 2023 20:27:51 -
@@ -126,7 +126,7 @@
 #define HWFEATURESBITS \
"\024\1CSUM_IPv4\2CSUM_TCPv4\3CSUM_UDPv4"   \
"\5VLAN_MTU\6VLAN_HWTAGGING\10CSUM_TCPv6"   \
-   "\11CSUM_UDPv6\17TSO\20WOL"
+   "\11CSUM_UDPv6\15TSOv4\16TSOv6\17LSO\20WOL"
 
 struct ifencap {
unsigned int ife_flags;
@@ -469,8 +469,8 @@ const structcmd {
{ "-soii",  IFXF_INET6_NOSOII,  0,  setifxflags },
{ "monitor",IFXF_MONITOR,   0,  setifxflags },
{ "-monitor",   -IFXF_MONITOR,  0,  setifxflags },
-   { "tso",IFXF_TSO,   0,  setifxflags },
-   { "-tso",   -IFXF_TSO,  0,  setifxflags },
+ 

Add LRO counter in ix(4)

2023-05-16 Thread Jan Klemkow
Hi,

This diff introduces new counters for LRO packets, we get from the
network interface.  It shows, how many packets the network interface has
coalesced into LRO packets.

In followup diff, this packet counter will also be used to set the
ph_mss variable to valid value.  So, the stack is able to forward or
redirect this kind of packets.

ok?

bye,
Jan

Index: usr.bin/netstat/inet.c
===
RCS file: /cvs/src/usr.bin/netstat/inet.c,v
retrieving revision 1.175
diff -u -p -r1.175 inet.c
--- usr.bin/netstat/inet.c  10 May 2023 12:07:17 -  1.175
+++ usr.bin/netstat/inet.c  16 May 2023 17:55:20 -
@@ -412,6 +412,10 @@ tcp_stats(char *name)
p(tcps_outhwtso, "\t\t%u output TSO packet%s hardware processed\n");
p(tcps_outpkttso, "\t\t%u output TSO packet%s generated\n");
p(tcps_outbadtso, "\t\t%u output TSO packet%s dropped\n");
+   p(tcps_inhwlro, "\t\t%u input LRO generated packet%s from hardware\n");
+   p(tcps_inpktlro, "\t\t%u input LRO coalesced packet%s from hardware\n");
+   p(tcps_inbadlro, "\t\t%u input bad LRO packet%s from hardware\n");
+
p(tcps_rcvtotal, "\t%u packet%s received\n");
p2(tcps_rcvackpack, tcps_rcvackbyte, "\t\t%u ack%s (for %llu 
byte%s)\n");
p(tcps_rcvdupack, "\t\t%u duplicate ack%s\n");
Index: sys/dev/pci/if_ix.c
===
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.194
diff -u -p -r1.194 if_ix.c
--- sys/dev/pci/if_ix.c 16 May 2023 14:32:54 -  1.194
+++ sys/dev/pci/if_ix.c 16 May 2023 18:49:33 -
@@ -3175,12 +3175,23 @@ ixgbe_rxeof(struct rx_ring *rxr)
sendmp = rxbuf->fmp;
rxbuf->buf = rxbuf->fmp = NULL;
 
-   if (sendmp != NULL) /* secondary frag */
+   if (sendmp != NULL) { /* secondary frag */
sendmp->m_pkthdr.len += mp->m_len;
-   else {
+
+   /*
+* This function iterates over interleaved descriptors.
+* Thus, we reuse ph_mss as global segment counter per
+* TCP connection, insteat of introducing a new variable
+* in m_pkthdr.
+*/
+   if (rsccnt)
+   sendmp->m_pkthdr.ph_mss += rsccnt - 1;
+   } else {
/* first desc of a non-ps chain */
sendmp = mp;
sendmp->m_pkthdr.len = mp->m_len;
+   if (rsccnt)
+   sendmp->m_pkthdr.ph_mss = rsccnt - 1;
 #if NVLAN > 0
if (sc->vlan_stripping && staterr & IXGBE_RXD_STAT_VP) {
sendmp->m_pkthdr.ether_vtag = vtag;
@@ -3200,6 +3211,21 @@ ixgbe_rxeof(struct rx_ring *rxr)
if (hashtype != IXGBE_RXDADV_RSSTYPE_NONE) {
sendmp->m_pkthdr.ph_flowid = hash;
SET(sendmp->m_pkthdr.csum_flags, M_FLOWID);
+   }
+
+   if (sendmp->m_pkthdr.ph_mss == 1)
+   sendmp->m_pkthdr.ph_mss = 0;
+
+   if (sendmp->m_pkthdr.ph_mss > 0) {
+   struct ether_extracted ext;
+   uint16_t pkts = sendmp->m_pkthdr.ph_mss;
+
+   ether_extract_headers(sendmp, );
+   if (ext.tcp)
+   tcpstat_inc(tcps_inhwlro);
+   else
+   tcpstat_inc(tcps_inbadlro);
+   tcpstat_add(tcps_inpktlro, pkts);
}
 
ml_enqueue(, sendmp);
Index: sys/dev/pci/ixgbe.h
===
RCS file: /cvs/src/sys/dev/pci/ixgbe.h,v
retrieving revision 1.33
diff -u -p -r1.33 ixgbe.h
--- sys/dev/pci/ixgbe.h 8 Feb 2022 03:38:00 -   1.33
+++ sys/dev/pci/ixgbe.h 16 May 2023 17:55:20 -
@@ -60,12 +60,18 @@
 
 #include 
 #include 
+#include 
 #include 
 
+struct tdb;
+
 #include 
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 
 #if NBPFILTER > 0
 #include 
Index: sys/netinet/tcp_usrreq.c
===
RCS file: /cvs/src/sys/netinet/tcp_usrreq.c,v
retrieving revision 1.218
diff -u -p -r1.218 tcp_usrreq.c
--- sys/netinet/tcp_usrreq.c10 May 2023 12:07:16 -  1.218
+++ sys/netinet/tcp_usrreq.c16 May 2023 17:55:20 -
@@ -1340,6 +1340,9 @@ tcp_sysctl_tcpstat(void *oldp, size_t *o
ASSIGN(tcps_outhwtso);
ASSIGN(tcps_outpkttso);
ASSIGN(tcps_outbadtso);
+   ASSIGN(tcps_inhwlro);
+   ASSIGN(tcps_inpktlro);
+   

Re: ifconfig: SIOCSIFFLAGS: device not configured

2023-05-12 Thread Jan Klemkow
On Thu, May 11, 2023 at 09:17:37PM +0200, Hrvoje Popovski wrote:
> is it possible to change "ifconfig: SIOCSIFFLAGS: device not configured"
> message that it has an interface name in it, something like:
> ifconfig pfsync0: SIOCSIFFLAGS: device not configured <- in my case.
> 
> I have many vlans and static routes in my setup and while testing some
> diffs, it took me a long time to figure out which interface the message
> was coming from.
> 
> starting network
> add host 10.11.2.69: gateway 10.12.253.225
> add host 10.250.184.36: gateway 10.12.253.225
> add host 9.9.9.9: gateway 10.12.253.225
> add host 10.11.1.234: gateway 10.12.253.225
> add host 10.11.1.235: gateway 10.12.253.225
> add host 10.11.255.123: gateway 10.12.253.225
> add net 10.101/16: gateway 10.12.253.225
> ifconfig: SIOCSIFFLAGS: Device not configured
> add net 16/8: gateway 192.168.100.112
> add net a192:a168:a100:a100::/64: gateway 192:168:1000:1000::112
> add net 48/8: gateway 192.168.111.112
> add net a192:a168:a111:a111::/64: gateway 192:168::::112
> reordering: ld.so libc libcrypto sshd.
> 
> or when I'm doing sh /etc/netstart and have aggr interface
> 
> ifconfig: SIOCSTRUNKPORT: Device busy
> ifconfig: SIOCSTRUNKPORT: Device busy
> 
> to change
> ifconfig ix0: SIOCSTRUNKPORT: Device busy
> ifconfig ix1: SIOCSTRUNKPORT: Device busy

I also run into this issue sometimes.  So, here is diff that prints the
interface name in front of most of these anonym error messages.

ok?

Jan

Index: ifconfig.c
===
RCS file: /cvs/src/sbin/ifconfig/ifconfig.c,v
retrieving revision 1.462
diff -u -p -r1.462 ifconfig.c
--- ifconfig.c  8 Mar 2023 04:43:06 -   1.462
+++ ifconfig.c  12 May 2023 14:14:01 -
@@ -1070,14 +1070,14 @@ printgroup(char *groupname, int ifaliase
errno == ENOENT)
return (-1);
else
-   err(1, "SIOCGIFGMEMB");
+   err(1, "%s: SIOCGIFGMEMB", ifgr.ifgr_name);
}
 
len = ifgr.ifgr_len;
if ((ifgr.ifgr_groups = calloc(1, len)) == NULL)
err(1, "printgroup");
if (ioctl(sock, SIOCGIFGMEMB, (caddr_t)) == -1)
-   err(1, "SIOCGIFGMEMB");
+   err(1, "%s: SIOCGIFGMEMB", ifgr.ifgr_name);
 
for (ifg = ifgr.ifgr_groups; ifg && len >= sizeof(struct ifg_req);
ifg++) {
@@ -1099,7 +1099,7 @@ printgroupattribs(char *groupname)
bzero(, sizeof(ifgr));
strlcpy(ifgr.ifgr_name, groupname, sizeof(ifgr.ifgr_name));
if (ioctl(sock, SIOCGIFGATTR, (caddr_t)) == -1)
-   err(1, "SIOCGIFGATTR");
+   err(1, "%s: SIOCGIFGATTR", ifgr.ifgr_name);
 
printf("%s:", groupname);
printf(" carp demote count %d", ifgr.ifgr_attrib.ifg_carp_demoted);
@@ -1122,7 +1122,8 @@ setgroupattribs(char *groupname, int arg
if (argc > 1) {
neg = strtonum(argv[1], 0, 128, );
if (errstr)
-   errx(1, "invalid carp demotion: %s", errstr);
+   errx(1, "%s: invalid carp demotion: %s", ifgr.ifgr_name,
+   errstr);
}
 
if (p[0] == '-') {
@@ -1135,7 +1136,7 @@ setgroupattribs(char *groupname, int arg
usage();
 
if (ioctl(sock, SIOCSIFGATTR, (caddr_t)) == -1)
-   err(1, "SIOCSIFGATTR");
+   err(1, "%s: SIOCSIFGATTR", ifgr.ifgr_name);
 }
 
 void
@@ -1249,7 +1250,7 @@ clone_create(const char *addr, int param
 
(void) strlcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name));
if (ioctl(sock, SIOCIFCREATE, ) == -1)
-   err(1, "SIOCIFCREATE");
+   err(1, "%s: SIOCIFCREATE", ifr.ifr_name);
 }
 
 void
@@ -1258,7 +1259,7 @@ clone_destroy(const char *addr, int para
 
(void) strlcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name));
if (ioctl(sock, SIOCIFDESTROY, ) == -1)
-   err(1, "SIOCIFDESTROY");
+   err(1, "%s: SIOCIFDESTROY", ifr.ifr_name);
 }
 
 struct if_clonereq *
@@ -1422,7 +1423,7 @@ setifflags(const char *vname, int value)
bcopy((char *), (char *)_ifr, sizeof(struct ifreq));
 
if (ioctl(sock, SIOCGIFFLAGS, (caddr_t)_ifr) == -1)
-   err(1, "SIOCGIFFLAGS");
+   err(1, "%s: SIOCGIFFLAGS", my_ifr.ifr_name);
(void) strlcpy(my_ifr.ifr_name, ifname, sizeof(my_ifr.ifr_name));
flags = my_ifr.ifr_flags;
 
@@ -1433,7 +1434,7 @@ setifflags(const char *vname, int value)
flags |= value;
my_ifr.ifr_flags = flags;
if (ioctl(sock, SIOCSIFFLAGS, (caddr_t)_ifr) == -1)
-   err(1, "SIOCSIFFLAGS");
+   err(1, "%s: SIOCSIFFLAGS", my_ifr.ifr_name);
 }
 
 void
@@ -1444,7 +1445,7 @@ setifxflags(const char *vname, int value
bcopy((char *), (char *)_ifr, sizeof(struct ifreq));
 
if (ioctl(sock, SIOCGIFXFLAGS, (caddr_t)_ifr) == -1)
-   

Re: Add LRO counter in ix(4)

2023-05-18 Thread Jan Klemkow
On Thu, May 18, 2023 at 12:01:44AM +0200, Alexander Bluhm wrote:
> On Tue, May 16, 2023 at 09:11:48PM +0200, Jan Klemkow wrote:
> > @@ -412,6 +412,10 @@ tcp_stats(char *name)
> > p(tcps_outhwtso, "\t\t%u output TSO packet%s hardware processed\n");
> > p(tcps_outpkttso, "\t\t%u output TSO packet%s generated\n");
> > p(tcps_outbadtso, "\t\t%u output TSO packet%s dropped\n");
> > +   p(tcps_inhwlro, "\t\t%u input LRO generated packet%s from hardware\n");
> > +   p(tcps_inpktlro, "\t\t%u input LRO coalesced packet%s from hardware\n");
> 
> ... coalesced packet%s by hardware

done

> > +   p(tcps_inbadlro, "\t\t%u input bad LRO packet%s from hardware\n");
> > +
> 
> Move this down to the "packets received" section.  You included it
> in "packets sent".

done

> > +   /*
> > +* This function iterates over interleaved descriptors.
> > +* Thus, we reuse ph_mss as global segment counter per
> > +* TCP connection, insteat of introducing a new variable
> 
> s/insteat/instead/

done

ok?

Thanks,
Jan

diff --git a/sys/dev/pci/if_ix.c b/sys/dev/pci/if_ix.c
index 4119a2416dc..924a6d63236 100644
--- a/sys/dev/pci/if_ix.c
+++ b/sys/dev/pci/if_ix.c
@@ -3214,12 +3214,23 @@ ixgbe_rxeof(struct rx_ring *rxr)
sendmp = rxbuf->fmp;
rxbuf->buf = rxbuf->fmp = NULL;
 
-   if (sendmp != NULL) /* secondary frag */
+   if (sendmp != NULL) { /* secondary frag */
sendmp->m_pkthdr.len += mp->m_len;
-   else {
+
+   /*
+* This function iterates over interleaved descriptors.
+* Thus, we reuse ph_mss as global segment counter per
+* TCP connection, instead of introducing a new variable
+* in m_pkthdr.
+*/
+   if (rsccnt)
+   sendmp->m_pkthdr.ph_mss += rsccnt - 1;
+   } else {
/* first desc of a non-ps chain */
sendmp = mp;
sendmp->m_pkthdr.len = mp->m_len;
+   if (rsccnt)
+   sendmp->m_pkthdr.ph_mss = rsccnt - 1;
 #if NVLAN > 0
if (sc->vlan_stripping && staterr & IXGBE_RXD_STAT_VP) {
sendmp->m_pkthdr.ether_vtag = vtag;
@@ -3241,6 +3252,21 @@ ixgbe_rxeof(struct rx_ring *rxr)
SET(sendmp->m_pkthdr.csum_flags, M_FLOWID);
}
 
+   if (sendmp->m_pkthdr.ph_mss == 1)
+   sendmp->m_pkthdr.ph_mss = 0;
+
+   if (sendmp->m_pkthdr.ph_mss > 0) {
+   struct ether_extracted ext;
+   uint16_t pkts = sendmp->m_pkthdr.ph_mss;
+
+   ether_extract_headers(sendmp, );
+   if (ext.tcp)
+   tcpstat_inc(tcps_inhwlro);
+   else
+   tcpstat_inc(tcps_inbadlro);
+   tcpstat_add(tcps_inpktlro, pkts);
+   }
+
ml_enqueue(, sendmp);
}
 next_desc:
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index 120e3cc5ea7..3970636cde1 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -1340,6 +1340,9 @@ tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void 
*newp)
ASSIGN(tcps_outhwtso);
ASSIGN(tcps_outpkttso);
ASSIGN(tcps_outbadtso);
+   ASSIGN(tcps_inhwlro);
+   ASSIGN(tcps_inpktlro);
+   ASSIGN(tcps_inbadlro);
 
 #undef ASSIGN
 
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 0a9630d719f..e706fedd0e7 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -447,6 +447,9 @@ struct  tcpstat {
u_int32_t tcps_outhwtso;/* output tso processed by hardware */
u_int32_t tcps_outpkttso;   /* packets generated by tso */
u_int32_t tcps_outbadtso;   /* output tso failed, packet dropped */
+   u_int32_t tcps_inhwlro; /* input lro from hardware */
+   u_int32_t tcps_inpktlro;/* packets coalessed by hardware lro */
+   u_int32_t tcps_inbadlro;/* input bad lro packets from hardware 
*/
 };
 
 /*
@@ -625,6 +628,9 @@ enum tcpstat_counters {
tcps_outhwtso,
tcps_outpkttso,
tcps_outbadtso,
+   tcps_inhwlro,
+   tcps_inpktlro,
+   tcps_inbadlro,
tcps_ncounters,
 };
 
diff --git a/usr.bin/

Fix wrong interface mtu in tcp_mss

2023-05-19 Thread Jan Klemkow
Hi,

We use the wrong interface and mtu in tcp_mss() to calculate the mss if
the destination address points is a local address.  In ip_output() we
use the correct interface and its mtu.

This limits the mss to 1448 if the mtu of the interface it 1500,
instead of using a local 32k mss.

The bigger issue is: local bulk traffic with the current TSO
implementation is broken.  tcp_output() creates TSO packets with an mss
smaller then 32k and ip_output() calls if_output instead of
tcp_if_output_tso() because it fits into the mtu check of lo0.

This diff takes the same logic to pick the interface in tcp_mss() as its
done in ip_output() and fixes both issues.

ok?

bye,
Jan

Index: netinet/tcp_input.c
===
RCS file: /cvs/src/sys/netinet/tcp_input.c,v
retrieving revision 1.387
diff -u -p -r1.387 tcp_input.c
--- netinet/tcp_input.c 14 Mar 2023 00:24:05 -  1.387
+++ netinet/tcp_input.c 19 May 2023 17:22:47 -
@@ -2805,7 +2805,11 @@ tcp_mss(struct tcpcb *tp, int offer)
if (rt == NULL)
goto out;
 
-   ifp = if_get(rt->rt_ifidx);
+   if (ISSET(rt->rt_flags, RTF_LOCAL))
+   ifp = if_get(rtable_loindex(inp->inp_rtableid));
+   else
+   ifp = if_get(rt->rt_ifidx);
+
if (ifp == NULL)
goto out;
 



<    1   2   3