Re: patch unveil fail

2023-10-25 Thread Alexander Bluhm
On Wed, Oct 25, 2023 at 07:00:28PM +0200, Omar Polo wrote:
> On 2023/10/25 13:38:37 +0200, Alexander Bluhm  wrote:
> > @@ -213,11 +214,27 @@ main(int argc, char *argv[])
> > perror("unveil");
> > my_exit(2);
> > }
> > -   if (filearg[0] != NULL)
> > +   if (filearg[0] != NULL) {
> > +   char *origdir;
> > +
> > if (unveil(filearg[0], "rwc") == -1) {
> > perror("unveil");
> > my_exit(2);
> > }
> > +   if ((origdir = dirname(filearg[0])) == NULL) {
> 
> Not sure if we're interested in it, but dirname(3) theoretically alter
> the passed string.  our dirname doesn't do it, but per posix it can,
> IIUC.  This could cause issues since filearg[0] is used later.
> 
> If we care about portability here, we should pass a copy to dirname.
> don't know if we care thought.

unveil(2) is not portable code anyway.  And dirname(3) is only used
for that.

> > +   perror("dirname");
> > +   my_exit(2);
> > +   }
> > +   if (unveil(origdir, "rwc") == -1) {
> > +   perror("unveil");
> > +   my_exit(2);
> > +   }
> > +   } else {
> > +   if (unveil(".", "rwc") == -1) {
> > +   perror("unveil");
> > +   my_exit(2);
> > +   }
> > +   }
> > if (filearg[1] != NULL)
> > if (unveil(filearg[1], "r") == -1) {
> > perror("unveil");



Re: IPv4 on ix(4) slow/nothing - 7.4

2023-10-25 Thread Alexander Bluhm
On Thu, Oct 19, 2023 at 04:04:26PM +0200, Jan Klemkow wrote:
> On Wed, Oct 18, 2023 at 08:53:44PM +0200, Alexander Bluhm wrote:
> > On Wed, Oct 18, 2023 at 08:19:29PM +0200, Mischa wrote:
> > > It's indeed something like that: ix -> vlan (tagged) -> veb
> > 
> > When vlan is added to veb, kernel should disable LRO on ix.
> > All testing before release did not find this code path :-(
> > 
> > Is it possible to add vlan to veb first, and then add or change the
> > vlan parent to ix?  If it works, that should also disable LRO.
> > 
> > Jan said he will have a look tomorrow.
> > 
> > trunk, carp, ... in veb or bridge might have the same issue.
> 
> First round of fixes for vlan(4), vxlan(4), nvgre(4) and bpe(4).

Don't know much about nvgre(4) and bpe(4).

Maybe we should call ifsetlro(ifp0, 0) unconditionally in
vxlan_set_parent().  Otherwise we may get large UDP packets with
large TCP frames.

For vlan(4) this diff is correct.  For the others it is at least
an improvement.

OK bluhm@

> Index: net/if.c
> ===
> RCS file: /cvs/src/sys/net/if.c,v
> retrieving revision 1.708
> diff -u -p -r1.708 if.c
> --- net/if.c  16 Sep 2023 09:33:27 -  1.708
> +++ net/if.c  19 Oct 2023 13:03:33 -
> @@ -3243,6 +3243,17 @@ ifsetlro(struct ifnet *ifp, int on)
>   struct ifreq ifrq;
>   int error = 0;
>   int s = splnet();
> + struct if_parent parent;
> +
> + memset(, 0, sizeof(parent));
> + if ((*ifp->if_ioctl)(ifp, SIOCGIFPARENT, (caddr_t)) != -1) {
> + struct ifnet *ifp0 = if_unit(parent.ifp_parent);
> +
> + if (ifp0 != NULL) {
> + ifsetlro(ifp0, on);
> + if_put(ifp0);
> + }
> + }
>  
>   if (!ISSET(ifp->if_capabilities, IFCAP_LRO)) {
>   error = ENOTSUP;
> Index: net/if_bpe.c
> ===
> RCS file: /cvs/src/sys/net/if_bpe.c,v
> retrieving revision 1.19
> diff -u -p -r1.19 if_bpe.c
> --- net/if_bpe.c  8 Nov 2021 04:54:44 -   1.19
> +++ net/if_bpe.c  19 Oct 2023 13:20:18 -
> @@ -631,6 +631,9 @@ bpe_set_parent(struct bpe_softc *sc, con
>   goto put;
>   }
>  
> + if (ether_brport_isset(ifp))
> + ifsetlro(ifp0, 0);
> +
>   /* commit */
>   sc->sc_key.k_if = ifp0->if_index;
>   etherbridge_flush(>sc_eb, IFBF_FLUSHALL);
> Index: net/if_gre.c
> ===
> RCS file: /cvs/src/sys/net/if_gre.c,v
> retrieving revision 1.174
> diff -u -p -r1.174 if_gre.c
> --- net/if_gre.c  13 May 2023 13:35:17 -  1.174
> +++ net/if_gre.c  19 Oct 2023 13:24:56 -
> @@ -3544,6 +3544,9 @@ nvgre_set_parent(struct nvgre_softc *sc,
>   return (EPROTONOSUPPORT);
>   }
>  
> + if (ether_brport_isset(>sc_ac.ac_if))
> + ifsetlro(ifp0, 0);
> +
>   /* commit */
>   sc->sc_ifp0 = ifp0->if_index;
>   if_put(ifp0);
> Index: net/if_vlan.c
> ===
> RCS file: /cvs/src/sys/net/if_vlan.c,v
> retrieving revision 1.215
> diff -u -p -r1.215 if_vlan.c
> --- net/if_vlan.c 16 May 2023 14:32:54 -  1.215
> +++ net/if_vlan.c 19 Oct 2023 11:08:23 -
> @@ -937,6 +937,9 @@ vlan_set_parent(struct vlan_softc *sc, c
>   if (error != 0)
>   goto put;
>  
> + if (ether_brport_isset(ifp))
> + ifsetlro(ifp0, 0);
> +
>   /* commit */
>   sc->sc_ifidx0 = ifp0->if_index;
>   if (!ISSET(sc->sc_flags, IFVF_LLADDR))
> Index: net/if_vxlan.c
> ===
> RCS file: /cvs/src/sys/net/if_vxlan.c,v
> retrieving revision 1.93
> diff -u -p -r1.93 if_vxlan.c
> --- net/if_vxlan.c3 Aug 2023 09:49:08 -   1.93
> +++ net/if_vxlan.c19 Oct 2023 13:18:47 -
> @@ -1582,6 +1582,9 @@ vxlan_set_parent(struct vxlan_softc *sc,
>   goto put;
>   }
>  
> + if (ether_brport_isset(ifp))
> + ifsetlro(ifp0, 0);
> +
>   /* commit */
>   sc->sc_if_index0 = ifp0->if_index;
>   etherbridge_flush(>sc_eb, IFBF_FLUSHALL);



patch unveil fail

2023-10-25 Thread Alexander Bluhm
Hi,

Since 7.4 patch(1) does not work if an explicit patchfile is given on
command line.

https://marc.info/?l=openbsd-cvs=168941770509379=2

root@ot14:.../~# patch /usr/src/usr.bin/patch/patch.c patch-unveil.diff
Hmm...  Looks like a unified diff to me...
The text leading up to this was:
--
|Index: patch.c
|===
|RCS file: /data/mirror/openbsd/cvs/src/usr.bin/patch/patch.c,v
|diff -u -p -r1.74 patch.c
|--- patch.c19 Jul 2023 13:26:20 -  1.74
|+++ patch.c24 Oct 2023 17:13:28 -
--
Patching file /usr/src/usr.bin/patch/patch.c using Plan A...
Hunk #1 succeeded at 32.
Hunk #2 succeeded at 214.
Hunk #3 succeeded at 245.
Can't backup /usr/src/usr.bin/patch/patch.c, output is in 
/tmp/patchoorjYymLKcM: No such file or directory
done

A backup file should be created in the directory of the original
file, but only the current directory is unveiled.  Then the patched
file is created in /tmp and does not replace the original patchfile
in place.

Diff below fixes it.

ok?

bluhm

Index: patch.c
===
RCS file: /data/mirror/openbsd/cvs/src/usr.bin/patch/patch.c,v
diff -u -p -r1.74 patch.c
--- patch.c 19 Jul 2023 13:26:20 -  1.74
+++ patch.c 24 Oct 2023 17:13:28 -
@@ -32,6 +32,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -213,11 +214,27 @@ main(int argc, char *argv[])
perror("unveil");
my_exit(2);
}
-   if (filearg[0] != NULL)
+   if (filearg[0] != NULL) {
+   char *origdir;
+
if (unveil(filearg[0], "rwc") == -1) {
perror("unveil");
my_exit(2);
}
+   if ((origdir = dirname(filearg[0])) == NULL) {
+   perror("dirname");
+   my_exit(2);
+   }
+   if (unveil(origdir, "rwc") == -1) {
+   perror("unveil");
+   my_exit(2);
+   }
+   } else {
+   if (unveil(".", "rwc") == -1) {
+   perror("unveil");
+   my_exit(2);
+   }
+   }
if (filearg[1] != NULL)
if (unveil(filearg[1], "r") == -1) {
perror("unveil");
@@ -228,10 +245,6 @@ main(int argc, char *argv[])
perror("unveil");
my_exit(2);
}
-   if (unveil(".", "rwc") == -1) {
-   perror("unveil");
-   my_exit(2);
-   }
if (*rejname != '\0')
if (unveil(rejname, "rwc") == -1) {
perror("unveil");



OpenBSD Errata: October 25, 2023 (xserver msplit)

2023-10-25 Thread Alexander Bluhm
Errata patches for X11 server and kernel mbuf handling have been
released for OpenBSD 7.3 and 7.4.

Binary updates for the amd64, arm64 and i386 platform are available
via the syspatch utility.  Source code patches can be found on the
respective errata page:

  https://www.openbsd.org/errata73.html
  https://www.openbsd.org/errata74.html



Re: nfsd: don't clear SB_NOINTR flag

2023-10-20 Thread Alexander Bluhm
On Mon, Oct 16, 2023 at 10:17:50PM +0300, Vitaliy Makkoveev wrote:
> This socket comes from userland, so this flag is never set. This makes
> SB_NOINTR flag immutable, because we only set this bit on NFS client
> socket buffers for all it's lifetime.
> 
> I want to do this because this flag modifies sblock() behaviour and it's
> not clean which lock should protect it for standalone sblock().
> 
> ok?

This code came here:
https://svnweb.freebsd.org/csrg/sys/nfs/nfs_syscalls.c?revision=41903=markup
"update from Rick Macklem adding TCP support to NFS"

I would guess that this flag must be cleared to allow to kill the
NFS server if a client does not respond.  Otherwise it may hang in
sbwait() in soreceive() or sosend().

As the flags are never set, your diff does not change behavior.

> I want to completely remove SB_NOINTR flag. Only NFS client sets it, but
> since the socket never passed to userland, this flag is useless, because
> we can't send singnal to kernel thread. So for this sblock()/sbwait()
> sleep is uninterruptible. But I want to not mix NFS server and NFS
> client diffs.

The NFS client does not run as kernel thread, but as the process
that does file system access.  You can Ctrl-C it if mount uses
"intr" option.  NFSMNT_INT sets some PCATCH and sb_timeo_nsecs, but
I think signal should also abort sbwait().  That is why NFS client
sets SB_NOINTR.

As this flag is only set during mount or reconnect.  It is a problem
for MP?  Is it modified in a non-initialization path?

bluhm

> Index: sys/nfs/nfs_syscalls.c
> ===
> RCS file: /cvs/src/sys/nfs/nfs_syscalls.c,v
> retrieving revision 1.119
> diff -u -p -r1.119 nfs_syscalls.c
> --- sys/nfs/nfs_syscalls.c3 Aug 2023 09:49:09 -   1.119
> +++ sys/nfs/nfs_syscalls.c16 Oct 2023 19:00:02 -
> @@ -276,9 +276,7 @@ nfssvc_addsock(struct file *fp, struct m
>   m_freem(m);
>   }
>   solock(so);
> - so->so_rcv.sb_flags &= ~SB_NOINTR;
>   so->so_rcv.sb_timeo_nsecs = INFSLP;
> - so->so_snd.sb_flags &= ~SB_NOINTR;
>   so->so_snd.sb_timeo_nsecs = INFSLP;
>   sounlock(so);
>   if (tslp)



Re: TSO for ixl(4)

2023-10-18 Thread Alexander Bluhm
On Wed, Oct 18, 2023 at 05:29:41PM +0200, Jan Klemkow wrote:
> This diff implements TCP Segmentation Offloading for ixl(4).  I tested
> it successfully on amd64 and sparc64 with Intel X710.  It should
> increase the TCP bulk performance to 10 Gbit/s.  On sparc64 I got an
> increase from 600 MBit/s to 2.000 Gbit/s.
> 
> Further testing is welcome.

tested on amd64
OK bluhm@

> Index: dev/pci/if_ixl.c
> ===
> RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v
> retrieving revision 1.89
> diff -u -p -r1.89 if_ixl.c
> --- dev/pci/if_ixl.c  29 Sep 2023 19:44:47 -  1.89
> +++ dev/pci/if_ixl.c  18 Oct 2023 15:15:30 -
> @@ -71,6 +71,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  #if NBPFILTER > 0
> @@ -85,6 +86,8 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
>  #include 
>  #include 
>  
> @@ -827,6 +830,10 @@ struct ixl_tx_desc {
>  #define IXL_TX_DESC_BSIZE_MASK   \
>   (IXL_TX_DESC_BSIZE_MAX << IXL_TX_DESC_BSIZE_SHIFT)
>  
> +#define IXL_TX_CTX_DESC_CMD_TSO  0x10
> +#define IXL_TX_CTX_DESC_TLEN_SHIFT   30
> +#define IXL_TX_CTX_DESC_MSS_SHIFT50
> +
>  #define IXL_TX_DESC_L2TAG1_SHIFT 48
>  } __packed __aligned(16);
>  
> @@ -893,11 +900,19 @@ struct ixl_rx_wb_desc_32 {
>   uint64_tqword3;
>  } __packed __aligned(16);
>  
> -#define IXL_TX_PKT_DESCS 8
> +#define IXL_TX_PKT_DESCS 32
>  #define IXL_TX_QUEUE_ALIGN   128
>  #define IXL_RX_QUEUE_ALIGN   128
>  
>  #define IXL_HARDMTU  9712 /* 9726 - ETHER_HDR_LEN */
> +#define IXL_TSO_SIZE ((255 * 1024) - 1)
> +#define IXL_MAX_DMA_SEG_SIZE ((16 * 1024) - 1)
> +
> +/*
> + * Our TCP/IP Stack could not handle packets greater than MAXMCLBYTES.
> + * This interface could not handle packets greater than IXL_TSO_SIZE.
> + */
> +CTASSERT(MAXMCLBYTES < IXL_TSO_SIZE);
>  
>  #define IXL_PCIREG   PCI_MAPREG_START
>  
> @@ -1958,6 +1973,7 @@ ixl_attach(struct device *parent, struct
>   ifp->if_capabilities |= IFCAP_CSUM_IPv4 |
>   IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 |
>   IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
> + ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6;
>  
>   ifmedia_init(>sc_media, 0, ixl_media_change, ixl_media_status);
>  
> @@ -2603,7 +2619,7 @@ ixl_txr_alloc(struct ixl_softc *sc, unsi
>   txm = [i];
>  
>   if (bus_dmamap_create(sc->sc_dmat,
> - IXL_HARDMTU, IXL_TX_PKT_DESCS, IXL_HARDMTU, 0,
> + MAXMCLBYTES, IXL_TX_PKT_DESCS, IXL_MAX_DMA_SEG_SIZE, 0,
>   BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW | BUS_DMA_64BIT,
>   >txm_map) != 0)
>   goto uncreate;
> @@ -2787,7 +2803,8 @@ ixl_load_mbuf(bus_dma_tag_t dmat, bus_dm
>  }
>  
>  static uint64_t
> -ixl_tx_setup_offload(struct mbuf *m0)
> +ixl_tx_setup_offload(struct mbuf *m0, struct ixl_tx_ring *txr,
> +unsigned int prod)
>  {
>   struct ether_extracted ext;
>   uint64_t hlen;
> @@ -2800,7 +2817,7 @@ ixl_tx_setup_offload(struct mbuf *m0)
>   }
>  
>   if (!ISSET(m0->m_pkthdr.csum_flags,
> - M_IPV4_CSUM_OUT|M_TCP_CSUM_OUT|M_UDP_CSUM_OUT))
> + M_IPV4_CSUM_OUT|M_TCP_CSUM_OUT|M_UDP_CSUM_OUT|M_TCP_TSO))
>   return (offload);
>  
>   ether_extract_headers(m0, );
> @@ -2833,6 +2850,28 @@ ixl_tx_setup_offload(struct mbuf *m0)
>   offload |= (sizeof(*ext.udp) >> 2) << IXL_TX_DESC_L4LEN_SHIFT;
>   }
>  
> + if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_TSO)) {
> + if (ext.tcp) {
> + struct ixl_tx_desc *ring, *txd;
> + uint64_t cmd = 0;
> +
> + hlen += ext.tcp->th_off << 2;
> + ring = IXL_DMA_KVA(>txr_mem);
> + txd = [prod];
> +
> + cmd |= IXL_TX_DESC_DTYPE_CONTEXT;
> + cmd |= IXL_TX_CTX_DESC_CMD_TSO;
> + cmd |= (uint64_t)(m0->m_pkthdr.len - ETHER_HDR_LEN
> + - hlen) << IXL_TX_CTX_DESC_TLEN_SHIFT;
> + cmd |= (uint64_t)(m0->m_pkthdr.ph_mss)
> + << IXL_TX_CTX_DESC_MSS_SHIFT;
> +
> + htolem64(>addr, 0);
> + htolem64(>cmd, cmd);
> + } else
> + tcpstat_inc(tcps_outbadtso);
> + }
> +
>   return (offload);
>  }
>  
> @@ -2873,7 +2912,8 @@ ixl_start(struct ifqueue *ifq)
>   mask = sc->sc_tx_ring_ndescs - 1;
>  
>   for (;;) {
> - if (free <= IXL_TX_PKT_DESCS) {
> + /* We need one extra descriptor for TSO packets. */
> + if (free <= (IXL_TX_PKT_DESCS + 1)) {
>   ifq_set_oactive(ifq);
>   break;
>   }
> @@ -2882,10 +2922,16 @@ ixl_start(struct ifqueue *ifq)
>   

syslogd udp dropped counter

2023-10-17 Thread Alexander Bluhm
Hi,

Now that syslogd handles delayed DNS lookups, I think we should
count dropped packets to UDP loghosts.  Although not every dropped
UDP packet can be detected, the message makes the admin aware that
there may be a blind spot during startup.

syslogd[26493]: dropped 8 messages to udp loghost "@udp://loghost"

Debug and log messages were improved, especially if UDP looging is
shut down permanently.  Also do not print 'last message repeated'
if the message was dropped.

ok?

bluhm

Index: usr.sbin/syslogd/syslogd.c
===
RCS file: /data/mirror/openbsd/cvs/src/usr.sbin/syslogd/syslogd.c,v
retrieving revision 1.278
diff -u -p -r1.278 syslogd.c
--- usr.sbin/syslogd/syslogd.c  12 Oct 2023 22:36:54 -  1.278
+++ usr.sbin/syslogd/syslogd.c  17 Oct 2023 23:30:05 -
@@ -241,6 +241,7 @@ int NoVerify = 0;   /* do not verify TLS 
 const char *ClientCertfile = NULL;
 const char *ClientKeyfile = NULL;
 const char *ServerCAfile = NULL;
+intudpsend_dropped = 0;/* messages dropped due to UDP not ready */
 inttcpbuf_dropped = 0; /* count messages dropped from TCP or TLS */
 intfile_dropped = 0;   /* messages dropped due to file system full */
 intinit_dropped = 0;   /* messages dropped during initialization */
@@ -1820,6 +1821,7 @@ logmsg(struct msg *msg, int flags, char 
(f->f_type != F_PIPE && f->f_type != F_FORWUDP &&
f->f_type != F_FORWTCP && f->f_type != F_FORWTLS))) &&
(flags & MARK) == 0 && msglen == f->f_prevlen &&
+   f->f_dropped == 0 &&
!strcmp(msg->m_msg, f->f_prevline) &&
!strcmp(from, f->f_prevhost)) {
strlcpy(f->f_lasttime, msg->m_timestamp,
@@ -2005,14 +2007,14 @@ fprintlog(struct filed *f, int flags, ch
 
switch (f->f_type) {
case F_UNUSED:
-   log_debug("%s", "");
+   log_debug("");
break;
 
case F_FORWUDP:
-   log_debug(" %s", f->f_un.f_forw.f_loghost);
+   log_debugadd(" %s", f->f_un.f_forw.f_loghost);
if (f->f_un.f_forw.f_addr.ss_family == AF_UNSPEC) {
-   log_warnx("not resolved \"%s\"",
-   f->f_un.f_forw.f_loghost);
+   log_debug(" (dropped not resolved)");
+   f->f_dropped++;
break;
}
l = iov[0].iov_len + iov[1].iov_len + iov[2].iov_len +
@@ -2040,14 +2042,30 @@ fprintlog(struct filed *f, int flags, ch
case ENETUNREACH:
case ENOBUFS:
case EWOULDBLOCK:
+   log_debug(" (dropped send error)");
+   f->f_dropped++;
/* silently dropped */
break;
default:
+   log_debug(" (dropped permanent send error)");
+   f->f_dropped++;
f->f_type = F_UNUSED;
-   log_warn("sendmsg to \"%s\"",
+   snprintf(ebuf, sizeof(ebuf),
+   "to udp loghost \"%s\"",
+   f->f_un.f_forw.f_loghost);
+   dropped_warn(>f_dropped, ebuf);
+   log_warn("loghost \"%s\" disabled, sendmsg",
f->f_un.f_forw.f_loghost);
break;
}
+   } else {
+   log_debug("");
+   if (f->f_dropped > 0) {
+   snprintf(ebuf, sizeof(ebuf),
+   "to udp loghost \"%s\"",
+   f->f_un.f_forw.f_loghost);
+   dropped_warn(>f_dropped, ebuf);
+   }
}
break;
 
@@ -2056,7 +2074,7 @@ fprintlog(struct filed *f, int flags, ch
log_debugadd(" %s", f->f_un.f_forw.f_loghost);
if (EVBUFFER_LENGTH(f->f_un.f_forw.f_bufev->output) >=
MAX_TCPBUF) {
-   log_debug(" (dropped)");
+   log_debug(" (dropped tcpbuf full)");
f->f_dropped++;
break;
}
@@ -2077,12 +2095,12 @@ fprintlog(struct filed *f, int flags, ch
(char *)iov[3].iov_base, (char *)iov[4].iov_base,
(char *)iov[5].iov_base, (char *)iov[6].iov_base);
if (l < 0) {
-   log_debug(" (dropped evbuffer_add_printf)");
+   log_debug(" (dropped evbuffer add)");
f->f_dropped++;

pf log drop default rule

2023-10-10 Thread Alexander Bluhm
Hi,

If a packet is malformed, it is dropped by pf(4).  The rule referenced
in pflog(4) is the default rule.  As the default rule is a pass
rule, tcpdump prints "pass" although the packet is actually dropped.
I have reports from genua and OPNsense users who are confused by
the output.

With the diff below we see pass or blocked when the packet is matched
or dropped due to bad fragment respectively.

19:29:17.314991 rule def/(match) [uid 0, pid 0] pass in on em1: 10.188.81.21 > 
10.188.81.22: (frag 43955:8@8+) (ttl 64, len 28)
19:29:31.321728 rule def/(fragment) [uid 0, pid 0] block in on em1: 
10.188.81.21 > 10.188.81.22: (frag 27096:64@4032+) (ttl 64, len 84)

ok?

bluhm

Index: net/if_pflog.c
===
RCS file: /cvs/src/sys/net/if_pflog.c,v
retrieving revision 1.97
diff -u -p -r1.97 if_pflog.c
--- net/if_pflog.c  20 Jan 2021 23:25:19 -  1.97
+++ net/if_pflog.c  10 Oct 2023 17:20:00 -
@@ -204,7 +204,9 @@ pflog_packet(struct pf_pdesc *pd, u_int8
 
bzero(, sizeof(hdr));
hdr.length = PFLOG_REAL_HDRLEN;
-   hdr.action = rm->action;
+   /* Default rule does not pass packets dropped for other reasons. */
+   hdr.action = (rm->nr == (u_int32_t)-1 && reason != PFRES_MATCH) ?
+   PF_DROP : rm->action;
hdr.reason = reason;
memcpy(hdr.ifname, pd->kif->pfik_name, sizeof(hdr.ifname));
 



pf passes packet if limit reached

2023-10-10 Thread Alexander Bluhm
Hi,

The behaviour of the PFRULE_SRCTRACK and max_states check was
unintentionally changed by this commit.


revision 1.964
date: 2016/01/25 18:49:57;  author: sashan;  state: Exp;  lines: +18 -10;  
commitid: KeemoLxcm7FS1oYy;
- plugging massive pf_state_key leak

OK mpi@ dlg@ sthen@


If we do not create a state after some limit was reached, pf still
passes the packet.  We can restore the old behavior by setting
action later, after the checks.

ok?

bluhm

Index: net/pf.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf.c,v
retrieving revision 1.1186
diff -u -p -r1.1186 pf.c
--- net/pf.c8 Sep 2023 13:40:52 -   1.1186
+++ net/pf.c9 Oct 2023 22:37:14 -
@@ -4467,8 +4467,6 @@ pf_test_rule(struct pf_pdesc *pd, struct
goto cleanup;
}
 
-   action = PF_PASS;
-
if (pd->virtual_proto != PF_VPROTO_FRAGMENT
&& !ctx.state_icmp && r->keep_state) {
 
@@ -4511,6 +4509,8 @@ pf_test_rule(struct pf_pdesc *pd, struct
 #endif /* INET6 */
 
} else {
+   action = PF_PASS;
+
while ((ctx.ri = SLIST_FIRST())) {
SLIST_REMOVE_HEAD(, entry);
pool_put(_rule_item_pl, ctx.ri);



pf_pull_hdr useless action pointer and fragment logic

2023-10-09 Thread Alexander Bluhm
Hi,

pf_pull_hdr() allows to pass an action pointer parameter as output
value.  This is never used, all callers pass a NULL argument.  Remove
ACTION_SET() entirely.

The logic if (fragoff >= len) in pf_pull_hdr() looks odd.  One is
the offset in the IP packet, the latter the length of some header
within the fragment.  In revision 1.1 the logic was used to drop
short TCP or UDP fragments that contained only part of the header.
This does not work since pf_pull_hdr() supports offsets.


revision 1.4
date: 2001/06/24 20:54:55;  author: itojun;  state: Exp;  lines: +18 -16;
pull_hdr() now takes header offset explicitly, to help header chain parsing
(v6, ipsec)


The code drops the packets anyway, so always set reason PFRES_FRAG.

ok?

bluhm

Index: net/pf.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf.c,v
retrieving revision 1.1186
diff -u -p -r1.1186 pf.c
--- net/pf.c8 Sep 2023 13:40:52 -   1.1186
+++ net/pf.c9 Oct 2023 17:01:04 -
@@ -3201,7 +3201,7 @@ pf_modulate_sack(struct pf_pdesc *pd, st
optsoff = pd->off + sizeof(struct tcphdr);
 #define TCPOLEN_MINSACK(TCPOLEN_SACK + 2)
if (olen < TCPOLEN_MINSACK ||
-   !pf_pull_hdr(pd->m, optsoff, opts, olen, NULL, NULL, pd->af))
+   !pf_pull_hdr(pd->m, optsoff, opts, olen, NULL, pd->af))
return (0);
 
eoh = opts + olen;
@@ -3871,7 +3871,7 @@ pf_get_wscale(struct pf_pdesc *pd)
 
olen = (pd->hdr.tcp.th_off << 2) - sizeof(struct tcphdr);
if (olen < TCPOLEN_WINDOW || !pf_pull_hdr(pd->m,
-   pd->off + sizeof(struct tcphdr), opts, olen, NULL, NULL, pd->af))
+   pd->off + sizeof(struct tcphdr), opts, olen, NULL, pd->af))
return (0);
 
opt = opts;
@@ -3896,7 +3896,7 @@ pf_get_mss(struct pf_pdesc *pd)
 
olen = (pd->hdr.tcp.th_off << 2) - sizeof(struct tcphdr);
if (olen < TCPOLEN_MAXSEG || !pf_pull_hdr(pd->m,
-   pd->off + sizeof(struct tcphdr), opts, olen, NULL, NULL, pd->af))
+   pd->off + sizeof(struct tcphdr), opts, olen, NULL, pd->af))
return (0);
 
opt = opts;
@@ -5691,7 +5691,7 @@ pf_test_state_icmp(struct pf_pdesc *pd, 
ipoff2 = pd->off + ICMP_MINLEN;
 
if (!pf_pull_hdr(pd2.m, ipoff2, , sizeof(h2),
-   NULL, reason, pd2.af)) {
+   reason, pd2.af)) {
DPFPRINTF(LOG_NOTICE,
"ICMP error message too short (ip)");
return (PF_DROP);
@@ -5719,7 +5719,7 @@ pf_test_state_icmp(struct pf_pdesc *pd, 
ipoff2 = pd->off + sizeof(struct icmp6_hdr);
 
if (!pf_pull_hdr(pd2.m, ipoff2, _6, sizeof(h2_6),
-   NULL, reason, pd2.af)) {
+   reason, pd2.af)) {
DPFPRINTF(LOG_NOTICE,
"ICMP error message too short (ip6)");
return (PF_DROP);
@@ -5770,7 +5770,7 @@ pf_test_state_icmp(struct pf_pdesc *pd, 
 * expected. Don't access any TCP header fields after
 * th_seq, an ackskew test is not possible.
 */
-   if (!pf_pull_hdr(pd2.m, pd2.off, th, 8, NULL, reason,
+   if (!pf_pull_hdr(pd2.m, pd2.off, th, 8, reason,
pd2.af)) {
DPFPRINTF(LOG_NOTICE,
"ICMP error message too short (tcp)");
@@ -5951,7 +5951,7 @@ pf_test_state_icmp(struct pf_pdesc *pd, 
int  action;
 
if (!pf_pull_hdr(pd2.m, pd2.off, uh, sizeof(*uh),
-   NULL, reason, pd2.af)) {
+   reason, pd2.af)) {
DPFPRINTF(LOG_NOTICE,
"ICMP error message too short (udp)");
return (PF_DROP);
@@ -6079,7 +6079,7 @@ pf_test_state_icmp(struct pf_pdesc *pd, 
}
 
if (!pf_pull_hdr(pd2.m, pd2.off, iih, ICMP_MINLEN,
-   NULL, reason, pd2.af)) {
+   reason, pd2.af)) {
DPFPRINTF(LOG_NOTICE,
"ICMP error message too short (icmp)");
return (PF_DROP);
@@ -6185,7 +6185,7 @@ pf_test_state_icmp(struct pf_pdesc *pd, 
}
 
if (!pf_pull_hdr(pd2.m, pd2.off, iih,
-   sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) {
+   sizeof(struct icmp6_hdr), reason, pd2.af)) {
 

Re: ifq_start_task(): purge queue before exit when IFF_RUNNING flag is not set

2023-10-06 Thread Alexander Bluhm
On Fri, Oct 06, 2023 at 04:17:34PM +0300, Vitaliy Makkoveev wrote:
> On Fri, Oct 06, 2023 at 02:50:21PM +0200, Alexander Bluhm wrote:
> > On Fri, Oct 06, 2023 at 12:45:54PM +0300, Vitaliy Makkoveev wrote:
> > > On Thu, Oct 05, 2023 at 10:42:25PM +1000, David Gwynne wrote:
> > > > > On 5 Oct 2023, at 21:50, Vitaliy Makkoveev  wrote:
> > > > > 
> > > > > I don't like this unlocked if_flags check we have in ifq_start_task().
> > > > > Guess READ_ONCE() is much better to load value.
> > > > 
> > > > there are no inconsistent intermediate values you can read from 
> > > > if_flags. are you worried the compiler will load the value again and 
> > > > maybe we'll get a different state the second time?
> > > 
> > > To enforce loading it from memory instead of cache. But may be I'm
> > > wrong.
> > 
> > READ_ONCE() does not address cache coherency problmes.
> > 
> > Just reading an int value that an other processor can modify is
> > wrong.  The compiler can optimize it into multiple reads with
> > inconsistent values, witten by the other CPU.  This is prevented
> > by READ_ONCE().
> > 
> > When using READ_ONCE() you must be sure that cache coherency is not
> > a problem or that you have some memory barriers in place.  I added
> > READ_ONCE() to some queue macros recently to fix obvious missuse.
> > That does not mean it is sufficient.
> > 
> 
> Thanks for explanation.
> 
> > Accessing ifp->if_flags is a different story.  As it is marked as
> > [N] you need exclusive netlock to write and shared netlock to read
> > to be on the safe side.  A lot of your code just reads it without
> > caring about locks.  This works most of the time.
> > 
> 
> Not my code. However these IFF_UP and IFF_RUNNING reads and
> modifications are within kernel locked chunks of drivers and kernel
> locked ifioctl() chunk. Mostly, this is configuration path. In normal
> condition these flags should be modified a very little times, mostly
> once. So this works...

During configuration there is usually some kind of lock, it happens
rarely, and bugs are hard to trigger, especially on amd64 which
guarantees some consistency.  That's why it works.

> Some times ago, I privately pointed this and proposed to modify if_flags
> atomically. May be it's time to rethink this.

Atomic operations are not the answer to everything.  They are still
slow and don't guarantee consistency with the logic around them.
If writes to if_flags happen rarely and are protected by exclusive
netlock anyway, we can leave them as they are.

More problematic is reading.  Many people believe that reading an
integer, that another CPU can modify, is fine.  To make the integer
value consistent for the compiler, you need READ_ONCE() or
atomic_load_int().  And then consider whether reordering it with
all the instructions around it, is a problem.  Getting that right
with memory barriers is very hard.  It should not be done in regular
code, better use primitives that have better semantics.

The easiest way to get MP safety is to identify critical sections
and put a lock around them.  For reading, a shared lock is enough,
it has all the barriers.  But it is slow and should be avoided in
the hot path.  By adding a lock per single network packet, throughput
reduction is well visible in my measurements.

So we ended with the current code that works although it does not
follow these principles.  Usually it does not really matter where
the read actually occures within the context.

We have also lockless primitives like SMR and SRP which are also
hard to understand and have their drawbacks.  I never understood
how they work, but I can trigger bugs in code that use them.

> No. They all do simply load, compare and ored the result, no difference
> what condition was the reason to return.

That's why it works with multiple CPUs.

bluhm



Re: tcp syn cache unlock

2023-10-06 Thread Alexander Bluhm
On Fri, Oct 06, 2023 at 03:47:31PM +0300, Vitaliy Makkoveev wrote:
> On Fri, Oct 06, 2023 at 02:14:52PM +0200, Alexander Bluhm wrote:
> > > @@ -718,11 +743,13 @@ softclock(void *arg)
> > >   softclock_process_tick_timeout(to, new);
> > >   }
> > >   tostat.tos_softclocks++;
> > > - needsproc = !CIRCQ_EMPTY(_proc);
> > > - mtx_leave(_mutex);
> > > -
> > > - if (needsproc)
> > > + if (!CIRCQ_EMPTY(_proc))
> > >   wakeup(_proc);
> > > +#ifdef MULTIPROCESSOR
> > > + if(!CIRCQ_EMPTY(_proc_mpsafe))
> > > + wakeup(_proc_mpsafe);
> > > +#endif
> > > + mtx_leave(_mutex);
> > >  }
> > >  
> > >  void
> > 
> > Was there a good reason that wakeup() did run without mutex?
> > Do we really want to change this?
> > 
> 
> I dont understand you. Original code does wakeup() outside mutex. I
> moved wakeup() under mutex. You want to move it back?

I just wanted to know why you moved it.

Now I see.  You use msleep_nsec() with timeout_mutex.  Putting
wakeup in mutex ensures that you don't miss it.

Nitpick: timeoutmp_proc should be timeout_procmp.  timeout_ is the
prefix in this file.  mp suffix is easier to see at the end.

>+   if (kthread_create(softclockmp_thread, NULL, NULL, "softclockm"))
"softclockm" -> "softclockmp"

OK bluhm@, but let's wait for cheloha@ and see what he thinks



Re: ifq_start_task(): purge queue before exit when IFF_RUNNING flag is not set

2023-10-06 Thread Alexander Bluhm
On Fri, Oct 06, 2023 at 12:45:54PM +0300, Vitaliy Makkoveev wrote:
> On Thu, Oct 05, 2023 at 10:42:25PM +1000, David Gwynne wrote:
> > > On 5 Oct 2023, at 21:50, Vitaliy Makkoveev  wrote:
> > > 
> > > I don't like this unlocked if_flags check we have in ifq_start_task().
> > > Guess READ_ONCE() is much better to load value.
> > 
> > there are no inconsistent intermediate values you can read from if_flags. 
> > are you worried the compiler will load the value again and maybe we'll get 
> > a different state the second time?
> 
> To enforce loading it from memory instead of cache. But may be I'm
> wrong.

READ_ONCE() does not address cache coherency problmes.

Just reading an int value that an other processor can modify is
wrong.  The compiler can optimize it into multiple reads with
inconsistent values, witten by the other CPU.  This is prevented
by READ_ONCE().

When using READ_ONCE() you must be sure that cache coherency is not
a problem or that you have some memory barriers in place.  I added
READ_ONCE() to some queue macros recently to fix obvious missuse.
That does not mean it is sufficient.

Accessing ifp->if_flags is a different story.  As it is marked as
[N] you need exclusive netlock to write and shared netlock to read
to be on the safe side.  A lot of your code just reads it without
caring about locks.  This works most of the time.

I this case, I don't know.  Would it be a problem if ISSET(ifp->if_flags),
ifq_empty(ifq), and ifq_is_oactive(ifq) would be executed in different
order?

bluhm



Re: tcp syn cache unlock

2023-10-06 Thread Alexander Bluhm
On Fri, Oct 06, 2023 at 02:12:31PM +0300, Vitaliy Makkoveev wrote:
> I reworked your diff for a little. At firts I use separate
> softclock_thread_mpsafe() for mpsafe timers. I don't think we need to
> bind this thread to the primary CPU.

I also think a separate thread is better.  Makes no sense to grab
and release kernel lock for each timer.  Task uses systq and systqmp
the same way.

Pinning threads to cpu should only be done when we know that it has
a performance benefit in a particular situation.  At the moment I
would prefer a thread that runs anywhere.

> Also, on uniprocessor machine we don't need dedicated mpsafe timers
> processing, so this specific code separated with preprocessor.

Make sense.

> Man page is included to this diff.
> 
> I'm using this with pipex(4) and PF_ROUTE sockets.

I guess there are a bunch of network timeouts we can run without
kernel lock as soon this feature gets in.

> @@ -718,11 +743,13 @@ softclock(void *arg)
>   softclock_process_tick_timeout(to, new);
>   }
>   tostat.tos_softclocks++;
> - needsproc = !CIRCQ_EMPTY(_proc);
> - mtx_leave(_mutex);
> -
> - if (needsproc)
> + if (!CIRCQ_EMPTY(_proc))
>   wakeup(_proc);
> +#ifdef MULTIPROCESSOR
> + if(!CIRCQ_EMPTY(_proc_mpsafe))
> + wakeup(_proc_mpsafe);
> +#endif
> + mtx_leave(_mutex);
>  }
>  
>  void

Was there a good reason that wakeup() did run without mutex?
Do we really want to change this?

> @@ -730,6 +757,11 @@ softclock_create_thread(void *arg)
>  {
>   if (kthread_create(softclock_thread, NULL, NULL, "softclock"))
>   panic("fork softclock");
> +#ifdef MULTIPROCESSOR
> + if (kthread_create(softclock_thread_mpsafe, NULL, NULL,
> + "softclock_mpsafe"))
> + panic("fork softclock_mpsafe");
> +#endif
>  }
>  
>  void

Could you make the visible name of the thread shorter?  e.g.
"softclock_mpsafe" -> "softclockmp", it should fit in ps and top.

bluhm



Re: wg destroy hangs

2023-10-05 Thread Alexander Bluhm
On Thu, Oct 05, 2023 at 07:15:23AM +0200, Kirill Miazine wrote:
> > This diff checks IFF_RUNNING flag within while (!ifq_empty()) loop of
> > wg_peer_destroy(). If the flag is not set queue will be purged and check
> > performed again. I intentionally keep netlock to prevent ifconfig
> > manipulations on the interface.
> 
> I confirm that just the diff below solved the issue

> > +* XXX: `if_snd' of stopped interface could still packets

This sentnece is missing a verb.  ... could still contain packets?
Or: `if_snd' of stopped interface does not consume packets

OK bluhm@

> > Index: sys/net/if_wg.c
> > ===
> > RCS file: /cvs/src/sys/net/if_wg.c,v
> > retrieving revision 1.31
> > diff -u -p -r1.31 if_wg.c
> > --- sys/net/if_wg.c 26 Sep 2023 15:16:44 -  1.31
> > +++ sys/net/if_wg.c 4 Oct 2023 23:09:14 -
> > @@ -509,6 +509,13 @@ wg_peer_destroy(struct wg_peer *peer)
> >   
> > NET_LOCK();
> > while (!ifq_empty(>sc_if.if_snd)) {
> > +   /*
> > +* XXX: `if_snd' of stopped interface could still packets
> > +*/
> > +   if (!ISSET(sc->sc_if.if_flags, IFF_RUNNING)) {
> > +   ifq_purge(>sc_if.if_snd);
> > +   continue;
> > +   }
> > NET_UNLOCK();
> > tsleep_nsec(sc, PWAIT, "wg_ifq", 1000);
> > NET_LOCK();
> > 



Re: syslogd retry dns lookup

2023-10-04 Thread Alexander Bluhm
On Sun, Sep 03, 2023 at 02:00:46AM +0200, Alexander Bluhm wrote:
> When DNS lookup for remote loghost in @ line in syslog.conf does
> not work at startup, retry in intervals.

Thanks Paul de Weerd for testing my diff.  Together we improved the
debug output to make clear what is going on.  I also added generic
loghost_resolve() and loghost_retry() functions for UDP and TCP.

I am looking for ok now.

bluhm

Index: usr.sbin/syslogd/privsep.c
===
RCS file: /data/mirror/openbsd/cvs/src/usr.sbin/syslogd/privsep.c,v
retrieving revision 1.76
diff -u -p -r1.76 privsep.c
--- usr.sbin/syslogd/privsep.c  11 Aug 2023 04:45:06 -  1.76
+++ usr.sbin/syslogd/privsep.c  2 Oct 2023 16:48:55 -
@@ -742,8 +742,8 @@ priv_config_parse_done(void)
 /* Name/service to address translation.  Response is placed into addr.
  * Return 0 for success or < 0 for error like getaddrinfo(3) */
 int
-priv_getaddrinfo(char *proto, char *host, char *serv, struct sockaddr *addr,
-size_t addr_len)
+priv_getaddrinfo(const char *proto, const char *host, const char *serv,
+struct sockaddr *addr, size_t addr_len)
 {
char protocpy[5], hostcpy[NI_MAXHOST], servcpy[NI_MAXSERV];
int cmd, ret_len;
Index: usr.sbin/syslogd/syslogd.c
===
RCS file: /data/mirror/openbsd/cvs/src/usr.sbin/syslogd/syslogd.c,v
retrieving revision 1.277
diff -u -p -r1.277 syslogd.c
--- usr.sbin/syslogd/syslogd.c  16 Mar 2023 18:22:08 -  1.277
+++ usr.sbin/syslogd/syslogd.c  2 Oct 2023 18:46:27 -
@@ -156,9 +156,12 @@ struct filed {
struct sockaddr_storage  f_addr;
struct buffertls f_buftls;
struct bufferevent  *f_bufev;
+   struct event f_ev;
struct tls  *f_ctx;
+   char*f_ipproto;
char*f_host;
-   int  f_reconnectwait;
+   char*f_port;
+   int  f_retrywait;
} f_forw;   /* forwarding address */
charf_fname[PATH_MAX];
struct {
@@ -319,7 +322,9 @@ void tcp_dropcb(struct bufferevent *, v
 voidtcp_writecb(struct bufferevent *, void *);
 voidtcp_errorcb(struct bufferevent *, short, void *);
 voidtcp_connectcb(int, short, void *);
-voidtcp_connect_retry(struct bufferevent *, struct filed *);
+int loghost_resolve(struct filed *);
+voidloghost_retry(struct filed *);
+voidudp_resolvecb(int, short, void *);
 int tcpbuf_countmsg(struct bufferevent *bufev);
 voiddie_signalcb(int, short, void *);
 voidmark_timercb(int, short, void *);
@@ -962,12 +967,15 @@ socket_bind(const char *proto, const cha
res->ai_socktype | SOCK_NONBLOCK, res->ai_protocol)) == -1)
continue;
 
-   if (getnameinfo(res->ai_addr, res->ai_addrlen, hostname,
+   error = getnameinfo(res->ai_addr, res->ai_addrlen, hostname,
sizeof(hostname), servname, sizeof(servname),
NI_NUMERICHOST | NI_NUMERICSERV |
-   (res->ai_socktype == SOCK_DGRAM ? NI_DGRAM : 0)) != 0) {
-   log_debug("Malformed bind address");
-   hostname[0] = servname[0] = '\0';
+   (res->ai_socktype == SOCK_DGRAM ? NI_DGRAM : 0));
+   if (error) {
+   log_warnx("malformed bind address host \"%s\": %s",
+   host, gai_strerror(error));
+   strlcpy(hostname, hostname_unknown, sizeof(hostname));
+   strlcpy(servname, hostname_unknown, sizeof(servname));
}
if (shutread && shutdown(*fdp, SHUT_RD) == -1) {
log_warn("shutdown SHUT_RD "
@@ -1130,7 +1138,7 @@ acceptcb(int lfd, short event, void *arg
socklen_tsslen;
char hostname[NI_MAXHOST], servname[NI_MAXSERV];
char*peername;
-   int  fd;
+   int  fd, error;
 
sslen = sizeof(ss);
if ((fd = reserve_accept4(lfd, event, ev, tcp_acceptcb,
@@ -1143,17 +1151,21 @@ acceptcb(int lfd, short event, void *arg
}
log_debug("Accepting tcp connection");
 
-   if (getnameinfo((struct sockaddr *), sslen, hostname,
+   error = getnameinfo((struct sockaddr *), sslen, hostname,
sizeof(hostname), servname, sizeof(servname),
-   NI_NUMERICHOST | NI_NUMERICSERV) != 0 ||
-   asprintf(,

tcp syn cache unlock

2023-10-04 Thread Alexander Bluhm
Hi,

This is a first step to unlock TCP syn cache.  The timer function
is independent of the socket code.  That makes it easy to start
there.

Introduce tcp syn cache mutex.  Document fields protected by net
lock and mutex.  Devide timer function in parts protected by mutex
and sending with netlock.  I had to split the flags field in dynamic
flags protected by mutex and fixed flags set during initialization.
Note that sc_dynflags is u_int to prevent the compiler puts both
flags into the same 32 bit access.  Needed on alpha and maybe
elsewhere.

Still missing:
- Use shared net lock for sending.  Basically route cache prevents
  that.  But I have to solve that for shared sending in general.
- Allow shared net lock for calls from tcp_input().
- Run timer without kernel lock.  I am not aware of such a feature.
  There is already some network code that could benefit from that.
  Can we get timer without kernel lock like TASKQ_MPSAFE implements
  it for tasks?

ok?

bluhm

Index: netinet/tcp_input.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
retrieving revision 1.391
diff -u -p -r1.391 tcp_input.c
--- netinet/tcp_input.c 3 Sep 2023 21:37:17 -   1.391
+++ netinet/tcp_input.c 4 Oct 2023 22:40:05 -
@@ -3084,15 +3084,24 @@ tcp_mss_adv(struct mbuf *m, int af)
  * state for SYN_RECEIVED.
  */
 
+/*
+ * Locks used to protect global data and struct members:
+ * N   net lock
+ * S   syn_cache_mtx   tcp syn cache global mutex
+ */
+
 /* syn hash parameters */
-inttcp_syn_hash_size = TCP_SYN_HASH_SIZE;
-inttcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE;
-inttcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE;
-inttcp_syn_use_limit = 10;
+inttcp_syn_hash_size = TCP_SYN_HASH_SIZE;  /* [N] size of hash table */
+inttcp_syn_cache_limit =   /* [N] global entry limit */
+   TCP_SYN_HASH_SIZE * TCP_SYN_BUCKET_SIZE;
+inttcp_syn_bucket_limit =  /* [N] per bucket limit */
+   3 * TCP_SYN_BUCKET_SIZE;
+inttcp_syn_use_limit = 10; /* [N] reseed after uses */
 
 struct pool syn_cache_pool;
 struct syn_cache_set tcp_syn_cache[2];
 int tcp_syn_cache_active;
+struct mutex syn_cache_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
 
 #define SYN_HASH(sa, sp, dp, rand) \
(((sa)->s_addr ^ (rand)[0]) *   \
@@ -3134,7 +3143,10 @@ do { 
\
 void
 syn_cache_rm(struct syn_cache *sc)
 {
-   sc->sc_flags |= SCF_DEAD;
+   MUTEX_ASSERT_LOCKED(_cache_mtx);
+
+   KASSERT(!ISSET(sc->sc_dynflags, SCF_DEAD));
+   SET(sc->sc_dynflags, SCF_DEAD);
TAILQ_REMOVE(>sc_buckethead->sch_bucket, sc, sc_bucketq);
sc->sc_tp = NULL;
LIST_REMOVE(sc, sc_tpq);
@@ -3151,11 +3163,10 @@ syn_cache_put(struct syn_cache *sc)
if (refcnt_rele(>sc_refcnt) == 0)
return;
 
+   /* Dealing with last reference, no lock needed. */
m_free(sc->sc_ipopts);
-   if (sc->sc_route4.ro_rt != NULL) {
-   rtfree(sc->sc_route4.ro_rt);
-   sc->sc_route4.ro_rt = NULL;
-   }
+   rtfree(sc->sc_route4.ro_rt);
+
pool_put(_cache_pool, sc);
 }
 
@@ -3190,6 +3201,7 @@ syn_cache_insert(struct syn_cache *sc, s
int i;
 
NET_ASSERT_LOCKED();
+   MUTEX_ASSERT_LOCKED(_cache_mtx);
 
/*
 * If there are no entries in the hash table, reinitialize
@@ -,12 +3345,10 @@ syn_cache_timer(void *arg)
uint64_t now;
int lastref;
 
-   NET_LOCK();
-   if (sc->sc_flags & SCF_DEAD)
+   mtx_enter(_cache_mtx);
+   if (ISSET(sc->sc_dynflags, SCF_DEAD))
goto freeit;
 
-   now = tcp_now();
-
if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
/* Drop it -- too many retransmissions. */
goto dropit;
@@ -3353,18 +3363,22 @@ syn_cache_timer(void *arg)
if (sc->sc_rxttot >= tcptv_keep_init)
goto dropit;
 
-   tcpstat_inc(tcps_sc_retransmitted);
-   (void) syn_cache_respond(sc, NULL, now);
-
/* Advance the timer back-off. */
sc->sc_rxtshift++;
TCPT_RANGESET(sc->sc_rxtcur,
TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN,
TCPTV_REXMTMAX);
-   if (!timeout_add_msec(>sc_timer, sc->sc_rxtcur))
-   syn_cache_put(sc);
+   if (timeout_add_msec(>sc_timer, sc->sc_rxtcur))
+   refcnt_take(>sc_refcnt);
+   mtx_leave(_cache_mtx);
 
+   NET_LOCK();
+   now = tcp_now();
+   (void) syn_cache_respond(sc, NULL, now);
+   tcpstat_inc(tcps_sc_retransmitted);
NET_UNLOCK();
+
+   syn_cache_put(sc);
return;
 
  dropit:
@@ -3375,8 +3389,8 @@ syn_cache_timer(void *arg)
KASSERT(lastref == 0);
(void)lastref;
  freeit:
+   

Re: wg destroy hangs

2023-10-04 Thread Alexander Bluhm
On Wed, Oct 04, 2023 at 11:03:27PM +0300, Vitaliy Makkoveev wrote:
> On Wed, Oct 04, 2023 at 09:13:59PM +0200, Alexander Bluhm wrote:
> > On Wed, Oct 04, 2023 at 08:42:48PM +0200, Kirill Miazine wrote:
> > > > If it happns again, could you send an 'ps axlww | grep ifconifg'
> > > > output?  Then we see the wait channel where it hangs in the kernel.
> > > > 
> > > > $ ps axlww
> > > >UID   PID  PPID CPU PRI  NI   VSZ   RSS WCHAN   STAT   TT   TIME 
> > > > COMMAND
> > > 
> > > Here it happened again:
> > > 
> > >  0 75339 23922   0  10   0   360   296 wg_ifq  D+Up00:00.00 
> > > ifconfig wg1 destroy
> > 
> > wg_peer_destroy()
> > ...
> > NET_LOCK();
> > while (!ifq_empty(>sc_if.if_snd)) {
> > NET_UNLOCK();
> > tsleep_nsec(sc, PWAIT, "wg_ifq", 1000);
> > NET_LOCK();
> > }
> > NET_UNLOCK();
> > 
> > This net lock dance looks fishy.  And the sleep has a timeout of 1
> > milli second.  But that is may be per packet.  So if you have a
> > long queue or the queue refills somehow, it will take forever.
> > 
> > I think the difference in the usage is constant traffic that keeps
> > the send queue full.  The timeout hides the problem when there are
> > only a few packets.
> > 
> 
> This should ensure wg_qstart() will not dereference dying `peer'. Looks
> crappy and potentially could block forever, but should work. However
> netlock it unnecessary here. netlocked wg_output() could fill `if_snd'
> while netlock released before tsleep(), so it serializes nothing but
> stops packets processing.
> 
> Kirill, does this diff help? 

I doubt that it changes much.  When netlock is not taken, the queue
can still be filled with packets.

Removing this ugly netlock makes sense anyway.  But without any
synchronisation just reading a variable feels wrong.  Can we add a
read once like for mq_len in sys/mbuf.h?  And the ifq_set_maxlen()
also looks very unsafe.  For mbuf queues I added a mutex, interface
queues should do the same.

ok?

bluhm

Index: net/ifq.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/ifq.c,v
retrieving revision 1.50
diff -u -p -r1.50 ifq.c
--- net/ifq.c   30 Jul 2023 05:39:52 -  1.50
+++ net/ifq.c   4 Oct 2023 21:04:20 -
@@ -529,6 +529,14 @@ ifq_hdatalen(struct ifqueue *ifq)
return (len);
 }
 
+void
+ifq_set_maxlen(struct ifqueue *ifq, unsigned int maxlen)
+{
+   mtx_enter(>ifq_mtx);
+   ifq->ifq_maxlen = maxlen;
+   mtx_leave(>ifq_mtx);
+}
+
 unsigned int
 ifq_purge(struct ifqueue *ifq)
 {
Index: net/ifq.h
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/ifq.h,v
retrieving revision 1.38
diff -u -p -r1.38 ifq.h
--- net/ifq.h   30 Jul 2023 05:39:52 -  1.38
+++ net/ifq.h   4 Oct 2023 21:09:04 -
@@ -435,6 +435,7 @@ void ifq_deq_commit(struct ifqueue *, 
 voidifq_deq_rollback(struct ifqueue *, struct mbuf *);
 struct mbuf*ifq_dequeue(struct ifqueue *);
 int ifq_hdatalen(struct ifqueue *);
+voidifq_set_maxlen(struct ifqueue *, unsigned int);
 voidifq_mfreem(struct ifqueue *, struct mbuf *);
 voidifq_mfreeml(struct ifqueue *, struct mbuf_list *);
 unsigned intifq_purge(struct ifqueue *);
@@ -448,9 +449,8 @@ int  ifq_deq_sleep(struct ifqueue *, st
 const char *, volatile unsigned int *,
 volatile unsigned int *);
 
-#defineifq_len(_ifq)   ((_ifq)->ifq_len)
-#defineifq_empty(_ifq) (ifq_len(_ifq) == 0)
-#defineifq_set_maxlen(_ifq, _l)((_ifq)->ifq_maxlen = (_l))
+#define ifq_len(_ifq)  READ_ONCE((_ifq)->ifq_len)
+#define ifq_empty(_ifq)(ifq_len(_ifq) == 0)
 
 static inline int
 ifq_is_priq(struct ifqueue *ifq)
@@ -490,8 +490,8 @@ int  ifiq_input(struct ifiqueue *, stru
 int ifiq_enqueue(struct ifiqueue *, struct mbuf *);
 voidifiq_add_data(struct ifiqueue *, struct if_data *);
 
-#defineifiq_len(_ifiq) ml_len(&(_ifiq)->ifiq_ml)
-#defineifiq_empty(_ifiq)   ml_empty(&(_ifiq)->ifiq_ml)
+#define ifiq_len(_ifiq)READ_ONCE(ml_len(&(_ifiq)->ifiq_ml))
+#define ifiq_empty(_ifiq)  (ifiq_len(_ifiq) == 0)
 
 #endif /* _KERNEL */
 



Re: wg destroy hangs

2023-10-04 Thread Alexander Bluhm
On Wed, Oct 04, 2023 at 08:42:48PM +0200, Kirill Miazine wrote:
> > If it happns again, could you send an 'ps axlww | grep ifconifg'
> > output?  Then we see the wait channel where it hangs in the kernel.
> > 
> > $ ps axlww
> >UID   PID  PPID CPU PRI  NI   VSZ   RSS WCHAN   STAT   TT   TIME 
> > COMMAND
> 
> Here it happened again:
> 
>  0 75339 23922   0  10   0   360   296 wg_ifq  D+Up00:00.00 
> ifconfig wg1 destroy

wg_peer_destroy()
...
NET_LOCK();
while (!ifq_empty(>sc_if.if_snd)) {
NET_UNLOCK();
tsleep_nsec(sc, PWAIT, "wg_ifq", 1000);
NET_LOCK();
}
NET_UNLOCK();

This net lock dance looks fishy.  And the sleep has a timeout of 1
milli second.  But that is may be per packet.  So if you have a
long queue or the queue refills somehow, it will take forever.

I think the difference in the usage is constant traffic that keeps
the send queue full.  The timeout hides the problem when there are
only a few packets.

bluhm



Re: wg destroy hangs

2023-10-04 Thread Alexander Bluhm
On Wed, Oct 04, 2023 at 10:53:30AM -0400, Sonic wrote:
> When it happened to me back then (2022) I'm pretty sure I did a "down"
> followed by a "delete" and then the "destroy".

root@ot6:.../~# cd /usr/src/regress/sys/net/wg
root@ot6:.../wg# make ifconfig
...
root@ot6:.../wg# ifconfig wg11
wg11: flags=80c3 rdomain 11 mtu 1420
index 42 priority 0 llprio 3
wgport 211
wgpubkey uQP9F5afOHni9RObVahSPxeJgbsrqGw/P4t5Balpmkc=
wgpeer beT/atjwFPBo3Pv8IvFO5Wf/uVXfgZ5QLSSQIGm/sSc=
wgendpoint 127.0.0.1 212
tx: 0, rx: 0
wgaip fdd7:e83e:66bc:46::2/128
wgaip 10.188.44.2/32
groups: wg
inet 10.188.44.1 netmask 0xff00 broadcast 10.188.44.255
inet6 fdd7:e83e:66bc:46::1 prefixlen 64
root@ot6:.../wg# ifconfig wg11 down
root@ot6:.../wg# ifconfig wg11 delete
root@ot6:.../wg# ifconfig wg11 destroy
root@ot6:.../wg#

For me it works.  Tested on i386 and amd64.

> Have not tried to recreate since then.

Can you try it again?  What is different in your setup?

bluhm



Re: wg destroy hangs

2023-10-04 Thread Alexander Bluhm
On Wed, Oct 04, 2023 at 10:08:01AM -0400, Sonic wrote:
> See the post:
> "Uninterruptible D State after ifconfig wg0 destroy" Oct. 31, 2022 in the
> Bugs archive.

I have a test regress/sys/net/wg that configures a wg(4), sends
some traffic, and destroys it.  I have never seen this bug.  There
must be something special to trigger it.

If it happns again, could you send an 'ps axlww | grep ifconifg'
output?  Then we see the wait channel where it hangs in the kernel.

$ ps axlww
  UID   PID  PPID CPU PRI  NI   VSZ   RSS WCHAN   STAT   TT   TIME COMMAND

The WCHAN string can be found in the kernel sources and gives hints.
More sophisticated would be to break into ddb and show the kernel
stack trace of the ifconfig process.  If you want to do that, I can
give some advice.  But I recommend a serial console for that.

Any idea what you did specially to trigger the problem?

bluhm



OpenBSD Errata: October 3, 2023 (xlibs)

2023-10-03 Thread Alexander Bluhm
Errata patches for X11 libraries libX11 and libXpm have been released
for OpenBSD 7.2 and 7.3.

Binary updates for the amd64, arm64 and i386 platform are available
via the syspatch utility.  Source code patches can be found on the
respective errata page:

  https://www.openbsd.org/errata72.html
  https://www.openbsd.org/errata73.html



Re: link mbufs/inpcbs to pf_states, not pf_state_keys

2023-09-29 Thread Alexander Bluhm
On Tue, Aug 22, 2023 at 06:30:31AM +0200, Alexandr Nedvedicky wrote:
> Currently we have something like this:
> 
>   { mbuf, pcb } <-> state key <-> { state, state ... }
> 
> with this diff we get to:
> 
>   { mbuf, pcb } <-> state <-> state key
> 
> Basically when we do process packet we are interested in state
> not state key itself.

The PCB holds the 4 tupel IP/Port/SRC/DST, and the state key uses
that also.  So we have a 1:1 relation.  That why the linking is
that way.

The mbuf linking is just to transport the information up and down
the stack to get the link with the first packet.  There are corner
cases that need this.  Especially port reuse and connection abort
is tricky.

I always forget why state key to states is a 1:n relation.  But I
expect touble with connectionless divert rules when we change the
PCB with state linking to 1:n.  Idea is to keep one PCB and one
state key in sync.

For the same reason we have sk_reverse.  It links both state keys
1:1.  Like sk_inp links PCB and state keys 1:1.  I expect sublte
breakage in our firewall product if we change that.

bluhm



Re: pfkey: forward after validation

2023-09-29 Thread Alexander Bluhm
On Fri, Sep 29, 2023 at 01:46:40AM +0200, Tobias Heider wrote:
> Like with route messages we should really only forward pfkey messages
> that made it past the validation step. This fixes a lot of possible
> crashes in ipsecctl -m.
> 
> ok?

OK bluhm@

> diff /home/user/got/co/src
> commit - 1ce2bc211dba4164679169b9248650fd1d6ba9d2
> path + /home/user/got/co/src
> blob - e750ae8bdbe6819473884a8c37a518171c63ad60
> file + sys/net/pfkeyv2.c
> --- sys/net/pfkeyv2.c
> +++ sys/net/pfkeyv2.c
> @@ -1162,6 +1162,10 @@ pfkeyv2_dosend(struct socket *so, void *message, int l
>  
>   rdomain = kp->kcb_rdomain;
>  
> + /* Validate message format */
> + if ((rval = pfkeyv2_parsemessage(message, len, headers)) != 0)
> + goto ret;
> +
>   /* If we have any promiscuous listeners, send them a copy of the 
> message */
>   if (promisc) {
>   struct mbuf *packet;
> @@ -1208,10 +1212,6 @@ pfkeyv2_dosend(struct socket *so, void *message, int l
>   freeme_sz = 0;
>   }
>  
> - /* Validate message format */
> - if ((rval = pfkeyv2_parsemessage(message, len, headers)) != 0)
> - goto ret;
> -
>   /* use specified rdomain */
>   srdomain = (struct sadb_x_rdomain *) headers[SADB_X_EXT_RDOMAIN];
>   if (srdomain) {



ixl witness lock order

2023-09-28 Thread Alexander Bluhm
Hi,

There is a lock order problem in ixl(4) show by witness.

Here I replaced a wrong net lock with kernel lock.

revision 1.84
date: 2022/08/05 13:57:16;  author: bluhm;  state: Exp;  lines: +6 -3;  
commitid: sAcV6NsO35L03mLS;
The netlock for SIOCSIFMEDIA and SIOCGIFMEDIA ioctl is not necessary.
Legacy drivers run with kernel lock, interface media is MP safe or
has kernel lock.  Assert kernel lock in ix(4) and ixl(4).
OK kettenis@


And then jan@ added a mutex to fix memory curruption in the queue.

revision 1.88
date: 2023/07/19 20:22:05;  author: jan;  state: Exp;  lines: +21 -4;  
commitid: NuYpolqO3BOrbWpt;
Protect ixl(4) admin queue with mutex(9).
with tweaks from bluhm
tested by bluhm
ok bluhm@


The consequence is that we aquire kernel lock while a mutex is held.
This is illegal.  Witness output below.

The kernel lock can be replaced with a mutex.  sc_media_status and
sc_media_active have to be protected.  I use sc_link_state_mtx as
it is there anyway and the path is not performace critical.

ifm->ifm_status and ifm->ifm_active are protected by kernel lock,
so I keep the assert in ixl_media_status().

ok?

bluhm

Index: dev/pci/if_ixl.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pci/if_ixl.c,v
retrieving revision 1.88
diff -u -p -r1.88 if_ixl.c
--- dev/pci/if_ixl.c19 Jul 2023 20:22:05 -  1.88
+++ dev/pci/if_ixl.c28 Sep 2023 13:18:39 -
@@ -2074,8 +2074,10 @@ ixl_media_status(struct ifnet *ifp, stru
 
KERNEL_ASSERT_LOCKED();
 
+   mtx_enter(>sc_link_state_mtx);
ifm->ifm_status = sc->sc_media_status;
ifm->ifm_active = sc->sc_media_active;
+   mtx_leave(>sc_link_state_mtx);
 }
 
 static void
@@ -3482,9 +3484,7 @@ ixl_link_state_update_iaq(struct ixl_sof
return;
}
 
-   KERNEL_LOCK();
link_state = ixl_set_link_status(sc, iaq);
-   KERNEL_UNLOCK();
mtx_enter(>sc_link_state_mtx);
if (ifp->if_link_state != link_state) {
ifp->if_link_state = link_state;
@@ -4468,9 +4468,6 @@ ixl_set_link_status(struct ixl_softc *sc
 {
const struct ixl_aq_link_status *status;
const struct ixl_phy_type *itype;
-
-   KERNEL_ASSERT_LOCKED();
-
uint64_t ifm_active = IFM_ETHER;
uint64_t ifm_status = IFM_AVALID;
int link_state = LINK_STATE_DOWN;
@@ -4496,9 +4493,11 @@ ixl_set_link_status(struct ixl_softc *sc
baudrate = ixl_search_link_speed(status->link_speed);
 
 done:
+   mtx_enter(>sc_link_state_mtx);
sc->sc_media_active = ifm_active;
sc->sc_media_status = ifm_status;
sc->sc_ac.ac_if.if_baudrate = baudrate;
+   mtx_leave(>sc_link_state_mtx);
 
return (link_state);
 }

ixl1 at pci5 dev 0 function 1 "Intel X710 SFP+" rev 0x02: port 1, FW 6.0.48442 
API 1.7, msix, 8 queues, address 40:a6:b7:6e:ad:a1
witness: lock_object uninitialized: 0x802df808
Starting stack trace...
witness_checkorder(802df808,9,0,802df808,e2333bf47b0dd339,8247aff0)
 at witness_checkorder+0xb1
mtx_enter(802df7f8,802df7f8,526c9cbeb61c34a2,802df000,0,0)
 at mtx_enter+0x38
ixl_set_link_status(802df000,8299c5e0,391863f80fb351e5,38700083,c6904,c6804)
 at ixl_set_link_status+0x15c
ixl_attach(800c1900,802df000,8299c700,800c1900,c84159f5901cc70f,800c1900)
 at ixl_attach+0x1f9d
config_attach(800c1900,824c6150,8299c700,81d75c20,112fd13f28adcf4b,80030100)
 at config_attach+0x1f4
pci_probe_device(800c1900,80030100,0,0,e75e9768b7491c85,0) at 
pci_probe_device+0x4f0
pci_enumerate_bus(800c1900,0,0,800c1900,bd3c6014bc8a4290,802d8400)
 at pci_enumerate_bus+0x189
config_attach(802d8400,824c54d8,8299c920,81368aa0,112fd13f284cd40e,8299ca90)
 at config_attach+0x1f4
ppbattach(800c1200,802d8400,8299ca90,800c1200,20134e0abec38d14,800c1200)
 at ppbattach+0x755
config_attach(800c1200,824c5dd0,8299ca90,81d75c20,112fd13f28adcf4b,80002800)
 at config_attach+0x1f4
pci_probe_device(800c1200,80002800,0,0,e75e9768b7491c85,0) at 
pci_probe_device+0x4f0
pci_enumerate_bus(800c1200,0,0,800c1200,bd3c6014bc8a4290,8002e280)
 at pci_enumerate_bus+0x189
config_attach(8002e280,824c54d8,8299cca8,81abbfc0,112fd13f28d1128b,80086c00)
 at config_attach+0x1f4
acpipci_attach_bus(8002e280,80086c00,55efbcc160c143c3,2,8002e280,824cc708)
 at acpipci_attach_bus+0x1b0
acpipci_attach_busses(8002e280,8002e280,d29eed02be6a9f22,8299cdc8,8002e280,0)
 at acpipci_attach_busses+0x5d

Re: hotplug(4): introduce `hotplug_mtx' mutex(9) and make `hotplugread_filterops' mp safe

2023-09-22 Thread Alexander Bluhm
On Mon, Sep 18, 2023 at 02:12:21PM +0300, Vitaliy Makkoveev wrote:
> On Mon, Sep 18, 2023 at 02:03:08PM +0300, Vitaliy Makkoveev wrote:
> > Also use this mutex to protect `evqueue_head', `evqueue_tail' and
> > `evqueue_count'.
> > 
> 
> Sorry, the right diff:

OK bluhm@

> Index: sys/dev/hotplug.c
> ===
> RCS file: /cvs/src/sys/dev/hotplug.c,v
> retrieving revision 1.23
> diff -u -p -r1.23 hotplug.c
> --- sys/dev/hotplug.c 8 Sep 2023 20:00:27 -   1.23
> +++ sys/dev/hotplug.c 18 Sep 2023 11:09:12 -
> @@ -22,27 +22,39 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> -#include 
> +#include 
>  #include 
>  
>  #define HOTPLUG_MAXEVENTS64
>  
> +/*
> + * Locks used to protect struct members and global data
> + *M  hotplug_mtx
> + */
> +
> +static struct mutex hotplug_mtx = MUTEX_INITIALIZER(IPL_MPFLOOR);
> +
>  static int opened;
>  static struct hotplug_event evqueue[HOTPLUG_MAXEVENTS];
> -static int evqueue_head, evqueue_tail, evqueue_count;
> -static struct selinfo hotplug_sel;
> +static int evqueue_head, evqueue_tail, evqueue_count;/* [M] */
> +static struct klist hotplug_klist;   /* [M] */
>  
>  void filt_hotplugrdetach(struct knote *);
>  int  filt_hotplugread(struct knote *, long);
> +int  filt_hotplugmodify(struct kevent *, struct knote *);
> +int  filt_hotplugprocess(struct knote *, struct kevent *);
>  
>  const struct filterops hotplugread_filtops = {
> - .f_flags= FILTEROP_ISFD,
> + .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
>   .f_attach   = NULL,
>   .f_detach   = filt_hotplugrdetach,
>   .f_event= filt_hotplugread,
> + .f_modify   = filt_hotplugmodify,
> + .f_process  = filt_hotplugprocess,
>  };
>  
>  #define EVQUEUE_NEXT(p) (p == HOTPLUG_MAXEVENTS - 1 ? 0 : p + 1)
> @@ -60,6 +72,8 @@ hotplugattach(int count)
>   evqueue_head = 0;
>   evqueue_tail = 0;
>   evqueue_count = 0;
> +
> + klist_init_mutex(_klist, _mtx);
>  }
>  
>  void
> @@ -87,7 +101,9 @@ hotplug_device_detach(enum devclass clas
>  int
>  hotplug_put_event(struct hotplug_event *he)
>  {
> + mtx_enter(_mtx);
>   if (evqueue_count == HOTPLUG_MAXEVENTS && opened) {
> + mtx_leave(_mtx);
>   printf("hotplug: event lost, queue full\n");
>   return (1);
>   }
> @@ -98,24 +114,21 @@ hotplug_put_event(struct hotplug_event *
>   evqueue_tail = EVQUEUE_NEXT(evqueue_tail);
>   else 
>   evqueue_count++;
> + knote_locked(_klist, 0);
>   wakeup();
> - selwakeup(_sel);
> + mtx_leave(_mtx);
>   return (0);
>  }
>  
>  int
>  hotplug_get_event(struct hotplug_event *he)
>  {
> - int s;
> -
>   if (evqueue_count == 0)
>   return (1);
>  
> - s = splbio();
>   *he = evqueue[evqueue_tail];
>   evqueue_tail = EVQUEUE_NEXT(evqueue_tail);
>   evqueue_count--;
> - splx(s);
>   return (0);
>  }
>  
> @@ -137,8 +150,11 @@ hotplugclose(dev_t dev, int flag, int mo
>  {
>   struct hotplug_event he;
>  
> + mtx_enter(_mtx);
>   while (hotplug_get_event() == 0)
>   continue;
> + mtx_leave(_mtx);
> + klist_invalidate(_klist);
>   opened = 0;
>   return (0);
>  }
> @@ -152,16 +168,23 @@ hotplugread(dev_t dev, struct uio *uio, 
>   if (uio->uio_resid != sizeof(he))
>   return (EINVAL);
>  
> -again:
> - if (hotplug_get_event() == 0)
> - return (uiomove(, sizeof(he), uio));
> - if (flags & IO_NDELAY)
> - return (EAGAIN);
> -
> - error = tsleep_nsec(, PRIBIO | PCATCH, "htplev", INFSLP);
> - if (error)
> - return (error);
> - goto again;
> + mtx_enter(_mtx);
> + while (hotplug_get_event()) {
> + if (flags & IO_NDELAY) {
> + mtx_leave(_mtx);
> + return (EAGAIN);
> + }
> + 
> + error = msleep_nsec(, _mtx, PRIBIO | PCATCH,
> + "htplev", INFSLP);
> + if (error) {
> + mtx_leave(_mtx);
> + return (error);
> + }
> + }
> + mtx_leave(_mtx);
> +
> + return (uiomove(, sizeof(he), uio));
>  }
>  
>  int
> @@ -183,32 +206,22 @@ hotplugioctl(dev_t dev, u_long cmd, cadd
>  int
>  hotplugkqfilter(dev_t dev, struct knote *kn)
>  {
> - struct klist *klist;
> - int s;
> -
>   switch (kn->kn_filter) {
>   case EVFILT_READ:
> - klist = _sel.si_note;
>   kn->kn_fop = _filtops;
>   break;
>   default:
>   return (EINVAL);
>   }
>  
> - s = splbio();
> - klist_insert_locked(klist, kn);
> - splx(s);
> + klist_insert(_klist, kn);
>   return (0);
>  }
>  
>  void
>  filt_hotplugrdetach(struct knote *kn)
>  {
> - int s;
> -
> - s = 

Re: hyperv(4): use shared netlock to protect if_list and ifa_list walkthrough and data

2023-09-22 Thread Alexander Bluhm
On Mon, Sep 18, 2023 at 11:40:28AM +0300, Vitaliy Makkoveev wrote:
> Context switch looks fine here.

OK bluhm@

> Index: sys/dev/pv/hypervic.c
> ===
> RCS file: /cvs/src/sys/dev/pv/hypervic.c,v
> retrieving revision 1.19
> diff -u -p -r1.19 hypervic.c
> --- sys/dev/pv/hypervic.c 11 Apr 2023 00:45:08 -  1.19
> +++ sys/dev/pv/hypervic.c 18 Sep 2023 08:35:02 -
> @@ -846,7 +846,7 @@ kvp_get_ip_info(struct hv_kvp *kvp, cons
>   struct sockaddr_in6 *sin6, sa6;
>   uint8_t enaddr[ETHER_ADDR_LEN];
>   uint8_t ipaddr[INET6_ADDRSTRLEN];
> - int i, j, lo, hi, s, af;
> + int i, j, lo, hi, af;
>  
>   /* Convert from the UTF-16LE string format to binary */
>   for (i = 0, j = 0; j < ETHER_ADDR_LEN; i += 6) {
> @@ -870,16 +870,14 @@ kvp_get_ip_info(struct hv_kvp *kvp, cons
>   return (-1);
>   }
>  
> - KERNEL_LOCK();
> - s = splnet();
> + NET_LOCK_SHARED();
>  
>   TAILQ_FOREACH(ifp, , if_list) {
>   if (!memcmp(LLADDR(ifp->if_sadl), enaddr, ETHER_ADDR_LEN))
>   break;
>   }
>   if (ifp == NULL) {
> - splx(s);
> - KERNEL_UNLOCK();
> + NET_UNLOCK_SHARED();
>   return (-1);
>   }
>  
> @@ -919,8 +917,7 @@ kvp_get_ip_info(struct hv_kvp *kvp, cons
>   else if (ifa6ll != NULL)
>   ifa = ifa6ll;
>   else {
> - splx(s);
> - KERNEL_UNLOCK();
> + NET_UNLOCK_SHARED();
>   return (-1);
>   }
>   }
> @@ -956,8 +953,7 @@ kvp_get_ip_info(struct hv_kvp *kvp, cons
>   break;
>   }
>  
> - splx(s);
> - KERNEL_UNLOCK();
> + NET_UNLOCK_SHARED();
>  
>   return (0);
>  }



Re: Make `logread_filtops' mpsafe

2023-09-22 Thread Alexander Bluhm
On Thu, Sep 14, 2023 at 01:12:22PM +0300, Vitaliy Makkoveev wrote:
> `log_mtx' mutex(9) already used for message buffer protection, so use it
> to protect `logread_filtops' too.
> 
> ok?

OK bluhm@

> Index: sys/kern/subr_log.c
> ===
> RCS file: /cvs/src/sys/kern/subr_log.c,v
> retrieving revision 1.77
> diff -u -p -r1.77 subr_log.c
> --- sys/kern/subr_log.c   14 Jul 2023 07:07:08 -  1.77
> +++ sys/kern/subr_log.c   14 Sep 2023 10:04:41 -
> @@ -50,6 +50,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -75,7 +76,7 @@
>   */
>  struct logsoftc {
>   int sc_state;   /* [L] see above for possibilities */
> - struct  selinfo sc_selp;/* process waiting on select call */
> + struct  klist sc_klist; /* process waiting on kevent call */
>   struct  sigio_ref sc_sigio; /* async I/O registration */
>   int sc_need_wakeup; /* if set, wake up waiters */
>   struct timeout sc_tick; /* wakeup poll timeout */
> @@ -99,12 +100,16 @@ struct   mutex log_mtx =
>  
>  void filt_logrdetach(struct knote *kn);
>  int filt_logread(struct knote *kn, long hint);
> +int filt_logmodify(struct kevent *, struct knote *);
> +int filt_logprocess(struct knote *, struct kevent *);
>  
>  const struct filterops logread_filtops = {
> - .f_flags= FILTEROP_ISFD,
> + .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
>   .f_attach   = NULL,
>   .f_detach   = filt_logrdetach,
>   .f_event= filt_logread,
> + .f_modify   = filt_logmodify,
> + .f_process  = filt_logprocess,
>  };
>  
>  int dosendsyslog(struct proc *, const char *, size_t, int, enum uio_seg);
> @@ -191,11 +196,9 @@ msgbuf_getlen(struct msgbuf *mbp)
>  {
>   long len;
>  
> - mtx_enter(_mtx);
>   len = mbp->msg_bufx - mbp->msg_bufr;
>   if (len < 0)
>   len += mbp->msg_bufs;
> - mtx_leave(_mtx);
>   return (len);
>  }
>  
> @@ -205,6 +208,7 @@ logopen(dev_t dev, int flags, int mode, 
>   if (log_open)
>   return (EBUSY);
>   log_open = 1;
> + klist_init_mutex(_klist, _mtx);
>   sigio_init(_sigio);
>   timeout_set(_tick, logtick, NULL);
>   timeout_add_msec(_tick, LOG_TICK);
> @@ -225,6 +229,10 @@ logclose(dev_t dev, int flag, int mode, 
>   FRELE(fp, p);
>   log_open = 0;
>   timeout_del(_tick);
> +
> + klist_invalidate(_klist);
> + klist_free(_klist);
> +
>   logsoftc.sc_state = 0;
>   sigio_free(_sigio);
>   return (0);
> @@ -301,11 +309,10 @@ int
>  logkqfilter(dev_t dev, struct knote *kn)
>  {
>   struct klist *klist;
> - int s;
>  
>   switch (kn->kn_filter) {
>   case EVFILT_READ:
> - klist = _selp.si_note;
> + klist = _klist;
>   kn->kn_fop = _filtops;
>   break;
>   default:
> @@ -313,10 +320,7 @@ logkqfilter(dev_t dev, struct knote *kn)
>   }
>  
>   kn->kn_hook = (void *)msgbufp;
> -
> - s = splhigh();
> - klist_insert_locked(klist, kn);
> - splx(s);
> + klist_insert(klist, kn);
>  
>   return (0);
>  }
> @@ -324,11 +328,7 @@ logkqfilter(dev_t dev, struct knote *kn)
>  void
>  filt_logrdetach(struct knote *kn)
>  {
> - int s;
> -
> - s = splhigh();
> - klist_remove_locked(_selp.si_note, kn);
> - splx(s);
> + klist_remove(_klist, kn);
>  }
>  
>  int
> @@ -340,6 +340,30 @@ filt_logread(struct knote *kn, long hint
>   return (kn->kn_data != 0);
>  }
>  
> +int
> +filt_logmodify(struct kevent *kev, struct knote *kn)
> +{
> + int active;
> +
> + mtx_enter(_mtx);
> + active = knote_modify(kev, kn);
> + mtx_leave(_mtx);
> +
> + return (active);
> +}
> +
> +int
> +filt_logprocess(struct knote *kn, struct kevent *kev)
> +{
> + int active;
> +
> + mtx_enter(_mtx);
> + active = knote_process(kn, kev);
> + mtx_leave(_mtx);
> +
> + return (active);
> +}
> +
>  void
>  logwakeup(void)
>  {
> @@ -380,9 +404,9 @@ logtick(void *arg)
>   state = logsoftc.sc_state;
>   if (logsoftc.sc_state & LOG_RDWAIT)
>   logsoftc.sc_state &= ~LOG_RDWAIT;
> + knote_locked(_klist, 0);
>   mtx_leave(_mtx);
>  
> - selwakeup(_selp);
>   if (state & LOG_ASYNC)
>   pgsigio(_sigio, SIGIO, 0);
>   if (state & LOG_RDWAIT)
> @@ -401,7 +425,9 @@ logioctl(dev_t dev, u_long com, caddr_t 
>  
>   /* return number of characters immediately available */
>   case FIONREAD:
> + mtx_enter(_mtx);
>   *(int *)data = (int)msgbuf_getlen(msgbufp);
> + mtx_leave(_mtx);
>   break;
>  
>   case FIONBIO:



Re: fix a wireguard mbuf leak

2023-09-22 Thread Alexander Bluhm
On Fri, Sep 22, 2023 at 12:21:42PM +0900, YASUOKA Masahiko wrote:
> A leak may happens when wgpeer is deleted.
> 
> ok?

OK bluhm@

> The state queue should be freeed when wg_peer is destroyed.
> diff from IIJ.
> 
> Index: sys/net/if_wg.c
> ===
> RCS file: /disk/cvs/openbsd/src/sys/net/if_wg.c,v
> retrieving revision 1.29
> diff -u -p -r1.29 if_wg.c
> --- sys/net/if_wg.c   3 Aug 2023 09:49:08 -   1.29
> +++ sys/net/if_wg.c   22 Sep 2023 03:11:47 -
> @@ -518,6 +518,9 @@ wg_peer_destroy(struct wg_peer *peer)
>   taskq_barrier(wg_crypt_taskq);
>   taskq_barrier(net_tq(sc->sc_if.if_index));
>  
> + if (!mq_empty(>p_stage_queue))
> + mq_purge(>p_stage_queue);
> +
>   DPRINTF(sc, "Peer %llu destroyed\n", peer->p_id);
>   explicit_bzero(peer, sizeof(*peer));
>   pool_put(_peer_pool, peer);



OpenBSD Errata: September 21, 2023 (npppd)

2023-09-20 Thread Alexander Bluhm
Errata patches for npppd have been released for OpenBSD 7.2 and 7.3.

Binary updates for the amd64, arm64 and i386 platform are available
via the syspatch utility.  Source code patches can be found on the
respective errata page:

  https://www.openbsd.org/errata72.html
  https://www.openbsd.org/errata73.html



Re: Use counters_read(9) from ddb(4)

2023-09-18 Thread Alexander Bluhm
On Fri, Sep 15, 2023 at 04:18:13PM +0200, Martin Pieuchot wrote:
> On 11/09/23(Mon) 21:05, Martin Pieuchot wrote:
> > On 06/09/23(Wed) 23:13, Alexander Bluhm wrote:
> > > On Wed, Sep 06, 2023 at 12:23:33PM -0500, Scott Cheloha wrote:
> > > > On Wed, Sep 06, 2023 at 01:04:19PM +0100, Martin Pieuchot wrote:
> > > > > Debugging OOM is hard.  UVM uses per-CPU counters and sadly
> > > > > counters_read(9) needs to allocate memory.  This is not acceptable in
> > > > > ddb(4).  As a result I cannot see the content of UVM counters in OOM
> > > > > situations.
> > > > > 
> > > > > Diff below introduces a *_static() variant of counters_read(9) that
> > > > > takes a secondary buffer to avoid calling malloc(9).  Is it fine?  Do
> > > > > you have a better idea?  Should we make it the default or using the
> > > > > stack might be a problem?
> > > > 
> > > > Instead of adding a second interface I think we could get away with
> > > > just extending counters_read(9) to take a scratch buffer as an optional
> > > > fourth parameter:
> > > > 
> > > > void
> > > > counters_read(struct cpumem *cm, uint64_t *output, unsigned int n,
> > > > uint64_t *scratch);
> > > > 
> > > > "scratch"?  "temp"?  "tmp"?
> > > 
> > > scratch is fine for me
> > 
> > Fine with me.
> 
> Here's a full diff, works for me(tm), ok?

OK bluhm@

> Index: sys/kern/kern_sysctl.c
> ===
> RCS file: /cvs/src/sys/kern/kern_sysctl.c,v
> retrieving revision 1.418
> diff -u -p -r1.418 kern_sysctl.c
> --- sys/kern/kern_sysctl.c16 Jul 2023 03:01:31 -  1.418
> +++ sys/kern/kern_sysctl.c15 Sep 2023 13:29:53 -
> @@ -519,7 +519,7 @@ kern_sysctl(int *name, u_int namelen, vo
>   unsigned int i;
>  
>   memset(, 0, sizeof(mbs));
> - counters_read(mbstat, counters, MBSTAT_COUNT);
> + counters_read(mbstat, counters, MBSTAT_COUNT, NULL);
>   for (i = 0; i < MBSTAT_TYPES; i++)
>   mbs.m_mtypes[i] = counters[i];
>  
> Index: sys/kern/subr_evcount.c
> ===
> RCS file: /cvs/src/sys/kern/subr_evcount.c,v
> retrieving revision 1.15
> diff -u -p -r1.15 subr_evcount.c
> --- sys/kern/subr_evcount.c   5 Dec 2022 08:58:49 -   1.15
> +++ sys/kern/subr_evcount.c   15 Sep 2023 14:01:55 -
> @@ -101,7 +101,7 @@ evcount_sysctl(int *name, u_int namelen,
>  {
>   int error = 0, s, nintr, i;
>   struct evcount *ec;
> - u_int64_t count;
> + uint64_t count, scratch;
>  
>   if (newp != NULL)
>   return (EPERM);
> @@ -129,7 +129,7 @@ evcount_sysctl(int *name, u_int namelen,
>   if (ec == NULL)
>   return (ENOENT);
>   if (ec->ec_percpu != NULL) {
> - counters_read(ec->ec_percpu, , 1);
> + counters_read(ec->ec_percpu, , 1, );
>   } else {
>   s = splhigh();
>   count = ec->ec_count;
> Index: sys/kern/subr_percpu.c
> ===
> RCS file: /cvs/src/sys/kern/subr_percpu.c,v
> retrieving revision 1.10
> diff -u -p -r1.10 subr_percpu.c
> --- sys/kern/subr_percpu.c3 Oct 2022 14:10:53 -   1.10
> +++ sys/kern/subr_percpu.c15 Sep 2023 14:16:41 -
> @@ -159,17 +159,19 @@ counters_free(struct cpumem *cm, unsigne
>  }
>  
>  void
> -counters_read(struct cpumem *cm, uint64_t *output, unsigned int n)
> +counters_read(struct cpumem *cm, uint64_t *output, unsigned int n,
> +uint64_t *scratch)
>  {
>   struct cpumem_iter cmi;
> - uint64_t *gen, *counters, *temp;
> + uint64_t *gen, *counters, *temp = scratch;
>   uint64_t enter, leave;
>   unsigned int i;
>  
>   for (i = 0; i < n; i++)
>   output[i] = 0;
>  
> - temp = mallocarray(n, sizeof(uint64_t), M_TEMP, M_WAITOK);
> + if (scratch == NULL)
> + temp = mallocarray(n, sizeof(uint64_t), M_TEMP, M_WAITOK);
>  
>   gen = cpumem_first(, cm);
>   do {
> @@ -202,7 +204,8 @@ counters_read(struct cpumem *cm, uint64_
>   gen = cpumem_next(, cm);
>   } while (gen != NULL);
>  
> - free(temp, M_TEMP, n * sizeof(uint64_t));
> + if (scratch == NULL)
> + free(temp, M_TEMP, n * sizeof(uint64_t));
>

Re: Replace selinfo by klist in vnode structure

2023-09-08 Thread Alexander Bluhm
On Thu, Sep 07, 2023 at 10:32:58PM +0300, Vitaliy Makkoveev wrote:
> Remove the remnants of the leftover selinfo from vnode(9) layer. Just
> mechanical replacement because knote(9) API is already used. I don't
> want make klist MP safe with this diff.
> 
>  headers added where is was required. Disabled tmpsfs was
> also tested.
> 
> ok?

OK bluhm@

>  #include 
>  #include 
>  #include 
> +#include 

I think these headers were sorted alphabetically as much as possible
before.



Re: syslogd retry dns lookup

2023-09-07 Thread Alexander Bluhm
On Sun, Sep 03, 2023 at 02:00:46AM +0200, Alexander Bluhm wrote:
> Hi,
> 
> When DNS lookup for remote loghost in @ line in syslog.conf does
> not work at startup, retry in intervals.
> 
> Please test if you use the feature.
> 
> ok?

anyone?

> bluhm
> 
> Index: syslogd.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/usr.sbin/syslogd/syslogd.c,v
> retrieving revision 1.277
> diff -u -p -r1.277 syslogd.c
> --- syslogd.c 16 Mar 2023 18:22:08 -  1.277
> +++ syslogd.c 2 Sep 2023 23:42:22 -
> @@ -156,9 +156,12 @@ struct filed {
>   struct sockaddr_storage  f_addr;
>   struct buffertls f_buftls;
>   struct bufferevent  *f_bufev;
> + struct event f_ev;
>   struct tls  *f_ctx;
> + char*f_ipproto;
>   char*f_host;
> - int  f_reconnectwait;
> + char*f_port;
> + int  f_retrywait;
>   } f_forw;   /* forwarding address */
>   charf_fname[PATH_MAX];
>   struct {
> @@ -320,6 +323,7 @@ void   tcp_writecb(struct bufferevent *, 
>  void  tcp_errorcb(struct bufferevent *, short, void *);
>  void  tcp_connectcb(int, short, void *);
>  void  tcp_connect_retry(struct bufferevent *, struct filed *);
> +void  udp_resolvecb(int, short, void *);
>  int   tcpbuf_countmsg(struct bufferevent *bufev);
>  void  die_signalcb(int, short, void *);
>  void  mark_timercb(int, short, void *);
> @@ -1380,7 +1384,7 @@ tcp_writecb(struct bufferevent *bufev, v
>* Successful write, connection to server is good, reset wait time.
>*/
>   log_debug("loghost \"%s\" successful write", f->f_un.f_forw.f_loghost);
> - f->f_un.f_forw.f_reconnectwait = 0;
> + f->f_un.f_forw.f_retrywait = 0;
>  
>   if (f->f_dropped > 0 &&
>   EVBUFFER_LENGTH(f->f_un.f_forw.f_bufev->output) < MAX_TCPBUF) {
> @@ -1453,6 +1457,18 @@ tcp_connectcb(int fd, short event, void 
>   struct bufferevent  *bufev = f->f_un.f_forw.f_bufev;
>   int  s;
>  
> + if (f->f_un.f_forw.f_addr.ss_family == AF_UNSPEC) {
> + if (priv_getaddrinfo(f->f_un.f_forw.f_ipproto,
> + f->f_un.f_forw.f_host, f->f_un.f_forw.f_port,
> + (struct sockaddr*)>f_un.f_forw.f_addr,
> + sizeof(f->f_un.f_forw.f_addr)) != 0) {
> + log_warnx("bad hostname \"%s\"",
> + f->f_un.f_forw.f_loghost);
> + tcp_connect_retry(bufev, f);
> + return;
> + }
> + }
> +
>   if ((s = tcp_socket(f)) == -1) {
>   tcp_connect_retry(bufev, f);
>   return;
> @@ -1511,21 +1527,66 @@ tcp_connect_retry(struct bufferevent *bu
>  {
>   struct timeval   to;
>  
> - if (f->f_un.f_forw.f_reconnectwait == 0)
> - f->f_un.f_forw.f_reconnectwait = 1;
> + if (f->f_un.f_forw.f_retrywait == 0)
> + f->f_un.f_forw.f_retrywait = 1;
>   else
> - f->f_un.f_forw.f_reconnectwait <<= 1;
> - if (f->f_un.f_forw.f_reconnectwait > 600)
> - f->f_un.f_forw.f_reconnectwait = 600;
> - to.tv_sec = f->f_un.f_forw.f_reconnectwait;
> + f->f_un.f_forw.f_retrywait <<= 1;
> + if (f->f_un.f_forw.f_retrywait > 600)
> + f->f_un.f_forw.f_retrywait = 600;
> + to.tv_sec = f->f_un.f_forw.f_retrywait;
>   to.tv_usec = 0;
> + evtimer_add(>f_un.f_forw.f_ev, );
>  
> - log_debug("tcp connect retry: wait %d",
> - f->f_un.f_forw.f_reconnectwait);
> + log_debug("tcp connect retry: wait %d", f->f_un.f_forw.f_retrywait);
>   bufferevent_setfd(bufev, -1);
> - /* We can reuse the write event as bufferevent is disabled. */
> - evtimer_set(>ev_write, tcp_connectcb, f);
> - evtimer_add(>ev_write, );
> +}
> +
> +void
> +udp_resolvecb(int fd, short event, void *arg)
> +{
> + struct filed*f = arg;
> + struct timeval   to;
> +
> + if (priv_getaddrinfo(f->f_un.f_forw.f_ipproto,
> + f->f_un.f_forw.f_host, f->f_un.f_forw.f_port,
> + (struct sockaddr*)>f_un.f_forw.f_addr,
> + 

Re: Use counters_read(9) from ddb(4)

2023-09-06 Thread Alexander Bluhm
On Wed, Sep 06, 2023 at 12:23:33PM -0500, Scott Cheloha wrote:
> On Wed, Sep 06, 2023 at 01:04:19PM +0100, Martin Pieuchot wrote:
> > Debugging OOM is hard.  UVM uses per-CPU counters and sadly
> > counters_read(9) needs to allocate memory.  This is not acceptable in
> > ddb(4).  As a result I cannot see the content of UVM counters in OOM
> > situations.
> > 
> > Diff below introduces a *_static() variant of counters_read(9) that
> > takes a secondary buffer to avoid calling malloc(9).  Is it fine?  Do
> > you have a better idea?  Should we make it the default or using the
> > stack might be a problem?
> 
> Instead of adding a second interface I think we could get away with
> just extending counters_read(9) to take a scratch buffer as an optional
> fourth parameter:
> 
> void
> counters_read(struct cpumem *cm, uint64_t *output, unsigned int n,
> uint64_t *scratch);
> 
> "scratch"?  "temp"?  "tmp"?

scratch is fine for me

> This kinda looks like a case where we could annotate these pointers
> with 'restrict', but I have never fully understood when 'restrict' is
> appropriate vs. when it is overkill or useless.

restrict scares me

#ifdef MULTIPROCESSOR

>  void
> -counters_read(struct cpumem *cm, uint64_t *output, unsigned int n)
> +counters_read(struct cpumem *cm, uint64_t *output, unsigned int n,
> +uint64_t *scratch)

#else

>  void
> -counters_read(struct cpumem *cm, uint64_t *output, unsigned int n)
> +counters_read(struct cpumem *cm, uint64_t *output, uint64_t *scratch,
> +unsigned int n)

Here scratch and n are swapped.  Build a ramdisk kernel :-)

bluhm



ip send shared netlock

2023-09-05 Thread Alexander Bluhm
Hi,

ip_output() and ip6_output() should be MP safe when called with
NULL options.

ok?

bluhm

Index: netinet/ip_input.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_input.c,v
retrieving revision 1.385
diff -u -p -r1.385 ip_input.c
--- netinet/ip_input.c  18 May 2023 09:59:43 -  1.385
+++ netinet/ip_input.c  5 Sep 2023 11:48:25 -
@@ -1851,7 +1851,7 @@ ip_send_do_dispatch(void *xmq, int flags
if (ml_empty())
return;
 
-   NET_LOCK();
+   NET_LOCK_SHARED();
while ((m = ml_dequeue()) != NULL) {
u_int32_t ipsecflowinfo = 0;
 
@@ -1862,7 +1862,7 @@ ip_send_do_dispatch(void *xmq, int flags
}
ip_output(m, NULL, NULL, flags, NULL, NULL, ipsecflowinfo);
}
-   NET_UNLOCK();
+   NET_UNLOCK_SHARED();
 }
 
 void
Index: netinet6/ip6_input.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_input.c,v
retrieving revision 1.254
diff -u -p -r1.254 ip6_input.c
--- netinet6/ip6_input.c21 Aug 2022 14:15:55 -  1.254
+++ netinet6/ip6_input.c5 Sep 2023 11:48:44 -
@@ -1572,11 +1572,11 @@ ip6_send_dispatch(void *xmq)
if (ml_empty())
return;
 
-   NET_LOCK();
+   NET_LOCK_SHARED();
while ((m = ml_dequeue()) != NULL) {
ip6_output(m, NULL, NULL, 0, NULL, NULL);
}
-   NET_UNLOCK();
+   NET_UNLOCK_SHARED();
 }
 
 void



Re: tcp sync cache signed use counter

2023-09-04 Thread Alexander Bluhm
On Mon, Sep 04, 2023 at 07:22:03PM +0300, Vitaliy Makkoveev wrote:
> > On 4 Sep 2023, at 16:19, Alexander Bluhm  wrote:
> > 
> > Hi,
> > 
> > Variable scs_use is basically counting packet insertions to syn
> > cache, so I would prefer type long to exclude overflow on fast
> > machines.  With the current limits int should be enough, but long
> > does not hurt.
> > 
> > It can be negative as it starts at a positive limit and counts
> > backwards.  After all entries in the current syn cache have been
> > timed out, it is reset to positive limit.  If timeout takes a while,
> > it may get well below zero.
> > 
> > To prevent netstat output like this
> >18446744073709531826 uses of current SYN cache left
> > make tcps_sc_uses_left signed and print it as long long integer.
> > 
> > ok?
> > 
> 
> Does the negative value output makes sense? If not, could you
> prevent negative value output too?

Goal is to reseed the hash algorithm from time to time.  This cannot
be done while there are still entries in the hash table.

So we have two syn caches, an active and an inactive one.

They switch roles when the active syn cache has been used for a
while and the inactive one is empty.  The scs_use counts the packets
that have been inserted in the active one.

We start at a limit that has been set by sysctl.  We count down to
0, then use the inactive one.  But it may be busy, so we wait until
all entries have timed out.  While that the active one is still
used, and the counter gets negative.

Negative numbers express how long the active syn cache has been
used above the limit.

Of course I could move the range to have positive numbers only.
But then the state "used above limit" would not be obvious anymore.

bluhm

> > Index: sys/netinet/tcp_var.h
> > ===
> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
> > retrieving revision 1.170
> > diff -u -p -r1.170 tcp_var.h
> > --- sys/netinet/tcp_var.h   28 Aug 2023 14:50:02 -  1.170
> > +++ sys/netinet/tcp_var.h   4 Sep 2023 13:02:40 -
> > @@ -288,9 +288,9 @@ struct syn_cache_head {
> > 
> > struct syn_cache_set {
> > struct  syn_cache_head *scs_buckethead;
> > +   longscs_use;
> > int scs_size;
> > int scs_count;
> > -   int scs_use;
> > u_int32_t   scs_random[5];
> > };
> > 
> > @@ -430,7 +430,7 @@ struct  tcpstat {
> > u_int64_t tcps_sc_entry_limit;  /* limit of syn cache entries */
> > u_int64_t tcps_sc_bucket_maxlen;/* maximum # of entries in any bucket */
> > u_int64_t tcps_sc_bucket_limit; /* limit of syn cache bucket list */
> > -   u_int64_t tcps_sc_uses_left;/* use counter of current syn cache */
> > +   int64_t tcps_sc_uses_left;  /* use counter of current syn cache */
> > 
> > u_int64_t tcps_conndrained; /* # of connections drained */
> > 
> > Index: usr.bin/netstat/inet.c
> > ===
> > RCS file: /data/mirror/openbsd/cvs/src/usr.bin/netstat/inet.c,v
> > retrieving revision 1.178
> > diff -u -p -r1.178 inet.c
> > --- usr.bin/netstat/inet.c  7 Jul 2023 09:15:13 -   1.178
> > +++ usr.bin/netstat/inet.c  4 Sep 2023 12:51:01 -
> > @@ -498,7 +498,7 @@ tcp_stats(char *name)
> > "\t%llu entr%s in current SYN cache, limit is %llu\n");
> > p2b(tcps_sc_bucket_maxlen, tcps_sc_bucket_limit,
> > "\t%llu longest bucket length in current SYN cache, limit is 
> > %llu\n");
> > -   p(tcps_sc_uses_left, "\t%llu use%s of current SYN cache left\n");
> > +   p(tcps_sc_uses_left, "\t%lld use%s of current SYN cache left\n");
> > 
> > p(tcps_sack_recovery_episode, "\t%llu SACK recovery episode%s\n");
> > p(tcps_sack_rexmits,
> > 



tcp sync cache signed use counter

2023-09-04 Thread Alexander Bluhm
Hi,

Variable scs_use is basically counting packet insertions to syn
cache, so I would prefer type long to exclude overflow on fast
machines.  With the current limits int should be enough, but long
does not hurt.

It can be negative as it starts at a positive limit and counts
backwards.  After all entries in the current syn cache have been
timed out, it is reset to positive limit.  If timeout takes a while,
it may get well below zero.

To prevent netstat output like this
18446744073709531826 uses of current SYN cache left
make tcps_sc_uses_left signed and print it as long long integer.

ok?

bluhm

Index: sys/netinet/tcp_var.h
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
retrieving revision 1.170
diff -u -p -r1.170 tcp_var.h
--- sys/netinet/tcp_var.h   28 Aug 2023 14:50:02 -  1.170
+++ sys/netinet/tcp_var.h   4 Sep 2023 13:02:40 -
@@ -288,9 +288,9 @@ struct syn_cache_head {
 
 struct syn_cache_set {
struct  syn_cache_head *scs_buckethead;
+   longscs_use;
int scs_size;
int scs_count;
-   int scs_use;
u_int32_t   scs_random[5];
 };
 
@@ -430,7 +430,7 @@ struct  tcpstat {
u_int64_t tcps_sc_entry_limit;  /* limit of syn cache entries */
u_int64_t tcps_sc_bucket_maxlen;/* maximum # of entries in any bucket */
u_int64_t tcps_sc_bucket_limit; /* limit of syn cache bucket list */
-   u_int64_t tcps_sc_uses_left;/* use counter of current syn cache */
+   int64_t tcps_sc_uses_left;  /* use counter of current syn cache */
 
u_int64_t tcps_conndrained; /* # of connections drained */
 
Index: usr.bin/netstat/inet.c
===
RCS file: /data/mirror/openbsd/cvs/src/usr.bin/netstat/inet.c,v
retrieving revision 1.178
diff -u -p -r1.178 inet.c
--- usr.bin/netstat/inet.c  7 Jul 2023 09:15:13 -   1.178
+++ usr.bin/netstat/inet.c  4 Sep 2023 12:51:01 -
@@ -498,7 +498,7 @@ tcp_stats(char *name)
"\t%llu entr%s in current SYN cache, limit is %llu\n");
p2b(tcps_sc_bucket_maxlen, tcps_sc_bucket_limit,
"\t%llu longest bucket length in current SYN cache, limit is 
%llu\n");
-   p(tcps_sc_uses_left, "\t%llu use%s of current SYN cache left\n");
+   p(tcps_sc_uses_left, "\t%lld use%s of current SYN cache left\n");
 
p(tcps_sack_recovery_episode, "\t%llu SACK recovery episode%s\n");
p(tcps_sack_rexmits,



tcp sync cache refcount improvement

2023-09-03 Thread Alexander Bluhm
Hi,

Avoid a useless increment and decrement of of the tcp syn cache
refcount by unexpanding the SYN_CACHE_TIMER_ARM() macro in the timer
callback.

ok?

bluhm

Index: netinet/tcp_input.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
retrieving revision 1.390
diff -u -p -r1.390 tcp_input.c
--- netinet/tcp_input.c 28 Aug 2023 14:50:01 -  1.390
+++ netinet/tcp_input.c 3 Sep 2023 18:04:13 -
@@ -3159,19 +3159,6 @@ syn_cache_put(struct syn_cache *sc)
pool_put(_cache_pool, sc);
 }
 
-/*
- * We don't estimate RTT with SYNs, so each packet starts with the default
- * RTT and each timer step has a fixed timeout value.
- */
-#defineSYN_CACHE_TIMER_ARM(sc) 
\
-do {   \
-   TCPT_RANGESET((sc)->sc_rxtcur,  \
-   TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \
-   TCPTV_REXMTMAX);\
-   if (timeout_add_msec(&(sc)->sc_timer, (sc)->sc_rxtcur)) \
-   refcnt_take(&(sc)->sc_refcnt);  \
-} while (/*CONSTCOND*/0)
-
 void
 syn_cache_init(void)
 {
@@ -3300,11 +3287,17 @@ syn_cache_insert(struct syn_cache *sc, s
}
 
/*
-* Initialize the entry's timer.
+* Initialize the entry's timer.  We don't estimate RTT
+* with SYNs, so each packet starts with the default RTT
+* and each timer step has a fixed timeout value.
 */
sc->sc_rxttot = 0;
sc->sc_rxtshift = 0;
-   SYN_CACHE_TIMER_ARM(sc);
+   TCPT_RANGESET(sc->sc_rxtcur,
+   TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN,
+   TCPTV_REXMTMAX);
+   if (timeout_add_msec(>sc_timer, sc->sc_rxtcur))
+   refcnt_take(>sc_refcnt);
 
/* Link it from tcpcb entry */
refcnt_take(>sc_refcnt);
@@ -3365,15 +3358,12 @@ syn_cache_timer(void *arg)
 
/* Advance the timer back-off. */
sc->sc_rxtshift++;
-   SYN_CACHE_TIMER_ARM(sc);
+   TCPT_RANGESET(sc->sc_rxtcur,
+   TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN,
+   TCPTV_REXMTMAX);
+   if (!timeout_add_msec(>sc_timer, sc->sc_rxtcur))
+   syn_cache_put(sc);
 
-   /*
-* Decrement reference of this timer.  We know there is another timer
-* as we just added it.  So just deref, free is not necessary.
-*/
-   lastref = refcnt_rele(>sc_refcnt);
-   KASSERT(lastref == 0);
-   (void)lastref;
NET_UNLOCK();
return;
 



syslogd retry dns lookup

2023-09-02 Thread Alexander Bluhm
Hi,

When DNS lookup for remote loghost in @ line in syslog.conf does
not work at startup, retry in intervals.

Please test if you use the feature.

ok?

bluhm

Index: syslogd.c
===
RCS file: /data/mirror/openbsd/cvs/src/usr.sbin/syslogd/syslogd.c,v
retrieving revision 1.277
diff -u -p -r1.277 syslogd.c
--- syslogd.c   16 Mar 2023 18:22:08 -  1.277
+++ syslogd.c   2 Sep 2023 23:42:22 -
@@ -156,9 +156,12 @@ struct filed {
struct sockaddr_storage  f_addr;
struct buffertls f_buftls;
struct bufferevent  *f_bufev;
+   struct event f_ev;
struct tls  *f_ctx;
+   char*f_ipproto;
char*f_host;
-   int  f_reconnectwait;
+   char*f_port;
+   int  f_retrywait;
} f_forw;   /* forwarding address */
charf_fname[PATH_MAX];
struct {
@@ -320,6 +323,7 @@ void tcp_writecb(struct bufferevent *, 
 voidtcp_errorcb(struct bufferevent *, short, void *);
 voidtcp_connectcb(int, short, void *);
 voidtcp_connect_retry(struct bufferevent *, struct filed *);
+voidudp_resolvecb(int, short, void *);
 int tcpbuf_countmsg(struct bufferevent *bufev);
 voiddie_signalcb(int, short, void *);
 voidmark_timercb(int, short, void *);
@@ -1380,7 +1384,7 @@ tcp_writecb(struct bufferevent *bufev, v
 * Successful write, connection to server is good, reset wait time.
 */
log_debug("loghost \"%s\" successful write", f->f_un.f_forw.f_loghost);
-   f->f_un.f_forw.f_reconnectwait = 0;
+   f->f_un.f_forw.f_retrywait = 0;
 
if (f->f_dropped > 0 &&
EVBUFFER_LENGTH(f->f_un.f_forw.f_bufev->output) < MAX_TCPBUF) {
@@ -1453,6 +1457,18 @@ tcp_connectcb(int fd, short event, void 
struct bufferevent  *bufev = f->f_un.f_forw.f_bufev;
int  s;
 
+   if (f->f_un.f_forw.f_addr.ss_family == AF_UNSPEC) {
+   if (priv_getaddrinfo(f->f_un.f_forw.f_ipproto,
+   f->f_un.f_forw.f_host, f->f_un.f_forw.f_port,
+   (struct sockaddr*)>f_un.f_forw.f_addr,
+   sizeof(f->f_un.f_forw.f_addr)) != 0) {
+   log_warnx("bad hostname \"%s\"",
+   f->f_un.f_forw.f_loghost);
+   tcp_connect_retry(bufev, f);
+   return;
+   }
+   }
+
if ((s = tcp_socket(f)) == -1) {
tcp_connect_retry(bufev, f);
return;
@@ -1511,21 +1527,66 @@ tcp_connect_retry(struct bufferevent *bu
 {
struct timeval   to;
 
-   if (f->f_un.f_forw.f_reconnectwait == 0)
-   f->f_un.f_forw.f_reconnectwait = 1;
+   if (f->f_un.f_forw.f_retrywait == 0)
+   f->f_un.f_forw.f_retrywait = 1;
else
-   f->f_un.f_forw.f_reconnectwait <<= 1;
-   if (f->f_un.f_forw.f_reconnectwait > 600)
-   f->f_un.f_forw.f_reconnectwait = 600;
-   to.tv_sec = f->f_un.f_forw.f_reconnectwait;
+   f->f_un.f_forw.f_retrywait <<= 1;
+   if (f->f_un.f_forw.f_retrywait > 600)
+   f->f_un.f_forw.f_retrywait = 600;
+   to.tv_sec = f->f_un.f_forw.f_retrywait;
to.tv_usec = 0;
+   evtimer_add(>f_un.f_forw.f_ev, );
 
-   log_debug("tcp connect retry: wait %d",
-   f->f_un.f_forw.f_reconnectwait);
+   log_debug("tcp connect retry: wait %d", f->f_un.f_forw.f_retrywait);
bufferevent_setfd(bufev, -1);
-   /* We can reuse the write event as bufferevent is disabled. */
-   evtimer_set(>ev_write, tcp_connectcb, f);
-   evtimer_add(>ev_write, );
+}
+
+void
+udp_resolvecb(int fd, short event, void *arg)
+{
+   struct filed*f = arg;
+   struct timeval   to;
+
+   if (priv_getaddrinfo(f->f_un.f_forw.f_ipproto,
+   f->f_un.f_forw.f_host, f->f_un.f_forw.f_port,
+   (struct sockaddr*)>f_un.f_forw.f_addr,
+   sizeof(f->f_un.f_forw.f_addr)) == 0) {
+   switch (f->f_un.f_forw.f_addr.ss_family) {
+   case AF_INET:
+   log_debug("resolved \"%s\" to IPv4 address",
+   f->f_un.f_forw.f_loghost);
+   f->f_file = fd_udp;
+   break;
+   case AF_INET6:
+   log_debug("resolved \"%s\" to IPv6 address",
+   f->f_un.f_forw.f_loghost);
+   f->f_file = fd_udp6;
+   break;
+   }
+   f->f_un.f_forw.f_retrywait = 0;
+
+   if 

Re: link mbufs/inpcbs to pf_states, not pf_state_keys

2023-08-21 Thread Alexander Bluhm
On Thu, Aug 17, 2023 at 12:02:35PM +1000, David Gwynne wrote:
> there are links between the pcb/socket layer and pf as an optimisation,
> and links on mbufs between both sides of a forwarded connection.
> these links let pf skip an rb tree lookup for outgoing packets.
> 
> right now these links are between pf_state_key structs, which are the
> things that contain the actual addresses used by the connection, but you
> then have to iterate over a list in pf_state_keys to get to the pf_state
> structures.
> 
> i dont understand why we dont just link the actual pf_state structs.
> my best guess is there wasnt enough machinery (ie, refcnts and
> mtxes) on a pf_state struct to make it safe, so the compromise was
> the pf_state keys. it still got to avoid the tree lookup.

This linkage is much older than MP in pf.  There was no refcount
and mutex when added by henning@.

> i wanted this to make it easier to look up information on pf states from
> the socket layer, but sashan@ said i should send it out. i do think it
> makes things a bit easier to understand.
> 
> the most worrying bit is the change to pf_state_find().
> 
> thoughts? ok?

It took me years to get the logic correct for all corner cases.
When using pf-divert with UDP strange things start to happen.  Most
things are covered by regress/sys/net/pf_divert .  You have to setup
two machines to run it.

I don't know why it was written that way, but I know it works for
me.  What do you want to fix?

bluhm

> Index: kern/uipc_mbuf.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
> retrieving revision 1.287
> diff -u -p -r1.287 uipc_mbuf.c
> --- kern/uipc_mbuf.c  23 Jun 2023 04:36:49 -  1.287
> +++ kern/uipc_mbuf.c  17 Aug 2023 01:31:04 -
> @@ -308,7 +308,7 @@ m_clearhdr(struct mbuf *m)
>   /* delete all mbuf tags to reset the state */
>   m_tag_delete_chain(m);
>  #if NPF > 0
> - pf_mbuf_unlink_state_key(m);
> + pf_mbuf_unlink_state(m);
>   pf_mbuf_unlink_inpcb(m);
>  #endif   /* NPF > 0 */
>  
> @@ -440,7 +440,7 @@ m_free(struct mbuf *m)
>   if (m->m_flags & M_PKTHDR) {
>   m_tag_delete_chain(m);
>  #if NPF > 0
> - pf_mbuf_unlink_state_key(m);
> + pf_mbuf_unlink_state(m);
>   pf_mbuf_unlink_inpcb(m);
>  #endif   /* NPF > 0 */
>   }
> @@ -1398,8 +1398,8 @@ m_dup_pkthdr(struct mbuf *to, struct mbu
>   to->m_pkthdr = from->m_pkthdr;
>  
>  #if NPF > 0
> - to->m_pkthdr.pf.statekey = NULL;
> - pf_mbuf_link_state_key(to, from->m_pkthdr.pf.statekey);
> + to->m_pkthdr.pf.st = NULL;
> + pf_mbuf_link_state(to, from->m_pkthdr.pf.st);
>   to->m_pkthdr.pf.inp = NULL;
>   pf_mbuf_link_inpcb(to, from->m_pkthdr.pf.inp);
>  #endif   /* NPF > 0 */
> @@ -1526,8 +1526,8 @@ m_print(void *v,
>   m->m_pkthdr.csum_flags, MCS_BITS);
>   (*pr)("m_pkthdr.ether_vtag: %u\tm_ptkhdr.ph_rtableid: %u\n",
>   m->m_pkthdr.ether_vtag, m->m_pkthdr.ph_rtableid);
> - (*pr)("m_pkthdr.pf.statekey: %p\tm_pkthdr.pf.inp %p\n",
> - m->m_pkthdr.pf.statekey, m->m_pkthdr.pf.inp);
> + (*pr)("m_pkthdr.pf.st: %p\tm_pkthdr.pf.inp %p\n",
> + m->m_pkthdr.pf.st, m->m_pkthdr.pf.inp);
>   (*pr)("m_pkthdr.pf.qid: %u\tm_pkthdr.pf.tag: %u\n",
>   m->m_pkthdr.pf.qid, m->m_pkthdr.pf.tag);
>   (*pr)("m_pkthdr.pf.flags: %b\n",
> Index: net/if_mpw.c
> ===
> RCS file: /cvs/src/sys/net/if_mpw.c,v
> retrieving revision 1.63
> diff -u -p -r1.63 if_mpw.c
> --- net/if_mpw.c  29 Aug 2022 07:51:45 -  1.63
> +++ net/if_mpw.c  17 Aug 2023 01:31:04 -
> @@ -620,7 +620,7 @@ mpw_input(struct mpw_softc *sc, struct m
>   m->m_pkthdr.ph_rtableid = ifp->if_rdomain;
>  
>   /* packet has not been processed by PF yet. */
> - KASSERT(m->m_pkthdr.pf.statekey == NULL);
> + KASSERT(m->m_pkthdr.pf.st == NULL);
>  
>   if_vinput(ifp, m);
>   return;
> Index: net/if_tpmr.c
> ===
> RCS file: /cvs/src/sys/net/if_tpmr.c,v
> retrieving revision 1.33
> diff -u -p -r1.33 if_tpmr.c
> --- net/if_tpmr.c 16 May 2023 14:32:54 -  1.33
> +++ net/if_tpmr.c 17 Aug 2023 01:31:04 -
> @@ -303,7 +303,7 @@ tpmr_pf(struct ifnet *ifp0, int dir, str
>   return (NULL);
>  
>   if (dir == PF_IN && ISSET(m->m_pkthdr.pf.flags, PF_TAG_DIVERTED)) {
> - pf_mbuf_unlink_state_key(m);
> + pf_mbuf_unlink_state(m);
>   pf_mbuf_unlink_inpcb(m);
>   (*fam->ip_input)(ifp0, m);
>   return (NULL);
> Index: net/if_veb.c
> ===
> RCS file: /cvs/src/sys/net/if_veb.c,v
> retrieving revision 1.31
> diff -u -p -r1.31 if_veb.c
> --- 

Re: sosetopt(): merge SO_SND* with corresponding SO_RCV* cases

2023-08-08 Thread Alexander Bluhm
On Wed, Aug 09, 2023 at 12:41:18AM +0300, Vitaliy Makkoveev wrote:
> I think it's better to merge SO_BINDANY cases from both switch blocks.
> This time SO_LINGER case is separated, so there is no reason for
> dedicated switch block.

OK bluhm@

> Index: sys/kern/uipc_socket.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.307
> diff -u -p -r1.307 uipc_socket.c
> --- sys/kern/uipc_socket.c3 Aug 2023 09:49:08 -   1.307
> +++ sys/kern/uipc_socket.c8 Aug 2023 21:35:50 -
> @@ -1800,13 +1800,6 @@ sosetopt(struct socket *so, int level, i
>   error = ENOPROTOOPT;
>   } else {
>   switch (optname) {
> - case SO_BINDANY:
> - if ((error = suser(curproc)) != 0)  /* XXX */
> - return (error);
> - break;
> - }
> -
> - switch (optname) {
>  
>   case SO_LINGER:
>   if (m == NULL || m->m_len != sizeof (struct linger) ||
> @@ -1824,6 +1817,9 @@ sosetopt(struct socket *so, int level, i
>  
>   break;

Can you add a newline here?  All other cases have one.

>   case SO_BINDANY:
> + if ((error = suser(curproc)) != 0)  /* XXX */
> + return (error);
> + /* FALLTHROUGH */
>   case SO_DEBUG:
>   case SO_KEEPALIVE:
>   case SO_USELOOPBACK:



Re: sosetopt(): merge SO_SND* with corresponding SO_RCV* cases

2023-08-08 Thread Alexander Bluhm
On Fri, Aug 04, 2023 at 12:38:23AM +0300, Vitaliy Makkoveev wrote:
> @@ -1856,6 +1856,9 @@ sosetopt(struct socket *so, int level, i
>   case SO_SNDLOWAT:
>   case SO_RCVLOWAT:
>   {
> + struct sockbuf *sb = (optname == SO_SNDBUF ||
> + optname == SO_SNDLOWAT ?
> + >so_snd : >so_rcv);
> @@ -1910,6 +1896,8 @@ sosetopt(struct socket *so, int level, i
>   case SO_SNDTIMEO:
>   case SO_RCVTIMEO:
>   {
> + struct sockbuf *sb = (optname == SO_SNDTIMEO ?
> + >so_snd : >so_rcv);

Would it be nicer to set sb in the switch (optname) at the begining?

struct sockbuf *sb = NULL;

switch (optname) {
case SO_SNDBUF:
case SO_SNDLOWAT:
case SO_SNDTIMEO:
sb = >so_snd;
break;
case SO_RCVBUF:
case SO_RCVLOWAT:
case SO_RCVTIMEO:
sb = >so_rcv;
break;
case SO_BINDANY:
...
}

Anyway, OK bluhm@



Re: pf(4) may cause relayd(8) to abort

2023-08-08 Thread Alexander Bluhm
On Tue, Aug 01, 2023 at 01:50:52AM +0200, Alexandr Nedvedicky wrote:
> OK to commit?

OK bluhm@

> 8<---8<---8<--8<
> diff --git a/sys/net/pf_table.c b/sys/net/pf_table.c
> index 6f23a6f795d..c862c804f84 100644
> --- a/sys/net/pf_table.c
> +++ b/sys/net/pf_table.c
> @@ -1565,8 +1565,10 @@ pfr_add_tables(struct pfr_table *tbl, int size, int 
> *nadd, int flags)
>   xadd++;
>   } else if (!(flags & PFR_FLAG_DUMMY) &&
>   !(p->pfrkt_flags & PFR_TFLAG_ACTIVE)) {
> - p->pfrkt_nflags = (p->pfrkt_flags &
> - ~PFR_TFLAG_USRMASK) | PFR_TFLAG_ACTIVE;
> + p->pfrkt_nflags =
> + (p->pfrkt_flags & ~PFR_TFLAG_USRMASK) |
> + (n->pfrkt_flags & PFR_TFLAG_USRMASK) |
> + PFR_TFLAG_ACTIVE;
>   SLIST_INSERT_HEAD(, p, pfrkt_workq);
>   }
>   }
> diff --git a/regress/sys/net/pf_table/Makefile 
> b/regress/sys/net/pf_table/Makefile
> index a71f0190c73..8911e8a1d35 100644
> --- a/regress/sys/net/pf_table/Makefile
> +++ b/regress/sys/net/pf_table/Makefile
> @@ -1,15 +1,26 @@
>  #$OpenBSD: Makefile,v 1.3 2017/07/07 23:15:27 bluhm Exp $
>  
> -REGRESS_TARGETS= hit miss cleanup
> -CLEANFILES=  stamp-*
> +REGRESS_TARGETS= hit miss cleanup flags
> +CLEANFILES=  stamp-* \
> + pf-reftab.conf  \
> + pf-instance.conf\
> + table-ref.conf  \
> + table-pgone.out \
> + table-persist.out   \
> + table-ref.out   \
> + table-refgone.out
> +
>  
>  stamp-setup:
> + ${SUDO} pfctl -a regress/ttest -Fa
>   ${SUDO} pfctl -qt __regress_tbl -T add -f ${.CURDIR}/table.in
>   date >$@
>  
>  cleanup:
>   rm -f stamp-setup
>   ${SUDO} pfctl -qt __regress_tbl -T kill
> + ${SUDO} pfctl -q -a regress/ttest -Fr
> + ${SUDO} pfctl -q -a regress/ttest -qt instance -T kill
>  
>  hit: stamp-setup
>   for i in `cat ${.CURDIR}/table.hit`; do \
> @@ -27,6 +38,77 @@ miss: stamp-setup
>   done; \
>   exit 0
>  
> -.PHONY: hit miss
> +#
> +# tables  and  are both referenced by rule only
> +#
> +pf-instab.conf:
> + @echo 'table  { 192.168.1.0/24 }' > $@
> + @echo 'pass in from  to ' >> $@
> +
> +#
> +# table  is active and referred by rule, table 
> +# is referenced only.
> +pf-reftab.conf:
> + @echo 'pass in from  to ' > $@
> +
> +#
> +# check persistent flag (p) is gone from table  after
> +# we load pf-instab.conf. Deals with case when persistent table 
> +# exists before pf-instab.conf gets loaded.
> +#
> +table-pgone.out:
> + @echo '--a-r--  instanceregress/ttest' > $@
> + @echo 'r--  reference   regress/ttest' >> $@
> +
> +#
> +# verify table  got persistent flag after we
> +# run 'pfctl -t instance -T add ...'
> +#
> +table-persist.out:
> + @echo '-pa-r--  instanceregress/ttest' > $@
> + @echo 'r--  reference   regress/ttest' >> $@
> +
> +#
> +# verify tables  and  are created on behalf of
> +# reference by rule after pf-reftab.conf got loaded.
> +#
> +table-ref.out:
> + @echo 'r--  instanceregress/ttest' > $@
> + @echo 'r--  reference   regress/ttest' >> $@
> +
> +#
> +# verify reference to  table (persistent) is gone
> +# after rules got flushed
> +#
> +table-refgone.out:
> + @echo '-pa  instanceregress/ttest' > $@
> +
> +flags: pf-instab.conf pf-reftab.conf table-pgone.out table-persist.out \
> +table-ref.out table-refgone.out
> + @echo 'loading pf-reftab,conf (tables referenced by rules only)'
> + @cat pf-reftab.conf
> + ${SUDO} pfctl -a regress/ttest -f pf-reftab.conf
> + @echo 'tables  and  should both have r--'
> + ${SUDO} pfctl -a regress/ttest -sT -vg | diff table-ref.out -
> + @echo 'creating  table on command line, flags should be:'
> + @cat table-persist.out
> + ${SUDO} pfctl -a regress/ttest -t instance -T add 192.168.1.0/24
> + ${SUDO} pfctl -a regress/ttest -sT -vg | diff table-persist.out -
> + @echo 'flushing rules'
> + ${SUDO} pfctl -a regress/ttest -Fr
> + @echo 'table  should be gone, table  should stay'
> + ${SUDO} pfctl -a regress/ttest -sT -vg | diff table-refgone.out -
> + @echo 'loading pf-instab.conf'
> + @cat pf-instab.conf
> + ${SUDO} pfctl -a regress/ttest -f pf-instab.conf
> + @echo 'table  loses -p- flag:'
> + @cat table-pgone.out
> + ${SUDO} pfctl -a regress/ttest -sT -vg | diff table-pgone.out -
> + @echo 'flusing rules, both tables should be gone'
> + ${SUDO} pfctl -a regress/ttest -Fr
> + @echo 'anchor regress/ttest must be gone'
> + ${SUDO} pfctl -a regress/ttest -sr 2>&1 | grep 'pfctl: Anchor does not 
> exist'

Re: Move solock() down to sosetopt()

2023-07-28 Thread Alexander Bluhm
On Sat, Jul 22, 2023 at 05:42:49PM +0300, Vitaliy Makkoveev wrote:
> Thanks for testing. There is updated diff below.

OK bluhm@

> Index: sys/kern/uipc_socket.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.305
> diff -u -p -r1.305 uipc_socket.c
> --- sys/kern/uipc_socket.c4 Jul 2023 22:28:24 -   1.305
> +++ sys/kern/uipc_socket.c22 Jul 2023 14:38:43 -
> @@ -1789,12 +1789,12 @@ sosetopt(struct socket *so, int level, i
>  {
>   int error = 0;
>  
> - soassertlocked(so);
> -
>   if (level != SOL_SOCKET) {
>   if (so->so_proto->pr_ctloutput) {
> + solock(so);
>   error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so,
>   level, optname, m);
> + sounlock(so);
>   return (error);
>   }
>   error = ENOPROTOOPT;
> @@ -1813,9 +1813,16 @@ sosetopt(struct socket *so, int level, i
>   mtod(m, struct linger *)->l_linger < 0 ||
>   mtod(m, struct linger *)->l_linger > SHRT_MAX)
>   return (EINVAL);
> +
> + solock(so);
>   so->so_linger = mtod(m, struct linger *)->l_linger;
> - /* FALLTHROUGH */
> + if (*mtod(m, int *))
> + so->so_options |= optname;
> + else
> + so->so_options &= ~optname;
> + sounlock(so);
>  
> + break;
>   case SO_BINDANY:
>   case SO_DEBUG:
>   case SO_KEEPALIVE:
> @@ -1828,12 +1835,15 @@ sosetopt(struct socket *so, int level, i
>   case SO_ZEROIZE:
>   if (m == NULL || m->m_len < sizeof (int))
>   return (EINVAL);
> +
> + solock(so);
>   if (*mtod(m, int *))
>   so->so_options |= optname;
>   else
>   so->so_options &= ~optname;
> - break;
> + sounlock(so);
>  
> + break;
>   case SO_DONTROUTE:
>   if (m == NULL || m->m_len < sizeof (int))
>   return (EINVAL);
> @@ -1853,23 +1863,32 @@ sosetopt(struct socket *so, int level, i
>   cnt = *mtod(m, int *);
>   if ((long)cnt <= 0)
>   cnt = 1;
> - switch (optname) {
>  
> + solock(so);
> + switch (optname) {
>   case SO_SNDBUF:
> - if (so->so_snd.sb_state & SS_CANTSENDMORE)
> - return (EINVAL);
> + if (so->so_snd.sb_state & SS_CANTSENDMORE) {
> + error = EINVAL;
> + break;
> + }
>   if (sbcheckreserve(cnt, so->so_snd.sb_wat) ||
> - sbreserve(so, >so_snd, cnt))
> - return (ENOBUFS);
> + sbreserve(so, >so_snd, cnt)) {
> + error = ENOBUFS;
> + break;
> + }
>   so->so_snd.sb_wat = cnt;
>   break;
>  
>   case SO_RCVBUF:
> - if (so->so_rcv.sb_state & SS_CANTRCVMORE)
> - return (EINVAL);
> + if (so->so_rcv.sb_state & SS_CANTRCVMORE) {
> + error = EINVAL;
> + break;
> + }
>   if (sbcheckreserve(cnt, so->so_rcv.sb_wat) ||
> - sbreserve(so, >so_rcv, cnt))
> - return (ENOBUFS);
> + sbreserve(so, >so_rcv, cnt)) {
> + error = ENOBUFS;
> + break;
> + }
>   so->so_rcv.sb_wat = cnt;
>   break;
>  
> @@ -1884,6 +1903,7 @@ sosetopt(struct socket *so, int level, i
>   so->so_rcv.sb_hiwat : cnt;
>   break;
>   }
> + sounlock(so);
>   break;
>   }
>  
> @@ -1903,8 +1923,9 @@ sosetopt(struct socket *so, int level, i
>   return (EDOM);
>   

Re: fix vlan handling with tcplro on ix(4)

2023-07-27 Thread Alexander Bluhm
On Thu, Jul 27, 2023 at 09:15:48AM -0700, Chris Cappuccio wrote:
> Jan Klemkow [j.klem...@wemelug.de] wrote:
> > +#if NVLAN > 0
> > +   if (ext.evh)
> > +   hdrlen += ETHER_VLAN_ENCAP_LEN;
> > +#endif
> > if (ext.ip4)
> > hdrlen += ext.ip4->ip_hl << 2;
> > if (ext.ip6)
> 
> I'm not sure this should be tied to the compilation of the vlan
> driver in the kernel. Since the length is larger in either case,
> the ix driver should behave the same in either case.

If NVLAN is not compiled in, vlan should be ignored here.  That
works as ether_extract_headers() also has #if NVLAN > 0.  Then
ext.ip4 and ext.ip6 is NULL.  This is correct in this place.

More important question is what happens when ether_extract_headers()
is called during transmit in em(4), igc(4), ixl(4) and ix(4).
Without NVLAN in kernel everything is fine as ether_extract_headers()
ignores IP in packets with vlan tag.

With vlan it looks like checksum offloading is broken if M_VLANTAG
is not set.  But with vlan hardware offloading everything should
work.  That is the case for em(4), ixl(4) and ix(4).

igc(4) has #ifdef notyet #if NVLAN > 0, so I guess transmit checksum
offload does not work for vlan packets.  Jan's diff improves it a
little bit, but does not fix it completely.  I will test and see
what works.  But this is unrelated to Jan's fix.

I think it should be commited, as I see better test results with
ix(4) and it should fix sthen's setup.

bluhm



Re: fix vlan handling with tcplro on ix(4)

2023-07-26 Thread Alexander Bluhm
On Wed, Jul 26, 2023 at 11:30:45AM +0200, Jan Klemkow wrote:
> Hi,
> 
> I missed the vlan-tag size in the mss calculation of lro packets in
> ix(4).  This diff add vlan-header detection in ether_extract_headers()
> and uses this information to calculate the right mss.
> 
> This fixes forwarding of vlan tagged lro packets.
> 
> ok?

OK bluhm@
I will test later.  My machines are currently down.

> Index: dev/pci/if_ix.c
> ===
> RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
> retrieving revision 1.200
> diff -u -p -r1.200 if_ix.c
> --- dev/pci/if_ix.c   18 Jul 2023 16:01:20 -  1.200
> +++ dev/pci/if_ix.c   26 Jul 2023 09:21:15 -
> @@ -3275,6 +3275,10 @@ ixgbe_rxeof(struct rx_ring *rxr)
>   /* Calculate header size. */
>   ether_extract_headers(sendmp, );
>   hdrlen = sizeof(*ext.eh);
> +#if NVLAN > 0
> + if (ext.evh)
> + hdrlen += ETHER_VLAN_ENCAP_LEN;
> +#endif
>   if (ext.ip4)
>   hdrlen += ext.ip4->ip_hl << 2;
>   if (ext.ip6)
> Index: net/if_ethersubr.c
> ===
> RCS file: /cvs/src/sys/net/if_ethersubr.c,v
> retrieving revision 1.290
> diff -u -p -r1.290 if_ethersubr.c
> --- net/if_ethersubr.c6 Jul 2023 19:46:53 -   1.290
> +++ net/if_ethersubr.c26 Jul 2023 09:20:57 -
> @@ -1040,6 +1040,7 @@ ether_extract_headers(struct mbuf *mp, s
>   uint64_t hlen;
>   int  hoff;
>   uint8_t  ipproto;
> + uint16_t ether_type;
>  
>   /* Return NULL if header was not recognized. */
>   memset(ext, 0, sizeof(*ext));
> @@ -1048,9 +1049,20 @@ ether_extract_headers(struct mbuf *mp, s
>   return;
>  
>   ext->eh = mtod(mp, struct ether_header *);
> - switch (ntohs(ext->eh->ether_type)) {
> + ether_type = ntohs(ext->eh->ether_type);
> + hlen = sizeof(*ext->eh);
> +
> +#if NVLAN > 0
> + if (ether_type == ETHERTYPE_VLAN) {
> + ext->evh = mtod(mp, struct ether_vlan_header *);
> + ether_type = ntohs(ext->evh->evl_proto);
> + hlen = sizeof(*ext->evh);
> + }
> +#endif
> +
> + switch (ether_type) {
>   case ETHERTYPE_IP:
> - m = m_getptr(mp, sizeof(*ext->eh), );
> + m = m_getptr(mp, hlen, );
>   if (m == NULL || m->m_len - hoff < sizeof(*ext->ip4))
>   return;
>   ext->ip4 = (struct ip *)(mtod(m, caddr_t) + hoff);
> @@ -1064,7 +1076,7 @@ ether_extract_headers(struct mbuf *mp, s
>   break;
>  #ifdef INET6
>   case ETHERTYPE_IPV6:
> - m = m_getptr(mp, sizeof(*ext->eh), );
> + m = m_getptr(mp, hlen, );
>   if (m == NULL || m->m_len - hoff < sizeof(*ext->ip6))
>   return;
>   ext->ip6 = (struct ip6_hdr *)(mtod(m, caddr_t) + hoff);
> Index: netinet/if_ether.h
> ===
> RCS file: /cvs/src/sys/netinet/if_ether.h,v
> retrieving revision 1.89
> diff -u -p -r1.89 if_ether.h
> --- netinet/if_ether.h6 Jul 2023 19:46:53 -   1.89
> +++ netinet/if_ether.h26 Jul 2023 09:20:22 -
> @@ -301,11 +301,12 @@ uint64_tether_addr_to_e64(const struct 
>  void ether_e64_to_addr(struct ether_addr *, uint64_t);
>  
>  struct ether_extracted {
> - struct ether_header *eh;
> - struct ip   *ip4;
> - struct ip6_hdr  *ip6;
> - struct tcphdr   *tcp;
> - struct udphdr   *udp;
> + struct ether_header *eh;
> + struct ether_vlan_header*evh;
> + struct ip   *ip4;
> + struct ip6_hdr  *ip6;
> + struct tcphdr   *tcp;
> + struct udphdr   *udp;
>  };
>  
>  void ether_extract_headers(struct mbuf *, struct ether_extracted *);



OpenBSD Errata: July 25, 2023 (hv amd cpu)

2023-07-25 Thread Alexander Bluhm
Errata patches for guests running on hypervisors on AMD cpus have
been released for OpenBSD 7.2 and 7.3.

Binary updates for the amd64 and i386 platform are available via
the syspatch utility.  Source code patches can be found on the
respective errata page:

  https://www.openbsd.org/errata72.html
  https://www.openbsd.org/errata73.html



OpenBSD Errata: July 24, 2023 (amd cpu firmware, wscons)

2023-07-24 Thread Alexander Bluhm
Errata patches for AMD cpus with their firmware updates and for
wscons have been released for OpenBSD 7.2 and 7.3.

Binary updates for the amd64, i386 and arm64 platform are available
via the syspatch utility.  Source code patches can be found on the
respective errata page:

  https://www.openbsd.org/errata72.html
  https://www.openbsd.org/errata73.html

After syspatch on amd64 and i386, "fw_update" and "installboot"
must be run before reboot.  See errata file for detailed instructions.



Re: Move solock() down to sosetopt()

2023-07-21 Thread Alexander Bluhm
On Thu, Jul 13, 2023 at 02:22:17AM +0300, Vitaliy Makkoveev wrote:
> This is a part of my standalone sblock() work. I need this movement
> because buffers related SO_SND* and SO_RCV* socket options modification
> should be protected with sblock(). However, standalone sblock() has
> different lock orders with solock() for receive and send buffers. At
> least sblock() for `so_snd' buffer will always be taken before solock()
> in the sosend() path.
> 
> The switch() block was split by two. SO_DONTROUTE, SO_SPLICE, SO_SND*
> and SO_RCV* cases do not require to call (*pr_ctloutput)(), so they were
> moved to the first switch() block solock() was pushed into each case
> individually. For SO_SND* and SO_RCV* cases solock() will be replaced by
> sblock() in the future. SO_RTABLE case calls (*pr_ctloutput)(), but do
> this in the special way, so it was placed to the first switch() block
> too.
> 
> The second switch() block contains the cases which require to call
> (*pr_ctloutput)(). solock() is taken around this block together with the
> (*pr_ctloutput)() call to keep atomicy.

I did not see where (*pr_ctloutput)() is actually required.  In
this else level == SOL_SOCKET and all pr_ctloutput functions I could
find do nothing for case SOL_SOCKET.

So I do not see the cases where calling (*pr_ctloutput)() is required
and where not.  What did I miss?

I run regress with this and a witness kernel.  No fallout.  These
witness warnings are alway there when I run regress.

http://bluhm.genua.de/regress/results/2023-07-21T13%3A08%3A59Z/bsdcons-ot29.txt

[-- MARK -- Fri Jul 21 16:55:00 2023]
witness: lock order reversal:
 1st 0xfd88688efdc8 vmmaplk (>lock)
 2nd 0xfd870a1b35f0 inode (>i_lock)
lock order ">i_lock"(rrwlock) -> ">lock"(rwlock) first seen at:
#0  rw_enter_read+0x50
#1  uvmfault_lookup+0x8a
#2  uvm_fault_check+0x36
#3  uvm_fault+0xfb
#4  kpageflttrap+0x14d
#5  kerntrap+0x95
#6  alltraps_kern_meltdown+0x7b
#7  copyout+0x57
#8  ffs_read+0x1f6
#9  VOP_READ+0x45
#10 vn_rdwr+0xa5
#11 vmcmd_map_readvn+0xb5
#12 exec_process_vmcmds+0x98
#13 sys_execve+0x7ad
#14 start_init+0x26f
#15 proc_trampoline+0x1c
lock order ">lock"(rwlock) -> ">i_lock"(rrwlock) first seen at:
#0  rw_enter+0x71
#1  rrw_enter+0x57
#2  VOP_LOCK+0x5f
#3  vn_lock+0xbd
#4  vn_rdwr+0x83
#5  vndstrategy+0x2c7
#6  physio+0x237
#7  spec_read+0xac
#8  VOP_READ+0x45
#9  vn_read+0xaa
#10 dofilereadv+0x14a
#11 sys_read+0x55
#12 syscall+0x3d4
#13 Xsyscall+0x128

[-- MARK -- Fri Jul 21 17:05:00 2023]
witness: lock order reversal:
 1st 0xfd885cd0b8e0 vmmaplk (>lock)
 2nd 0x8000246587c0 nfsnode (>n_lock)
lock order data w2 -> w1 missing
lock order ">lock"(rwlock) -> ">n_lock"(rrwlock) first seen at:
#0  rw_enter+0x71
#1  rrw_enter+0x57
#2  VOP_LOCK+0x5f
#3  vn_lock+0xbd
#4  vn_rdwr+0x83
#5  vndstrategy+0x2c7
#6  physio+0x237
#7  spec_write+0x9a
#8  VOP_WRITE+0x45
#9  vn_write+0x105
#10 dofilewritev+0x151
#11 sys_pwrite+0x60
#12 syscall+0x3d4
#13 Xsyscall+0x128

> sys_setsockopt() is not the only sosetopt() caller. For such places
> the solock() could be just dropped around sosetopt() call. Please note,
> solock() protects only socket consistency so this doesn't brings any
> atomicy loss.
> 
> I want to receive feedback, polish the diff if required, and then I'll
> ask to test the final version with bulk builds and the snaps.
> 
> Index: sys/kern/uipc_socket.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.305
> diff -u -p -r1.305 uipc_socket.c
> --- sys/kern/uipc_socket.c4 Jul 2023 22:28:24 -   1.305
> +++ sys/kern/uipc_socket.c12 Jul 2023 23:08:02 -
> @@ -1789,57 +1789,23 @@ sosetopt(struct socket *so, int level, i
>  {
>   int error = 0;
>  
> - soassertlocked(so);
> -
>   if (level != SOL_SOCKET) {
>   if (so->so_proto->pr_ctloutput) {
> + solock(so);
>   error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so,
>   level, optname, m);
> + sounlock(so);
>   return (error);
>   }
>   error = ENOPROTOOPT;
>   } else {
>   switch (optname) {
> - case SO_BINDANY:
> - if ((error = suser(curproc)) != 0)  /* XXX */
> - return (error);
> - break;
> - }
> -
> - switch (optname) {
> -
> - case SO_LINGER:
> - if (m == NULL || m->m_len != sizeof (struct linger) ||
> - mtod(m, struct linger *)->l_linger < 0 ||
> - mtod(m, struct linger *)->l_linger > SHRT_MAX)
> - return (EINVAL);
> - so->so_linger = mtod(m, struct linger *)->l_linger;
> - /* FALLTHROUGH */
> -
> - case SO_BINDANY:
> - case SO_DEBUG:
> -

Re: inetd echo localhost

2023-07-21 Thread Alexander Bluhm
On Fri, Jul 21, 2023 at 03:05:41PM +0200, Claudio Jeker wrote:
> On Fri, Jul 21, 2023 at 03:17:35PM +0300, Vitaliy Makkoveev wrote:
> > On Thu, Jul 20, 2023 at 09:57:00PM +0200, Alexander Bluhm wrote:
> > > Hi,
> > > 
> > > I wonder why UDP echo does not work with inetd on 127.0.0.1.
> > > 
> > > Note that it is default off.  One of my regress machines has it
> > > enabled for other tests.  There perl dist/Net-Ping/t/510_ping_udp.t
> > > expects that UDP echo works on 127.0.0.1.
> > > 
> > > It was disabled with this commit:
> > > 
> > > revision 1.65
> > > date: 2000/08/01 19:02:05;  author: itojun;  state: Exp;  lines: +47 -11;
> > > be more paranoid about UDP-based echo services validation.  namely,
> > > reject the following sources:
> > > 0.0.0.0/8 127.0.0.0/8 240.0.0.0/4 255.0.0.0/8
> > > ff00::/8 ::/128
> > > :::0.0.0.0/96 and ::0.0.0.0/96 obeys IPv4 rule.
> > > reserved port, or NFS port.
> > > hint from deraadt.
> > > 
> > > 
> > > Note that IPv6 echo to ::1 works fine.  Only IPv4 echo to 127.0.0.1
> > > is broken.
> > > 
> > > I cannot see the security reason for disabling 127/8.
> > > Loops are prevented by blocking priviledged ports.
> > > Echo to a local interface address through loopback is still allowed.
> > > The kernel checks that 127/8 does not come from extern.
> > > 127.0.0.1 should be handled like ::1 .
> > > 
> > > The feature was introduced together with IPv6 mapped addresses.
> > > See cvs diff -r1.64 -r1.65 inetd.c
> > > There it made sense to be paranoid about the IPv4 compatibility part
> > > of the IPv6 address.  But this feature has been removed since decades.
> > > So it could be a left over.
> > > 
> > > Should we also disable ::1 IPv6?
> > > Or allow 127.0.0.1 only?
> > > Or remove the case 127 completely?
> > > 
> > 
> > It's better to have similar behaviour for both ipv4 and ipv6 cases. I
> > see no reason to disable localhost.
> 
> Now hold your horses. This was done because of RPC / NFS and especially
> portmap. Neither of these protocols work over IPv6 so there is no reason
> to block ::1.

But for these special ports we have this check in inetd.

if (port < IPPORT_RESERVED || port == NFS_PORT)
goto bad;

To my surprise blocking 127/8 in kernel ip_input() on non-loopback
interfaces was added after it was blocked in inetd.


revision 1.62
date: 2001/03/03 01:00:19;  author: itojun;  state: Exp;  lines: +11 -1;
drop packets with 127.0.0.0/8 in header field, if the packet is from outside.
under RFC1122 sender rule 127.0.0.8 must not appear on the wire.
count incidents by ipstat.ips_badaddr.  sync with kame


Checking it in userland again looks unnecessary.  Especially as
userland does not know as the interface and blocks unconditionally.

bluhm

> > > Index: usr.sbin/inetd/inetd.c
> > > ===
> > > RCS file: /data/mirror/openbsd/cvs/src/usr.sbin/inetd/inetd.c,v
> > > retrieving revision 1.164
> > > diff -u -p -r1.164 inetd.c
> > > --- usr.sbin/inetd/inetd.c19 Apr 2023 12:58:16 -  1.164
> > > +++ usr.sbin/inetd/inetd.c20 Jul 2023 19:52:39 -
> > > @@ -444,7 +444,7 @@ dg_badinput(struct sockaddr *sa)
> > >   if (IN_MULTICAST(in.s_addr))
> > >   goto bad;
> > >   switch ((in.s_addr & 0xff00) >> 24) {
> > > - case 0: case 127: case 255:
> > > + case 0: case 255:
> > >   goto bad;
> > >   }
> > >   if (dg_broadcast())
> > > 
> > 
> 
> -- 
> :wq Claudio



Re: sobuf_print(): add `sb_state' output

2023-07-21 Thread Alexander Bluhm
On Fri, Jul 21, 2023 at 11:35:21AM +0300, Vitaliy Makkoveev wrote:
> It contains SS_CANTSENDMORE, SS_ISSENDING, SS_CANTRCVMORE and
> SS_RCVATMARK bits. Also do `sb_flags' output as hex, it contains flags
> too.

OK bluhm@

> Index: sys/kern/uipc_socket.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.305
> diff -u -p -r1.305 uipc_socket.c
> --- sys/kern/uipc_socket.c4 Jul 2023 22:28:24 -   1.305
> +++ sys/kern/uipc_socket.c21 Jul 2023 08:30:05 -
> @@ -2366,7 +2366,8 @@ sobuf_print(struct sockbuf *sb,
>   (*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail);
>   (*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord);
>   (*pr)("\tsb_sel: ...\n");
> - (*pr)("\tsb_flags: %i\n", sb->sb_flags);
> + (*pr)("\tsb_flags: %04x\n", sb->sb_flags);
> + (*pr)("\tsb_state: %04x\n", sb->sb_state);
>   (*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs);
>  }
>  



inetd echo localhost

2023-07-20 Thread Alexander Bluhm
Hi,

I wonder why UDP echo does not work with inetd on 127.0.0.1.

Note that it is default off.  One of my regress machines has it
enabled for other tests.  There perl dist/Net-Ping/t/510_ping_udp.t
expects that UDP echo works on 127.0.0.1.

It was disabled with this commit:

revision 1.65
date: 2000/08/01 19:02:05;  author: itojun;  state: Exp;  lines: +47 -11;
be more paranoid about UDP-based echo services validation.  namely,
reject the following sources:
0.0.0.0/8 127.0.0.0/8 240.0.0.0/4 255.0.0.0/8
ff00::/8 ::/128
:::0.0.0.0/96 and ::0.0.0.0/96 obeys IPv4 rule.
reserved port, or NFS port.
hint from deraadt.


Note that IPv6 echo to ::1 works fine.  Only IPv4 echo to 127.0.0.1
is broken.

I cannot see the security reason for disabling 127/8.
Loops are prevented by blocking priviledged ports.
Echo to a local interface address through loopback is still allowed.
The kernel checks that 127/8 does not come from extern.
127.0.0.1 should be handled like ::1 .

The feature was introduced together with IPv6 mapped addresses.
See cvs diff -r1.64 -r1.65 inetd.c
There it made sense to be paranoid about the IPv4 compatibility part
of the IPv6 address.  But this feature has been removed since decades.
So it could be a left over.

Should we also disable ::1 IPv6?
Or allow 127.0.0.1 only?
Or remove the case 127 completely?

bluhm

Index: usr.sbin/inetd/inetd.c
===
RCS file: /data/mirror/openbsd/cvs/src/usr.sbin/inetd/inetd.c,v
retrieving revision 1.164
diff -u -p -r1.164 inetd.c
--- usr.sbin/inetd/inetd.c  19 Apr 2023 12:58:16 -  1.164
+++ usr.sbin/inetd/inetd.c  20 Jul 2023 19:52:39 -
@@ -444,7 +444,7 @@ dg_badinput(struct sockaddr *sa)
if (IN_MULTICAST(in.s_addr))
goto bad;
switch ((in.s_addr & 0xff00) >> 24) {
-   case 0: case 127: case 255:
+   case 0: case 255:
goto bad;
}
if (dg_broadcast())



loopback bpf tcpdump curruption

2023-07-20 Thread Alexander Bluhm
Hi,

When running tcpdump on loopback, I see a currupted packet for each
valid packet.

root@v74:.../~# tcpdump -ni lo0 -X -vvv
tcpdump: listening on lo0, link-type LOOP
21:01:07.645880
  : e161  ff01  7f00 0001 7f00 0001  .a..
  0010: 0800 00df 3a4f  d257 38fa 0236 0129  :O...W8..6.)
  0020: b9ec 7ecc 329a e881 4e36 e269 9fe4 1744  ..~.2...N6.i...D
  0030: 1819 1a1b 1c1d 1e1f 2021 2223 2425 2627   !"#$%&'
  0040: 2829 2a2b 2c2d 2e2f 3031 3233 3435 3637  ()*+,-./01234567

21:01:07.645886 127.0.0.1 > 127.0.0.1: icmp: echo request (id:3a4f seq:0) [icmp 
cksum ok] (ttl 255, id 57697, len 84, bad ip cksum 0! -> dc44)
  : 4500 0054 e161  ff01  7f00 0001  E..T.a..
  0010: 7f00 0001 0800 00df 3a4f  d257 38fa  :O...W8.
  0020: 0236 0129 b9ec 7ecc 329a e881 4e36 e269  .6.)..~.2...N6.i
  0030: 9fe4 1744 1819 1a1b 1c1d 1e1f 2021 2223  ...D !"#
  0040: 2425 2627 2829 2a2b 2c2d 2e2f 3031 3233  $%&'()*+,-./0123
  0050: 3435 36374567

lo(4) used to dump to bpf only for output.  It seems that when
if_bpf_mtap() was introduced, this changed and lo(4) dumps an
additional truncated packet.  The default bpf_mtap_ether() is not
suitable for lo(4).

Solution is to install a dummy lo_bpf_mtap() to suppress bpf on
input.

ok?

bluhm

Index: net/if_loop.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_loop.c,v
retrieving revision 1.96
diff -u -p -r1.96 if_loop.c
--- net/if_loop.c   18 Jul 2023 16:01:20 -  1.96
+++ net/if_loop.c   20 Jul 2023 19:06:39 -
@@ -146,6 +146,7 @@ voidlortrequest(struct ifnet *, int, st
 void   loinput(struct ifnet *, struct mbuf *);
 intlooutput(struct ifnet *,
struct mbuf *, struct sockaddr *, struct rtentry *);
+intlo_bpf_mtap(caddr_t, const struct mbuf *, u_int);
 
 intloop_clone_create(struct if_clone *, int);
 intloop_clone_destroy(struct ifnet *);
@@ -177,6 +178,7 @@ loop_clone_create(struct if_clone *ifc, 
IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 |
IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6 |
IFCAP_LRO | IFCAP_TSOv4 | IFCAP_TSOv6;
+   ifp->if_bpf_mtap = lo_bpf_mtap;
ifp->if_rtrequest = lortrequest;
ifp->if_ioctl = loioctl;
ifp->if_input = loinput;
@@ -228,6 +230,13 @@ loop_clone_destroy(struct ifnet *ifp)
 
if (rdomain)
rtable_l2set(rdomain, 0, 0);
+   return (0);
+}
+
+int
+lo_bpf_mtap(caddr_t if_bpf, const struct mbuf *m, u_int dir)
+{
+   /* loopback dumps on output, disable input bpf */
return (0);
 }
 



OpenBSD Errata: July 19, 2023 (ssh-agent)

2023-07-19 Thread Alexander Bluhm
Errata patches for ssh-agent have been released for
OpenBSD 7.2 and 7.3.

Binary updates for the amd64, i386 and arm64 platform are available
via the syspatch utility.  Source code patches can be found on the
respective errata page:

  https://www.openbsd.org/errata72.html
  https://www.openbsd.org/errata73.html



Re: tcp lro by default, call for testing

2023-07-17 Thread Alexander Bluhm
On Mon, Jul 10, 2023 at 02:07:01PM +0200, Jan Klemkow wrote:
> On Sat, Jul 08, 2023 at 05:15:26PM +0300, Alexander Bluhm wrote:
> > I am not aware of any more limitations when enabling LRO for TCP
> > in the network drivers.  The feature allows to receive agregated
> > packets larger than the MTU.  Receiving TCP streams becomes much
> > faster.
> > 
> > As the network hardware is not aware whether a packet is received
> > locally or forwarded, everything is aggregated.  In case of forwarding
> > it is split on output to packets not larger than the original
> > packets.  So path MTU discovery should still work.  If the outgoing
> > interface supports TSO, the packet is chopped in hardware.
> > 
> > Currently only ix(4) and lo(4) support LRO, and ix(4) is limited
> > to IPv4 and newer than the old 82598 model.  If the interface is
> > added to a bridge(4) or aggr(4), LRO is automatically disabled.
> 
> I guess you mean veb(4) not aggr(4).  We just avoid the in heritage
> of the LRO capability in aggr(4) but are using the feature.
> 
> > So in case you possess any ix(4) hardware or do funky pf routing
> > on lo(4) please run this diff.  If you encounter problems, report
> > and turn LRO off per interface with ifconfig -tcplro.
> 
> Diff looks fine to me.  I just would keep mentioning the default
> behavior in the manpage like this:

I think in future default will depend on the driver.  When adding
the LRO feature for a hardware we start with default off.  When we
are confident, we turn it on.  We are more flexible if we do not
mention it in ifconfig(8) man page.

If there are some problems with specific hardware they belong to
the driver in section 4.

> ok jan@

No test reports arrvied me.  So either everything works fine or
nobody has tested it.  I will commit soon to find out.

bluhm

> Index: sbin/ifconfig/ifconfig.8
> ===
> RCS file: /cvs/src/sbin/ifconfig/ifconfig.8,v
> retrieving revision 1.397
> diff -u -p -r1.397 ifconfig.8
> --- sbin/ifconfig/ifconfig.8  7 Jun 2023 18:42:40 -   1.397
> +++ sbin/ifconfig/ifconfig.8  10 Jul 2023 11:54:47 -
> @@ -517,9 +517,9 @@ It is not possible to use LRO with inter
>  or
>  .Xr tpmr 4 .
>  Changing this option will re-initialize the network interface.
> +LRO is enabled by default.
>  .It Cm -tcplro
>  Disable LRO.
> -LRO is disabled by default.
>  .It Cm up
>  Mark an interface
>  .Dq up .
> 
> 
> > Index: sys/dev/pci/if_ix.c
> > ===
> > RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pci/if_ix.c,v
> > retrieving revision 1.198
> > diff -u -p -r1.198 if_ix.c
> > --- sys/dev/pci/if_ix.c 8 Jul 2023 09:01:30 -   1.198
> > +++ sys/dev/pci/if_ix.c 8 Jul 2023 13:51:26 -
> > @@ -1925,8 +1925,10 @@ ixgbe_setup_interface(struct ix_softc *s
> > ifp->if_capabilities |= IFCAP_CSUM_IPv4;
> >  
> > ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6;
> > -   if (sc->hw.mac.type != ixgbe_mac_82598EB)
> > +   if (sc->hw.mac.type != ixgbe_mac_82598EB) {
> > +   ifp->if_xflags |= IFXF_LRO;
> > ifp->if_capabilities |= IFCAP_LRO;
> > +   }
> >  
> > /*
> >  * Specify the media types supported by this sc and register
> > Index: sys/net/if_loop.c
> > ===
> > RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_loop.c,v
> > retrieving revision 1.95
> > diff -u -p -r1.95 if_loop.c
> > --- sys/net/if_loop.c   2 Jul 2023 19:59:15 -   1.95
> > +++ sys/net/if_loop.c   8 Jul 2023 13:51:26 -
> > @@ -172,11 +172,11 @@ loop_clone_create(struct if_clone *ifc, 
> > ifp->if_softc = NULL;
> > ifp->if_mtu = LOMTU;
> > ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST;
> > -   ifp->if_xflags = IFXF_CLONED;
> > +   ifp->if_xflags = IFXF_CLONED | IFXF_LRO;
> > ifp->if_capabilities = IFCAP_CSUM_IPv4 |
> > IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 |
> > IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6 |
> > -   IFCAP_LRO;
> > +   IFCAP_LRO | IFCAP_TSOv4 | IFCAP_TSOv6;
> > ifp->if_rtrequest = lortrequest;
> > ifp->if_ioctl = loioctl;
> > ifp->if_input = loinput;
> > Index: sbin/ifconfig/ifconfig.8
> > ===
> > RCS file: /data/mirror/openbsd/cvs/src/sbin/ifconfig/ifconfig.8,v
> > retrieving revision 1.397
> > diff -u -p -r1.397 ifconfig.8
> > --- sbin/ifconfig/ifconfig.87 Jun 2023 18:42:40 -   1.397
> > +++ sbin/ifconfig/ifconfig.87 Jul 2023 19:57:09 -
> > @@ -519,7 +519,6 @@ or
> >  Changing this option will re-initialize the network interface.
> >  .It Cm -tcplro
> >  Disable LRO.
> > -LRO is disabled by default.
> >  .It Cm up
> >  Mark an interface
> >  .Dq up .
> > 



route leak nd6 detach

2023-07-09 Thread Alexander Bluhm
Hi,

While testing my ART reference couting fix, I discovered a rtentry
leak that is triggered by regress/sbin/route and detected with
btrace(8) refcnt.

The reference returned by rtalloc() must be freed with rtfree() in
all cases.

ok?

bluhm

Index: netinet6/in6_ifattach.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/in6_ifattach.c,v
retrieving revision 1.121
diff -u -p -r1.121 in6_ifattach.c
--- netinet6/in6_ifattach.c 15 Nov 2022 18:42:46 -  1.121
+++ netinet6/in6_ifattach.c 9 Jul 2023 11:20:39 -
@@ -440,10 +440,9 @@ in6_ifdetach(struct ifnet *ifp)
sin6.sin6_addr = in6addr_intfacelocal_allnodes;
sin6.sin6_addr.s6_addr16[1] = htons(ifp->if_index);
rt = rtalloc(sin6tosa(), 0, ifp->if_rdomain);
-   if (rt && rt->rt_ifidx == ifp->if_index) {
+   if (rt && rt->rt_ifidx == ifp->if_index)
rtdeletemsg(rt, ifp, ifp->if_rdomain);
-   rtfree(rt);
-   }
+   rtfree(rt);
 
/* remove route to link-local allnodes multicast (ff02::1) */
bzero(, sizeof(sin6));
@@ -452,10 +451,9 @@ in6_ifdetach(struct ifnet *ifp)
sin6.sin6_addr = in6addr_linklocal_allnodes;
sin6.sin6_addr.s6_addr16[1] = htons(ifp->if_index);
rt = rtalloc(sin6tosa(), 0, ifp->if_rdomain);
-   if (rt && rt->rt_ifidx == ifp->if_index) {
+   if (rt && rt->rt_ifidx == ifp->if_index)
rtdeletemsg(rt, ifp, ifp->if_rdomain);
-   rtfree(rt);
-   }
+   rtfree(rt);
 
if (ifp->if_xflags & (IFXF_AUTOCONF6 | IFXF_AUTOCONF6TEMP))
ifp->if_xflags &= ~(IFXF_AUTOCONF6 | IFXF_AUTOCONF6TEMP);



tcp lro by default, call for testing

2023-07-08 Thread Alexander Bluhm
Hi,

I am not aware of any more limitations when enabling LRO for TCP
in the network drivers.  The feature allows to receive agregated
packets larger than the MTU.  Receiving TCP streams becomes much
faster.

As the network hardware is not aware whether a packet is received
locally or forwarded, everything is aggregated.  In case of forwarding
it is split on output to packets not larger than the original
packets.  So path MTU discovery should still work.  If the outgoing
interface supports TSO, the packet is chopped in hardware.

Currently only ix(4) and lo(4) support LRO, and ix(4) is limited
to IPv4 and newer than the old 82598 model.  If the interface is
added to a bridge(4) or aggr(4), LRO is automatically disabled.

So in case you possess any ix(4) hardware or do funky pf routing
on lo(4) please run this diff.  If you encounter problems, report
and turn LRO off per interface with ifconfig -tcplro.

bluhm

Index: sys/dev/pci/if_ix.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.198
diff -u -p -r1.198 if_ix.c
--- sys/dev/pci/if_ix.c 8 Jul 2023 09:01:30 -   1.198
+++ sys/dev/pci/if_ix.c 8 Jul 2023 13:51:26 -
@@ -1925,8 +1925,10 @@ ixgbe_setup_interface(struct ix_softc *s
ifp->if_capabilities |= IFCAP_CSUM_IPv4;
 
ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6;
-   if (sc->hw.mac.type != ixgbe_mac_82598EB)
+   if (sc->hw.mac.type != ixgbe_mac_82598EB) {
+   ifp->if_xflags |= IFXF_LRO;
ifp->if_capabilities |= IFCAP_LRO;
+   }
 
/*
 * Specify the media types supported by this sc and register
Index: sys/net/if_loop.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_loop.c,v
retrieving revision 1.95
diff -u -p -r1.95 if_loop.c
--- sys/net/if_loop.c   2 Jul 2023 19:59:15 -   1.95
+++ sys/net/if_loop.c   8 Jul 2023 13:51:26 -
@@ -172,11 +172,11 @@ loop_clone_create(struct if_clone *ifc, 
ifp->if_softc = NULL;
ifp->if_mtu = LOMTU;
ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST;
-   ifp->if_xflags = IFXF_CLONED;
+   ifp->if_xflags = IFXF_CLONED | IFXF_LRO;
ifp->if_capabilities = IFCAP_CSUM_IPv4 |
IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 |
IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6 |
-   IFCAP_LRO;
+   IFCAP_LRO | IFCAP_TSOv4 | IFCAP_TSOv6;
ifp->if_rtrequest = lortrequest;
ifp->if_ioctl = loioctl;
ifp->if_input = loinput;
Index: sbin/ifconfig/ifconfig.8
===
RCS file: /data/mirror/openbsd/cvs/src/sbin/ifconfig/ifconfig.8,v
retrieving revision 1.397
diff -u -p -r1.397 ifconfig.8
--- sbin/ifconfig/ifconfig.87 Jun 2023 18:42:40 -   1.397
+++ sbin/ifconfig/ifconfig.87 Jul 2023 19:57:09 -
@@ -519,7 +519,6 @@ or
 Changing this option will re-initialize the network interface.
 .It Cm -tcplro
 Disable LRO.
-LRO is disabled by default.
 .It Cm up
 Mark an interface
 .Dq up .



Re: Use u_long for struct mstat

2023-07-07 Thread Alexander Bluhm
On Fri, Jul 07, 2023 at 12:31:02PM +0300, YASUOKA Masahiko wrote:
> Hi,
> 
> I'd like to expand the counters in struct mbstat from u_short to u_long.
> 
> When I was debugging a mbuf leak, I saw the result of "netstat -m"
> ---
> 28647 mbufs in use:
> 28551 mbufs allocated to data
> 4 mbufs allocated to packet headers
> 92 mbufs allocated to socket names and addresses
> 159506/160736 mbuf 2048 byte clusters in use (current/peak)
> 0/30 mbuf 2112 byte clusters in use (current/peak)
> 0/24 mbuf 4096 byte clusters in use (current/peak)
> 0/24 mbuf 8192 byte clusters in use (current/peak)
> 0/0 mbuf 9216 byte clusters in use (current/peak)
> 0/0 mbuf 12288 byte clusters in use (current/peak)
> 0/16 mbuf 16384 byte clusters in use (current/peak)
> 0/0 mbuf 65536 byte clusters in use (current/peak)
> 360980/362484/2097152 Kbytes allocated to network (current/peak/max)
> 0 requests for memory denied
> 0 requests for memory delayed
> 0 calls to protocol drain routines
> ---
> 
> I couldn't figure out why mcl2k is leaked without leaking mbuf for few
> days.  Actually it was shown in u_short (actual number % 65535).
> 
> 
> ok? comments?

OK bluhm@

> Index: sys/sys/mbuf.h
> ===
> RCS file: /disk/cvs/openbsd/src/sys/sys/mbuf.h,v
> retrieving revision 1.259
> diff -u -p -r1.259 mbuf.h
> --- sys/sys/mbuf.h4 Jul 2023 09:47:51 -   1.259
> +++ sys/sys/mbuf.h6 Jul 2023 14:36:15 -
> @@ -372,7 +372,7 @@ struct mbstat {
>   u_long  m_drops;/* times failed to find space */
>   u_long  m_wait; /* times waited for space */
>   u_long  m_drain;/* times drained protocols for space */
> - u_short m_mtypes[256];  /* type specific mbuf allocations */
> + u_long  m_mtypes[256];  /* type specific mbuf allocations */
>  };
>  
>  #define MBSTAT_TYPES   MT_NTYPES
> Index: sys/kern/kern_sysctl.c
> ===
> RCS file: /disk/cvs/openbsd/src/sys/kern/kern_sysctl.c,v
> retrieving revision 1.416
> diff -u -p -r1.416 kern_sysctl.c
> --- sys/kern/kern_sysctl.c2 Jul 2023 19:02:27 -   1.416
> +++ sys/kern/kern_sysctl.c6 Jul 2023 14:36:15 -
> @@ -515,20 +515,22 @@ kern_sysctl(int *name, u_int namelen, vo
>   case KERN_MBSTAT: {
>   extern struct cpumem *mbstat;
>   uint64_t counters[MBSTAT_COUNT];
> - struct mbstat mbs;
> + struct mbstat *mbs;
>   unsigned int i;
> + int ret;
>  
> - memset(, 0, sizeof(mbs));
> + mbs = malloc(sizeof(*mbs), M_TEMP, M_WAITOK | M_ZERO);
>   counters_read(mbstat, counters, MBSTAT_COUNT);
>   for (i = 0; i < MBSTAT_TYPES; i++)
> - mbs.m_mtypes[i] = counters[i];
> + mbs->m_mtypes[i] = counters[i];
>  
> - mbs.m_drops = counters[MBSTAT_DROPS];
> - mbs.m_wait = counters[MBSTAT_WAIT];
> - mbs.m_drain = counters[MBSTAT_DRAIN];
> + mbs->m_drops = counters[MBSTAT_DROPS];
> + mbs->m_wait = counters[MBSTAT_WAIT];
> + mbs->m_drain = counters[MBSTAT_DRAIN];
>  
> - return (sysctl_rdstruct(oldp, oldlenp, newp,
> - , sizeof(mbs)));
> + ret = sysctl_rdstruct(oldp, oldlenp, newp, mbs, sizeof(*mbs));
> + free(mbs, M_TEMP, sizeof(*mbs));
> + return (ret);
>   }
>   case KERN_MSGBUFSIZE:
>   case KERN_CONSBUFSIZE: {
> Index: usr.bin/netstat/mbuf.c
> ===
> RCS file: /disk/cvs/openbsd/src/usr.bin/netstat/mbuf.c,v
> retrieving revision 1.43
> diff -u -p -r1.43 mbuf.c
> --- usr.bin/netstat/mbuf.c16 Jul 2019 17:39:02 -  1.43
> +++ usr.bin/netstat/mbuf.c6 Jul 2023 14:36:15 -
> @@ -78,7 +78,7 @@ static struct mbtypes {
>   { 0, 0 }
>  };
>  
> -int nmbtypes = sizeof(mbstat.m_mtypes) / sizeof(short);
> +int nmbtypes = sizeof(mbstat.m_mtypes) / sizeof(u_long);
>  bool seen[256];  /* "have we seen this type yet?" */
>  
>  /*
> @@ -172,7 +172,7 @@ mbpr(void)
>   for (mp = mbtypes; mp->mt_name; mp++)
>   if (mbstat.m_mtypes[mp->mt_type]) {
>   seen[mp->mt_type] = YES;
> - printf("\t%u mbuf%s allocated to %s\n",
> + printf("\t%lu mbuf%s allocated to %s\n",
>   mbstat.m_mtypes[mp->mt_type],
>   plural(mbstat.m_mtypes[mp->mt_type]),
>   mp->mt_name);
> @@ -180,7 +180,7 @@ mbpr(void)
>   seen[MT_FREE] = YES;
>   for (i = 0; i < nmbtypes; i++)
>   if (!seen[i] && mbstat.m_mtypes[i]) {
> - printf("\t%u mbuf%s allocated to \n",
> + printf("\t%lu mbuf%s allocated to \n",
>   

Re: tcp timer wrap around, use 64 bit

2023-07-07 Thread Alexander Bluhm
On Fri, Jul 07, 2023 at 12:07:36PM +0300, YASUOKA Masahiko wrote:
> Hi,
> 
> This netstat diff is already needed to avoid compiler warnings since
> it uses struct tcpcb directly.
> 
> ok?

OK bluhm@

> Index: usr.bin/netstat/inet.c
> ===
> RCS file: /cvs/src/usr.bin/netstat/inet.c,v
> retrieving revision 1.177
> diff -u -p -r1.177 inet.c
> --- usr.bin/netstat/inet.c2 Jul 2023 19:59:15 -   1.177
> +++ usr.bin/netstat/inet.c7 Jul 2023 09:01:04 -
> @@ -1556,8 +1556,8 @@ tcpcb_dump(u_long off)
>   p("%lu", snd_cwnd, ", ");
>   p("%lu", snd_ssthresh, ", ");
>   p("%lu", max_sndwnd, "\n ");
> - p("%u", t_rcvtime, ", ");
> - p("%u", t_rtttime, ", ");
> + p("%llu", t_rcvtime, ", ");
> + p("%llu", t_rtttime, ", ");
>   p("%u", t_rtseq, "\n ");
>   p("%u", t_srtt, ", ");
>   p("%u", t_rttvar, ", ");
> @@ -1570,7 +1570,7 @@ tcpcb_dump(u_long off)
>   p("%u", request_r_scale, ", ");
>   p("%u", requested_s_scale, "\n ");
>   p("%u", ts_recent, ", ");
> - p("%u", ts_recent_age, "\n ");
> + p("%llu", ts_recent_age, "\n ");
>   p("%u", last_ack_sent, "\n ");
>   HTONS(tcpcb.t_pmtud_ip_len);
>   HTONS(tcpcb.t_pmtud_nextmtu);



Re: tcp timer wrap around, use 64 bit

2023-07-07 Thread Alexander Bluhm
On Fri, Jul 07, 2023 at 10:43:21AM +0200, Claudio Jeker wrote:
> On Fri, Jul 07, 2023 at 11:38:58AM +0300, YASUOKA Masahiko wrote:
> > Hi,
> > 
> > Does using 64 bit for timer in tcpcb require this?
> 
> Not sure about this but one comment below.

When we introduced struct tcp_info, we discussed if we should make
values 64 bit to be prepared for future.  Conclusion was 32 bit are
enough.  There are no other 64 bit fields in it.

As we want to keep the ABI stable, I would stay with 32.

Values are relative to tcp_now(), so it will overflow after 49 days.
The struct is only informing userland, this rare incorrectness is
not relevant for me.

bluhm

> > Index: sys/netinet/tcp.h
> > ===
> > RCS file: /cvs/src/sys/netinet/tcp.h,v
> > retrieving revision 1.24
> > diff -u -p -r1.24 tcp.h
> > --- sys/netinet/tcp.h   19 May 2023 01:04:39 -  1.24
> > +++ sys/netinet/tcp.h   7 Jul 2023 08:33:26 -
> > @@ -194,9 +194,9 @@ struct tcp_info {
> > uint32_ttcpi_snd_wl2;
> > uint32_ttcpi_snd_max;
> > uint32_ttcpi_ts_recent;
> > -   uint32_ttcpi_ts_recent_age;
> > +   uint64_ttcpi_ts_recent_age;
> > uint32_ttcpi_rfbuf_cnt;
> > -   uint32_ttcpi_rfbuf_ts;
> > +   uint64_ttcpi_rfbuf_ts;
> > uint32_ttcpi_so_rcv_sb_cc;
> > uint32_ttcpi_so_rcv_sb_hiwat;
> > uint32_ttcpi_so_rcv_sb_lowat;
> > Index: usr.bin/tcpbench/tcpbench.c
> > ===
> > RCS file: /cvs/src/usr.bin/tcpbench/tcpbench.c,v
> > retrieving revision 1.69
> > diff -u -p -r1.69 tcpbench.c
> > --- usr.bin/tcpbench/tcpbench.c 22 May 2023 12:53:04 -  1.69
> > +++ usr.bin/tcpbench/tcpbench.c 7 Jul 2023 08:33:26 -
> > @@ -51,6 +51,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  
> >  #define DEFAULT_PORT "12345"
> >  #define DEFAULT_STATS_INTERVAL 1000 /* ms */
> > @@ -411,7 +412,7 @@ tcp_stats_display(unsigned long long tot
> > P(tcpi, rcv_up, "%u")
> > P(tcpi, rcv_wscale, "%hhu")
> > P(tcpi, rfbuf_cnt, "%u")
> > -   P(tcpi, rfbuf_ts, "%u")
> > +   P(tcpi, rfbuf_ts, "%" PRIu64)
> 
> I don't think we need these ugly PRIu64 here. Just use %llu since in
> OpenBSD uint64_t is always a unsigned long long.
> 
> > P(tcpi, rtt, "%u")
> > P(tcpi, rttmin, "%u")
> > P(tcpi, rttvar, "%u")
> > @@ -436,7 +437,7 @@ tcp_stats_display(unsigned long long tot
> > P(tcpi, so_snd_sb_lowat, "%u")
> > P(tcpi, so_snd_sb_wat, "%u")
> > P(tcpi, ts_recent, "%u")
> > -   P(tcpi, ts_recent_age, "%u")
> > +   P(tcpi, ts_recent_age, "%" PRIu64)
> >  #undef S
> >  #undef P
> > }
> 
> -- 
> :wq Claudio



tso lo keep mss

2023-07-07 Thread Alexander Bluhm
Hi,

When we preserve M_TCP_TSO we also must keep ph_mss.  In lo(4)
LRO/TSO this logic was missing.  As this may be relevant only for
weird pf configs that forward from loopback and ifconfig tcplro is
disabled by default, it is not relevant yet.

ok?

bluhm

Index: net/if.c
===
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.705
diff -u -p -r1.705 if.c
--- net/if.c7 Jul 2023 08:05:02 -   1.705
+++ net/if.c7 Jul 2023 08:47:21 -
@@ -782,6 +782,7 @@ int
 if_input_local(struct ifnet *ifp, struct mbuf *m, sa_family_t af)
 {
int keepflags, keepcksum;
+   uint16_t keepmss;
 
 #if NBPFILTER > 0
/*
@@ -807,9 +808,11 @@ if_input_local(struct ifnet *ifp, struct
keepcksum = m->m_pkthdr.csum_flags & (M_IPV4_CSUM_OUT |
M_TCP_CSUM_OUT | M_UDP_CSUM_OUT | M_ICMP_CSUM_OUT |
M_TCP_TSO);
+   keepmss = m->m_pkthdr.ph_mss;
m_resethdr(m);
m->m_flags |= M_LOOP | keepflags;
m->m_pkthdr.csum_flags = keepcksum;
+   m->m_pkthdr.ph_mss = keepmss;
m->m_pkthdr.ph_ifidx = ifp->if_index;
m->m_pkthdr.ph_rtableid = ifp->if_rdomain;
 



Re: tcp lro tso path mtu

2023-07-06 Thread Alexander Bluhm
On Thu, Jul 06, 2023 at 08:49:03PM +0200, Jan Klemkow wrote:
> > @@ -109,6 +109,9 @@
> >  #include 
> >  #include 
> >  #include 
> 
> I think is a merge bug, isn't it?
> 
> > +#include 
> > +#include 
> > +#include 

Right.

> > +   error = tcp_if_output_tso(ifp, mp, dst, rt, ifcap, mtu);
> > +   if (error || *mp == NULL)
> > +   return error;
> > +
> > +   if ((*mp)->m_pkthdr.len <= mtu) {
> 
> I may miss something but...
> 
> Couldn't you move the *_cksum_out() calls above the upper
> tcp_if_output_tso() call?  And than remove the *_cksum_out() calls
> inside of tcp_if_output_tso()?
> 
> Thus, there is just one place where we call them.
> 
> > +   switch (dst->sa_family) {
> > +   case AF_INET:
> > +   in_hdr_cksum_out(*mp, ifp);
> > +   in_proto_cksum_out(*mp, ifp);
> > +   break;
> > +#ifdef INET6
> > +   case AF_INET6:
> > +   in6_proto_cksum_out(*mp, ifp);
> > +   break;
> > +#endif

There is the case in tcp_if_output_tso() where we call tcp_chopper().
Then checksum has to be calcualted after chopping.  If I do it
always before tcp_if_output_tso(), we may caluclate it twice.  Once
for the large packet and once for the small ones.

New diff without duplicate includes.

bluhm

Index: net/if.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v
retrieving revision 1.704
diff -u -p -r1.704 if.c
--- net/if.c6 Jul 2023 04:55:04 -   1.704
+++ net/if.c6 Jul 2023 19:15:00 -
@@ -886,6 +886,57 @@ if_output_ml(struct ifnet *ifp, struct m
 }
 
 int
+if_output_tso(struct ifnet *ifp, struct mbuf **mp, struct sockaddr *dst,
+struct rtentry *rt, u_int mtu)
+{
+   uint32_t ifcap;
+   int error;
+
+   switch (dst->sa_family) {
+   case AF_INET:
+   ifcap = IFCAP_TSOv4;
+   break;
+#ifdef INET6
+   case AF_INET6:
+   ifcap = IFCAP_TSOv6;
+   break;
+#endif
+   default:
+   unhandled_af(dst->sa_family);
+   }
+
+   /*
+* Try to send with TSO first.  When forwarding LRO may set
+* maximium segment size in mbuf header.  Chop TCP segment
+* even if it would fit interface MTU to preserve maximum
+* path MTU.
+*/
+   error = tcp_if_output_tso(ifp, mp, dst, rt, ifcap, mtu);
+   if (error || *mp == NULL)
+   return error;
+
+   if ((*mp)->m_pkthdr.len <= mtu) {
+   switch (dst->sa_family) {
+   case AF_INET:
+   in_hdr_cksum_out(*mp, ifp);
+   in_proto_cksum_out(*mp, ifp);
+   break;
+#ifdef INET6
+   case AF_INET6:
+   in6_proto_cksum_out(*mp, ifp);
+   break;
+#endif
+   }
+   error = ifp->if_output(ifp, *mp, dst, rt);
+   *mp = NULL;
+   return error;
+   }
+
+   /* mp still contains mbuf that has to be fragmented or dropped. */
+   return 0;
+}
+
+int
 if_output_mq(struct ifnet *ifp, struct mbuf_queue *mq, unsigned int *total,
 struct sockaddr *dst, struct rtentry *rt)
 {
Index: net/if_var.h
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_var.h,v
retrieving revision 1.128
diff -u -p -r1.128 if_var.h
--- net/if_var.h28 Jun 2023 11:49:49 -  1.128
+++ net/if_var.h6 Jul 2023 19:12:39 -
@@ -329,6 +329,8 @@ int if_output_ml(struct ifnet *, struct 
struct sockaddr *, struct rtentry *);
 intif_output_mq(struct ifnet *, struct mbuf_queue *, unsigned int *,
struct sockaddr *, struct rtentry *);
+intif_output_tso(struct ifnet *, struct mbuf **, struct sockaddr *,
+   struct rtentry *, u_int);
 intif_output_local(struct ifnet *, struct mbuf *, sa_family_t);
 void   if_rtrequest_dummy(struct ifnet *, int, struct rtentry *);
 void   p2p_rtrequest(struct ifnet *, int, struct rtentry *);
Index: net/pf.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf.c,v
retrieving revision 1.1182
diff -u -p -r1.1182 pf.c
--- net/pf.c6 Jul 2023 04:55:05 -   1.1182
+++ net/pf.c6 Jul 2023 19:12:39 -
@@ -6610,15 +6610,8 @@ pf_route(struct pf_pdesc *pd, struct pf_
ip = mtod(m0, struct ip *);
}
 
-   if (ntohs(ip->ip_len) <= ifp->if_mtu) {
-   in_hdr_cksum_out(m0, ifp);
-   in_proto_cksum_out(m0, ifp);
-   ifp->if_output(ifp, m0, sintosa(dst), rt);
-   goto done;
-   }
-
-   if (tcp_if_output_tso(ifp, , sintosa(dst), rt,
-   IFCAP_TSOv4, ifp->if_mtu) || m0 == NULL)
+   if (if_output_tso(ifp, , sintosa(dst), rt, ifp->if_mtu) ||
+   m0 == NULL)
goto done;
 

Re: OpenBSD perl 5.36.1 - Call for Testing

2023-07-06 Thread Alexander Bluhm
On Tue, Jul 04, 2023 at 06:38:46PM +0300, Alexander Bluhm wrote:
> On Sat, May 20, 2023 at 06:59:14PM -0700, Andrew Hewus Fresh wrote:
> > Here's a patch to bring perl in base from 5.36.0 to 5.36.1, there are
> > only a few fixes in it and at least one of them I had previously
> > backported before importing 5.36.0.
> 
> I am running this on my amd64 laptop for a while without problems.
> I have commited a Makefile to regress that will allow me to run
> Perl tests on all architectures.
> 
> OK bluhm@ for Perl 5.36.1
> 
> Just give me one or two days to run 5.36.0 tests everywhere before
> commiting 5.36.1.  Then we can compare differences.

i386, amd64, powerpc64 have the same known failures.  My other
architectures are too slow.  I have an 1 hour limit per regress
test and Perl takes more time.  I have doubled it, lets see if that
helps.

I think you should commit 5.36.1, we will see fallout in regress
test.

After that we have only the syscall Perl diff floating around and
can concentrate on that.

bluhm



fix crash in routing table

2023-07-05 Thread Alexander Bluhm
Hi,

When forwaring in packets parallel while removing the corresponding
ARP entry in a loop like this
while :; do arp -nd 10.10.21.1; arp -nd 10.10.22.4; done
it was quite easy to crash the kernel.

The race is exposed by putting a delay() here:

Index: net/rtable.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/rtable.c,v
retrieving revision 1.82
diff -u -p -r1.82 rtable.c
--- net/rtable.c19 Apr 2023 17:42:47 -  1.82
+++ net/rtable.c5 Jul 2023 19:59:04 -
@@ -489,6 +489,8 @@ rtable_match(unsigned int rtableid, stru
if (an == NULL)
goto out;
 
+   delay(10);  /* triggers crash */
+
rt = SRPL_FIRST(, >an_rtlist);
if (rt == NULL) {
SRPL_LEAVE();

Then expecting the struct rtentry rt_ifa is bogous and rt_refcnt
is 0.  So it looks like a refcounting bug.

While an_rtlist is protected by SRP, the route entries are not.  I
increased the refcount while the entry is held the an_rtlist.  Then
the crash goes away.

ok?

bluhm

Index: net/rtable.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/rtable.c,v
retrieving revision 1.82
diff -u -p -r1.82 rtable.c
--- net/rtable.c19 Apr 2023 17:42:47 -  1.82
+++ net/rtable.c5 Jul 2023 20:05:26 -
@@ -604,6 +604,11 @@ rtable_insert(unsigned int rtableid, str
SRPL_INSERT_HEAD_LOCKED(_rc, >an_rtlist, rt, rt_next);
 
prev = art_insert(ar, an, addr, plen);
+   if (prev == an) {
+   rw_exit_write(>ar_lock);
+   /* keep the refcount for rt while it is in an_rtlist */
+   return (0);
+   }
if (prev != an) {
SRPL_REMOVE_LOCKED(_rc, >an_rtlist, rt, rtentry,
rt_next);
@@ -689,9 +694,10 @@ rtable_delete(unsigned int rtableid, str
npaths++;
 
if (npaths > 1) {
-   KASSERT(refcnt_read(>rt_refcnt) >= 1);
+   KASSERT(refcnt_read(>rt_refcnt) >= 2);
SRPL_REMOVE_LOCKED(_rc, >an_rtlist, rt, rtentry,
rt_next);
+   rtfree(rt);
 
mrt = SRPL_FIRST_LOCKED(>an_rtlist);
if (npaths == 2)
@@ -703,8 +709,9 @@ rtable_delete(unsigned int rtableid, str
if (art_delete(ar, an, addr, plen) == NULL)
panic("art_delete failed to find node %p", an);
 
-   KASSERT(refcnt_read(>rt_refcnt) >= 1);
+   KASSERT(refcnt_read(>rt_refcnt) >= 2);
SRPL_REMOVE_LOCKED(_rc, >an_rtlist, rt, rtentry, rt_next);
+   rtfree(rt);
art_put(an);
 
 leave:
@@ -821,12 +828,11 @@ rtable_mpath_reprio(unsigned int rtablei
 */
rt->rt_priority = prio;
} else {
-   rtref(rt); /* keep rt alive in between remove and insert */
+   KASSERT(refcnt_read(>rt_refcnt) >= 2);
SRPL_REMOVE_LOCKED(_rc, >an_rtlist,
rt, rtentry, rt_next);
rt->rt_priority = prio;
rtable_mpath_insert(an, rt);
-   rtfree(rt);
error = EAGAIN;
}
rw_exit_write(>ar_lock);
@@ -839,6 +845,9 @@ rtable_mpath_insert(struct art_node *an,
 {
struct rtentry  *mrt, *prt = NULL;
uint8_t  prio = rt->rt_priority;
+
+   /* increment the refcount for rt while it is in an_rtlist */
+   rtref(rt);
 
if ((mrt = SRPL_FIRST_LOCKED(>an_rtlist)) == NULL) {
SRPL_INSERT_HEAD_LOCKED(_rc, >an_rtlist, rt, rt_next);



Re: Remove unnecessary NOWITNESS kludge

2023-07-05 Thread Alexander Bluhm
On Wed, Jul 05, 2023 at 03:41:31AM +, Visa Hankala wrote:
> Initialize stack-based mutexed using mtx_init(). This removes the need
> of the NOWITNESS kludge and lets the lock checker do its job with these
> mutexes.
> 
> At the moment, static initialization of locks inside functions does not
> work correctly with WITNESS. A lock initializer sets up a struct that
> gets permanently referenced by the lock checker. Inside a function,
> the static initializers put these structs on the stack, which causes
> trouble when the function returns. In principle, this might be solvable
> by using a compile-time expression that chooses the correct way of
> initialization based on the scope of usage.

OK bluhm@

> Index: dev/ic/mfi.c
> ===
> RCS file: src/sys/dev/ic/mfi.c,v
> retrieving revision 1.189
> diff -u -p -r1.189 mfi.c
> --- dev/ic/mfi.c  25 May 2023 19:35:58 -  1.189
> +++ dev/ic/mfi.c  5 Jul 2023 02:56:57 -
> @@ -925,8 +925,9 @@ mfi_poll(struct mfi_softc *sc, struct mf
>  void
>  mfi_exec(struct mfi_softc *sc, struct mfi_ccb *ccb)
>  {
> - struct mutex m = MUTEX_INITIALIZER_FLAGS(IPL_BIO, __MTX_NAME,
> - MTX_NOWITNESS);
> + struct mutex m;
> +
> + mtx_init(, IPL_BIO);
>  
>  #ifdef DIAGNOSTIC
>   if (ccb->ccb_cookie != NULL || ccb->ccb_done != NULL)
> Index: dev/ic/mpi.c
> ===
> RCS file: src/sys/dev/ic/mpi.c,v
> retrieving revision 1.225
> diff -u -p -r1.225 mpi.c
> --- dev/ic/mpi.c  25 May 2023 19:35:58 -  1.225
> +++ dev/ic/mpi.c  5 Jul 2023 02:56:57 -
> @@ -1263,10 +1263,11 @@ mpi_poll_done(struct mpi_ccb *ccb)
>  void
>  mpi_wait(struct mpi_softc *sc, struct mpi_ccb *ccb)
>  {
> - struct mutexcookie = MUTEX_INITIALIZER_FLAGS(
> - IPL_BIO, __MTX_NAME, MTX_NOWITNESS);
> + struct mutexcookie;
>   void(*done)(struct mpi_ccb *);
>  
> + mtx_init(, IPL_BIO);
> +
>   done = ccb->ccb_done;
>   ccb->ccb_done = mpi_wait_done;
>   ccb->ccb_cookie = 
> Index: dev/pci/mfii.c
> ===
> RCS file: src/sys/dev/pci/mfii.c,v
> retrieving revision 1.88
> diff -u -p -r1.88 mfii.c
> --- dev/pci/mfii.c25 May 2023 19:35:58 -  1.88
> +++ dev/pci/mfii.c5 Jul 2023 02:56:57 -
> @@ -1764,8 +1764,9 @@ mfii_poll_done(struct mfii_softc *sc, st
>  int
>  mfii_exec(struct mfii_softc *sc, struct mfii_ccb *ccb)
>  {
> - struct mutex m = MUTEX_INITIALIZER_FLAGS(IPL_BIO, __MTX_NAME,
> - MTX_NOWITNESS);
> + struct mutex m;
> +
> + mtx_init(, IPL_BIO);
>  
>  #ifdef DIAGNOSTIC
>   if (ccb->ccb_cookie != NULL || ccb->ccb_done != NULL)
> Index: dev/pci/mpii.c
> ===
> RCS file: src/sys/dev/pci/mpii.c,v
> retrieving revision 1.145
> diff -u -p -r1.145 mpii.c
> --- dev/pci/mpii.c25 May 2023 19:35:58 -  1.145
> +++ dev/pci/mpii.c5 Jul 2023 02:56:57 -
> @@ -2857,11 +2857,12 @@ mpii_init_queues(struct mpii_softc *sc)
>  void
>  mpii_wait(struct mpii_softc *sc, struct mpii_ccb *ccb)
>  {
> - struct mutexmtx = MUTEX_INITIALIZER_FLAGS(IPL_BIO,
> - __MTX_NAME, MTX_NOWITNESS);
> + struct mutexmtx;
>   void(*done)(struct mpii_ccb *);
>   void*cookie;
>  
> + mtx_init(, IPL_BIO);
> +
>   done = ccb->ccb_done;
>   cookie = ccb->ccb_cookie;
>  
> Index: scsi/scsi_base.c
> ===
> RCS file: src/sys/scsi/scsi_base.c,v
> retrieving revision 1.281
> diff -u -p -r1.281 scsi_base.c
> --- scsi/scsi_base.c  25 May 2023 19:35:58 -  1.281
> +++ scsi/scsi_base.c  5 Jul 2023 02:56:57 -
> @@ -1497,10 +1497,11 @@ scsi_done(struct scsi_xfer *xs)
>  int
>  scsi_xs_sync(struct scsi_xfer *xs)
>  {
> - struct mutexcookie = MUTEX_INITIALIZER_FLAGS(IPL_BIO, __MTX_NAME,
> - MTX_NOWITNESS);
> + struct mutexcookie;
>   int error;
>  
> + mtx_init(, IPL_BIO);
> +
>  #ifdef DIAGNOSTIC
>   if (xs->cookie != NULL)
>   panic("xs->cookie != NULL in scsi_xs_sync");



Re: cksum remove redundant code

2023-07-04 Thread Alexander Bluhm
anyone?

On Fri, May 26, 2023 at 06:44:25PM +0200, Alexander Bluhm wrote:
> Hi,
> 
> in_ifcap_cksum() checks ifp == NULL
> in_hdr_cksum_out() sets ip_sum = 0
> in_proto_cksum_out() and in6_proto_cksum_out() always write
> th_sum if M_TCP_CSUM_OUT is set and proto is IPPROTO_TCP.
> 
> ok?
> 
> bluhm
> 
> Index: netinet/ip_output.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_output.c,v
> retrieving revision 1.388
> diff -u -p -r1.388 ip_output.c
> --- netinet/ip_output.c   22 May 2023 16:08:34 -  1.388
> +++ netinet/ip_output.c   26 May 2023 11:55:49 -
> @@ -1801,7 +1801,7 @@ in_hdr_cksum_out(struct mbuf *m, struct 
>   struct ip *ip = mtod(m, struct ip *);
>  
>   ip->ip_sum = 0;
> - if (ifp && in_ifcap_cksum(m, ifp, IFCAP_CSUM_IPv4)) {
> + if (in_ifcap_cksum(m, ifp, IFCAP_CSUM_IPv4)) {
>   SET(m->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT);
>   } else {
>   ipstat_inc(ips_outswcsum);
> Index: netinet/tcp_output.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_output.c,v
> retrieving revision 1.138
> diff -u -p -r1.138 tcp_output.c
> --- netinet/tcp_output.c  15 May 2023 16:34:56 -  1.138
> +++ netinet/tcp_output.c  26 May 2023 15:19:12 -
> @@ -1295,7 +1295,6 @@ tcp_chopper(struct mbuf *m0, struct mbuf
>  
>   /* copy and adjust IP header, calculate checksum */
>   SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
> - mhth->th_sum = 0;
>   if (ip) {
>   struct ip *mhip;
>  
> @@ -1328,10 +1327,8 @@ tcp_chopper(struct mbuf *m0, struct mbuf
>   }
>   /* adjust IP header, calculate checksum */
>   SET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
> - th->th_sum = 0;
>   if (ip) {
>   ip->ip_len = htons(m0->m_pkthdr.len);
> - ip->ip_sum = 0;
>   in_hdr_cksum_out(m0, ifp);
>   in_proto_cksum_out(m0, ifp);
>   }



tcp timer wrap around, use 64 bit

2023-07-04 Thread Alexander Bluhm
Hi,

After changing tcp now tick to milliseconds, it will wrap around
after 49 days of uptime.  That may be a problem in some places of
our stack.  Better use a 64 bit counter.

As timestamp option is 32 bit in TCP protocol, we have to use the
lower 32 bit there.  There are casts to 32 bits that should behave
correctly.  More eyes to review would be helpful.

As a bonus, start with random 63 bit offset to avoid uptime leakage.
I am not aware that we leak anywhere, but more random is always
good.  2^63 milliseconds gives us 2.9*10^8 years of possible uptime.

ok?

bluhm

Index: netinet/tcp_input.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
retrieving revision 1.388
diff -u -p -r1.388 tcp_input.c
--- netinet/tcp_input.c 30 May 2023 19:32:57 -  1.388
+++ netinet/tcp_input.c 4 Jul 2023 08:49:47 -
@@ -130,8 +130,8 @@ struct timeval tcp_ackdrop_ppslim_last;
 #define TCP_PAWS_IDLE  TCP_TIME(24 * 24 * 60 * 60)
 
 /* for modulo comparisons of timestamps */
-#define TSTMP_LT(a,b)  ((int)((a)-(b)) < 0)
-#define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0)
+#define TSTMP_LT(a,b)  ((int32_t)((a)-(b)) < 0)
+#define TSTMP_GEQ(a,b) ((int32_t)((a)-(b)) >= 0)
 
 /* for TCP SACK comparisons */
 #defineSEQ_MIN(a,b)(SEQ_LT(a,b) ? (a) : (b))
@@ -190,7 +190,7 @@ void tcp_newreno_partialack(struct tcpc
 
 voidsyn_cache_put(struct syn_cache *);
 voidsyn_cache_rm(struct syn_cache *);
-int syn_cache_respond(struct syn_cache *, struct mbuf *, uint32_t);
+int syn_cache_respond(struct syn_cache *, struct mbuf *, uint64_t);
 voidsyn_cache_timer(void *);
 voidsyn_cache_reaper(void *);
 voidsyn_cache_insert(struct syn_cache *, struct tcpcb *);
@@ -198,10 +198,10 @@ void   syn_cache_reset(struct sockaddr *,
struct tcphdr *, u_int);
 int syn_cache_add(struct sockaddr *, struct sockaddr *, struct tcphdr *,
unsigned int, struct socket *, struct mbuf *, u_char *, int,
-   struct tcp_opt_info *, tcp_seq *, uint32_t);
+   struct tcp_opt_info *, tcp_seq *, uint64_t);
 struct socket *syn_cache_get(struct sockaddr *, struct sockaddr *,
struct tcphdr *, unsigned int, unsigned int, struct socket *,
-   struct mbuf *, uint32_t);
+   struct mbuf *, uint64_t);
 struct syn_cache *syn_cache_lookup(struct sockaddr *, struct sockaddr *,
struct syn_cache_head **, u_int);
 
@@ -375,7 +375,7 @@ tcp_input(struct mbuf **mp, int *offp, i
short ostate;
caddr_t saveti;
tcp_seq iss, *reuse = NULL;
-   uint32_t now;
+   uint64_t now;
u_long tiwin;
struct tcp_opt_info opti;
struct tcphdr *th;
@@ -885,7 +885,7 @@ findpcb:
goto drop;
 
if (opti.ts_present && opti.ts_ecr) {
-   int rtt_test;
+   int32_t rtt_test;
 
/* subtract out the tcp timestamp modulator */
opti.ts_ecr -= tp->ts_modulate;
@@ -1272,7 +1272,7 @@ trimthenstep6:
TSTMP_LT(opti.ts_val, tp->ts_recent)) {
 
/* Check to see if ts_recent is over 24 days old.  */
-   if ((int)(now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
+   if (now - tp->ts_recent_age > TCP_PAWS_IDLE) {
/*
 * Invalidate ts_recent.  If this segment updates
 * ts_recent, the age will be reset later and ts_recent
@@ -2120,7 +2120,7 @@ drop:
 int
 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th,
 struct mbuf *m, int iphlen, struct tcp_opt_info *oi,
-u_int rtableid, uint32_t now)
+u_int rtableid, uint64_t now)
 {
u_int16_t mss = 0;
int opt, optlen;
@@ -2686,7 +2686,7 @@ tcp_pulloutofband(struct socket *so, u_i
  * and update averages and current timeout.
  */
 void
-tcp_xmit_timer(struct tcpcb *tp, int rtt)
+tcp_xmit_timer(struct tcpcb *tp, int32_t rtt)
 {
int delta, rttmin;
 
@@ -3335,7 +3335,7 @@ void
 syn_cache_timer(void *arg)
 {
struct syn_cache *sc = arg;
-   uint32_t now;
+   uint64_t now;
 
NET_LOCK();
if (sc->sc_flags & SCF_DEAD)
@@ -3469,7 +3469,7 @@ syn_cache_lookup(struct sockaddr *src, s
  */
 struct socket *
 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
-u_int hlen, u_int tlen, struct socket *so, struct mbuf *m, uint32_t now)
+u_int hlen, u_int tlen, struct socket *so, struct mbuf *m, uint64_t now)
 {
struct syn_cache *sc;
struct syn_cache_head *scp;
@@ -3744,7 +3744,7 @@ syn_cache_unreach(struct sockaddr *src, 
 int
 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen,
-struct tcp_opt_info *oi, tcp_seq *issp, uint32_t now)
+struct tcp_opt_info *oi, tcp_seq 

Re: Add ethernet type check in ifsetlro()

2023-07-04 Thread Alexander Bluhm
On Mon, Jul 03, 2023 at 11:12:17PM +0200, Jan Klemkow wrote:
> bluhm pointed out that the ether_brport_isset() check it just allowed on
> ethernet devices.  Thus, I put an additional ethernet check in the
> condition.  This also fixes EBUSY errors of "ifconfig lo0 tcplro" calls
> in my setup.
> 
> ok?

OK bluhm@

> Index: net/if.c
> ===
> RCS file: /cvs/src/sys/net/if.c,v
> retrieving revision 1.702
> diff -u -p -r1.702 if.c
> --- net/if.c  2 Jul 2023 19:59:15 -   1.702
> +++ net/if.c  3 Jul 2023 20:58:32 -
> @@ -3206,7 +3206,7 @@ ifsetlro(struct ifnet *ifp, int on)
>   KERNEL_ASSERT_LOCKED(); /* for if_flags */
>  
>   if (on && !ISSET(ifp->if_xflags, IFXF_LRO)) {
> - if (ether_brport_isset(ifp)) {
> + if (ifp->if_type == IFT_ETHER && ether_brport_isset(ifp)) {
>   error = EBUSY;
>   goto out;
>   }



tcp lro tso path mtu

2023-07-03 Thread Alexander Bluhm
Hi,

As final step before making LRO (Large Receive Offload) the default,
we have to fix path MTU discovery when forwarding.

The drivers, currently ix(4) and lo(4) only, record an upper bound
of the size of the original packets in ph_mss.  When sending we
must chop the packets with TSO (TCP Segmentation Offload) to that
size.  That means we have to call tcp_if_output_tso() before
ifp->if_output().  I have put that logic into if_output_tso() to
avoid code duplication.

ok?

bluhm

Index: net/if.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v
retrieving revision 1.702
diff -u -p -r1.702 if.c
--- net/if.c2 Jul 2023 19:59:15 -   1.702
+++ net/if.c3 Jul 2023 10:28:30 -
@@ -109,6 +109,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 
 #ifdef INET6
 #include 
@@ -883,6 +886,57 @@ if_output_ml(struct ifnet *ifp, struct m
ml_purge(ml);
 
return error;
+}
+
+int
+if_output_tso(struct ifnet *ifp, struct mbuf **mp, struct sockaddr *dst,
+struct rtentry *rt, u_int mtu)
+{
+   uint32_t ifcap;
+   int error;
+
+   switch (dst->sa_family) {
+   case AF_INET:
+   ifcap = IFCAP_TSOv4;
+   break;
+#ifdef INET6
+   case AF_INET6:
+   ifcap = IFCAP_TSOv6;
+   break;
+#endif
+   default:
+   unhandled_af(dst->sa_family);
+   }
+
+   /*
+* Try to send with TSO first.  When forwarding LRO may set
+* maximium segment size in mbuf header.  Chop TCP segment
+* even if it would fit interface MTU to preserve maximum
+* path MTU.
+*/
+   error = tcp_if_output_tso(ifp, mp, dst, rt, ifcap, mtu);
+   if (error || *mp == NULL)
+   return error;
+
+   if ((*mp)->m_pkthdr.len <= mtu) {
+   switch (dst->sa_family) {
+   case AF_INET:
+   in_hdr_cksum_out(*mp, ifp);
+   in_proto_cksum_out(*mp, ifp);
+   break;
+#ifdef INET6
+   case AF_INET6:
+   in6_proto_cksum_out(*mp, ifp);
+   break;
+#endif
+   }
+   error = ifp->if_output(ifp, *mp, dst, rt);
+   *mp = NULL;
+   return error;
+   }
+
+   /* mp still contains mbuf that has to be fragmented or dropped. */
+   return 0;
 }
 
 int
Index: net/if_var.h
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_var.h,v
retrieving revision 1.128
diff -u -p -r1.128 if_var.h
--- net/if_var.h28 Jun 2023 11:49:49 -  1.128
+++ net/if_var.h3 Jul 2023 10:04:17 -
@@ -329,6 +329,8 @@ int if_output_ml(struct ifnet *, struct 
struct sockaddr *, struct rtentry *);
 intif_output_mq(struct ifnet *, struct mbuf_queue *, unsigned int *,
struct sockaddr *, struct rtentry *);
+intif_output_tso(struct ifnet *, struct mbuf **, struct sockaddr *,
+   struct rtentry *, u_int);
 intif_output_local(struct ifnet *, struct mbuf *, sa_family_t);
 void   if_rtrequest_dummy(struct ifnet *, int, struct rtentry *);
 void   p2p_rtrequest(struct ifnet *, int, struct rtentry *);
Index: net/pf.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf.c,v
retrieving revision 1.1181
diff -u -p -r1.1181 pf.c
--- net/pf.c5 Jun 2023 08:37:27 -   1.1181
+++ net/pf.c3 Jul 2023 10:04:17 -
@@ -6551,15 +6551,8 @@ pf_route(struct pf_pdesc *pd, struct pf_
ip = mtod(m0, struct ip *);
}
 
-   if (ntohs(ip->ip_len) <= ifp->if_mtu) {
-   in_hdr_cksum_out(m0, ifp);
-   in_proto_cksum_out(m0, ifp);
-   ifp->if_output(ifp, m0, sintosa(dst), rt);
-   goto done;
-   }
-
-   if (tcp_if_output_tso(ifp, , sintosa(dst), rt,
-   IFCAP_TSOv4, ifp->if_mtu) || m0 == NULL)
+   if (if_output_tso(ifp, , sintosa(dst), rt, ifp->if_mtu) ||
+   m0 == NULL)
goto done;
 
/*
@@ -6686,14 +6679,8 @@ pf_route6(struct pf_pdesc *pd, struct pf
goto done;
}
 
-   if (m0->m_pkthdr.len <= ifp->if_mtu) {
-   in6_proto_cksum_out(m0, ifp);
-   ifp->if_output(ifp, m0, sin6tosa(dst), rt);
-   goto done;
-   }
-
-   if (tcp_if_output_tso(ifp, , sin6tosa(dst), rt,
-   IFCAP_TSOv6, ifp->if_mtu) || m0 == NULL)
+   if (if_output_tso(ifp, , sin6tosa(dst), rt, ifp->if_mtu) ||
+   m0 == NULL)
goto done;
 
ip6stat_inc(ip6s_cantfrag);
Index: netinet/ip_output.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_output.c,v
retrieving revision 1.388
diff -u -p -r1.388 ip_output.c
--- 

Re: bge(4) kstats

2023-07-03 Thread Alexander Bluhm
On Mon, Jul 03, 2023 at 04:48:36PM +1000, Jonathan Matthew wrote:
> This adds kstats for the hardware counters available in bge(4) devices, 
> BCM5705
> and newer.  The main complication is that some of the counters are already 
> used
> in bge_stats_update_regs() as part of a hardware bug workaround, some are 
> affected
> by hardware bugs themselves, and some are read to update interface counters.
> I decided to leave that as-is as much as possible.
> 
> The main changes to bge_stats_update_regs() are to always read the outgoing
> ucast/mcast/bcast packet counters (instead of just when we're working around 
> the
> RDMA bug) and to accumulate any counters read into the kstat buffer, so
> bge_kstat_read() doesn't have to touch them.  All the hardware counters reset 
> on
> read, so avoiding double handling keeps things simple.  This means
> bge_stats_update_regs() also has to be called with bge_kstat_mtx held, so to
> decrease the number of '#if NKSTAT > 0' the mutex is compiled in even in 
> kernels
> without kstat.
> 
> On a lightly used machine that sees a lot of multicast and broadcast due to 
> being
> near Windows desktops, the stats look like this:
> 
> ok?

Tested with BCM5720 and BCM5704.

OK bluhm@

> bge0:0:bge-stats:0
>   out octets: 738725 bytes
>   collisions: 0
> xon sent: 0
>xoff sent: 0
>  xmit errors: 0
>  coll frames: 0 packets
>  multicoll frame: 0 packets
>deferred xmit: 0
>  excess coll: 0
>late coll: 0
>   out ucast pkts: 1495 packets
>   out mcast pkts: 0 packets
>   out bcast pkts: 5 packets
>in octets: 10192782 bytes
>fragments: 0
>in ucast pkts: 1736 packets
>in mcast pkts: 27251 packets
>in bcast pkts: 42984 packets
>   FCS errors: 0
> align errors: 0
> xon rcvd: 0
>xoff rcvd: 0
>  ctrlframes rcvd: 0
> xoff entered: 0
>  too long frames: 0
>  jabbers: 0
>   too short pkts: 0
>  DMA RQ full: 0
>DMA HPRQ full: 0
>   SDC queue full: 0
> sendprod set: 0
>stats updated: 0
> irqs: 0
> avoided irqs: 0
>tx thresh hit: 0
> filtdrop: 0
> DMA WRQ full: 0
>   DMA HPWRQ full: 0
>   out of BDs: 10
>  if in drops: 0
> if in errors: 0
>rx thresh hit: 0
> 
> 
> 
> Index: if_bge.c
> ===
> RCS file: /cvs/src/sys/dev/pci/if_bge.c,v
> retrieving revision 1.400
> diff -u -p -u -p -r1.400 if_bge.c
> --- if_bge.c  18 Jan 2023 23:31:37 -  1.400
> +++ if_bge.c  3 Jul 2023 06:09:42 -
> @@ -74,6 +74,7 @@
>  
>  #include "bpfilter.h"
>  #include "vlan.h"
> +#include "kstat.h"
>  
>  #include 
>  #include 
> @@ -85,6 +86,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  #include 
> @@ -203,6 +205,58 @@ void bge_ape_unlock(struct bge_softc *, 
>  void bge_ape_send_event(struct bge_softc *, uint32_t);
>  void bge_ape_driver_state_change(struct bge_softc *, int);
>  
> +#if NKSTAT > 0
> +void bge_kstat_attach(struct bge_softc *);
> +
> +enum {
> + bge_stat_out_octets = 0,
> + bge_stat_collisions,
> + bge_stat_xon_sent,
> + bge_stat_xoff_sent,
> + bge_stat_xmit_errors,
> + bge_stat_coll_frames,
> + bge_stat_multicoll_frames,
> + bge_stat_deferred_xmit,
> + bge_stat_excess_coll,
> + bge_stat_late_coll,
> + bge_stat_out_ucast_pkt,
> + bge_stat_out_mcast_pkt,
> + bge_stat_out_bcast_pkt,
> + bge_stat_in_octets,
> + bge_stat_fragments,
> + bge_stat_in_ucast_pkt,
> + bge_stat_in_mcast_pkt,
> + bge_stat_in_bcast_pkt,
> + bge_stat_fcs_errors,
> + bge_stat_align_errors,
> + bge_stat_xon_rcvd,
> + bge_stat_xoff_rcvd,
> + bge_stat_ctrl_frame_rcvd,
> + bge_stat_xoff_entered,
> + bge_stat_too_long_frames,
> + bge_stat_jabbers,
> + bge_stat_too_short_pkts,
> +
> + bge_stat_dma_rq_full,
> + bge_stat_dma_hprq_full,
> + bge_stat_sdc_queue_full,
> + bge_stat_nic_sendprod_set,
> + bge_stat_status_updated,
> + bge_stat_irqs,
> + bge_stat_avoided_irqs,
> + bge_stat_tx_thresh_hit,
> +
> + bge_stat_filtdrop,
> + bge_stat_dma_wrq_full,
> + bge_stat_dma_hpwrq_full,
> + bge_stat_out_of_bds,
> + bge_stat_if_in_drops,
> + bge_stat_if_in_errors,
> + bge_stat_rx_thresh_hit,
> +};
> +
> +#endif
> +
>  #ifdef BGE_DEBUG
>  #define DPRINTF(x)   do { if (bgedebug) printf x; } while (0)
>  #define DPRINTFN(n,x)do { if (bgedebug >= (n)) printf x; } while (0)
> @@ -2993,6 +3047,12 @@ bge_attach(struct device *parent, struct
>   else
>   sc->bge_return_ring_cnt = BGE_RETURN_RING_CNT_5705;
>  
> + mtx_init(>bge_kstat_mtx, IPL_SOFTCLOCK);
> +#if NKSTAT > 0
> + if (BGE_IS_5705_PLUS(sc))
> + bge_kstat_attach(sc);
> +#endif
> +
>   /* Set up ifnet structure */
>   ifp = >arpcom.ac_if;
>   ifp->if_softc = sc;
> @@ -3767,9 +3827,11 @@ bge_tick(void *xsc)
>  
>   

Re: lo(4) loopback LRO and TSO

2023-07-02 Thread Alexander Bluhm
anyone?

On Fri, Jun 23, 2023 at 06:06:16PM +0200, Alexander Bluhm wrote:
> Hi,
> 
> Claudio@ mentioned the idea to use TSO and LRO on the loopback
> interface to transfer TCP faster.
> 
> I see a performance effect with this diff, but more importantly it
> gives us more test coverage.  Currently LRO on lo(4) is default
> off.
> 
> Future plan is:
> - Fix some corner cases for LRO/TSO with TCP path-MTU discovery
>   and IP forwarding when LRO is enabled.
> - Enable LRO/TSO for lo(4) and ix(4) per default.
> - Jan@ commits his ixl(4) TSO diff.
> 
> ok for lo(4) LRO/TSO with default off?
> 
> bluhm
> 
> Index: sys/net/if.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v
> retrieving revision 1.700
> diff -u -p -r1.700 if.c
> --- sys/net/if.c  12 Jun 2023 21:19:54 -  1.700
> +++ sys/net/if.c  23 Jun 2023 15:48:27 -
> @@ -106,6 +106,9 @@
>  #ifdef MROUTING
>  #include 
>  #endif
> +#include 
> +#include 
> +#include 
>  
>  #ifdef INET6
>  #include 
> @@ -802,12 +805,29 @@ if_input_local(struct ifnet *ifp, struct
>* is now incorrect, will be calculated before sending.
>*/
>   keepcksum = m->m_pkthdr.csum_flags & (M_IPV4_CSUM_OUT |
> - M_TCP_CSUM_OUT | M_UDP_CSUM_OUT | M_ICMP_CSUM_OUT);
> + M_TCP_CSUM_OUT | M_UDP_CSUM_OUT | M_ICMP_CSUM_OUT |
> + M_TCP_TSO);
>   m_resethdr(m);
>   m->m_flags |= M_LOOP | keepflags;
>   m->m_pkthdr.csum_flags = keepcksum;
>   m->m_pkthdr.ph_ifidx = ifp->if_index;
>   m->m_pkthdr.ph_rtableid = ifp->if_rdomain;
> +
> + if (ISSET(keepcksum, M_TCP_TSO) && m->m_pkthdr.len > ifp->if_mtu) {
> + if (ifp->if_mtu > 0 &&
> + ((af == AF_INET &&
> + ISSET(ifp->if_capabilities, IFCAP_TSOv4)) ||
> + (af == AF_INET6 &&
> + ISSET(ifp->if_capabilities, IFCAP_TSOv6 {
> + tcpstat_inc(tcps_inswlro);
> + tcpstat_add(tcps_inpktlro,
> + (m->m_pkthdr.len + ifp->if_mtu - 1) / ifp->if_mtu);
> + } else {
> + tcpstat_inc(tcps_inbadlro);
> + m_freem(m);
> + return (EPROTONOSUPPORT);
> + }
> + }
>  
>   if (ISSET(keepcksum, M_TCP_CSUM_OUT))
>   m->m_pkthdr.csum_flags |= M_TCP_CSUM_IN_OK;
> Index: sys/net/if_loop.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_loop.c,v
> retrieving revision 1.94
> diff -u -p -r1.94 if_loop.c
> --- sys/net/if_loop.c 5 Jun 2023 11:35:46 -   1.94
> +++ sys/net/if_loop.c 23 Jun 2023 15:48:27 -
> @@ -175,7 +175,8 @@ loop_clone_create(struct if_clone *ifc, 
>   ifp->if_xflags = IFXF_CLONED;
>   ifp->if_capabilities = IFCAP_CSUM_IPv4 |
>   IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 |
> - IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
> + IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6 |
> + IFCAP_LRO;
>   ifp->if_rtrequest = lortrequest;
>   ifp->if_ioctl = loioctl;
>   ifp->if_input = loinput;
> @@ -281,6 +282,10 @@ loioctl(struct ifnet *ifp, u_long cmd, c
>  
>   switch (cmd) {
>   case SIOCSIFFLAGS:
> + if (ISSET(ifp->if_xflags, IFXF_LRO))
> + SET(ifp->if_capabilities, IFCAP_TSOv4 | IFCAP_TSOv6);
> + else
> + CLR(ifp->if_capabilities, IFCAP_TSOv4 | IFCAP_TSOv6);
>   break;
>  
>   case SIOCSIFADDR:
> Index: sys/netinet/tcp_usrreq.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_usrreq.c,v
> retrieving revision 1.219
> diff -u -p -r1.219 tcp_usrreq.c
> --- sys/netinet/tcp_usrreq.c  23 May 2023 09:16:16 -  1.219
> +++ sys/netinet/tcp_usrreq.c  23 Jun 2023 15:48:27 -
> @@ -1340,6 +1340,7 @@ tcp_sysctl_tcpstat(void *oldp, size_t *o
>   ASSIGN(tcps_outhwtso);
>   ASSIGN(tcps_outpkttso);
>   ASSIGN(tcps_outbadtso);
> + ASSIGN(tcps_inswlro);
>   ASSIGN(tcps_inhwlro);
>   ASSIGN(tcps_inpktlro);
>   ASSIGN(tcps_inbadlro);
> Index: sys/netinet/tcp_var.h
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
> retrieving revision 1.167
> diff -u -p -r1.167 tcp_var.h
> --- sys/netinet/tcp_var.h 23 May 2023 09:16:16 -  1.167

Re: Introduce M_IFGROUP type of memory allocation

2023-06-27 Thread Alexander Bluhm
On Tue, Jun 27, 2023 at 01:55:23PM +0200, Mark Kettenis wrote:
> > Date: Tue, 27 Jun 2023 11:09:32 +
> > From: Klemens Nanni 
> > 
> > On Tue, Jun 27, 2023 at 01:32:37PM +0300, Vitaliy Makkoveev wrote:
> > > M_TEMP seems unreasonable for interface groups data allocations.
> > 
> > After claudio pointed out the wrong type, I thought of the same name,
> > no other malloc(9) type fits.
> > 
> > FWIW OK kn, but please wait for other to chime in.
> 
> I don't think the interface groups are likely candidates for memory
> leaks so having a separate type for them seems to be overkill.  But

Leaks are always where you don't expect them.  It is important for
me to exclude areas when you are hunting them.

> indeed there is no other suitable type.  Maybe we can turn this into
> M_IFMISC in the future if we need another network interface related
> type.  But for now this is fine.

M_TEMP is bad, anything else is better.  I prefer fine grained
counters as it simplifies bug hunting.

What is the prefered way between pool and malloc?  I thought malloc
is for dynamic size or historic code, but pool is for fixed size
elements.  Does it make sense to create a pool for every fixed sized
struct?

Here we have struct ifg_list, ifg_group, and ifg_member.  Three
pools feel like overkill.  One malloc type for all of them looks
reasonable.

OK bluhm@

> > > Don't forget to recompile systat(1) and vmstat(8) with new sys/malloc.h.
> > > 
> > > Index: sys/net/if.c
> > > ===
> > > RCS file: /cvs/src/sys/net/if.c,v
> > > retrieving revision 1.700
> > > diff -u -p -r1.700 if.c
> > > --- sys/net/if.c  12 Jun 2023 21:19:54 -  1.700
> > > +++ sys/net/if.c  27 Jun 2023 10:15:12 -
> > > @@ -2784,7 +2784,7 @@ if_creategroup(const char *groupname)
> > >  {
> > >   struct ifg_group*ifg;
> > >  
> > > - if ((ifg = malloc(sizeof(*ifg), M_TEMP, M_NOWAIT)) == NULL)
> > > + if ((ifg = malloc(sizeof(*ifg), M_IFGROUP, M_NOWAIT)) == NULL)
> > >   return (NULL);
> > >  
> > >   strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
> > > @@ -2819,11 +2819,11 @@ if_addgroup(struct ifnet *ifp, const cha
> > >   if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
> > >   return (EEXIST);
> > >  
> > > - if ((ifgl = malloc(sizeof(*ifgl), M_TEMP, M_NOWAIT)) == NULL)
> > > + if ((ifgl = malloc(sizeof(*ifgl), M_IFGROUP, M_NOWAIT)) == NULL)
> > >   return (ENOMEM);
> > >  
> > > - if ((ifgm = malloc(sizeof(*ifgm), M_TEMP, M_NOWAIT)) == NULL) {
> > > - free(ifgl, M_TEMP, sizeof(*ifgl));
> > > + if ((ifgm = malloc(sizeof(*ifgm), M_IFGROUP, M_NOWAIT)) == NULL) {
> > > + free(ifgl, M_IFGROUP, sizeof(*ifgl));
> > >   return (ENOMEM);
> > >   }
> > >  
> > > @@ -2834,8 +2834,8 @@ if_addgroup(struct ifnet *ifp, const cha
> > >   if (ifg == NULL) {
> > >   ifg = if_creategroup(groupname);
> > >   if (ifg == NULL) {
> > > - free(ifgl, M_TEMP, sizeof(*ifgl));
> > > - free(ifgm, M_TEMP, sizeof(*ifgm));
> > > + free(ifgl, M_IFGROUP, sizeof(*ifgl));
> > > + free(ifgm, M_IFGROUP, sizeof(*ifgm));
> > >   return (ENOMEM);
> > >   }
> > >   } else
> > > @@ -2878,7 +2878,7 @@ if_delgroup(struct ifnet *ifp, const cha
> > >  
> > >   if (ifgm != NULL) {
> > >   TAILQ_REMOVE(>ifgl_group->ifg_members, ifgm, ifgm_next);
> > > - free(ifgm, M_TEMP, sizeof(*ifgm));
> > > + free(ifgm, M_IFGROUP, sizeof(*ifgm));
> > >   }
> > >  
> > >  #if NPF > 0
> > > @@ -2891,10 +2891,10 @@ if_delgroup(struct ifnet *ifp, const cha
> > >  #if NPF > 0
> > >   pfi_detach_ifgroup(ifgl->ifgl_group);
> > >  #endif
> > > - free(ifgl->ifgl_group, M_TEMP, sizeof(*ifgl->ifgl_group));
> > > + free(ifgl->ifgl_group, M_IFGROUP, sizeof(*ifgl->ifgl_group));
> > >   }
> > >  
> > > - free(ifgl, M_TEMP, sizeof(*ifgl));
> > > + free(ifgl, M_IFGROUP, sizeof(*ifgl));
> > >  
> > >   return (0);
> > >  }
> > > Index: sys/sys/malloc.h
> > > ===
> > > RCS file: /cvs/src/sys/sys/malloc.h,v
> > > retrieving revision 1.122
> > > diff -u -p -r1.122 malloc.h
> > > --- sys/sys/malloc.h  3 Feb 2022 17:18:22 -   1.122
> > > +++ sys/sys/malloc.h  27 Jun 2023 10:15:13 -
> > > @@ -72,7 +72,7 @@
> > >  /* 7 - free */
> > >  /* 8 - free */
> > >  #define  M_IFADDR9   /* interface address */
> > > -/* 10 - free */
> > > +#define M_IFGROUP10  /* interface group */
> > >  #define  M_SYSCTL11  /* sysctl buffers (persistent storage) 
> > > */
> > >  #define  M_COUNTERS  12  /* per CPU counters */
> > >  /* 13 - free */
> > > @@ -190,7 +190,7 @@
> > >   NULL, \
> > >   NULL, \
> > >   "ifaddr",   /* 9 M_IFADDR */ \
> > > - NULL, \
> > > + "ifgroup",  /* 10 M_IFGROUP */ \
> > >   "sysctl",   /* 11 

Re: inpcb sip hash mutex contention

2023-06-26 Thread Alexander Bluhm
On Sat, Jun 24, 2023 at 11:20:50AM +1000, David Gwynne wrote:
> maybe it's time to re-evaluate siphash?

In one of our products we have replaced SipHash with xxHash.

https://xxhash.com/
https://github.com/Cyan4973/xxHash

bluhm



lo(4) loopback LRO and TSO

2023-06-23 Thread Alexander Bluhm
Hi,

Claudio@ mentioned the idea to use TSO and LRO on the loopback
interface to transfer TCP faster.

I see a performance effect with this diff, but more importantly it
gives us more test coverage.  Currently LRO on lo(4) is default
off.

Future plan is:
- Fix some corner cases for LRO/TSO with TCP path-MTU discovery
  and IP forwarding when LRO is enabled.
- Enable LRO/TSO for lo(4) and ix(4) per default.
- Jan@ commits his ixl(4) TSO diff.

ok for lo(4) LRO/TSO with default off?

bluhm

Index: sys/net/if.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v
retrieving revision 1.700
diff -u -p -r1.700 if.c
--- sys/net/if.c12 Jun 2023 21:19:54 -  1.700
+++ sys/net/if.c23 Jun 2023 15:48:27 -
@@ -106,6 +106,9 @@
 #ifdef MROUTING
 #include 
 #endif
+#include 
+#include 
+#include 
 
 #ifdef INET6
 #include 
@@ -802,12 +805,29 @@ if_input_local(struct ifnet *ifp, struct
 * is now incorrect, will be calculated before sending.
 */
keepcksum = m->m_pkthdr.csum_flags & (M_IPV4_CSUM_OUT |
-   M_TCP_CSUM_OUT | M_UDP_CSUM_OUT | M_ICMP_CSUM_OUT);
+   M_TCP_CSUM_OUT | M_UDP_CSUM_OUT | M_ICMP_CSUM_OUT |
+   M_TCP_TSO);
m_resethdr(m);
m->m_flags |= M_LOOP | keepflags;
m->m_pkthdr.csum_flags = keepcksum;
m->m_pkthdr.ph_ifidx = ifp->if_index;
m->m_pkthdr.ph_rtableid = ifp->if_rdomain;
+
+   if (ISSET(keepcksum, M_TCP_TSO) && m->m_pkthdr.len > ifp->if_mtu) {
+   if (ifp->if_mtu > 0 &&
+   ((af == AF_INET &&
+   ISSET(ifp->if_capabilities, IFCAP_TSOv4)) ||
+   (af == AF_INET6 &&
+   ISSET(ifp->if_capabilities, IFCAP_TSOv6 {
+   tcpstat_inc(tcps_inswlro);
+   tcpstat_add(tcps_inpktlro,
+   (m->m_pkthdr.len + ifp->if_mtu - 1) / ifp->if_mtu);
+   } else {
+   tcpstat_inc(tcps_inbadlro);
+   m_freem(m);
+   return (EPROTONOSUPPORT);
+   }
+   }
 
if (ISSET(keepcksum, M_TCP_CSUM_OUT))
m->m_pkthdr.csum_flags |= M_TCP_CSUM_IN_OK;
Index: sys/net/if_loop.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_loop.c,v
retrieving revision 1.94
diff -u -p -r1.94 if_loop.c
--- sys/net/if_loop.c   5 Jun 2023 11:35:46 -   1.94
+++ sys/net/if_loop.c   23 Jun 2023 15:48:27 -
@@ -175,7 +175,8 @@ loop_clone_create(struct if_clone *ifc, 
ifp->if_xflags = IFXF_CLONED;
ifp->if_capabilities = IFCAP_CSUM_IPv4 |
IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 |
-   IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
+   IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6 |
+   IFCAP_LRO;
ifp->if_rtrequest = lortrequest;
ifp->if_ioctl = loioctl;
ifp->if_input = loinput;
@@ -281,6 +282,10 @@ loioctl(struct ifnet *ifp, u_long cmd, c
 
switch (cmd) {
case SIOCSIFFLAGS:
+   if (ISSET(ifp->if_xflags, IFXF_LRO))
+   SET(ifp->if_capabilities, IFCAP_TSOv4 | IFCAP_TSOv6);
+   else
+   CLR(ifp->if_capabilities, IFCAP_TSOv4 | IFCAP_TSOv6);
break;
 
case SIOCSIFADDR:
Index: sys/netinet/tcp_usrreq.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_usrreq.c,v
retrieving revision 1.219
diff -u -p -r1.219 tcp_usrreq.c
--- sys/netinet/tcp_usrreq.c23 May 2023 09:16:16 -  1.219
+++ sys/netinet/tcp_usrreq.c23 Jun 2023 15:48:27 -
@@ -1340,6 +1340,7 @@ tcp_sysctl_tcpstat(void *oldp, size_t *o
ASSIGN(tcps_outhwtso);
ASSIGN(tcps_outpkttso);
ASSIGN(tcps_outbadtso);
+   ASSIGN(tcps_inswlro);
ASSIGN(tcps_inhwlro);
ASSIGN(tcps_inpktlro);
ASSIGN(tcps_inbadlro);
Index: sys/netinet/tcp_var.h
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
retrieving revision 1.167
diff -u -p -r1.167 tcp_var.h
--- sys/netinet/tcp_var.h   23 May 2023 09:16:16 -  1.167
+++ sys/netinet/tcp_var.h   23 Jun 2023 15:48:27 -
@@ -447,6 +447,7 @@ struct  tcpstat {
u_int32_t tcps_outhwtso;/* output tso processed by hardware */
u_int32_t tcps_outpkttso;   /* packets generated by tso */
u_int32_t tcps_outbadtso;   /* output tso failed, packet dropped */
+   u_int32_t tcps_inswlro; /* input lro on pseudo device */
u_int32_t tcps_inhwlro; /* input lro from hardware */
u_int32_t tcps_inpktlro;/* packets coalesced by hardware lro */
u_int32_t tcps_inbadlro;/* input bad lro packets */
@@ -628,6 +629,7 @@ enum 

inpcb sip hash mutex contention

2023-06-22 Thread Alexander Bluhm
Hi,

I am working on a diff to run UDP input in parallel.  Btrace kstack
analysis shows that SIP hash for PCB lookup is quite expensive.
When running in parallel we get lock contention on the PCB table
mutex.

So it results in better performance to calculate the hash value
before taking the mutex.  Code gets a bit more complicated as I
have to pass the value around.

The hash secret has to be constant as hash calculation must not
depend on values protected by the table mutex.  I see no security
benefit in reseeding when the hash table gets resized.

Analysis also shows that asserting a rw_lock while holding a
mutex is a bit expensive.  Just remove the netlock assert.

ok?

bluhm

Index: netinet/in_pcb.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v
retrieving revision 1.276
diff -u -p -r1.276 in_pcb.c
--- netinet/in_pcb.c3 Oct 2022 16:43:52 -   1.276
+++ netinet/in_pcb.c22 Jun 2023 06:26:14 -
@@ -121,15 +121,15 @@ struct baddynamicports rootonlyports;
 struct pool inpcb_pool;
 
 void   in_pcbhash_insert(struct inpcb *);
-struct inpcb *in_pcbhash_lookup(struct inpcbtable *, u_int,
+struct inpcb *in_pcbhash_lookup(struct inpcbtable *, uint64_t, u_int,
 const struct in_addr *, u_short, const struct in_addr *, u_short);
 intin_pcbresize(struct inpcbtable *, int);
 
 #defineINPCBHASH_LOADFACTOR(_x)(((_x) * 3) / 4)
 
-struct inpcbhead *in_pcbhash(struct inpcbtable *, u_int,
+uint64_t in_pcbhash(struct inpcbtable *, u_int,
 const struct in_addr *, u_short, const struct in_addr *, u_short);
-struct inpcbhead *in_pcblhash(struct inpcbtable *, u_int, u_short);
+uint64_t in_pcblhash(struct inpcbtable *, u_int, u_short);
 
 /*
  * in_pcb is used for inet and inet6.  in6_pcb only contains special
@@ -142,7 +142,7 @@ in_init(void)
IPL_SOFTNET, 0, "inpcb", NULL);
 }
 
-struct inpcbhead *
+uint64_t
 in_pcbhash(struct inpcbtable *table, u_int rdomain,
 const struct in_addr *faddr, u_short fport,
 const struct in_addr *laddr, u_short lport)
@@ -156,11 +156,10 @@ in_pcbhash(struct inpcbtable *table, u_i
SipHash24_Update(, , sizeof(fport));
SipHash24_Update(, laddr, sizeof(*laddr));
SipHash24_Update(, , sizeof(lport));
-
-   return (>inpt_hashtbl[SipHash24_End() & table->inpt_mask]);
+   return SipHash24_End();
 }
 
-struct inpcbhead *
+uint64_t
 in_pcblhash(struct inpcbtable *table, u_int rdomain, u_short lport)
 {
SIPHASH_CTX ctx;
@@ -169,8 +168,7 @@ in_pcblhash(struct inpcbtable *table, u_
SipHash24_Init(, >inpt_lkey);
SipHash24_Update(, , sizeof(nrdom));
SipHash24_Update(, , sizeof(lport));
-
-   return (>inpt_lhashtbl[SipHash24_End() & table->inpt_lmask]);
+   return SipHash24_End();
 }
 
 void
@@ -816,11 +814,14 @@ in_pcblookup_local(struct inpcbtable *ta
struct in6_addr *laddr6 = (struct in6_addr *)laddrp;
 #endif
struct inpcbhead *head;
+   uint64_t lhash;
u_int rdomain;
 
rdomain = rtable_l2(rtable);
+   lhash = in_pcblhash(table, rdomain, lport);
+
mtx_enter(>inpt_mtx);
-   head = in_pcblhash(table, rdomain, lport);
+   head = >inpt_lhashtbl[lhash & table->inpt_lmask];
LIST_FOREACH(inp, head, inp_lhash) {
if (rtable_l2(inp->inp_rtableid) != rdomain)
continue;
@@ -1056,37 +1057,38 @@ in_pcbhash_insert(struct inpcb *inp)
 {
struct inpcbtable *table = inp->inp_table;
struct inpcbhead *head;
+   uint64_t hash, lhash;
 
-   NET_ASSERT_LOCKED();
MUTEX_ASSERT_LOCKED(>inpt_mtx);
 
-   head = in_pcblhash(table, inp->inp_rtableid, inp->inp_lport);
+   lhash = in_pcblhash(table, inp->inp_rtableid, inp->inp_lport);
+   head = >inpt_lhashtbl[lhash & table->inpt_lmask];
LIST_INSERT_HEAD(head, inp, inp_lhash);
 #ifdef INET6
if (inp->inp_flags & INP_IPV6)
-   head = in6_pcbhash(table, rtable_l2(inp->inp_rtableid),
+   hash = in6_pcbhash(table, rtable_l2(inp->inp_rtableid),
>inp_faddr6, inp->inp_fport,
>inp_laddr6, inp->inp_lport);
else
 #endif /* INET6 */
-   head = in_pcbhash(table, rtable_l2(inp->inp_rtableid),
+   hash = in_pcbhash(table, rtable_l2(inp->inp_rtableid),
>inp_faddr, inp->inp_fport,
>inp_laddr, inp->inp_lport);
+   head = >inpt_hashtbl[hash & table->inpt_mask];
LIST_INSERT_HEAD(head, inp, inp_hash);
 }
 
 struct inpcb *
-in_pcbhash_lookup(struct inpcbtable *table, u_int rdomain,
+in_pcbhash_lookup(struct inpcbtable *table, uint64_t hash, u_int rdomain,
 const struct in_addr *faddr, u_short fport,
 const struct in_addr *laddr, u_short lport)
 {
struct inpcbhead *head;
struct inpcb *inp;
 
-   NET_ASSERT_LOCKED();
MUTEX_ASSERT_LOCKED(>inpt_mtx);
 
-   

Re: tso ip6 forward

2023-06-16 Thread Alexander Bluhm
On Mon, Jun 12, 2023 at 03:46:28PM +0200, Alexander Bluhm wrote:
> I found a little inconsistency in IPv6 forwarding with TSO.
> 
> Sending with TSO should only done if the large packet does not fit
> in the interface MTU.  In case tcp_if_output_tso() does not process
> the packet, we should send an ICMP6 error.  Rearrange the code that
> it looks more like other calls to tcp_if_output_tso().
> 
> All these cases can only be reached when LRO is turned on for IPv6
> which none of our drivers currently supports.

jan@ pointed out that reordering TSO in ip6 forward breaks path MTU
discovery.  So lets only fix the forward counters, icmp6 packet too
big and icmp6 redirect.

First try to send with TSO.  The goto senderr handles icmp6 redirect
and other errors.

If TSO is not necessary and the interface MTU fits, just send the
packet.  Again goto senderr handles icmp6.

Finally care about icmp6 packet too big.

ok?

bluhm

Index: netinet6/ip6_forward.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_forward.c,v
retrieving revision 1.110
diff -u -p -r1.110 ip6_forward.c
--- netinet6/ip6_forward.c  1 Jun 2023 09:05:33 -   1.110
+++ netinet6/ip6_forward.c  16 Jun 2023 08:55:43 -
@@ -321,35 +321,30 @@ reroute:
 
error = tcp_if_output_tso(ifp, , sin6tosa(sin6), rt, IFCAP_TSOv6,
ifp->if_mtu);
+   if (error)
+   ip6stat_inc(ip6s_cantforward);
+   else if (m == NULL)
+   ip6stat_inc(ip6s_forward);
if (error || m == NULL)
-   goto freecopy;
+   goto senderr;
 
/* Check the size after pf_test to give pf a chance to refragment. */
-   if (m->m_pkthdr.len > ifp->if_mtu) {
-   if (mcopy)
-   icmp6_error(mcopy, ICMP6_PACKET_TOO_BIG, 0,
-   ifp->if_mtu);
-   m_freem(m);
-   goto out;
+   if (m->m_pkthdr.len <= ifp->if_mtu) {
+   in6_proto_cksum_out(m, ifp);
+   error = ifp->if_output(ifp, m, sin6tosa(sin6), rt);
+   if (error)
+   ip6stat_inc(ip6s_cantforward);
+   else
+   ip6stat_inc(ip6s_forward);
+   goto senderr;
}
 
-   in6_proto_cksum_out(m, ifp);
-   error = ifp->if_output(ifp, m, sin6tosa(sin6), rt);
-   if (error) {
-   ip6stat_inc(ip6s_cantforward);
-   } else {
-   ip6stat_inc(ip6s_forward);
-   if (type)
-   ip6stat_inc(ip6s_redirectsent);
-   else {
-   if (mcopy)
-   goto freecopy;
-   }
-   }
+   if (mcopy != NULL)
+   icmp6_error(mcopy, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu);
+   m_freem(m);
+   goto out;
 
-#if NPF > 0 || defined(IPSEC)
 senderr:
-#endif
if (mcopy == NULL)
goto out;
 
@@ -357,6 +352,7 @@ senderr:
case 0:
if (type == ND_REDIRECT) {
icmp6_redirect_output(mcopy, rt);
+   ip6stat_inc(ip6s_redirectsent);
goto out;
}
goto freecopy;



OpenBSD Errata: June 15, 2023 (libx11)

2023-06-15 Thread Alexander Bluhm
Errata patches for libX11 CVE-2023-3138 have been released for
OpenBSD 7.2 and 7.3.

Binary updates for the amd64, i386 and arm64 platform are available
via the syspatch utility.  Source code patches can be found on the
respective errata page:

  https://www.openbsd.org/errata72.html
  https://www.openbsd.org/errata73.html



Re: msdosfs: fixes for Undefined Behavior

2023-06-15 Thread Alexander Bluhm
On Thu, Jun 15, 2023 at 11:36:21AM +0200, Stefan Fritsch wrote:
> >From FreeBSD commits
> 
>   commit c0db7289c3de290d821311942d5533f2284af77f
>   Author: pfg 
>   Date:   Wed Aug 8 15:08:22 2018 +
> 
>   msdosfs: fixes for Undefined Behavior.
> 
> and
> 
>   commit 852150953b828e4e8c32789637061001158a8cf4
>   Author: kib 
>   Date:   Fri Oct 11 18:37:02 2019 +
> 
>   Plug the rest of undef behavior places that were missed in r337456.
> 
> 
> ok ?

OK bluhm@

> diff --git a/sys/msdosfs/msdosfs_fat.c b/sys/msdosfs/msdosfs_fat.c
> index d31abf7d11d..be23e161e29 100644
> --- a/sys/msdosfs/msdosfs_fat.c
> +++ b/sys/msdosfs/msdosfs_fat.c
> @@ -411,7 +411,7 @@ usemap_alloc(struct msdosfsmount *pmp, uint32_t cn)
>  {
>   KASSERT(cn <= pmp->pm_maxcluster);
>  
> - pmp->pm_inusemap[cn / N_INUSEBITS] |= 1 << (cn % N_INUSEBITS);
> + pmp->pm_inusemap[cn / N_INUSEBITS] |= 1U << (cn % N_INUSEBITS);
>   pmp->pm_freeclustercount--;
>  }
>  
> @@ -421,7 +421,7 @@ usemap_free(struct msdosfsmount *pmp, uint32_t cn)
>   KASSERT(cn <= pmp->pm_maxcluster);
>  
>   pmp->pm_freeclustercount++;
> - pmp->pm_inusemap[cn / N_INUSEBITS] &= ~(1 << (cn % N_INUSEBITS));
> + pmp->pm_inusemap[cn / N_INUSEBITS] &= ~(1U << (cn % N_INUSEBITS));
>  }
>  
>  int
> @@ -652,7 +652,7 @@ chainlength(struct msdosfsmount *pmp, uint32_t start, 
> uint32_t count)
>   idx = start / N_INUSEBITS;
>   start %= N_INUSEBITS;
>   map = pmp->pm_inusemap[idx];
> - map &= ~((1 << start) - 1);
> + map &= ~((1U << start) - 1);
>   if (map) {
>   len = ffs(map) - 1 - start;
>   len = MIN(len, count);
> @@ -759,7 +759,7 @@ clusteralloc(struct msdosfsmount *pmp, uint32_t start, 
> uint32_t count,
>   for (cn = newst; cn <= pmp->pm_maxcluster;) {
>   idx = cn / N_INUSEBITS;
>   map = pmp->pm_inusemap[idx];
> - map |= (1 << (cn % N_INUSEBITS)) - 1;
> + map |= (1U << (cn % N_INUSEBITS)) - 1;
>   if (map != (u_int)-1) {
>   cn = idx * N_INUSEBITS + ffs(map^(u_int)-1) - 1;
>   if ((l = chainlength(pmp, cn, count)) >= count)
> @@ -776,7 +776,7 @@ clusteralloc(struct msdosfsmount *pmp, uint32_t start, 
> uint32_t count,
>   for (cn = 0; cn < newst;) {
>   idx = cn / N_INUSEBITS;
>   map = pmp->pm_inusemap[idx];
> - map |= (1 << (cn % N_INUSEBITS)) - 1;
> + map |= (1U << (cn % N_INUSEBITS)) - 1;
>   if (map != (u_int)-1) {
>   cn = idx * N_INUSEBITS + ffs(map^(u_int)-1) - 1;
>   if ((l = chainlength(pmp, cn, count)) >= count)



Re: ix(4): allocate less memory for tx buffers

2023-06-13 Thread Alexander Bluhm
On Fri, Jun 09, 2023 at 08:10:22PM +0200, Jan Klemkow wrote:
> On Fri, Jun 09, 2023 at 06:59:57PM +0200, Jan Klemkow wrote:
> > On Fri, Jun 09, 2023 at 06:11:38PM +0200, Jan Klemkow wrote:
> > > TSO packets are limited to MAXMCLBYTES (64k).  Thus, we don't need to
> > > allocate IXGBE_TSO_SIZE (256k) per packet for the transmit buffers.
> > > 
> > > This saves 3/4 of the memory and allows me to pack over 8 ix(8) ports
> > > into one machine.  Otherwise I run out of devbuf in malloc(9).
> > 
> > fix typo in comment
> 
> Use a more precise compare in the CTASSERT condition.
> 
> ok?

I did run this through my performance tests.  No difference visible.

OK bluhm@

> Index: dev/pci/if_ix.c
> ===
> RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
> retrieving revision 1.197
> diff -u -p -r1.197 if_ix.c
> --- dev/pci/if_ix.c   1 Jun 2023 09:05:33 -   1.197
> +++ dev/pci/if_ix.c   9 Jun 2023 16:01:18 -
> @@ -37,6 +37,12 @@
>  #include 
>  #include 
>  
> +/*
> + * Our TCP/IP Stack could not handle packets greater than MAXMCLBYTES.
> + * This interface could not handle packets greater than IXGBE_TSO_SIZE.
> + */
> +CTASSERT(MAXMCLBYTES <= IXGBE_TSO_SIZE);
> +
>  /*
>   *  Driver version
>   */
> @@ -2263,7 +2269,7 @@ ixgbe_allocate_transmit_buffers(struct t
>   /* Create the descriptor buffer dma maps */
>   for (i = 0; i < sc->num_tx_desc; i++) {
>   txbuf = >tx_buffers[i];
> - error = bus_dmamap_create(txr->txdma.dma_tag, IXGBE_TSO_SIZE,
> + error = bus_dmamap_create(txr->txdma.dma_tag, MAXMCLBYTES,
>   sc->num_segs, PAGE_SIZE, 0,
>   BUS_DMA_NOWAIT, >map);
>  



tso ip6 forward

2023-06-12 Thread Alexander Bluhm
Hi,

I found a little inconsistency in IPv6 forwarding with TSO.

Sending with TSO should only done if the large packet does not fit
in the interface MTU.  In case tcp_if_output_tso() does not process
the packet, we should send an ICMP6 error.  Rearrange the code that
it looks more like other calls to tcp_if_output_tso().

All these cases can only be reached when LRO is turned on for IPv6
which none of our drivers currently supports.

When comparing ip6_forward() with ip6_output() I found a typo there.
Of course we have to compare ph_mss with if_mtu and not the packet
checksum flags.  ph_mss contains the size of the copped packets.

ok?

bluhm

Index: netinet6/ip6_forward.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_forward.c,v
retrieving revision 1.110
diff -u -p -r1.110 ip6_forward.c
--- netinet6/ip6_forward.c  1 Jun 2023 09:05:33 -   1.110
+++ netinet6/ip6_forward.c  12 Jun 2023 13:34:03 -
@@ -319,37 +319,32 @@ reroute:
}
 #endif
 
-   error = tcp_if_output_tso(ifp, , sin6tosa(sin6), rt, IFCAP_TSOv6,
-   ifp->if_mtu);
-   if (error || m == NULL)
-   goto freecopy;
-
/* Check the size after pf_test to give pf a chance to refragment. */
-   if (m->m_pkthdr.len > ifp->if_mtu) {
-   if (mcopy)
-   icmp6_error(mcopy, ICMP6_PACKET_TOO_BIG, 0,
-   ifp->if_mtu);
-   m_freem(m);
-   goto out;
+   if (m->m_pkthdr.len <= ifp->if_mtu) {
+   in6_proto_cksum_out(m, ifp);
+   error = ifp->if_output(ifp, m, sin6tosa(sin6), rt);
+   if (error)
+   ip6stat_inc(ip6s_cantforward);
+   else
+   ip6stat_inc(ip6s_forward);
+   goto senderr;
}
 
-   in6_proto_cksum_out(m, ifp);
-   error = ifp->if_output(ifp, m, sin6tosa(sin6), rt);
-   if (error) {
+   error = tcp_if_output_tso(ifp, , sin6tosa(sin6), rt, IFCAP_TSOv6,
+   ifp->if_mtu);
+   if (error)
ip6stat_inc(ip6s_cantforward);
-   } else {
+   else
ip6stat_inc(ip6s_forward);
-   if (type)
-   ip6stat_inc(ip6s_redirectsent);
-   else {
-   if (mcopy)
-   goto freecopy;
-   }
-   }
+   if (error || m == NULL)
+   goto senderr;
+
+   if (mcopy != NULL)
+   icmp6_error(mcopy, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu);
+   m_freem(m);
+   goto out;
 
-#if NPF > 0 || defined(IPSEC)
 senderr:
-#endif
if (mcopy == NULL)
goto out;
 
@@ -357,6 +352,7 @@ senderr:
case 0:
if (type == ND_REDIRECT) {
icmp6_redirect_output(mcopy, rt);
+   ip6stat_inc(ip6s_redirectsent);
goto out;
}
goto freecopy;
Index: netinet6/ip6_output.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_output.c,v
retrieving revision 1.277
diff -u -p -r1.277 ip6_output.c
--- netinet6/ip6_output.c   22 May 2023 16:08:34 -  1.277
+++ netinet6/ip6_output.c   12 Jun 2023 13:34:03 -
@@ -688,7 +688,7 @@ reroute:
dontfrag = 0;
if (dontfrag && /* case 2-b */
(ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) ?
-   m->m_pkthdr.csum_flags : tlen) > ifp->if_mtu) {
+   m->m_pkthdr.ph_mss : tlen) > ifp->if_mtu) {
 #ifdef IPSEC
if (ip_mtudisc)
ipsec_adjust_mtu(m, mtu);



Re: Kill if_detached_ioctl()

2023-06-12 Thread Alexander Bluhm
On Wed, Jun 07, 2023 at 01:36:23PM +0300, Vitaliy Makkoveev wrote:
> In this point the interface is already removed from the list of all
> interfaces and from the interface index map and all possible
> concurrent ioctl() threads finished. Remove this dead code.

Should we set ifp->if_ioctl to NULL?  Then we crash if we have a
race somewhere.

anyway OK bluhm@

> Index: sys/net/if.c
> ===
> RCS file: /cvs/src/sys/net/if.c,v
> retrieving revision 1.699
> diff -u -p -r1.699 if.c
> --- sys/net/if.c  5 Jun 2023 11:35:46 -   1.699
> +++ sys/net/if.c  7 Jun 2023 10:22:36 -
> @@ -145,7 +145,6 @@ int   if_setrdomain(struct ifnet *, int);
>  void if_slowtimo(void *);
>  
>  void if_detached_qstart(struct ifqueue *);
> -int  if_detached_ioctl(struct ifnet *, u_long, caddr_t);
>  
>  int  ifioctl_get(u_long, caddr_t);
>  int  ifconf(caddr_t);
> @@ -1128,7 +1127,6 @@ if_detach(struct ifnet *ifp)
>  
>   NET_LOCK();
>   s = splnet();
> - ifp->if_ioctl = if_detached_ioctl;
>   ifp->if_watchdog = NULL;
>  
>   /* Remove the watchdog timeout & task */
> @@ -2761,19 +2759,13 @@ if_getdata(struct ifnet *ifp, struct if_
>  }
>  
>  /*
> - * Dummy functions replaced in ifnet during detach (if protocols decide to
> + * Dummy function replaced in ifnet during detach (if protocols decide to
>   * fiddle with the if during detach.
>   */
>  void
>  if_detached_qstart(struct ifqueue *ifq)
>  {
>   ifq_purge(ifq);
> -}
> -
> -int
> -if_detached_ioctl(struct ifnet *ifp, u_long a, caddr_t b)
> -{
> - return ENODEV;
>  }
>  
>  /*



Re: if_detach(): move nd6_ifdetach() out of netlock

2023-06-12 Thread Alexander Bluhm
On Wed, Jun 07, 2023 at 01:15:46PM +0300, Vitaliy Makkoveev wrote:
> In this point, the interface is disconnected from everywhere. No need to
> hold netlock for dummy 'nd_ifinfo' release. Netlock is also not needed
> for TAILQ_EMPTY(>if_*hooks) assertions.

OK bluhm@

> Index: sys/net/if.c
> ===
> RCS file: /cvs/src/sys/net/if.c,v
> retrieving revision 1.699
> diff -u -p -r1.699 if.c
> --- sys/net/if.c  5 Jun 2023 11:35:46 -   1.699
> +++ sys/net/if.c  7 Jun 2023 10:15:12 -
> @@ -1170,6 +1170,8 @@ if_detach(struct ifnet *ifp)
>   ifafree(ifa);
>   }
>   }
> + splx(s);
> + NET_UNLOCK();
>  
>   KASSERT(TAILQ_EMPTY(>if_addrhooks));
>   KASSERT(TAILQ_EMPTY(>if_linkstatehooks));
> @@ -1178,8 +1180,6 @@ if_detach(struct ifnet *ifp)
>  #ifdef INET6
>   nd6_ifdetach(ifp);
>  #endif
> - splx(s);
> - NET_UNLOCK();
>  
>   /* Announce that the interface is gone. */
>   rtm_ifannounce(ifp, IFAN_DEPARTURE);



Re: ifconfig rename tcplro

2023-06-07 Thread Alexander Bluhm
On Wed, Jun 07, 2023 at 12:59:11PM +0300, Vitaliy Makkoveev wrote:
> On Wed, Jun 07, 2023 at 10:19:32AM +1000, David Gwynne wrote:
> > 
> > 
> > > On 7 Jun 2023, at 06:33, Vitaliy Makkoveev  wrote:
> > > 
> > >> On 6 Jun 2023, at 20:29, Alexander Bluhm  wrote:
> > >> 
> > >> On Tue, Jun 06, 2023 at 05:54:31PM +0300, Vitaliy Makkoveev wrote:
> > >>> On Tue, Jun 06, 2023 at 02:31:52PM +0200, Alexander Bluhm wrote:
> > >>>> Hi,
> > >>>> 
> > >>>> I would suggest to rename ifconfig tcprecvoffload to tcplro.  Maybe
> > >>>> it's just because I had to type that long name too often.
> > >>>> 
> > >>>> With that we have consistent naming:
> > >>>> # ifconfig ix0 tcplro
> > >>>> # sysctl net.inet.tcp.tso=1
> > >>>> 
> > >>>> Also the coresponding flag are named LRO.
> > >>>> # ifconfig ix1 hwfeatures
> > >>>> ix1: flags=2008843 mtu 1500
> > >>>>   
> > >>>> hwfeatures=71b7
> > >>>>  hardmtu 9198
> > >>>> 
> > >>>> The feature is quite new, so I have no backward compatiblity concerns.
> > >>>> 
> > >>>> ok?
> > >>>> 
> > >>> 
> > >>> Could you name it "lro" like FreeBSD uses?
> > >> 
> > >> When I started with this, LRO and TSO were unknown to me.  So with
> > >> TCP prefix it may be clearer to users where the feature belongs.
> > >> 
> > >> Naming is hard.
> > > 
> > > Yeah, naming is definitely hard. I propose to use lro because it is
> > > already used for the same purpose by FreeBSD, so the same name helps
> > > to avoid confusion.
> > > 
> > >lro If the driver supports tcp(4) large receive offloading,
> > >enable LRO on the interface.
> > > 
> > > Also, we have used "tso" keyword for tcp segmentation offloading for
> > > the same reason, until it became global net.inet.tcp.tso.
> > 
> > Is it going to be used to enable lro for udp and other protocols as well?
> 
> Why not? We have tso feature system wide, so why don't have receive
> offloading feature global for all supported protocols? Especially since
> I suspect this control will be moved from ifconfig to global
> net.inet.tcp.lro like net.inet.tcp.tso.

Maybe we can make lro the default, and then move it to net.inet.tcp.lro.
But I like to see another driver to implement it first.

> However, I'm not the fan of original "tcprecvoffload" and like shorter
> naming.

Can we use ifconfig tcplro for now?
+ it only affects TCP
+ user see that it is related to TCP
+ it is not a 3 letter abrevation claudio does not like
+ it is shorter than tcprecvoffload

cons
- FreeBSD calls it lro

bluhm



Re: ifconfig rename tcplro

2023-06-06 Thread Alexander Bluhm
On Tue, Jun 06, 2023 at 11:33:36PM +0300, Vitaliy Makkoveev wrote:
> > On 6 Jun 2023, at 20:29, Alexander Bluhm  wrote:
> > 
> > On Tue, Jun 06, 2023 at 05:54:31PM +0300, Vitaliy Makkoveev wrote:
> >> On Tue, Jun 06, 2023 at 02:31:52PM +0200, Alexander Bluhm wrote:
> >>> Hi,
> >>> 
> >>> I would suggest to rename ifconfig tcprecvoffload to tcplro.  Maybe
> >>> it's just because I had to type that long name too often.
> >>> 
> >>> With that we have consistent naming:
> >>> # ifconfig ix0 tcplro
> >>> # sysctl net.inet.tcp.tso=1
> >>> 
> >>> Also the coresponding flag are named LRO.
> >>> # ifconfig ix1 hwfeatures
> >>> ix1: flags=2008843 mtu 1500
> >>>
> >>> hwfeatures=71b7
> >>>  hardmtu 9198
> >>> 
> >>> The feature is quite new, so I have no backward compatiblity concerns.
> >>> 
> >>> ok?
> >>> 
> >> 
> >> Could you name it "lro" like FreeBSD uses?
> > 
> > When I started with this, LRO and TSO were unknown to me.  So with
> > TCP prefix it may be clearer to users where the feature belongs.
> > 
> > Naming is hard.
> 
> Yeah, naming is definitely hard. I propose to use lro because it is
> already used for the same purpose by FreeBSD, so the same name helps
> to avoid confusion.
> 
> lro If the driver supports tcp(4) large receive offloading,
> enable LRO on the interface.
> 
> Also, we have used "tso" keyword for tcp segmentation offloading for
> the same reason, until it became global net.inet.tcp.tso.

In sysctl we have tso in tcp namespace.  That's why I wanted tcplro.
And claudio@ asked for a longer name.  Let's see if he has an
opinion.



Re: ifconfig rename tcplro

2023-06-06 Thread Alexander Bluhm
On Tue, Jun 06, 2023 at 05:54:31PM +0300, Vitaliy Makkoveev wrote:
> On Tue, Jun 06, 2023 at 02:31:52PM +0200, Alexander Bluhm wrote:
> > Hi,
> > 
> > I would suggest to rename ifconfig tcprecvoffload to tcplro.  Maybe
> > it's just because I had to type that long name too often.
> > 
> > With that we have consistent naming:
> > # ifconfig ix0 tcplro
> > # sysctl net.inet.tcp.tso=1
> > 
> > Also the coresponding flag are named LRO.
> > # ifconfig ix1 hwfeatures
> > ix1: flags=2008843 mtu 1500
> > 
> > hwfeatures=71b7
> >  hardmtu 9198
> > 
> > The feature is quite new, so I have no backward compatiblity concerns.
> > 
> > ok?
> > 
> 
> Could you name it "lro" like FreeBSD uses?

When I started with this, LRO and TSO were unknown to me.  So with
TCP prefix it may be clearer to users where the feature belongs.

Naming is hard.

bluhm



ifconfig rename tcplro

2023-06-06 Thread Alexander Bluhm
Hi,

I would suggest to rename ifconfig tcprecvoffload to tcplro.  Maybe
it's just because I had to type that long name too often.

With that we have consistent naming:
# ifconfig ix0 tcplro
# sysctl net.inet.tcp.tso=1

Also the coresponding flag are named LRO.
# ifconfig ix1 hwfeatures
ix1: flags=2008843 mtu 1500

hwfeatures=71b7
 hardmtu 9198

The feature is quite new, so I have no backward compatiblity concerns.

ok?

bluhm

Index: sbin/ifconfig/ifconfig.8
===
RCS file: /data/mirror/openbsd/cvs/src/sbin/ifconfig/ifconfig.8,v
retrieving revision 1.396
diff -u -p -r1.396 ifconfig.8
--- sbin/ifconfig/ifconfig.81 Jun 2023 18:57:53 -   1.396
+++ sbin/ifconfig/ifconfig.86 Jun 2023 12:18:07 -
@@ -501,7 +501,7 @@ Query and display information and diagno
 modules installed in an interface.
 It is only supported by drivers implementing the necessary functionality
 on hardware which supports it.
-.It Cm tcprecvoffload
+.It Cm tcplro
 Enable TCP large receive offload (LRO) if it's supported by the hardware; see
 .Cm hwfeatures .
 LRO enabled network interfaces modify received TCP/IP packets.
@@ -517,7 +517,7 @@ It is not possible to use LRO with inter
 or
 .Xr tpmr 4 .
 Changing this option will re-initialize the network interface.
-.It Cm -tcprecvoffload
+.It Cm -tcplro
 Disable LRO.
 LRO is disabled by default.
 .It Cm up
Index: sbin/ifconfig/ifconfig.c
===
RCS file: /data/mirror/openbsd/cvs/src/sbin/ifconfig/ifconfig.c,v
retrieving revision 1.465
diff -u -p -r1.465 ifconfig.c
--- sbin/ifconfig/ifconfig.c1 Jun 2023 18:57:54 -   1.465
+++ sbin/ifconfig/ifconfig.c6 Jun 2023 12:18:59 -
@@ -471,8 +471,8 @@ const structcmd {
{ "-soii",  IFXF_INET6_NOSOII,  0,  setifxflags },
{ "monitor",IFXF_MONITOR,   0,  setifxflags },
{ "-monitor",   -IFXF_MONITOR,  0,  setifxflags },
-   { "tcprecvoffload", IFXF_LRO,   0,  setifxflags },
-   { "-tcprecvoffload", -IFXF_LRO, 0,  setifxflags },
+   { "tcplro", IFXF_LRO,   0,  setifxflags },
+   { "-tcplro",-IFXF_LRO,  0,  setifxflags },
 #ifndef SMALL
{ "hwfeatures", NEXTARG0,   0,  printifhwfeatures },
{ "metric", NEXTARG,0,  setifmetric },



Re: lo(4) checksum offload

2023-06-02 Thread Alexander Bluhm
On Wed, May 31, 2023 at 09:36:22AM +1000, David Gwynne wrote:
> we could export these csum flags as part of the bpf header so we can
> teach tcpdump to shut up in this situation.

Linux does not do that and I want to keep pcap file format compatible.
Also smart tools make debugging real checksum problems harder.  I
think people have to live with checksum warning when tcpdump is
called with -v.

> > The question is, does it break corner cases?  Please test with pf
> > route-to, IPsec, bridging, IPv6 and other setups where loopback
> > might be involved.
> 
> who knows how bridge works? otherwise i think it should be ok.
> hopefully.

No reply from any testers.  I think the only way to figure out is
to commit an watch for fallout.

ok?

bluhm

> > Index: net/if.c
> > ===
> > RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v
> > retrieving revision 1.697
> > diff -u -p -r1.697 if.c
> > --- net/if.c16 May 2023 14:32:54 -  1.697
> > +++ net/if.c30 May 2023 12:33:42 -
> > @@ -778,7 +778,7 @@ if_input(struct ifnet *ifp, struct mbuf_
> >  int
> >  if_input_local(struct ifnet *ifp, struct mbuf *m, sa_family_t af)
> >  {
> > -   int keepflags;
> > +   int keepflags, keepcksum;
> >  
> >  #if NBPFILTER > 0
> > /*
> > @@ -796,11 +796,26 @@ if_input_local(struct ifnet *ifp, struct
> > }
> >  #endif
> > keepflags = m->m_flags & (M_BCAST|M_MCAST);
> > +   /*
> > +* Preserve outgoing checksum flags, in case the packet is
> > +* forwarded to another interface.  Then the checksum, which
> > +* is now incorrect, will be calculated before sending.
> > +*/
> > +   keepcksum = m->m_pkthdr.csum_flags & (M_IPV4_CSUM_OUT |
> > +   M_TCP_CSUM_OUT | M_UDP_CSUM_OUT | M_ICMP_CSUM_OUT);
> > m_resethdr(m);
> > m->m_flags |= M_LOOP | keepflags;
> > +   m->m_pkthdr.csum_flags = keepcksum;
> > m->m_pkthdr.ph_ifidx = ifp->if_index;
> > m->m_pkthdr.ph_rtableid = ifp->if_rdomain;
> >  
> > +   if (ISSET(keepcksum, M_TCP_CSUM_OUT))
> > +   m->m_pkthdr.csum_flags |= M_TCP_CSUM_IN_OK;
> > +   if (ISSET(keepcksum, M_UDP_CSUM_OUT))
> > +   m->m_pkthdr.csum_flags |= M_UDP_CSUM_IN_OK;
> > +   if (ISSET(keepcksum, M_ICMP_CSUM_OUT))
> > +   m->m_pkthdr.csum_flags |= M_ICMP_CSUM_IN_OK;
> > +
> > ifp->if_opackets++;
> > ifp->if_obytes += m->m_pkthdr.len;
> >  
> > @@ -809,6 +824,8 @@ if_input_local(struct ifnet *ifp, struct
> >  
> > switch (af) {
> > case AF_INET:
> > +   if (ISSET(keepcksum, M_IPV4_CSUM_OUT))
> > +   m->m_pkthdr.csum_flags |= M_IPV4_CSUM_IN_OK;
> > ipv4_input(ifp, m);
> > break;
> >  #ifdef INET6
> > Index: net/if_loop.c
> > ===
> > RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_loop.c,v
> > retrieving revision 1.93
> > diff -u -p -r1.93 if_loop.c
> > --- net/if_loop.c   21 Oct 2022 14:20:03 -  1.93
> > +++ net/if_loop.c   30 May 2023 11:48:55 -
> > @@ -173,6 +173,9 @@ loop_clone_create(struct if_clone *ifc, 
> > ifp->if_mtu = LOMTU;
> > ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST;
> > ifp->if_xflags = IFXF_CLONED;
> > +   ifp->if_capabilities = IFCAP_CSUM_IPv4 |
> > +   IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 |
> > +   IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
> > ifp->if_rtrequest = lortrequest;
> > ifp->if_ioctl = loioctl;
> > ifp->if_input = loinput;
> > 



Re: lo(4) checksum offload

2023-06-01 Thread Alexander Bluhm
On Thu, Jun 01, 2023 at 04:38:53PM +, Peter Stuge wrote:
> David Gwynne wrote:
> > > Currently packets sent over loopback interface get their checksum
> > > calculated twice.  In the output path it is set and during TCP/IP
> > > input it is calculated again to be compared with the previous value.
> ..
> > > The question is, does it break corner cases?  Please test with pf
> > > route-to, IPsec, bridging, IPv6 and other setups where loopback
> > > might be involved.
> > 
> > who knows how bridge works? otherwise i think it should be ok.
> > hopefully.
> 
> Would a middle ground be safer? To only calculate checksum on output?

Either it works or not.  If not, we can fix the bugs.  Have you
tested and discovered a failure?

bluhm



Re: ix(4): LRO forwarding

2023-05-31 Thread Alexander Bluhm
On Thu, May 25, 2023 at 10:40:51PM +0200, Jan Klemkow wrote:
> On Wed, May 24, 2023 at 05:28:58PM +0200, Alexander Bluhm wrote:
> > On Tue, May 23, 2023 at 02:14:57PM +0200, Jan Klemkow wrote:
> > > This diff sets needed offloading flags and the calculated mss to LRO
> > > mbufs in ix(4).  Thus, we can forward this packets and process them via
> > > tcp_if_output_tso().  This diff also uses tcp_if_output_tso() in
> > > ip6_forward().

After lot of testing by Hrvoje and fixing corner cases with Jan,
this is the diff we currently have.  There a no more known problems
with TCP large receive offloading.

ok?

bluhm

Index: dev/pci/if_ix.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.196
diff -u -p -r1.196 if_ix.c
--- dev/pci/if_ix.c 23 May 2023 09:16:16 -  1.196
+++ dev/pci/if_ix.c 31 May 2023 13:48:25 -
@@ -3245,6 +3245,8 @@ ixgbe_rxeof(struct rx_ring *rxr)
sendmp = NULL;
mp->m_next = nxbuf->buf;
} else { /* Sending this frame? */
+   uint16_t pkts;
+
ixgbe_rx_checksum(staterr, sendmp);
 
if (hashtype != IXGBE_RXDADV_RSSTYPE_NONE) {
@@ -3252,19 +3254,45 @@ ixgbe_rxeof(struct rx_ring *rxr)
SET(sendmp->m_pkthdr.csum_flags, M_FLOWID);
}
 
-   if (sendmp->m_pkthdr.ph_mss == 1)
-   sendmp->m_pkthdr.ph_mss = 0;
+   pkts = sendmp->m_pkthdr.ph_mss;
+   sendmp->m_pkthdr.ph_mss = 0;
 
-   if (sendmp->m_pkthdr.ph_mss > 0) {
+   if (pkts > 1) {
struct ether_extracted ext;
-   uint16_t pkts = sendmp->m_pkthdr.ph_mss;
+   uint32_t hdrlen, paylen;
 
+   /* Calculate header size. */
ether_extract_headers(sendmp, );
-   if (ext.tcp)
+   hdrlen = sizeof(*ext.eh);
+   if (ext.ip4)
+   hdrlen += ext.ip4->ip_hl << 2;
+   if (ext.ip6)
+   hdrlen += sizeof(*ext.ip6);
+   if (ext.tcp) {
+   hdrlen += ext.tcp->th_off << 2;
tcpstat_inc(tcps_inhwlro);
-   else
+   tcpstat_add(tcps_inpktlro, pkts);
+   } else {
tcpstat_inc(tcps_inbadlro);
-   tcpstat_add(tcps_inpktlro, pkts);
+   }
+
+   /*
+* If we gonna forward this packet, we have to
+* mark it as TSO, set a correct mss,
+* and recalculate the TCP checksum.
+*/
+   paylen = sendmp->m_pkthdr.len - hdrlen;
+   if (ext.tcp && paylen >= pkts) {
+   SET(sendmp->m_pkthdr.csum_flags,
+   M_TCP_TSO);
+   sendmp->m_pkthdr.ph_mss = paylen / pkts;
+   }
+   if (ext.tcp &&
+   ISSET(sendmp->m_pkthdr.csum_flags,
+   M_TCP_CSUM_IN_OK)) {
+   SET(sendmp->m_pkthdr.csum_flags,
+   M_TCP_CSUM_OUT);
+   }
}
 
ml_enqueue(, sendmp);
Index: netinet6/ip6_forward.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_forward.c,v
retrieving revision 1.109
diff -u -p -r1.109 ip6_forward.c
--- netinet6/ip6_forward.c  5 Apr 2023 13:56:31 -   1.109
+++ netinet6/ip6_forward.c  31 May 2023 13:48:25 -
@@ -63,8 +63,10 @@
 #include 
 #include 
 #include 
-#include 
 #endif
+#include 
+#include 
+#include 
 
 /*
  * Forward a packet.  If some error occurs return the sender
@@ -316,7 +318,11 @@ reroute:
goto reroute;
}
 #endif
-   in6_proto_cksum_out(m, ifp);
+
+   error = tcp_if_output_tso(ifp, , sin6tosa(sin6), rt, IFCAP_TSOv6,
+   ifp->if_mtu);
+   if (error || m == NULL)
+   goto freecopy;
 
/* Check 

lo(4) checksum offload

2023-05-30 Thread Alexander Bluhm
Hi,

Currently packets sent over loopback interface get their checksum
calculated twice.  In the output path it is set and during TCP/IP
input it is calculated again to be compared with the previous value.

This can be avoided by claiming that lo(4) supports hardware checksum
offloading.  For each packet convert the flag that the checksum
should be calculated to the flag that it has been checked successfully.

In a simple test on a vmm guest I see between 30% to 60% increase
of thoughput over lo0.

A drawback is that "tcpdump -ni lo0 -v" reports invalid checksum.
But people are used to that with physical interfaces and hardware
offloading.

The question is, does it break corner cases?  Please test with pf
route-to, IPsec, bridging, IPv6 and other setups where loopback
might be involved.

bluhm

Index: net/if.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v
retrieving revision 1.697
diff -u -p -r1.697 if.c
--- net/if.c16 May 2023 14:32:54 -  1.697
+++ net/if.c30 May 2023 12:33:42 -
@@ -778,7 +778,7 @@ if_input(struct ifnet *ifp, struct mbuf_
 int
 if_input_local(struct ifnet *ifp, struct mbuf *m, sa_family_t af)
 {
-   int keepflags;
+   int keepflags, keepcksum;
 
 #if NBPFILTER > 0
/*
@@ -796,11 +796,26 @@ if_input_local(struct ifnet *ifp, struct
}
 #endif
keepflags = m->m_flags & (M_BCAST|M_MCAST);
+   /*
+* Preserve outgoing checksum flags, in case the packet is
+* forwarded to another interface.  Then the checksum, which
+* is now incorrect, will be calculated before sending.
+*/
+   keepcksum = m->m_pkthdr.csum_flags & (M_IPV4_CSUM_OUT |
+   M_TCP_CSUM_OUT | M_UDP_CSUM_OUT | M_ICMP_CSUM_OUT);
m_resethdr(m);
m->m_flags |= M_LOOP | keepflags;
+   m->m_pkthdr.csum_flags = keepcksum;
m->m_pkthdr.ph_ifidx = ifp->if_index;
m->m_pkthdr.ph_rtableid = ifp->if_rdomain;
 
+   if (ISSET(keepcksum, M_TCP_CSUM_OUT))
+   m->m_pkthdr.csum_flags |= M_TCP_CSUM_IN_OK;
+   if (ISSET(keepcksum, M_UDP_CSUM_OUT))
+   m->m_pkthdr.csum_flags |= M_UDP_CSUM_IN_OK;
+   if (ISSET(keepcksum, M_ICMP_CSUM_OUT))
+   m->m_pkthdr.csum_flags |= M_ICMP_CSUM_IN_OK;
+
ifp->if_opackets++;
ifp->if_obytes += m->m_pkthdr.len;
 
@@ -809,6 +824,8 @@ if_input_local(struct ifnet *ifp, struct
 
switch (af) {
case AF_INET:
+   if (ISSET(keepcksum, M_IPV4_CSUM_OUT))
+   m->m_pkthdr.csum_flags |= M_IPV4_CSUM_IN_OK;
ipv4_input(ifp, m);
break;
 #ifdef INET6
Index: net/if_loop.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_loop.c,v
retrieving revision 1.93
diff -u -p -r1.93 if_loop.c
--- net/if_loop.c   21 Oct 2022 14:20:03 -  1.93
+++ net/if_loop.c   30 May 2023 11:48:55 -
@@ -173,6 +173,9 @@ loop_clone_create(struct if_clone *ifc, 
ifp->if_mtu = LOMTU;
ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST;
ifp->if_xflags = IFXF_CLONED;
+   ifp->if_capabilities = IFCAP_CSUM_IPv4 |
+   IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 |
+   IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
ifp->if_rtrequest = lortrequest;
ifp->if_ioctl = loioctl;
ifp->if_input = loinput;



Re: syn cache tcp checksum

2023-05-30 Thread Alexander Bluhm
anyone?

On Mon, May 22, 2023 at 10:17:31PM +0200, Alexander Bluhm wrote:
> Hi,
> 
> I noticed a missing checksum count in netstat tcp packets sent
> software-checksummed.  Turns out that our syn cache does the checksum
> calculation by hand, instead of the established mechanism in ip
> output.
> 
> Just set the flag M_TCP_CSUM_OUT and let in_proto_cksum_out() do
> the work later.
> 
> While there remove redundant code.  The unhandled af case is handled
> in the first switch statement of the function.
> 
> ok?
> 
> bluhm
> 
> Index: netinet/tcp_input.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
> retrieving revision 1.387
> diff -u -p -r1.387 tcp_input.c
> --- netinet/tcp_input.c   14 Mar 2023 00:24:05 -  1.387
> +++ netinet/tcp_input.c   22 May 2023 19:48:25 -
> @@ -3990,14 +3990,11 @@ syn_cache_respond(struct syn_cache *sc, 
>   ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
>   ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
>   ip6->ip6_nxt = IPPROTO_TCP;
> - /* ip6_plen will be updated in ip6_output() */
>   th = (struct tcphdr *)(ip6 + 1);
>   th->th_dport = sc->sc_src.sin6.sin6_port;
>   th->th_sport = sc->sc_dst.sin6.sin6_port;
>   break;
>  #endif
> - default:
> - unhandled_af(sc->sc_src.sa.sa_family);
>   }
>  
>   th->th_seq = htonl(sc->sc_iss);
> @@ -4096,21 +4093,7 @@ syn_cache_respond(struct syn_cache *sc, 
>   }
>  #endif /* TCP_SIGNATURE */
>  
> - /* Compute the packet's checksum. */
> - switch (sc->sc_src.sa.sa_family) {
> - case AF_INET:
> - ip->ip_len = htons(tlen - hlen);
> - th->th_sum = 0;
> - th->th_sum = in_cksum(m, tlen);
> - break;
> -#ifdef INET6
> - case AF_INET6:
> - ip6->ip6_plen = htons(tlen - hlen);
> - th->th_sum = 0;
> - th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
> - break;
> -#endif
> - }
> + SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
>  
>   /* use IPsec policy and ttl from listening socket, on SYN ACK */
>   inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL;
> @@ -4125,34 +4108,22 @@ syn_cache_respond(struct syn_cache *sc, 
>   ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl;
>   if (inp != NULL)
>   ip->ip_tos = inp->inp_ip.ip_tos;
> - break;
> -#ifdef INET6
> - case AF_INET6:
> - ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
> - ip6->ip6_vfc |= IPV6_VERSION;
> - ip6->ip6_plen = htons(tlen - hlen);
> - /* ip6_hlim will be initialized afterwards */
> - /* leave flowlabel = 0, it is legal and require no state mgmt */
> - break;
> -#endif
> - }
>  
> - switch (sc->sc_src.sa.sa_family) {
> - case AF_INET:
>   error = ip_output(m, sc->sc_ipopts, >sc_route4,
>   (ip_mtudisc ? IP_MTUDISC : 0),  NULL, inp, 0);
>   break;
>  #ifdef INET6
>   case AF_INET6:
> + ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
> + ip6->ip6_vfc |= IPV6_VERSION;
> + /* ip6_plen will be updated in ip6_output() */
>   ip6->ip6_hlim = in6_selecthlim(inp);
> + /* leave flowlabel = 0, it is legal and require no state mgmt */
>  
>   error = ip6_output(m, NULL /*XXX*/, >sc_route6, 0,
>   NULL, NULL);
>   break;
>  #endif
> - default:
> - error = EAFNOSUPPORT;
> - break;
>   }
>   return (error);
>  }



cksum remove redundant code

2023-05-26 Thread Alexander Bluhm
Hi,

in_ifcap_cksum() checks ifp == NULL
in_hdr_cksum_out() sets ip_sum = 0
in_proto_cksum_out() and in6_proto_cksum_out() always write
th_sum if M_TCP_CSUM_OUT is set and proto is IPPROTO_TCP.

ok?

bluhm

Index: netinet/ip_output.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_output.c,v
retrieving revision 1.388
diff -u -p -r1.388 ip_output.c
--- netinet/ip_output.c 22 May 2023 16:08:34 -  1.388
+++ netinet/ip_output.c 26 May 2023 11:55:49 -
@@ -1801,7 +1801,7 @@ in_hdr_cksum_out(struct mbuf *m, struct 
struct ip *ip = mtod(m, struct ip *);
 
ip->ip_sum = 0;
-   if (ifp && in_ifcap_cksum(m, ifp, IFCAP_CSUM_IPv4)) {
+   if (in_ifcap_cksum(m, ifp, IFCAP_CSUM_IPv4)) {
SET(m->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT);
} else {
ipstat_inc(ips_outswcsum);
Index: netinet/tcp_output.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_output.c,v
retrieving revision 1.138
diff -u -p -r1.138 tcp_output.c
--- netinet/tcp_output.c15 May 2023 16:34:56 -  1.138
+++ netinet/tcp_output.c26 May 2023 15:19:12 -
@@ -1295,7 +1295,6 @@ tcp_chopper(struct mbuf *m0, struct mbuf
 
/* copy and adjust IP header, calculate checksum */
SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
-   mhth->th_sum = 0;
if (ip) {
struct ip *mhip;
 
@@ -1328,10 +1327,8 @@ tcp_chopper(struct mbuf *m0, struct mbuf
}
/* adjust IP header, calculate checksum */
SET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
-   th->th_sum = 0;
if (ip) {
ip->ip_len = htons(m0->m_pkthdr.len);
-   ip->ip_sum = 0;
in_hdr_cksum_out(m0, ifp);
in_proto_cksum_out(m0, ifp);
}



Re: Relax netlock to shared netlock and push it down to mrt_sysctl_mfc()

2023-05-26 Thread Alexander Bluhm
On Wed, May 17, 2023 at 01:02:58PM +0300, Vitaliy Makkoveev wrote:
> mrt_rtwalk_mfcsysctl() performs read-only access to protected data, so
> rtable_walk() could be called with shared netlock.

While I think the NET_LOCK_SHARED() is not sufficent, you can move
the NET_LOCK() into mrt_sysctl_mfc().  Only mrt_rtwalk_mfcsysctl
needs protection.

That diff would be OK bluhm@

> Index: sys/netinet/ip_input.c
> ===
> RCS file: /cvs/src/sys/netinet/ip_input.c,v
> retrieving revision 1.384
> diff -u -p -r1.384 ip_input.c
> --- sys/netinet/ip_input.c16 May 2023 19:36:00 -  1.384
> +++ sys/netinet/ip_input.c17 May 2023 09:59:16 -
> @@ -1712,10 +1712,7 @@ ip_sysctl(int *name, u_int namelen, void
>   case IPCTL_MRTMFC:
>   if (newp)
>   return (EPERM);
> - NET_LOCK();
> - error = mrt_sysctl_mfc(oldp, oldlenp);
> - NET_UNLOCK();
> - return (error);
> + return (mrt_sysctl_mfc(oldp, oldlenp));
>   case IPCTL_MRTVIF:
>   if (newp)
>   return (EPERM);
> Index: sys/netinet/ip_mroute.c
> ===
> RCS file: /cvs/src/sys/netinet/ip_mroute.c,v
> retrieving revision 1.138
> diff -u -p -r1.138 ip_mroute.c
> --- sys/netinet/ip_mroute.c   19 Apr 2023 20:03:51 -  1.138
> +++ sys/netinet/ip_mroute.c   17 May 2023 09:59:16 -
> @@ -479,10 +479,12 @@ mrt_sysctl_mfc(void *oldp, size_t *oldle
>   msa.msa_len = *oldlenp;
>   msa.msa_needed = 0;
>  
> + NET_LOCK_SHARED();
>   for (rtableid = 0; rtableid <= RT_TABLEID_MAX; rtableid++) {
>   rtable_walk(rtableid, AF_INET, NULL, mrt_rtwalk_mfcsysctl,
>   );
>   }
> + NET_UNLOCK_SHARED();
>  
>   if (msa.msa_minfos != NULL && msa.msa_needed > 0 &&
>   (error = copyout(msa.msa_minfos, oldp, msa.msa_needed)) != 0) {



Re: Relax netlock to shared netlock and push it down to mrt_sysctl_mfc()

2023-05-26 Thread Alexander Bluhm
On Fri, May 26, 2023 at 06:25:46PM +0300, Vitaliy Makkoveev wrote:
> On Fri, May 26, 2023 at 05:08:06PM +0200, Alexander Bluhm wrote:
> > On Fri, May 26, 2023 at 05:29:58PM +0300, Vitaliy Makkoveev wrote:
> > > On Wed, May 17, 2023 at 01:02:58PM +0300, Vitaliy Makkoveev wrote:
> > > > mrt_rtwalk_mfcsysctl() performs read-only access to protected data, so
> > > > rtable_walk() could be called with shared netlock.
> > > 
> > > Regardless on sysctl(2) unlocking backout, the netlock around
> > > mrt_sysctl_mfc() could be relaxed to shared netlock.
> > 
> > IP multicast is far from MP ready.  As a usual workaround I use
> > exclusive netlock or shared netlock plus kernel lock.
> > 
> > Whenever I have to call something with multicast from a section
> > that has only shared netlock, I grab the kenrel lock.
> > 
> > So using only NET_LOCK_SHARED() for reading multicast data does not
> > seem enough.
> 
> mrt_rtwalk_mfcsysctl() does read-only access. Since the sysctl(2)
> unlocking was reverted, this will be "shared netlock plus kernel lock"
> case.
> 
> > Look at the way ip_input_if() calls ip_mforward().  Maybe we should
> > start making ip_mroute.c MP safe.  Unfortunately I have no test
> > environment for that.
> 
> mpi@ said the kernel lock removal from uvm_swap_free() will be easy. So
> I want to try to remove it and push sysctl(2) unlocking back.

But you cannot do both.  Move to shared lock in mrt_rtwalk_mfcsysctl,
and remove kernel lock from sysctl.

Write access is done with shared netlock plus kernel lock.
ip_input_if() -> ip_mforward() -> mfc_add() -> update_mfc_params() ->
mrt_mcast_del() -> rt->rt_llinfo = NULL;
So shared netlock alone is not sufficient for read access.
The popper way is to add some locking to mroute to protect itself
when running with shared netlock.

bluhm

> > > > Index: sys/netinet/ip_input.c
> > > > ===
> > > > RCS file: /cvs/src/sys/netinet/ip_input.c,v
> > > > retrieving revision 1.384
> > > > diff -u -p -r1.384 ip_input.c
> > > > --- sys/netinet/ip_input.c  16 May 2023 19:36:00 -  1.384
> > > > +++ sys/netinet/ip_input.c  17 May 2023 09:59:16 -
> > > > @@ -1712,10 +1712,7 @@ ip_sysctl(int *name, u_int namelen, void
> > > > case IPCTL_MRTMFC:
> > > > if (newp)
> > > > return (EPERM);
> > > > -   NET_LOCK();
> > > > -   error = mrt_sysctl_mfc(oldp, oldlenp);
> > > > -   NET_UNLOCK();
> > > > -   return (error);
> > > > +   return (mrt_sysctl_mfc(oldp, oldlenp));
> > > > case IPCTL_MRTVIF:
> > > > if (newp)
> > > > return (EPERM);
> > > > Index: sys/netinet/ip_mroute.c
> > > > ===
> > > > RCS file: /cvs/src/sys/netinet/ip_mroute.c,v
> > > > retrieving revision 1.138
> > > > diff -u -p -r1.138 ip_mroute.c
> > > > --- sys/netinet/ip_mroute.c 19 Apr 2023 20:03:51 -  1.138
> > > > +++ sys/netinet/ip_mroute.c 17 May 2023 09:59:16 -
> > > > @@ -479,10 +479,12 @@ mrt_sysctl_mfc(void *oldp, size_t *oldle
> > > > msa.msa_len = *oldlenp;
> > > > msa.msa_needed = 0;
> > > >  
> > > > +   NET_LOCK_SHARED();
> > > > for (rtableid = 0; rtableid <= RT_TABLEID_MAX; rtableid++) {
> > > > rtable_walk(rtableid, AF_INET, NULL, 
> > > > mrt_rtwalk_mfcsysctl,
> > > > );
> > > > }
> > > > +   NET_UNLOCK_SHARED();
> > > >  
> > > > if (msa.msa_minfos != NULL && msa.msa_needed > 0 &&
> > > > (error = copyout(msa.msa_minfos, oldp, msa.msa_needed)) != 
> > > > 0) {
> > > > 
> > 



Re: Relax netlock to shared netlock and push it down to mrt_sysctl_mfc()

2023-05-26 Thread Alexander Bluhm
On Fri, May 26, 2023 at 05:29:58PM +0300, Vitaliy Makkoveev wrote:
> On Wed, May 17, 2023 at 01:02:58PM +0300, Vitaliy Makkoveev wrote:
> > mrt_rtwalk_mfcsysctl() performs read-only access to protected data, so
> > rtable_walk() could be called with shared netlock.
> > 
> 
> Regardless on sysctl(2) unlocking backout, the netlock around
> mrt_sysctl_mfc() could be relaxed to shared netlock.

IP multicast is far from MP ready.  As a usual workaround I use
exclusive netlock or shared netlock plus kernel lock.

Whenever I have to call something with multicast from a section
that has only shared netlock, I grab the kenrel lock.

So using only NET_LOCK_SHARED() for reading multicast data does not
seem enough.

Look at the way ip_input_if() calls ip_mforward().  Maybe we should
start making ip_mroute.c MP safe.  Unfortunately I have no test
environment for that.

bluhm

> > Index: sys/netinet/ip_input.c
> > ===
> > RCS file: /cvs/src/sys/netinet/ip_input.c,v
> > retrieving revision 1.384
> > diff -u -p -r1.384 ip_input.c
> > --- sys/netinet/ip_input.c  16 May 2023 19:36:00 -  1.384
> > +++ sys/netinet/ip_input.c  17 May 2023 09:59:16 -
> > @@ -1712,10 +1712,7 @@ ip_sysctl(int *name, u_int namelen, void
> > case IPCTL_MRTMFC:
> > if (newp)
> > return (EPERM);
> > -   NET_LOCK();
> > -   error = mrt_sysctl_mfc(oldp, oldlenp);
> > -   NET_UNLOCK();
> > -   return (error);
> > +   return (mrt_sysctl_mfc(oldp, oldlenp));
> > case IPCTL_MRTVIF:
> > if (newp)
> > return (EPERM);
> > Index: sys/netinet/ip_mroute.c
> > ===
> > RCS file: /cvs/src/sys/netinet/ip_mroute.c,v
> > retrieving revision 1.138
> > diff -u -p -r1.138 ip_mroute.c
> > --- sys/netinet/ip_mroute.c 19 Apr 2023 20:03:51 -  1.138
> > +++ sys/netinet/ip_mroute.c 17 May 2023 09:59:16 -
> > @@ -479,10 +479,12 @@ mrt_sysctl_mfc(void *oldp, size_t *oldle
> > msa.msa_len = *oldlenp;
> > msa.msa_needed = 0;
> >  
> > +   NET_LOCK_SHARED();
> > for (rtableid = 0; rtableid <= RT_TABLEID_MAX; rtableid++) {
> > rtable_walk(rtableid, AF_INET, NULL, mrt_rtwalk_mfcsysctl,
> > );
> > }
> > +   NET_UNLOCK_SHARED();
> >  
> > if (msa.msa_minfos != NULL && msa.msa_needed > 0 &&
> > (error = copyout(msa.msa_minfos, oldp, msa.msa_needed)) != 0) {
> > 



OpenBSD Errata: May 26, 2023 (rpki ssl)

2023-05-26 Thread Alexander Bluhm
Errata patches for rpki-client and LibreSSL libssl have been released
for OpenBSD 7.2 and 7.3.

Binary updates for the amd64, i386 and arm64 platform are available
via the syspatch utility.  Source code patches can be found on the
respective errata page:

  https://www.openbsd.org/errata72.html
  https://www.openbsd.org/errata73.html



Re: ix(4): LRO forwarding

2023-05-24 Thread Alexander Bluhm
On Tue, May 23, 2023 at 02:14:57PM +0200, Jan Klemkow wrote:
> Hi,
> 
> This diff sets needed offloading flags and the calculated mss to LRO
> mbufs in ix(4).  Thus, we can forward this packets and process them via
> tcp_if_output_tso().  This diff also uses tcp_if_output_tso() in
> ip6_forward().
> 
> I tested the ip6_forward path via the address family transition in pf:
> 
> pass in inet from 192.168.1.1 to 192.168.13.2 af-to \
> inet6 from fc00:13::1 to fc00:13::2
> 
> ok?

crashes during my tests with lro turned on.  Looks like devision
by zero.

START   ssh_perform@lt13_iperf3_-c10.3.46.36_-P10_-t10  2023-05-24T13:33:54Z
Timeout, server ot14 not responding.

login: [-- MARK -- Wed May 24 15:30:00 2023]
kerkneel:rker  nn el:e   l :  in  t e   g e r   d  i   v  inite dg e e  
fr   a   ul t   t r a  p, c  o   d   e =0
 Stopped at  ixgbe_encap+0x177:  divl%ecx,%eax

ddb{2}> trace
ixgbe_encap(801241b0,fd80b80db400) at ixgbe_encap+0x177
ixgbe_start(80121800) at ixgbe_start+0xcf
ifq_serialize(80121800,801218e0) at ifq_serialize+0xfd
taskq_thread(8002f000) at taskq_thread+0x100
end trace frame: 0x0, count: -4

ddb{2}> ps
   PID TID   PPIDUID  S   FLAGS  WAIT  COMMAND
 99854  209089   9079  0  30x100082  kqreadssh
 15809  442526  1  0  30x100080  kqreadssh
 97949   86242  1  0  30x100080  kqreadssh
  9079  146443  53981  0  30x82  piperdperl
 53981  227769   5893  0  30x10008a  sigsusp   ksh
  5893  451861   1468  0  30x9a  kqreadsshd
 94168  490141  1  0  30x100083  ttyin getty
 75652  427583  1  0  30x100098  kqreadcron
 39299  237287  1 99  3   0x1100090  kqreadsndiod
 47108  360525  1110  30x100090  kqreadsndiod
 38566  177187  19960 95  3   0x1100092  kqreadsmtpd
 23522  195537  19960103  3   0x1100092  kqreadsmtpd
 80420  195512  19960 95  3   0x1100092  kqreadsmtpd
  2367   42113  19960 95  30x100092  kqreadsmtpd
  7393  283209  19960 95  3   0x1100092  kqreadsmtpd
  7653  297707  19960 95  3   0x1100092  kqreadsmtpd
 19960  437487  1  0  30x100080  kqreadsmtpd
 71374  338452  30627 89  3   0x1100092  kqreadrelayd
 16907  408885  30627 89  3   0x1100092  kqreadrelayd
 35081  284379  30627 89  3   0x1100092  kqreadrelayd
 25277  284381  30627 89  3   0x1100092  kqreadrelayd
 30678  463036  30627 89  3   0x1100092  kqreadrelayd
 83004  265034  30627 89  3   0x1100092  kqreadrelayd
 95286  431522  30627 89  3   0x1100092  kqreadrelayd
 59217  310908  30627 89  3   0x1100092  kqreadrelayd
 30627  478072  1  0  30x80  kqreadrelayd
 11795  299233  20858 91  30x92  kqreadsnmpd_metrics
 20858   85096  1  0  30x100080  kqreadsnmpd
 37930  377730  1 91  3   0x1100092  kqreadsnmpd
  1468   62101  1  0  30x88  kqreadsshd
 24611   87454  0  0  3 0x14280  nfsidlnfsio
 65040  198675  0  0  3 0x14280  nfsidlnfsio
 24289  234081  0  0  3 0x14280  nfsidlnfsio
  8301  276844  0  0  3 0x14280  nfsidlnfsio
 93209   90851  1  0  30x100080  kqreadntpd
 61881   51205  99207 83  30x100092  kqreadntpd
 99207  296079  1 83  3   0x1100092  kqreadntpd
 19281  369312  38531 74  3   0x1100092  bpf   pflogd
 38531  252705  1  0  30x80  netio pflogd
 42140  503330   9467 73  3   0x1100090  kqreadsyslogd
  9467  506976  1  0  30x100082  netio syslogd
 86346  356807  1  0  30x100080  kqreadresolvd
  8248  456268  86437 77  30x100092  kqreaddhcpleased
 81843  247582  86437 77  30x100092  kqreaddhcpleased
 86437  362320  1  0  30x80  kqreaddhcpleased
 52151  379618   4048115  30x100092  kqreadslaacd
 61436   14534   4048115  30x100092  kqreadslaacd
  4048  130235  1  0  30x100080  kqreadslaacd
 20154   61454  0  0  3 0x14200  bored smr
 91942  510513  0  0  3 0x14200  pgzerozerothread
 42079  231044  0  0  3 0x14200  aiodoned  aiodoned
 91303   99546  0  0  3 0x14200  syncerupdate
  2249   13527  0  0  3 0x14200  cleaner   cleaner
 84329   74364  0  0  3 0x14200  reaperreaper
 16246  208290  0  0  3 0x14200  pgdaemon  pagedaemon
 87196  379876  0  0  3 0x14200  usbtskusbtask
 79540  107697  0

syn cache tcp checksum

2023-05-22 Thread Alexander Bluhm
Hi,

I noticed a missing checksum count in netstat tcp packets sent
software-checksummed.  Turns out that our syn cache does the checksum
calculation by hand, instead of the established mechanism in ip
output.

Just set the flag M_TCP_CSUM_OUT and let in_proto_cksum_out() do
the work later.

While there remove redundant code.  The unhandled af case is handled
in the first switch statement of the function.

ok?

bluhm

Index: netinet/tcp_input.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
retrieving revision 1.387
diff -u -p -r1.387 tcp_input.c
--- netinet/tcp_input.c 14 Mar 2023 00:24:05 -  1.387
+++ netinet/tcp_input.c 22 May 2023 19:48:25 -
@@ -3990,14 +3990,11 @@ syn_cache_respond(struct syn_cache *sc, 
ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
ip6->ip6_nxt = IPPROTO_TCP;
-   /* ip6_plen will be updated in ip6_output() */
th = (struct tcphdr *)(ip6 + 1);
th->th_dport = sc->sc_src.sin6.sin6_port;
th->th_sport = sc->sc_dst.sin6.sin6_port;
break;
 #endif
-   default:
-   unhandled_af(sc->sc_src.sa.sa_family);
}
 
th->th_seq = htonl(sc->sc_iss);
@@ -4096,21 +4093,7 @@ syn_cache_respond(struct syn_cache *sc, 
}
 #endif /* TCP_SIGNATURE */
 
-   /* Compute the packet's checksum. */
-   switch (sc->sc_src.sa.sa_family) {
-   case AF_INET:
-   ip->ip_len = htons(tlen - hlen);
-   th->th_sum = 0;
-   th->th_sum = in_cksum(m, tlen);
-   break;
-#ifdef INET6
-   case AF_INET6:
-   ip6->ip6_plen = htons(tlen - hlen);
-   th->th_sum = 0;
-   th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
-   break;
-#endif
-   }
+   SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
 
/* use IPsec policy and ttl from listening socket, on SYN ACK */
inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL;
@@ -4125,34 +4108,22 @@ syn_cache_respond(struct syn_cache *sc, 
ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl;
if (inp != NULL)
ip->ip_tos = inp->inp_ip.ip_tos;
-   break;
-#ifdef INET6
-   case AF_INET6:
-   ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
-   ip6->ip6_vfc |= IPV6_VERSION;
-   ip6->ip6_plen = htons(tlen - hlen);
-   /* ip6_hlim will be initialized afterwards */
-   /* leave flowlabel = 0, it is legal and require no state mgmt */
-   break;
-#endif
-   }
 
-   switch (sc->sc_src.sa.sa_family) {
-   case AF_INET:
error = ip_output(m, sc->sc_ipopts, >sc_route4,
(ip_mtudisc ? IP_MTUDISC : 0),  NULL, inp, 0);
break;
 #ifdef INET6
case AF_INET6:
+   ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
+   ip6->ip6_vfc |= IPV6_VERSION;
+   /* ip6_plen will be updated in ip6_output() */
ip6->ip6_hlim = in6_selecthlim(inp);
+   /* leave flowlabel = 0, it is legal and require no state mgmt */
 
error = ip6_output(m, NULL /*XXX*/, >sc_route6, 0,
NULL, NULL);
break;
 #endif
-   default:
-   error = EAFNOSUPPORT;
-   break;
}
return (error);
 }



Re: Fix wrong interface mtu in tcp_mss

2023-05-21 Thread Alexander Bluhm
On Sat, May 20, 2023 at 02:46:27PM +0200, Claudio Jeker wrote:
> On Fri, May 19, 2023 at 07:58:47PM +0200, Jan Klemkow wrote:
> > Hi,
> > 
> > We use the wrong interface and mtu in tcp_mss() to calculate the mss if
> > the destination address points is a local address.  In ip_output() we
> > use the correct interface and its mtu.
> > 
> > This limits the mss to 1448 if the mtu of the interface it 1500,
> > instead of using a local 32k mss.
> > 
> > The bigger issue is: local bulk traffic with the current TSO
> > implementation is broken.  tcp_output() creates TSO packets with an mss
> > smaller then 32k and ip_output() calls if_output instead of
> > tcp_if_output_tso() because it fits into the mtu check of lo0.
> > 
> > This diff takes the same logic to pick the interface in tcp_mss() as its
> > done in ip_output() and fixes both issues.
> > 
> > ok?
> 
> I'm fine with this going in since it emulates the same behaviour as
> ip_output. For the curious ip6_output() seems to have the same workaround.

OK bluhm@

> Now in an ideal world the other issue exposed by this mtu mismatch should
> also be fixed. We had similar issues with TCP over loopback on multiple
> occasions. So maybe it is time to fix this for good.

Please wait before commiting your diff.  My "tcp tso loopback
checksum" on tech@ should be commited and tested first, so we see
whether that fixes the underlying problem.

> Note: In an ideal world lo(4) would do TSO and LRO since it bounces the
> packet right back at us.

I think we should try to implement lo(4) checksum offloading.  In
an ideal world, packets are marked with correct checksum over
loopback instead of calculating it twice.

Then sending large packets with TSO and receiving with LSO would
also be good.  It helps testing the feature.  But performance
improvemnet might be small as MTU is already 32768 for loopback.

bluhm

> > Index: netinet/tcp_input.c
> > ===
> > RCS file: /cvs/src/sys/netinet/tcp_input.c,v
> > retrieving revision 1.387
> > diff -u -p -r1.387 tcp_input.c
> > --- netinet/tcp_input.c 14 Mar 2023 00:24:05 -  1.387
> > +++ netinet/tcp_input.c 19 May 2023 17:22:47 -
> > @@ -2805,7 +2805,11 @@ tcp_mss(struct tcpcb *tp, int offer)
> > if (rt == NULL)
> > goto out;
> >  
> > -   ifp = if_get(rt->rt_ifidx);
> > +   if (ISSET(rt->rt_flags, RTF_LOCAL))
> > +   ifp = if_get(rtable_loindex(inp->inp_rtableid));
> > +   else
> > +   ifp = if_get(rt->rt_ifidx);
> > +
> > if (ifp == NULL)
> > goto out;
> >  
> > 
> 
> -- 
> :wq Claudio



tcp tso loopback checksum

2023-05-21 Thread Alexander Bluhm
Hi,

When sending TCP packets with software TSO to the local address of
a physical interface, the TCP checksum was miscalculated.

This bug was triggered on loopback interface, when sending to the
local interface address of a physical interface.  Due to another
bug, the smaller MTU of the physical interface was used.  Then the
packet was sent without TSO chopping as it did fit the larger MTU
of the loopback interface.  Although the loopback interface does
not support hardware TSO, the modified TCP pseudo header checksum
for hardware TSO checksum offloading was calculated.

Please test with and without hardware TSO.

ok?

bluhm

Index: netinet/ip_output.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_output.c,v
retrieving revision 1.387
diff -u -p -r1.387 ip_output.c
--- netinet/ip_output.c 15 May 2023 16:34:56 -  1.387
+++ netinet/ip_output.c 21 May 2023 20:20:03 -
@@ -1882,7 +1882,8 @@ in_proto_cksum_out(struct mbuf *m, struc
u_int16_t csum = 0, offset;
 
offset = ip->ip_hl << 2;
-   if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO)) {
+   if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) && ifp != NULL &&
+   ISSET(ifp->if_capabilities, IFCAP_TSOv4)) {
csum = in_cksum_phdr(ip->ip_src.s_addr,
ip->ip_dst.s_addr, htonl(ip->ip_p));
} else if (ISSET(m->m_pkthdr.csum_flags,
Index: netinet6/ip6_output.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_output.c,v
retrieving revision 1.276
diff -u -p -r1.276 ip6_output.c
--- netinet6/ip6_output.c   15 May 2023 16:34:57 -  1.276
+++ netinet6/ip6_output.c   21 May 2023 20:20:18 -
@@ -2710,7 +2710,8 @@ in6_proto_cksum_out(struct mbuf *m, stru
u_int16_t csum;
 
offset = ip6_lasthdr(m, 0, IPPROTO_IPV6, );
-   if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO)) {
+   if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) && ifp != NULL &&
+   ISSET(ifp->if_capabilities, IFCAP_TSOv6)) {
csum = in6_cksum_phdr(>ip6_src, >ip6_dst,
htonl(0), htonl(nxt));
} else {



  1   2   3   4   5   6   7   8   9   10   >