Re: pf: remove unused include files

2022-05-17 Thread Alexander Bluhm
On Tue, May 17, 2022 at 06:40:12PM +, Miod Vallat wrote:
> sys/net/pf.c r1.968 added a call to m_print() #ifdef DDB in a
> troublesome situation.
> 
> Once the root cause of the problem was fixed, the DDB-specific code path
> was removed in r1.970, but the added includes were kept, although
> nothing in pf.c depends on DDB anymore.

OK bluhm@

> Index: pf.c
> ===
> RCS file: /OpenBSD/src/sys/net/pf.c,v
> retrieving revision 1.1129
> diff -u -p -r1.1129 pf.c
> --- pf.c  5 May 2022 16:44:22 -   1.1129
> +++ pf.c  17 May 2022 18:38:34 -
> @@ -103,11 +103,6 @@
>  struct pfsync_deferral;
>  #endif /* NPFSYNC > 0 */
>  
> -#ifdef DDB
> -#include 
> -#include 
> -#endif
> -
>  /*
>   * Global variables
>   */



Re: have in6_pcbselsrc copy the selected ip to the caller instead of a reference

2022-05-15 Thread Alexander Bluhm
On Sun, May 15, 2022 at 08:19:47PM +1000, David Gwynne wrote:
> this is basically the same as what i did for in_pcbselsrc, and
> completely mechanical. im too tired to figure out if there's a smarter
> way to do it.
> 
> lightly tested, and more eyes are welcome because of the tiredness
> thing.

OK bluhm@

> Index: in6_pcb.c
> ===
> RCS file: /cvs/src/sys/netinet6/in6_pcb.c,v
> retrieving revision 1.117
> diff -u -p -r1.117 in6_pcb.c
> --- in6_pcb.c 14 Apr 2022 14:10:22 -  1.117
> +++ in6_pcb.c 15 May 2022 09:53:53 -
> @@ -235,7 +235,7 @@ in6_pcbaddrisavail(struct inpcb *inp, st
>  int
>  in6_pcbconnect(struct inpcb *inp, struct mbuf *nam)
>  {
> - struct in6_addr *in6a = NULL;
> + struct in6_addr in6a;
>   struct sockaddr_in6 *sin6;
>   int error;
>   struct sockaddr_in6 tmp;
> @@ -273,7 +273,8 @@ in6_pcbconnect(struct inpcb *inp, struct
>   inp->inp_ipv6.ip6_hlim = (u_int8_t)in6_selecthlim(inp);
>  
>   if (in6_pcbhashlookup(inp->inp_table, >sin6_addr, sin6->sin6_port,
> - IN6_IS_ADDR_UNSPECIFIED(>inp_laddr6) ? in6a : >inp_laddr6,
> + IN6_IS_ADDR_UNSPECIFIED(>inp_laddr6) ?
> +   : >inp_laddr6,
>   inp->inp_lport, inp->inp_rtableid) != NULL) {
>   return (EADDRINUSE);
>   }
> @@ -286,13 +287,13 @@ in6_pcbconnect(struct inpcb *inp, struct
>   if (error)
>   return (error);
>   if (in6_pcbhashlookup(inp->inp_table, >sin6_addr,
> - sin6->sin6_port, in6a, inp->inp_lport,
> + sin6->sin6_port, , inp->inp_lport,
>   inp->inp_rtableid) != NULL) {
>   inp->inp_lport = 0;
>   return (EADDRINUSE);
>   }
>   }
> - inp->inp_laddr6 = *in6a;
> + inp->inp_laddr6 = in6a;
>   }
>   inp->inp_faddr6 = sin6->sin6_addr;
>   inp->inp_fport = sin6->sin6_port;
> Index: in6_src.c
> ===
> RCS file: /cvs/src/sys/netinet6/in6_src.c,v
> retrieving revision 1.86
> diff -u -p -r1.86 in6_src.c
> --- in6_src.c 22 Feb 2022 01:15:02 -  1.86
> +++ in6_src.c 15 May 2022 09:53:53 -
> @@ -91,7 +91,7 @@ int in6_selectif(struct sockaddr_in6 *, 
>   * the values set at pcb level can be overridden via cmsg.
>   */
>  int
> -in6_pcbselsrc(struct in6_addr **in6src, struct sockaddr_in6 *dstsock,
> +in6_pcbselsrc(struct in6_addr *in6src, struct sockaddr_in6 *dstsock,
>  struct inpcb *inp, struct ip6_pktopts *opts)
>  {
>   struct ip6_moptions *mopts = inp->inp_moptions6;
> @@ -138,7 +138,7 @@ in6_pcbselsrc(struct in6_addr **in6src, 
>  
>   pi->ipi6_addr = sa6.sin6_addr; /* XXX: this overrides pi */
>  
> - *in6src = >ipi6_addr;
> + *in6src = pi->ipi6_addr;
>   return (0);
>   }
>  
> @@ -147,7 +147,7 @@ in6_pcbselsrc(struct in6_addr **in6src, 
>* is already bound, use the bound address.
>*/
>   if (laddr && !IN6_IS_ADDR_UNSPECIFIED(laddr)) {
> - *in6src = laddr;
> + *in6src = *laddr;
>   return (0);
>   }
>  
> @@ -167,7 +167,7 @@ in6_pcbselsrc(struct in6_addr **in6src, 
>   if (ia6 == NULL)
>   return (EADDRNOTAVAIL);
>  
> - *in6src = >ia_addr.sin6_addr;
> + *in6src = ia6->ia_addr.sin6_addr;
>   return (0);
>   }
>  
> @@ -229,7 +229,7 @@ in6_pcbselsrc(struct in6_addr **in6src, 
>   struct ifaddr *ifa;
>   if ((ifa = ifa_ifwithaddr(ip6_source, rtableid)) !=
>   NULL && ISSET(ifa->ifa_ifp->if_flags, IFF_UP)) {
> - *in6src = (ip6_source)->sin6_addr;
> + *in6src = satosin6(ip6_source)->sin6_addr;
>   return (0);
>   }
>   }
> @@ -238,7 +238,7 @@ in6_pcbselsrc(struct in6_addr **in6src, 
>   if (ia6 == NULL)
>   return (EHOSTUNREACH);  /* no route */
>  
> - *in6src = >ia_addr.sin6_addr;
> + *in6src = ia6->ia_addr.sin6_addr;
>   return (0);
>  }
>  
> @@ -249,7 +249,7 @@ in6_pcbselsrc(struct in6_addr **in6src, 
>   * an entry to the caller for later use.
>   */
>  int
> -in6_selectsrc(struct in6_addr **in6src, struct sockaddr_in6 *dstsock,
> +in6_selectsrc(struct in6_addr *in6src, struct sockaddr_in6 *dstsock,
>  struct ip6_moptions *mopts, unsigned int rtableid)
>  {
>   struct ifnet *ifp = NULL;
> @@ -279,7 +279,7 @@ in6_selectsrc(struct in6_addr **in6src, 
>   if (ia6 == NULL)
>   return (EADDRNOTAVAIL);
>  
> - *in6src = >ia_addr.sin6_addr;
> + *in6src = ia6->ia_addr.sin6_addr;
>   return (0);
> 

OpenBSD Errata: May 16, 2022 (kqueue asn1 pppoe)

2022-05-15 Thread Alexander Bluhm
Errata patches for kqueue in the kernel and asn1 in libcrypto have 
been released for OpenBSD 7.1.  Errata patches for pppoe in the 
kernel have been released for OpenBSD 7.1 and 7.0.

Binary updates for the amd64, i386 and arm64 platform are available
via the syspatch utility.  Source code patches can be found on the
respective errata page:

  https://www.openbsd.org/errata71.html
  https://www.openbsd.org/errata70.html



Re: Lock kernel in nfsrv_rcv()

2022-05-13 Thread Alexander Bluhm
On Fri, May 13, 2022 at 04:05:15PM +, Visa Hankala wrote:
> The NFS subsystem is not MP-safe yet. Take this into account
> in the NFS server socket upcall by locking the kernel.
> 
> This might help with the NFS server hanging that was seen recently
> as a result of the now-reverted selwakeup()-to-KNOTE() conversion.
> Unfortunately, I have not been able to confirm this myself.
> 
> OK?

OK bluhm@

> Index: nfs/nfs_socket.c
> ===
> RCS file: src/sys/nfs/nfs_socket.c,v
> retrieving revision 1.140
> diff -u -p -r1.140 nfs_socket.c
> --- nfs/nfs_socket.c  17 Mar 2022 14:23:34 -  1.140
> +++ nfs/nfs_socket.c  13 May 2022 15:38:48 -
> @@ -1561,8 +1561,10 @@ nfsrv_rcv(struct socket *so, caddr_t arg
>   struct uio auio;
>   int flags, error;
>  
> + KERNEL_LOCK();
> +
>   if ((slp->ns_flag & SLP_VALID) == 0)
> - return;
> + goto out;
>  
>   /* Defer soreceive() to an nfsd. */
>   if (waitflag == M_DONTWAIT) {
> @@ -1644,6 +1646,9 @@ dorecs:
>   if (waitflag == M_DONTWAIT &&
>   (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN
>   nfsrv_wakenfsd(slp);
> +
> +out:
> + KERNEL_UNLOCK();
>  }
>  
>  /*



Re: Unlock umask(2)

2022-05-12 Thread Alexander Bluhm
On Wed, May 11, 2022 at 11:20:15AM +0300, Vitaliy Makkoveev wrote:
> sys_umask() only modifies `fd_cmask', which modification is already
> protected by `fd_lock' rwlock(9).

I did run full regress on amd64.

OK bluhm@

> Index: sys/kern/syscalls.master
> ===
> RCS file: /cvs/src/sys/kern/syscalls.master,v
> retrieving revision 1.223
> diff -u -p -r1.223 syscalls.master
> --- sys/kern/syscalls.master  24 Feb 2022 07:41:51 -  1.223
> +++ sys/kern/syscalls.master  11 May 2022 08:14:59 -
> @@ -146,7 +146,7 @@
>   char *buf, size_t count); }
>  59   STD { int sys_execve(const char *path, \
>   char * const *argp, char * const *envp); }
> -60   STD { mode_t sys_umask(mode_t newmask); }
> +60   STD NOLOCK  { mode_t sys_umask(mode_t newmask); }
>  61   STD { int sys_chroot(const char *path); }
>  62   STD { int sys_getfsstat(struct statfs *buf, size_t bufsize, 
> \
>   int flags); }



Re: Unlock umask(2)

2022-05-12 Thread Alexander Bluhm
On Wed, May 11, 2022 at 11:41:18PM +0300, Vitaliy Makkoveev wrote:
> > Both of the resizes should happen first, in case there is fallout in
> > userland which isn't visible yet.
> 
> No problem. 

OK bluhm@

> Index: sys/sys/filedesc.h
> ===
> RCS file: /cvs/src/sys/sys/filedesc.h,v
> retrieving revision 1.45
> diff -u -p -r1.45 filedesc.h
> --- sys/sys/filedesc.h4 Jul 2020 08:06:08 -   1.45
> +++ sys/sys/filedesc.h11 May 2022 20:14:48 -
> @@ -79,8 +79,8 @@ struct filedesc {
>   u_int   *fd_lomap;  /* [f] bitmap of free fds */
>   int fd_lastfile;/* [f] high-water mark of fd_ofiles */
>   int fd_freefile;/* [f] approx. next free file */
> - u_short fd_cmask;   /* [f/w] mask for file creation */
> - u_short fd_refcnt;  /* [K] reference count */
> + mode_t  fd_cmask;   /* [f/w] mask for file creation */
> + u_int   fd_refcnt;  /* [K] reference count */
>   struct rwlock fd_lock;  /* lock for the file descs */
>   struct mutex fd_fplock; /* lock for reading fd_ofiles without
>* fd_lock */



Re: Unlock umask(2)

2022-05-11 Thread Alexander Bluhm
On Wed, May 11, 2022 at 11:20:15AM +0300, Vitaliy Makkoveev wrote:
> sys_umask() only modifies `fd_cmask', which modification is already
> protected by `fd_lock' rwlock(9).

I found this in sys/filedesc.h

u_short fd_cmask;   /* [f/w] mask for file creation */
u_short fd_refcnt;  /* [K] reference count */

We have two short variables that are protected by different locks.
I think 16 bit values are not MP independent on all architectures.

When one CPU modifies the lower 16 bit and another CPU writes to
the higher 16 bit the result in the full 32 bit is not defined.
This is at least my understanding.

I have seen problems in real live with two shorts when one 16 bit
part was changed without spl protection and the other 16 bits were
written by interrupt.

Should we convert them to u_int?

bluhm

> Index: sys/kern/syscalls.master
> ===
> RCS file: /cvs/src/sys/kern/syscalls.master,v
> retrieving revision 1.223
> diff -u -p -r1.223 syscalls.master
> --- sys/kern/syscalls.master  24 Feb 2022 07:41:51 -  1.223
> +++ sys/kern/syscalls.master  11 May 2022 08:14:59 -
> @@ -146,7 +146,7 @@
>   char *buf, size_t count); }
>  59   STD { int sys_execve(const char *path, \
>   char * const *argp, char * const *envp); }
> -60   STD { mode_t sys_umask(mode_t newmask); }
> +60   STD NOLOCK  { mode_t sys_umask(mode_t newmask); }
>  61   STD { int sys_chroot(const char *path); }
>  62   STD { int sys_getfsstat(struct statfs *buf, size_t bufsize, 
> \
>   int flags); }



Re: [External] : Re: move memory allocation in pfr_add_tables() outside of NET_LOCK()/PF_LOCK()

2022-05-10 Thread Alexander Bluhm
On Tue, May 10, 2022 at 12:40:39AM +0200, Alexandr Nedvedicky wrote:
> updated diff is below.

Although this makes the code more complex, I see no simple solution.
We must get the sleeps out of the pf lock.  As seen from the mail
on bugs@, sleeping in pf is not a good thing.  Also mbuhl@ is hunting
some sleeping bugs with copyin and copyout from syzkaller.  So this
is the right direction.

OK bluhm@

> 8<---8<---8<--8<
> diff --git a/sys/net/pf_ioctl.c b/sys/net/pf_ioctl.c
> index 8315b115474..1f036e1368f 100644
> --- a/sys/net/pf_ioctl.c
> +++ b/sys/net/pf_ioctl.c
> @@ -2217,12 +2217,8 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int 
> flags, struct proc *p)
>   error = ENODEV;
>   goto fail;
>   }
> - NET_LOCK();
> - PF_LOCK();
>   error = pfr_add_tables(io->pfrio_buffer, io->pfrio_size,
>   >pfrio_nadd, io->pfrio_flags | PFR_FLAG_USERIOCTL);
> - PF_UNLOCK();
> - NET_UNLOCK();
>   break;
>   }
>  
> diff --git a/sys/net/pf_table.c b/sys/net/pf_table.c
> index fb23bcabe04..d0e42ca62ba 100644
> --- a/sys/net/pf_table.c
> +++ b/sys/net/pf_table.c
> @@ -189,6 +189,7 @@ void   pfr_clstats_ktable(struct 
> pfr_ktable *, time_t, int);
>  struct pfr_ktable*pfr_create_ktable(struct pfr_table *, time_t, int,
>   int);
>  void  pfr_destroy_ktables(struct pfr_ktableworkq *, int);
> +void  pfr_destroy_ktables_aux(struct pfr_ktableworkq *);
>  void  pfr_destroy_ktable(struct pfr_ktable *, int);
>  int   pfr_ktable_compare(struct pfr_ktable *,
>   struct pfr_ktable *);
> @@ -1493,14 +1494,16 @@ pfr_clr_tables(struct pfr_table *filter, int *ndel, 
> int flags)
>  int
>  pfr_add_tables(struct pfr_table *tbl, int size, int *nadd, int flags)
>  {
> - struct pfr_ktableworkq   addq, changeq;
> - struct pfr_ktable   *p, *q, *r, key;
> + struct pfr_ktableworkq   addq, changeq, auxq;
> + struct pfr_ktable   *p, *q, *r, *n, *w, key;
>   int  i, rv, xadd = 0;
>   time_t   tzero = gettime();
>  
>   ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY);
>   SLIST_INIT();
>   SLIST_INIT();
> + SLIST_INIT();
> + /* pre-allocate all memory outside of locks */
>   for (i = 0; i < size; i++) {
>   YIELD(flags & PFR_FLAG_USERIOCTL);
>   if (COPYIN(tbl+i, _t, sizeof(key.pfrkt_t), flags))
> @@ -1509,65 +1512,149 @@ pfr_add_tables(struct pfr_table *tbl, int size, int 
> *nadd, int flags)
>   flags & PFR_FLAG_USERIOCTL))
>   senderr(EINVAL);
>   key.pfrkt_flags |= PFR_TFLAG_ACTIVE;
> - p = RB_FIND(pfr_ktablehead, _ktables, );
> + p = pfr_create_ktable(_t, tzero, 0,
> + !(flags & PFR_FLAG_USERIOCTL));
> + if (p == NULL)
> + senderr(ENOMEM);
> +
> + /*
> +  * Note: we also pre-allocate a root table here. We keep it
> +  * at ->pfrkt_root, which we must not forget about.
> +  */
> + key.pfrkt_flags = 0;
> + memset(key.pfrkt_anchor, 0, sizeof(key.pfrkt_anchor));
> + p->pfrkt_root = pfr_create_ktable(_t, 0, 0,
> + !(flags & PFR_FLAG_USERIOCTL));
> + if (p->pfrkt_root == NULL) {
> + pfr_destroy_ktable(p, 0);
> + senderr(ENOMEM);
> + }
> +
> + SLIST_FOREACH(q, , pfrkt_workq) {
> + if (!pfr_ktable_compare(p, q)) {
> + /*
> +  * We need no lock here, because `p` is empty,
> +  * there are no rules or shadow tables
> +  * attached.
> +  */
> + pfr_destroy_ktable(p->pfrkt_root, 0);
> + p->pfrkt_root = NULL;
> + pfr_destroy_ktable(p, 0);
> + p = NULL;
> + break;
> + }
> + }
> + if (q != NULL)
> + continue;
> +
> + SLIST_INSERT_HEAD(, p, pfrkt_workq);
> + }
> +
> + /*
> +  * auxq contains freshly allocated tables with no dups.
> +  * also note there are no rulesets attached, because
> +  * the attach operation requires PF_LOCK().
> +  */
> + NET_LOCK();
> + PF_LOCK();
> + SLIST_FOREACH_SAFE(n, , pfrkt_workq, w) {
> + p = RB_FIND(pfr_ktablehead, _ktables, n);
>   if (p == NULL) {
> - p = pfr_create_ktable(_t, tzero, 1,
> - !(flags & PFR_FLAG_USERIOCTL));
> -  

Re: [External] : Re: move memory allocation in pfr_add_tables() outside of NET_LOCK()/PF_LOCK()

2022-05-09 Thread Alexander Bluhm
On Mon, May 09, 2022 at 11:11:03PM +0200, Alexandr Nedvedicky wrote:
> > ... and then we insert a destroyed p
> 
> yes. you are right. new diff addresses that with change as follows:
> 
> @@ -1542,9 +1542,8 @@ pfr_add_tables(struct pfr_table ...)
>   pfr_destroy_ktable(p, 0);
>   break;
>   }
> +   SLIST_INSERT_HEAD(, p, pfrkt_workq);
This inserts p each time you run over q list.
>   }
> -
> -   SLIST_INSERT_HEAD(, p, pfrkt_workq);
>   }

Should we do it like this?  It is similar to your solution at the
other loop.

SLIST_FOREACH(q, , pfrkt_workq) {
if (!pfr_ktable_compare(p, q)) {
/*
 * We need no lock here, because `p` is empty,
 * there are no rules or shadow tables
 * attached.
 */
pfr_destroy_ktable(p->pfrkt_root, 0);
p->pfrkt_root = NULL;
pfr_destroy_ktable(p, 0);
break;
}
}
if (q != NULL)
continue;
SLIST_INSERT_HEAD(, p, pfrkt_workq);


> > I compared the old and new code to see if it is equivalent.
> > Before the condtion looked like this.
> 
> very good point. I think this what needs to be done:
> 
> @@ -1558,7 +1558,8 @@ pfr_add_tables(struct pfr_table *tbl, ...)
>   if (p == NULL) {
>   SLIST_REMOVE(, n, pfr_ktable, pfrkt_workq);
>   SLIST_INSERT_HEAD(, n, pfrkt_workq);
> -   } else if (!(flags & PFR_FLAG_DUMMY)) {
> +   } else if (!(flags & PFR_FLAG_DUMMY) &&
I guess PFR_FLAG_DUMMY check is an optimization.
> +   !(p->pfrkt_flags & PFR_TFLAG_ACTIVE)) {
Indent should be one tab to the left.
>   p->pfrkt_nflags = (p->pfrkt_flags &
>   ~PFR_TFLAG_USRMASK) | key.pfrkt_flags;
>   SLIST_INSERT_HEAD(, p, pfrkt_workq);

Old code had this to avoid duplicate entries.  Do we need it?
SLIST_FOREACH(q, , pfrkt_workq)
if (!pfr_ktable_compare(, q))
goto _skip;

> > This continue goes to the r list, but I think you want to continue p list.
> > >   }
> > >   }
> 
> yes, exactly. we want to continue with outer loop if we break from inner
> one. this is what I want to do:
> 
> @@ -1617,9 +1618,12 @@ pfr_add_tables(struct pfr_table *tbl, ...)
>   p->pfrkt_root = r;
>   SLIST_INSERT_HEAD(, q,
>   pfrkt_workq);
> -   continue;
> +   break;
>   }
>   }
> +   if (r != SLIST_END())
Could you use if (r != NULL) ?  Noone uses SLIST_END macros.
> +   continue;
> +

bluhm



Re: [External] : Re: pf.conf(5) clarify ICMP sloppy state handling

2022-05-09 Thread Alexander Bluhm
On Mon, May 09, 2022 at 10:08:24PM +0100, Stuart Henderson wrote:
> This is helpful, but because it's so surprising that "pass proto icmp"
> doesn't pass all icmp traffic, I think it would help to mention it where
> "proto icmp" is described too.
> 
> Also, the top of the text about "sloppy" just talks about the sloppy
> TCP connection tracker, I think perhaps it would be better to lead
> with something that suggests it has multiple functions for different
> protocols?

OK bluhm@

> Index: man5/pf.conf.5
> ===
> RCS file: /cvs/src/share/man/man5/pf.conf.5,v
> retrieving revision 1.594
> diff -u -p -r1.594 pf.conf.5
> --- man5/pf.conf.59 May 2022 20:29:23 -   1.594
> +++ man5/pf.conf.59 May 2022 21:05:48 -
> @@ -594,6 +594,13 @@ or
>  .Pc
>  must match.
>  .Pp
> +ICMP responses are not permitted unless they either match an
> +existing request, or unless
> +.Cm no state
> +or
> +.Cm keep state (sloppy)
> +is specified.
> +.Pp
>  .It Cm label Ar string
>  Adds a label to the rule, which can be used to identify the rule.
>  For instance,
> @@ -2177,7 +2184,7 @@ States created by this rule are exported
>  .Xr pflow 4
>  interface.
>  .It Cm sloppy
> -Uses a sloppy TCP connection tracker that does not check sequence
> +For TCP, uses a sloppy connection tracker that does not check sequence
>  numbers at all, which makes insertion and ICMP teardown attacks way
>  easier.
>  This is intended to be used in situations where one does not see all
> @@ -2186,7 +2193,8 @@ It cannot be used with
>  .Cm modulate state
>  or
>  .Cm synproxy state .
> -With this option ICMP replies can create states.
> +For ICMP, this option allows states to be created from replies,
> +not just requests.
>  .It Ar timeout seconds
>  Changes the
>  .Ar timeout



Re: move memory allocation in pfr_add_tables() outside of NET_LOCK()/PF_LOCK()

2022-05-09 Thread Alexander Bluhm
On Sun, May 08, 2022 at 06:15:49PM +0200, Alexandr Nedvedicky wrote:
> OK ?

3 comments inline

> 8<---8<---8<--8<
> diff --git a/sys/net/pf_ioctl.c b/sys/net/pf_ioctl.c
> index 8315b115474..1f036e1368f 100644
> --- a/sys/net/pf_ioctl.c
> +++ b/sys/net/pf_ioctl.c
> @@ -2217,12 +2217,8 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int 
> flags, struct proc *p)
>   error = ENODEV;
>   goto fail;
>   }
> - NET_LOCK();
> - PF_LOCK();
>   error = pfr_add_tables(io->pfrio_buffer, io->pfrio_size,
>   >pfrio_nadd, io->pfrio_flags | PFR_FLAG_USERIOCTL);
> - PF_UNLOCK();
> - NET_UNLOCK();
>   break;
>   }
>  
> diff --git a/sys/net/pf_table.c b/sys/net/pf_table.c
> index fb23bcabe04..79fd3e0447b 100644
> --- a/sys/net/pf_table.c
> +++ b/sys/net/pf_table.c
> @@ -189,6 +189,7 @@ void   pfr_clstats_ktable(struct 
> pfr_ktable *, time_t, int);
>  struct pfr_ktable*pfr_create_ktable(struct pfr_table *, time_t, int,
>   int);
>  void  pfr_destroy_ktables(struct pfr_ktableworkq *, int);
> +void  pfr_destroy_ktables_aux(struct pfr_ktableworkq *);
>  void  pfr_destroy_ktable(struct pfr_ktable *, int);
>  int   pfr_ktable_compare(struct pfr_ktable *,
>   struct pfr_ktable *);
> @@ -1493,14 +1494,16 @@ pfr_clr_tables(struct pfr_table *filter, int *ndel, 
> int flags)
>  int
>  pfr_add_tables(struct pfr_table *tbl, int size, int *nadd, int flags)
>  {
> - struct pfr_ktableworkq   addq, changeq;
> - struct pfr_ktable   *p, *q, *r, key;
> + struct pfr_ktableworkq   addq, changeq, auxq;
> + struct pfr_ktable   *p, *q, *r, *n, *w, key;
>   int  i, rv, xadd = 0;
>   time_t   tzero = gettime();
>  
>   ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY);
>   SLIST_INIT();
>   SLIST_INIT();
> + SLIST_INIT();
> + /* pre-allocate all memory outside of locks */
>   for (i = 0; i < size; i++) {
>   YIELD(flags & PFR_FLAG_USERIOCTL);
>   if (COPYIN(tbl+i, _t, sizeof(key.pfrkt_t), flags))
> @@ -1509,65 +1512,142 @@ pfr_add_tables(struct pfr_table *tbl, int size, int 
> *nadd, int flags)
>   flags & PFR_FLAG_USERIOCTL))
>   senderr(EINVAL);
>   key.pfrkt_flags |= PFR_TFLAG_ACTIVE;
> - p = RB_FIND(pfr_ktablehead, _ktables, );
> + p = pfr_create_ktable(_t, tzero, 0,
> + !(flags & PFR_FLAG_USERIOCTL));
> + if (p == NULL)
> + senderr(ENOMEM);
> +
> + /*
> +  * Note: we also pre-allocate a root table here. We keep it
> +  * at ->pfrkt_root, which we must not forget about.
> +  */
> + key.pfrkt_flags = 0;
> + memset(key.pfrkt_anchor, 0, sizeof(key.pfrkt_anchor));
> + p->pfrkt_root = pfr_create_ktable(_t, 0, 0,
> + !(flags & PFR_FLAG_USERIOCTL));
> + if (p->pfrkt_root == NULL) {
> + pfr_destroy_ktable(p, 0);
> + senderr(ENOMEM);
> + }
> +
> + SLIST_FOREACH(q, , pfrkt_workq) {
> + if (!pfr_ktable_compare(p, q)) {
> + /*
> +  * We need no lock here, because `p` is empty,
> +  * there are no rules or shadow tables
> +  * attached.
> +  */
> + pfr_destroy_ktable(p->pfrkt_root, 0);
> + p->pfrkt_root = NULL;
> + pfr_destroy_ktable(p, 0);
> + break;
This break...
> + }
> + }
... end here
> +
> 
... and then we insert a destroyed p
> + SLIST_INSERT_HEAD(, p, pfrkt_workq);
> + }
> +
> + /*
> +  * auxq contains freshly allocated tables with no dups.
> +  * also note there are no rulesets attached, because
> +  * the attach operation requires PF_LOCK().
> +  */
> + NET_LOCK();
> + PF_LOCK();
> + SLIST_FOREACH_SAFE(n, , pfrkt_workq, w) {
> + p = RB_FIND(pfr_ktablehead, _ktables, n);
>   if (p == NULL) {
> - p = pfr_create_ktable(_t, tzero, 1,
> - !(flags & PFR_FLAG_USERIOCTL));
> - if (p == NULL)
> - senderr(ENOMEM);
> - SLIST_FOREACH(q, , pfrkt_workq) {
> - if (!pfr_ktable_compare(p, q)) {
> - pfr_destroy_ktable(p, 0);
> - goto _skip;
> - 

Re: [External] : net lock priority

2022-05-09 Thread Alexander Bluhm
On Sun, May 08, 2022 at 10:54:01PM +0200, Alexandr Nedvedicky wrote:
> what bothers me is the situation where there are
> more than one reader. The line 350 is executed by
> the first reader which drops the lock. So the process
> woken up by wakeup(rwl) are going to find out the
> lock is still occupied by remaining readers.

wakeup() activates all sleepers.  They should check and sleep again.
Maybe a little bit of wasted resources, but I don't see a severe
problem.

I did a little digging in history.  In rev 1.3 of kern_rwlock.c
we had case RW_READ: need_wait = RWLOCK_WRLOCK | RWLOCK_WRWANT;
and the commet "Let writers through before obtaining read lock."

The comment
 * RW_READ  RWLOCK_WRLOCK|RWLOCK_WRWANT may not be set. We increment
 *  with RWLOCK_READ_INCR. RWLOCK_WAIT while waiting.
is still there, just the RWLOCK_WRWANT got lost from the condition.

So I think we should get back the original reader writer priority.

Regarding the race in rw_do_exit() that sashan@ found I also found
a comment in rev 1.7.

/*
 * Potential MP race here. If the owner had WRWANT set we cleared
 * it and a reader can sneak in before a writer. Do we care?
 */

I do not want to change anything to that behavior now.  There is
no easy fix and I did not see the problem during testing.  But we
can put the comment back to clarify the situation.

ok?

bluhm

Index: kern/kern_rwlock.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_rwlock.c,v
retrieving revision 1.47
diff -u -p -r1.47 kern_rwlock.c
--- kern/kern_rwlock.c  8 Feb 2021 08:18:45 -   1.47
+++ kern/kern_rwlock.c  9 May 2022 07:36:35 -
@@ -81,7 +81,7 @@ static const struct rwlock_op {
},
{   /* RW_READ */
RWLOCK_READ_INCR,
-   RWLOCK_WRLOCK,
+   RWLOCK_WRLOCK | RWLOCK_WRWANT,
RWLOCK_WAIT,
0,
PLOCK
@@ -103,7 +103,7 @@ rw_enter_read(struct rwlock *rwl)
 {
unsigned long owner = rwl->rwl_owner;
 
-   if (__predict_false((owner & RWLOCK_WRLOCK) ||
+   if (__predict_false((owner & (RWLOCK_WRLOCK | RWLOCK_WRWANT)) ||
rw_cas(>rwl_owner, owner, owner + RWLOCK_READ_INCR)))
rw_enter(rwl, RW_READ);
else {
@@ -340,6 +340,11 @@ rw_do_exit(struct rwlock *rwl, unsigned 
 
do {
owner = rwl->rwl_owner;
+   /*
+* Potential MP race here.  If the owner had WRWANT set we
+* cleared it and a reader can sneak in before a writer.
+* Do we care?
+*/
if (wrlock)
set = 0;
else



Re: [External] : net lock priority

2022-05-08 Thread Alexander Bluhm
On Sun, May 08, 2022 at 09:19:23PM +0200, Alexander Bluhm wrote:
> I will run my tests with the diff below.

With the third chunk reboot hangs during reordering libraries in
vmmaplk.  So this needs more thought.

> Index: kern/kern_rwlock.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_rwlock.c,v
> retrieving revision 1.47
> diff -u -p -r1.47 kern_rwlock.c
> --- kern/kern_rwlock.c8 Feb 2021 08:18:45 -   1.47
> +++ kern/kern_rwlock.c8 May 2022 18:55:52 -
> @@ -81,7 +81,7 @@ static const struct rwlock_op {
>   },
>   {   /* RW_READ */
>   RWLOCK_READ_INCR,
> - RWLOCK_WRLOCK,
> + RWLOCK_WRLOCK | RWLOCK_WRWANT,
>   RWLOCK_WAIT,
>   0,
>   PLOCK
> @@ -103,7 +103,7 @@ rw_enter_read(struct rwlock *rwl)
>  {
>   unsigned long owner = rwl->rwl_owner;
>  
> - if (__predict_false((owner & RWLOCK_WRLOCK) ||
> + if (__predict_false((owner & (RWLOCK_WRLOCK | RWLOCK_WRWANT)) ||
>   rw_cas(>rwl_owner, owner, owner + RWLOCK_READ_INCR)))
>   rw_enter(rwl, RW_READ);
>   else {
> @@ -343,8 +343,7 @@ rw_do_exit(struct rwlock *rwl, unsigned 
>   if (wrlock)
>   set = 0;
>   else
> - set = (owner - RWLOCK_READ_INCR) &
> - ~(RWLOCK_WAIT|RWLOCK_WRWANT);
> + set = (owner - RWLOCK_READ_INCR) & ~RWLOCK_WAIT;
>   } while (__predict_false(rw_cas(>rwl_owner, owner, set)));
>  
>   if (owner & RWLOCK_WAIT)



Re: [External] : Re: pf.conf(5) clarify ICMP sloppy state handling

2022-05-08 Thread Alexander Bluhm
On Sun, May 08, 2022 at 09:58:47PM +0200, Alexandr Nedvedicky wrote:
> Hello,
> 
> On Sun, May 08, 2022 at 08:06:57PM +0200, Alexander Bluhm wrote:
> > On Sun, May 08, 2022 at 06:37:57PM +0200, Alexandr Nedvedicky wrote:
> > > this tiny update to pf.conf(5) has been prompted here [1] on
> > > pf mailing list. By default only ICMP queries are allowed
> > > to create state in pf(4). The sloppy option relaxes that
> > > so also ICMP replies can create a state. I think this should
> > > be also mentioned in pf.conf(5)
> > >
> > > OK to my suggestion below?
> > 
> > I would make it a bit shorter.  pf.conf(5) is very long already.
> > 
> > With this option ICMP replies can create states.
> > 
> > Does this describe everything?
> 
> yes, it does. I Like it. Updated diff below.

OK bluhm@

> 8<---8<---8<--8<
> diff --git a/share/man/man5/pf.conf.5 b/share/man/man5/pf.conf.5
> index fe4b117994a..e4af2a37c5e 100644
> --- a/share/man/man5/pf.conf.5
> +++ b/share/man/man5/pf.conf.5
> @@ -2186,6 +2186,7 @@ It cannot be used with
>  .Cm modulate state
>  or
>  .Cm synproxy state .
> +With this option ICMP replies can create states.
>  .It Ar timeout seconds
>  Changes the
>  .Ar timeout



Re: [External] : net lock priority

2022-05-08 Thread Alexander Bluhm
On Sun, May 08, 2022 at 07:55:44PM +0200, Alexandr Nedvedicky wrote:
> my question is why do we reset RWLOCK_WAIT and RWLOCK_WRWANT flags?

This is a very good question.

> I think those flags should be reset the last reader gone. Perhaps
> the else branch for reader requires this:
> 
>   else {
>   set = (owner - RWLOCK_READ_INCR) &
>   ~(RWLOCK_WAIT|RWLOCK_WRWANT)
>   if (set != 0)
>   set |= (owner & RWLOCK_MASK);
>   }

Why should a reader change RWLOCK_WRWANT at all?  The writer sets
and clears it.  This code was moved to this place in rev 1.8.

Before rw_exit_read() had this comment:

/*
 * Potential MP race here. If the owner had WRWANT set we cleared
 * it and a reader can sneak in before a writer. Do we care?
 */

I will run my tests with the diff below.

bluhm

Index: kern/kern_rwlock.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_rwlock.c,v
retrieving revision 1.47
diff -u -p -r1.47 kern_rwlock.c
--- kern/kern_rwlock.c  8 Feb 2021 08:18:45 -   1.47
+++ kern/kern_rwlock.c  8 May 2022 18:55:52 -
@@ -81,7 +81,7 @@ static const struct rwlock_op {
},
{   /* RW_READ */
RWLOCK_READ_INCR,
-   RWLOCK_WRLOCK,
+   RWLOCK_WRLOCK | RWLOCK_WRWANT,
RWLOCK_WAIT,
0,
PLOCK
@@ -103,7 +103,7 @@ rw_enter_read(struct rwlock *rwl)
 {
unsigned long owner = rwl->rwl_owner;
 
-   if (__predict_false((owner & RWLOCK_WRLOCK) ||
+   if (__predict_false((owner & (RWLOCK_WRLOCK | RWLOCK_WRWANT)) ||
rw_cas(>rwl_owner, owner, owner + RWLOCK_READ_INCR)))
rw_enter(rwl, RW_READ);
else {
@@ -343,8 +343,7 @@ rw_do_exit(struct rwlock *rwl, unsigned 
if (wrlock)
set = 0;
else
-   set = (owner - RWLOCK_READ_INCR) &
-   ~(RWLOCK_WAIT|RWLOCK_WRWANT);
+   set = (owner - RWLOCK_READ_INCR) & ~RWLOCK_WAIT;
} while (__predict_false(rw_cas(>rwl_owner, owner, set)));
 
if (owner & RWLOCK_WAIT)



Re: pf.conf(5) clarify ICMP sloppy state handling

2022-05-08 Thread Alexander Bluhm
On Sun, May 08, 2022 at 06:37:57PM +0200, Alexandr Nedvedicky wrote:
> this tiny update to pf.conf(5) has been prompted here [1] on
> pf mailing list. By default only ICMP queries are allowed
> to create state in pf(4). The sloppy option relaxes that
> so also ICMP replies can create a state. I think this should
> be also mentioned in pf.conf(5)
> 
> OK to my suggestion below?

I would make it a bit shorter.  pf.conf(5) is very long already.

With this option ICMP replies can create states.

Does this describe everything?

> 8<---8<---8<--8<
> diff --git a/share/man/man5/pf.conf.5 b/share/man/man5/pf.conf.5
> index fe4b117994a..7389d231fe2 100644
> --- a/share/man/man5/pf.conf.5
> +++ b/share/man/man5/pf.conf.5
> @@ -2186,6 +2186,9 @@ It cannot be used with
>  .Cm modulate state
>  or
>  .Cm synproxy state .
> +The option also relaxes handling of ICMP such that also ICMP replies
> +are allowed to create state.
> +By default ICMP queries only are allowed to create state.
>  .It Ar timeout seconds
>  Changes the
>  .Ar timeout



Re: divert packet kernel lock

2022-05-06 Thread Alexander Bluhm
On Fri, May 06, 2022 at 10:16:35PM +0200, Mark Kettenis wrote:
> > Date: Fri, 6 May 2022 14:48:59 +0200
> > From: Alexander Bluhm 
> > 
> > On Thu, May 05, 2022 at 11:10:54PM +0200, Mark Kettenis wrote:
> > > > Date: Thu, 5 May 2022 22:41:01 +0200
> > > > From: Alexander Bluhm 
> > > >
> > > > Hi,
> > > >
> > > > The easiest way to make divert_packet() MP safe for now, is to
> > > > protect sbappendaddr() with kernel lock.
> > >
> > > All other invocations of sbappendaddr() run with the kernel lock held?
> > 
> > No.  Only this place takes kernel lock.
> > 
> > > If so, maybe that should be asserted inside sbappendaddr()?
> > 
> > This is only a temporary hack.  The clean solution would be a socket
> > mutex.  I have marked it with XXXSMP.  Maybe this is place is a
> > good start to implement and test such a lock.
> > 
> > > If not, I don't understand how this would help...
> > 
> > All other places call sbappendaddr() with exclusive net lock.
> > divert_packet() holds the shared net lock, so it cannot run in
> > parallel with the other callers.
> > 
> > What is left is protection between multiple divert_packet() running
> > and calling sbappendaddr().  For that kernel lock helps.
> > 
> > Of course that is a dirty hack.  But we have a race in the commited
> > codebase that I want to plug quickly.  A proper solution needs more
> > thought.
> 
> Ouch.  I suppose use the kernel lock here makes sense since you're
> going to take it after the call anyway.
> 
> Maybe change the comment to state that in other places sbappendaddr()
> is always called with an exclusive net lock and therefore can't run
> while we're holding a shared net lock?

Is this better to understand?

Index: netinet/ip_divert.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_divert.c,v
retrieving revision 1.67
diff -u -p -r1.67 ip_divert.c
--- netinet/ip_divert.c 5 May 2022 16:44:22 -   1.67
+++ netinet/ip_divert.c 6 May 2022 20:45:43 -
@@ -222,11 +222,19 @@ divert_packet(struct mbuf *m, int dir, u
}
 
so = inp->inp_socket;
+   /*
+* XXXSMP sbappendaddr() is not MP safe and this function is called
+* from pf with shared netlock.  To call only one sbappendaddr() from
+* divert_packet(), protect it with kernel lock.  All other places
+* call sbappendaddr() with exclusive net lock.  This blocks
+* divert_packet() as we have the shared lock.
+*/
+   KERNEL_LOCK();
if (sbappendaddr(so, >so_rcv, sintosa(), m, NULL) == 0) {
+   KERNEL_UNLOCK();
divstat_inc(divs_fullsock);
goto bad;
}
-   KERNEL_LOCK();
sorwakeup(inp->inp_socket);
KERNEL_UNLOCK();
 
Index: netinet6/ip6_divert.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_divert.c,v
retrieving revision 1.66
diff -u -p -r1.66 ip6_divert.c
--- netinet6/ip6_divert.c   5 May 2022 16:44:22 -   1.66
+++ netinet6/ip6_divert.c   6 May 2022 20:45:11 -
@@ -228,11 +228,19 @@ divert6_packet(struct mbuf *m, int dir, 
}
 
so = inp->inp_socket;
+   /*
+* XXXSMP sbappendaddr() is not MP safe and this function is called
+* from pf with shared netlock.  To call only one sbappendaddr() from
+* divert_packet(), protect it with kernel lock.  All other places
+* call sbappendaddr() with exclusive net lock.  This blocks
+* divert_packet() as we have the shared lock.
+*/
+   KERNEL_LOCK();
if (sbappendaddr(so, >so_rcv, sin6tosa(), m, NULL) == 0) {
+   KERNEL_UNLOCK();
div6stat_inc(div6s_fullsock);
goto bad;
}
-   KERNEL_LOCK();
sorwakeup(inp->inp_socket);
KERNEL_UNLOCK();
 



net lock priority

2022-05-06 Thread Alexander Bluhm
Hi,

When creating network load by forwarding packets, SSH gets unusable
and ping time gets above 10 seconds.

Problem is that while multiple forwarding threads are running with
shared net lock, the exclusive lock cannot be acquired.  This is
unfair.

Diff below prevents that a read lock is granted when another thread
is waiting for the exclusive lock.  With that ping time stays under
300 ms.

Does this read write lock prio change make sense?

bluhm

Index: kern/kern_rwlock.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_rwlock.c,v
retrieving revision 1.47
diff -u -p -r1.47 kern_rwlock.c
--- kern/kern_rwlock.c  8 Feb 2021 08:18:45 -   1.47
+++ kern/kern_rwlock.c  6 May 2022 12:08:01 -
@@ -81,7 +81,7 @@ static const struct rwlock_op {
},
{   /* RW_READ */
RWLOCK_READ_INCR,
-   RWLOCK_WRLOCK,
+   RWLOCK_WRLOCK | RWLOCK_WRWANT,
RWLOCK_WAIT,
0,
PLOCK
@@ -103,7 +103,7 @@ rw_enter_read(struct rwlock *rwl)
 {
unsigned long owner = rwl->rwl_owner;
 
-   if (__predict_false((owner & RWLOCK_WRLOCK) ||
+   if (__predict_false((owner & (RWLOCK_WRLOCK | RWLOCK_WRWANT)) ||
rw_cas(>rwl_owner, owner, owner + RWLOCK_READ_INCR)))
rw_enter(rwl, RW_READ);
else {



Re: divert packet kernel lock

2022-05-06 Thread Alexander Bluhm
On Thu, May 05, 2022 at 11:10:54PM +0200, Mark Kettenis wrote:
> > Date: Thu, 5 May 2022 22:41:01 +0200
> > From: Alexander Bluhm 
> > 
> > Hi,
> > 
> > The easiest way to make divert_packet() MP safe for now, is to
> > protect sbappendaddr() with kernel lock.
> 
> All other invocations of sbappendaddr() run with the kernel lock held?

No.  Only this place takes kernel lock.

> If so, maybe that should be asserted inside sbappendaddr()?

This is only a temporary hack.  The clean solution would be a socket
mutex.  I have marked it with XXXSMP.  Maybe this is place is a
good start to implement and test such a lock.

> If not, I don't understand how this would help...

All other places call sbappendaddr() with exclusive net lock.
divert_packet() holds the shared net lock, so it cannot run in
parallel with the other callers.

What is left is protection between multiple divert_packet() running
and calling sbappendaddr().  For that kernel lock helps.

Of course that is a dirty hack.  But we have a race in the commited
codebase that I want to plug quickly.  A proper solution needs more
thought.

> > Index: netinet/ip_divert.c
> > ===
> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_divert.c,v
> > retrieving revision 1.67
> > diff -u -p -r1.67 ip_divert.c
> > --- netinet/ip_divert.c 5 May 2022 16:44:22 -   1.67
> > +++ netinet/ip_divert.c 5 May 2022 20:36:23 -
> > @@ -222,11 +222,18 @@ divert_packet(struct mbuf *m, int dir, u
> > }
> >  
> > so = inp->inp_socket;
> > +   /*
> > +* XXXSMP sbappendaddr() is not MP safe and this function is called
> > +* from pf with shared netlock.  To run only one sbappendaddr()
> > +* protect it with kernel lock.  Socket buffer access from system
> > +* call is protected with exclusive net lock.
> > +*/
> > +   KERNEL_LOCK();
> > if (sbappendaddr(so, >so_rcv, sintosa(), m, NULL) == 0) {
> > +   KERNEL_UNLOCK();
> > divstat_inc(divs_fullsock);
> > goto bad;
> > }
> > -   KERNEL_LOCK();
> > sorwakeup(inp->inp_socket);
> > KERNEL_UNLOCK();
> >  
> > Index: netinet6/ip6_divert.c
> > ===
> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_divert.c,v
> > retrieving revision 1.66
> > diff -u -p -r1.66 ip6_divert.c
> > --- netinet6/ip6_divert.c   5 May 2022 16:44:22 -   1.66
> > +++ netinet6/ip6_divert.c   5 May 2022 20:36:23 -
> > @@ -228,11 +228,18 @@ divert6_packet(struct mbuf *m, int dir, 
> > }
> >  
> > so = inp->inp_socket;
> > +   /*
> > +* XXXSMP sbappendaddr() is not MP safe and this function is called
> > +* from pf with shared netlock.  To run only one sbappendaddr()
> > +* protect it with kernel lock.  Socket buffer access from system
> > +* call is protected with exclusive net lock.
> > +*/
> > +   KERNEL_LOCK();
> > if (sbappendaddr(so, >so_rcv, sin6tosa(), m, NULL) == 0) {
> > +   KERNEL_UNLOCK();
> > div6stat_inc(div6s_fullsock);
> > goto bad;
> > }
> > -   KERNEL_LOCK();
> > sorwakeup(inp->inp_socket);
> > KERNEL_UNLOCK();
> >  
> > 
> > 



divert packet kernel lock

2022-05-05 Thread Alexander Bluhm
Hi,

The easiest way to make divert_packet() MP safe for now, is to
protect sbappendaddr() with kernel lock.

ok?

bluhm

Index: netinet/ip_divert.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_divert.c,v
retrieving revision 1.67
diff -u -p -r1.67 ip_divert.c
--- netinet/ip_divert.c 5 May 2022 16:44:22 -   1.67
+++ netinet/ip_divert.c 5 May 2022 20:36:23 -
@@ -222,11 +222,18 @@ divert_packet(struct mbuf *m, int dir, u
}
 
so = inp->inp_socket;
+   /*
+* XXXSMP sbappendaddr() is not MP safe and this function is called
+* from pf with shared netlock.  To run only one sbappendaddr()
+* protect it with kernel lock.  Socket buffer access from system
+* call is protected with exclusive net lock.
+*/
+   KERNEL_LOCK();
if (sbappendaddr(so, >so_rcv, sintosa(), m, NULL) == 0) {
+   KERNEL_UNLOCK();
divstat_inc(divs_fullsock);
goto bad;
}
-   KERNEL_LOCK();
sorwakeup(inp->inp_socket);
KERNEL_UNLOCK();
 
Index: netinet6/ip6_divert.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_divert.c,v
retrieving revision 1.66
diff -u -p -r1.66 ip6_divert.c
--- netinet6/ip6_divert.c   5 May 2022 16:44:22 -   1.66
+++ netinet6/ip6_divert.c   5 May 2022 20:36:23 -
@@ -228,11 +228,18 @@ divert6_packet(struct mbuf *m, int dir, 
}
 
so = inp->inp_socket;
+   /*
+* XXXSMP sbappendaddr() is not MP safe and this function is called
+* from pf with shared netlock.  To run only one sbappendaddr()
+* protect it with kernel lock.  Socket buffer access from system
+* call is protected with exclusive net lock.
+*/
+   KERNEL_LOCK();
if (sbappendaddr(so, >so_rcv, sin6tosa(), m, NULL) == 0) {
+   KERNEL_UNLOCK();
div6stat_inc(div6s_fullsock);
goto bad;
}
-   KERNEL_LOCK();
sorwakeup(inp->inp_socket);
KERNEL_UNLOCK();
 



Re: Use static allocation for rt_timer_queue

2022-05-05 Thread Alexander Bluhm
On Thu, May 05, 2022 at 03:08:39PM +0200, Claudio Jeker wrote:
> In total there are 6 rt_timer_queues in our kernel. 3 IPv4 and 3 IPv6.
> That number may be increased to 8 if arp and nd would use these timers as
> well. Because of this allocation the queue heads via pool(9) is overkill.
> 
> Switch rt_timer_queue_create to rt_timer_queue_init which just sets up the
> struct and links it to the rt_timer_queue.

OK bluhm@

> Index: net/route.c
> ===
> RCS file: /cvs/src/sys/net/route.c,v
> retrieving revision 1.409
> diff -u -p -r1.409 route.c
> --- net/route.c   4 May 2022 16:52:10 -   1.409
> +++ net/route.c   5 May 2022 12:38:16 -
> @@ -150,7 +150,6 @@ int   ifatrash;   /* ifas not in 
> ifp list 
>  
>  struct pool  rtentry_pool;   /* pool for rtentry structures */
>  struct pool  rttimer_pool;   /* pool for rttimer structures */
> -struct pool  rttimer_queue_pool; /* pool for rttimer_queue structures */
>  
>  int  rt_setgwroute(struct rtentry *, u_int);
>  void rt_putgwroute(struct rtentry *);
> @@ -1393,8 +1392,6 @@ rt_timer_init(void)
>  
>   pool_init(_pool, sizeof(struct rttimer), 0,
>   IPL_MPFLOOR, 0, "rttmr", NULL);
> - pool_init(_queue_pool, sizeof(struct rttimer_queue), 0,
> - IPL_MPFLOOR, 0, "rttmrq", NULL);
>  
>   mtx_init(_mtx, IPL_MPFLOOR);
>   LIST_INIT(_queue_head);
> @@ -1402,13 +1399,10 @@ rt_timer_init(void)
>   timeout_add_sec(_timer_timeout, 1);
>  }
>  
> -struct rttimer_queue *
> -rt_timer_queue_create(int timeout, void (*func)(struct rtentry *, u_int))
> +void
> +rt_timer_queue_init(struct rttimer_queue *rtq, int timeout,
> +void (*func)(struct rtentry *, u_int))
>  {
> - struct rttimer_queue*rtq;
> -
> - rtq = pool_get(_queue_pool, PR_WAITOK | PR_ZERO);
> -
>   rtq->rtq_timeout = timeout;
>   rtq->rtq_count = 0;
>   rtq->rtq_func = func;
> @@ -1417,8 +1411,6 @@ rt_timer_queue_create(int timeout, void 
>   mtx_enter(_mtx);
>   LIST_INSERT_HEAD(_queue_head, rtq, rtq_link);
>   mtx_leave(_mtx);
> -
> - return (rtq);
>  }
>  
>  void
> Index: net/route.h
> ===
> RCS file: /cvs/src/sys/net/route.h,v
> retrieving revision 1.193
> diff -u -p -r1.193 route.h
> --- net/route.h   4 May 2022 16:52:10 -   1.193
> +++ net/route.h   5 May 2022 12:39:15 -
> @@ -457,16 +457,16 @@ void rtm_proposal(struct ifnet *, struc
>  int   rt_setgate(struct rtentry *, struct sockaddr *, u_int);
>  struct rtentry *rt_getll(struct rtentry *);
>  
> -void  rt_timer_init(void);
> -int   rt_timer_add(struct rtentry *,
> - struct rttimer_queue *, u_int);
> -void  rt_timer_remove_all(struct rtentry *);
> -struct rttimer_queue *rt_timer_queue_create(int,
> - void(*)(struct rtentry *, u_int));
> -void  rt_timer_queue_change(struct rttimer_queue *, int);
> -void  rt_timer_queue_flush(struct rttimer_queue *);
> -unsigned long rt_timer_queue_count(struct rttimer_queue *);
> -void  rt_timer_timer(void *);
> +void rt_timer_init(void);
> +int  rt_timer_add(struct rtentry *,
> + struct rttimer_queue *, u_int);
> +void rt_timer_remove_all(struct rtentry *);
> +void rt_timer_queue_init(struct rttimer_queue *, int,
> + void(*)(struct rtentry *, u_int));
> +void rt_timer_queue_change(struct rttimer_queue *, int);
> +void rt_timer_queue_flush(struct rttimer_queue *);
> +unsigned longrt_timer_queue_count(struct rttimer_queue *);
> +void rt_timer_timer(void *);
>  
>  int   rt_mpls_set(struct rtentry *, struct sockaddr *, uint8_t);
>  void  rt_mpls_clear(struct rtentry *);
> Index: netinet/ip_icmp.c
> ===
> RCS file: /cvs/src/sys/netinet/ip_icmp.c,v
> retrieving revision 1.190
> diff -u -p -r1.190 ip_icmp.c
> --- netinet/ip_icmp.c 4 May 2022 16:52:10 -   1.190
> +++ netinet/ip_icmp.c 5 May 2022 12:49:51 -
> @@ -120,8 +120,8 @@ int   icmp_redirtimeout = 10 * 60;
>  static int icmperrpps_count = 0;
>  static struct timeval icmperrppslim_last;
>  
> -struct rttimer_queue *ip_mtudisc_timeout_q;
> -struct rttimer_queue *icmp_redirect_timeout_q;
> +struct rttimer_queue ip_mtudisc_timeout_q;
> +struct rttimer_queue icmp_redirect_timeout_q;
>  struct cpumem *icmpcounters;
>  
>  const struct sysctl_bounded_args icmpctl_vars[] =  {
> @@ -141,9 +141,9 @@ int icmp_sysctl_icmpstat(void *, size_t 
>  void
>  icmp_init(void)
>  {
> - ip_mtudisc_timeout_q = rt_timer_queue_create(ip_mtudisc_timeout,
> + rt_timer_queue_init(_mtudisc_timeout_q, ip_mtudisc_timeout,
>   _mtudisc_timeout);
> - 

OpenBSD Errata: May 5, 2022 (ipsec)

2022-05-05 Thread Alexander Bluhm
Errata patch for kernel IPsec has been released for OpenBSD 7.1.

Binary updates for the amd64, i386 and arm64 platform are available
via the syspatch utility.  Source code patches can be found on the
respective errata page:

  https://www.openbsd.org/errata71.html



Re: kbd set error message

2022-05-05 Thread Alexander Bluhm
anyone?

On Sun, Apr 17, 2022 at 08:20:40PM +0200, Alexander Bluhm wrote:
> Hi,
> 
> After fixing the kbd -l error handling, kbd set needs the same diff.
> While there, shorten long lines and avoid v--; v++; logic.
> 
> $ ./kbd de
> kbd: /dev/wskbd0: Permission denied
> 
> ok?
> 
> bluhm
> 
> Index: sbin/kbd/kbd_wscons.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sbin/kbd/kbd_wscons.c,v
> retrieving revision 1.35
> diff -u -p -r1.35 kbd_wscons.c
> --- sbin/kbd/kbd_wscons.c 17 Apr 2022 17:33:50 -  1.35
> +++ sbin/kbd/kbd_wscons.c 17 Apr 2022 17:39:56 -
> @@ -232,7 +232,7 @@ void
>  kbd_set(char *name, int verbose)
>  {
>   charbuf[LINE_MAX], *c, *b, device[sizeof "/dev/wskbd00"];
> - int map = 0, v, i, fd;
> + int map = 0, v, i, fd, error = 0;
>   struct nameint *n;
>  
>   c = name;
> @@ -271,19 +271,29 @@ kbd_set(char *name, int verbose)
>   fd = open(device, O_WRONLY);
>   if (fd == -1)
>   fd = open(device, O_RDONLY);
> - if (fd >= 0) {
> + if (fd == -1) {
> + /* remember the first error number */
> + if (error == 0)
> + error = errno;
> + } else {
> + /* at least one success, do not print error */
> + error = -1;
> +
>   if (ioctl(fd, WSKBDIO_SETENCODING, ) == -1) {
> - if (errno == EINVAL) {
> - fprintf(stderr,
> - "%s: unsupported encoding %s on 
> %s\n",
> - __progname, name, device);
> - } else
> - err(1, "WSKBDIO_SETENCODING: %s", 
> device);
> - v--;
> - }
> - v++;
> + if (errno != EINVAL)
> + err(1, "WSKBDIO_SETENCODING %s",
> + device);
> + fprintf(stderr,
> + "%s: unsupported encoding %s on %s\n",
> + __progname, name, device);
> + } else
> + v++;
>   close(fd);
>   }
> + }
> + if (error > 0) {
> + errno = error;
> + err(1, "/dev/wskbd0");
>   }
>  
>   if (verbose && v > 0)



Re: Reserved address behavior (alternate broadcast and 240/4)

2022-05-05 Thread Alexander Bluhm
On Wed, May 04, 2022 at 06:48:14PM -0700, Seth David Schoen wrote:
> This has some odd consequences.  For instance, if an OpenBSD system
> has an interface numbered with an address in 240/4, it can initiate
> and receive TCP connections using that address, and it can ping other
> hosts using that address, but it won't respond to pings from other
> hosts.  This patch cleans this up:

As forwarding and icmp reflect is the only place where IN_EXPERIMENTAL()
is used, I cannot see a downside of this patch.

OK bluhm@

> Index: in.c
> ===
> RCS file: /cvs/src/sys/netinet/in.c,v
> retrieving revision 1.173
> diff -u -p -r1.173 in.c
> --- in.c  28 Mar 2022 16:31:26 -  1.173
> +++ in.c  5 May 2022 01:05:04 -
> @@ -103,7 +103,7 @@ in_canforward(struct in_addr in)
>  {
>   u_int32_t net;
>  
> - if (IN_EXPERIMENTAL(in.s_addr) || IN_MULTICAST(in.s_addr))
> + if (IN_MULTICAST(in.s_addr))
>   return (0);
>   if (IN_CLASSA(in.s_addr)) {
>   net = in.s_addr & IN_CLASSA_NET;



divert packet mutex

2022-05-04 Thread Alexander Bluhm
Hi,

I missed that pf divert packet is not MP safe yet.  The problem is
that divert_packet() is called from pf with shared net lock and
sbappendaddr() needs exclusive net lock.

The direct call from pf in IP layer to divert in protocol layer is
not nice.  I not sure how to address that.

As a first step clean up divert_packet():
- the function never returns an error
- call variables so and sin like everywhere else
- use goto bad for error handling
- fix error counter
- introduce mutex and refcounting for inp like in the other pcb
  functions

Divert packet is still not MP safe, I will fix it later.

ok?

bluhm

Index: net/pf.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf.c,v
retrieving revision 1.1128
diff -u -p -r1.1128 pf.c
--- net/pf.c3 May 2022 13:32:47 -   1.1128
+++ net/pf.c4 May 2022 20:16:40 -
@@ -7403,13 +7403,13 @@ done:
case PF_DIVERT:
switch (pd.af) {
case AF_INET:
-   if (!divert_packet(pd.m, pd.dir, r->divert.port))
-   pd.m = NULL;
+   divert_packet(pd.m, pd.dir, r->divert.port);
+   pd.m = NULL;
break;
 #ifdef INET6
case AF_INET6:
-   if (!divert6_packet(pd.m, pd.dir, r->divert.port))
-   pd.m = NULL;
+   divert6_packet(pd.m, pd.dir, r->divert.port);
+   pd.m = NULL;
break;
 #endif /* INET6 */
}
Index: netinet/ip_divert.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_divert.c,v
retrieving revision 1.66
diff -u -p -r1.66 ip_divert.c
--- netinet/ip_divert.c 25 Feb 2022 23:51:03 -  1.66
+++ netinet/ip_divert.c 4 May 2022 18:17:49 -
@@ -171,30 +171,37 @@ fail:
return (error ? error : EINVAL);
 }
 
-int
+void
 divert_packet(struct mbuf *m, int dir, u_int16_t divert_port)
 {
-   struct inpcb *inp;
-   struct socket *sa = NULL;
-   struct sockaddr_in addr;
+   struct inpcb *inp = NULL;
+   struct socket *so;
+   struct sockaddr_in sin;
 
-   inp = NULL;
divstat_inc(divs_ipackets);
 
if (m->m_len < sizeof(struct ip) &&
(m = m_pullup(m, sizeof(struct ip))) == NULL) {
divstat_inc(divs_errors);
-   return (0);
+   goto bad;
}
 
+   mtx_enter(_mtx);
TAILQ_FOREACH(inp, _queue, inp_queue) {
-   if (inp->inp_lport == divert_port)
-   break;
+   if (inp->inp_lport != divert_port)
+   continue;
+   in_pcbref(inp);
+   break;
+   }
+   mtx_leave(_mtx);
+   if (inp == NULL) {
+   divstat_inc(divs_noport);
+   goto bad;
}
 
-   memset(, 0, sizeof(addr));
-   addr.sin_family = AF_INET;
-   addr.sin_len = sizeof(addr);
+   memset(, 0, sizeof(sin));
+   sin.sin_family = AF_INET;
+   sin.sin_len = sizeof(sin);
 
if (dir == PF_IN) {
struct ifaddr *ifa;
@@ -202,37 +209,34 @@ divert_packet(struct mbuf *m, int dir, u
 
ifp = if_get(m->m_pkthdr.ph_ifidx);
if (ifp == NULL) {
-   m_freem(m);
-   return (0);
+   divstat_inc(divs_errors);
+   goto bad;
}
TAILQ_FOREACH(ifa, >if_addrlist, ifa_list) {
if (ifa->ifa_addr->sa_family != AF_INET)
continue;
-   addr.sin_addr.s_addr = satosin(
-   ifa->ifa_addr)->sin_addr.s_addr;
+   sin.sin_addr = satosin(ifa->ifa_addr)->sin_addr;
break;
}
if_put(ifp);
}
 
-   if (inp) {
-   sa = inp->inp_socket;
-   if (sbappendaddr(sa, >so_rcv, sintosa(), m, NULL) == 
0) {
-   divstat_inc(divs_fullsock);
-   m_freem(m);
-   return (0);
-   } else {
-   KERNEL_LOCK();
-   sorwakeup(inp->inp_socket);
-   KERNEL_UNLOCK();
-   }
-   }
-
-   if (sa == NULL) {
-   divstat_inc(divs_noport);
-   m_freem(m);
+   so = inp->inp_socket;
+   if (sbappendaddr(so, >so_rcv, sintosa(), m, NULL) == 0) {
+   divstat_inc(divs_fullsock);
+   goto bad;
}
-   return (0);
+   KERNEL_LOCK();
+   sorwakeup(inp->inp_socket);
+   KERNEL_UNLOCK();
+
+   in_pcbunref(inp);
+   return;
+
+ bad:
+   if (inp != NULL)
+   in_pcbunref(inp);
+   m_freem(m);
 }
 

Re: [External] : Re: another syzkaller problem in pf

2022-05-04 Thread Alexander Bluhm
On Wed, May 04, 2022 at 02:21:11PM +0200, Alexandr Nedvedicky wrote:
> I'm not sure flipping a flag is a right change. In general we don't want
> to hold NET_LOCK()/PF_LOCK() while waiting for memory.

- We must not wait for memory when in the packet processing hot path.
  Drop the packet instead.

- We should not wait for memory when holding pf lock.  Then we can
  replace rw lock with a mutex or something else later.

- Waiting for memory with net lock is unavoidable.  We have to
  wait when coming from system call.  Doing preallocation may be
  possible in some cases but code may get too complicated elsewhere.

If getting the allocation out of the locks is doable here, it could
be the best solution.

bluhm



Re: ratecheck mutex

2022-05-04 Thread Alexander Bluhm
On Wed, May 04, 2022 at 04:42:21AM -0500, Scott Cheloha wrote:
> > On May 3, 2022, at 17:16, Alexander Bluhm  wrote:
> > 
> > ???Hi,
> > 
> > We have one comment that locking for ratecheck(9) is missing.  In
> > all other places locking status of the struct timeval *lasttime
> > is unclear.
> > 
> > The easiest fix is a global mutex for all lasttime in ratecheck().
> > This covers the usual usecase of the function.
> 
> Why not declare a struct ratecheck with
> a per-struct mutex?

Because that diff is more work.  It is the cleaner solution, but
touches a lot of code.

> It seems odd to be heading toward more
> parallel processing in e.g. the networking
> stack and introduce a global point of
> contention.

I don't expect contention on the rate limit.

Make things stable, run in parallel, tweak performance.
That's why I have chosen this aproach.

But if it is too dirty, I can create the larger diff.

bluhm



ratecheck mutex

2022-05-03 Thread Alexander Bluhm
Hi,

We have one comment that locking for ratecheck(9) is missing.  In
all other places locking status of the struct timeval *lasttime
is unclear.

The easiest fix is a global mutex for all lasttime in ratecheck().
This covers the usual usecase of the function.

Same for ppsratecheck(9), lasttime and curpps are protected.

Remove a useless #if 1 while there.

ok?

bluhm


Index: kern/kern_malloc.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_malloc.c,v
retrieving revision 1.146
diff -u -p -r1.146 kern_malloc.c
--- kern/kern_malloc.c  16 May 2021 15:10:20 -  1.146
+++ kern/kern_malloc.c  3 May 2022 21:51:21 -
@@ -188,7 +188,6 @@ malloc(size_t size, int type, int flags)
if (size > 65535 * PAGE_SIZE) {
if (flags & M_CANFAIL) {
 #ifndef SMALL_KERNEL
-   /* XXX lock */
if (ratecheck(_lasterr, _errintvl))
printf("malloc(): allocation too large, "
"type = %d, size = %lu\n", type, size);
Index: kern/kern_time.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.154
diff -u -p -r1.154 kern_time.c
--- kern/kern_time.c18 Jun 2021 15:59:14 -  1.154
+++ kern/kern_time.c3 May 2022 21:51:21 -
@@ -782,11 +782,13 @@ itimerdecr(struct itimerspec *itp, long 
 int
 ratecheck(struct timeval *lasttime, const struct timeval *mininterval)
 {
+   static struct mutex mtx = MUTEX_INITIALIZER(IPL_HIGH);
struct timeval tv, delta;
int rv = 0;
 
getmicrouptime();
 
+   mtx_enter();
timersub(, lasttime, );
 
/*
@@ -798,6 +800,7 @@ ratecheck(struct timeval *lasttime, cons
*lasttime = tv;
rv = 1;
}
+   mtx_leave();
 
return (rv);
 }
@@ -808,11 +811,13 @@ ratecheck(struct timeval *lasttime, cons
 int
 ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps)
 {
+   static struct mutex mtx = MUTEX_INITIALIZER(IPL_HIGH);
struct timeval tv, delta;
int rv;
 
microuptime();
 
+   mtx_enter();
timersub(, lasttime, );
 
/*
@@ -837,20 +842,11 @@ ppsratecheck(struct timeval *lasttime, i
else
rv = 0;
 
-#if 1 /*DIAGNOSTIC?*/
/* be careful about wrap-around */
if (*curpps + 1 > *curpps)
*curpps = *curpps + 1;
-#else
-   /*
-* assume that there's not too many calls to this function.
-* not sure if the assumption holds, as it depends on *caller's*
-* behavior, not the behavior of this function.
-* IMHO it is wrong to make assumption on the caller's behavior,
-* so the above #if is #if 1, not #ifdef DIAGNOSTIC.
-*/
-   *curpps = *curpps + 1;
-#endif
+
+   mtx_leave();
 
return (rv);
 }



Re: another syzkaller problem in pf

2022-05-03 Thread Alexander Bluhm
On Tue, May 03, 2022 at 07:42:34PM +0200, Moritz Buhl wrote:
> commit 4b3977248902c22d96aaebdb5784840debc2631c
> Author: mikeb 
> Date:   Mon Nov 24 13:22:09 2008 +
> 
> Fix splasserts seen in pr 5987 by propagating a flag that discribes
> whether we're called from the interrupt context to the functions
> performing allocations.

These days pf was protected with kernel lock and spl.  Both are
released when sleeping.  Now we have netlock and pflock.  These are
rwlocks and not released during sleep.  So this old race should not
exist anymore.

> And we are not in interrupt context.

Yes, it is ioctl(2).  I think we should always malloc with M_WAITOK
when in syscall.  Otherwise userland would have to cope with randomly
failing syscalls.

> If this is sound, then the only reason why pfr_destroy_ktable was called
> is that pool_get is called with PR_NOWAIT.  And then the following diff
> would help.

The code is too complex to be sure what the reason of the syzkaller
panic is.  Sleep in malloc is correct anyway and may improve the
situation.

Functions with argument values 0 or 1 are hard to read.  It would
be much nicer to pass M_WAITOK or M_NOWAIT.  And the variable name
"intr" does not make sense anymore.  pf does not run in interrupt
context.  Call it "mflags" like in pfi_kif_alloc().  Or "wait" like
in other functions.

Could you cleanup that also?

bluhm

> Index: pf_if.c
> ===
> RCS file: /cvs/src/sys/net/pf_if.c,v
> retrieving revision 1.104
> diff -u -p -r1.104 pf_if.c
> --- pf_if.c   29 Apr 2022 09:55:43 -  1.104
> +++ pf_if.c   3 May 2022 16:01:23 -
> @@ -464,7 +464,7 @@ pfi_dynaddr_setup(struct pf_addr_wrap *a
>   goto _bad;
>   }
>  
> - if ((dyn->pfid_kt = pfr_attach_table(ruleset, tblname, 1)) == NULL) {
> + if ((dyn->pfid_kt = pfr_attach_table(ruleset, tblname, 0)) == NULL) {
>   rv = 1;
>   goto _bad;
>   }



Re: rttimer move callback to the queue struct

2022-05-03 Thread Alexander Bluhm
On Tue, May 03, 2022 at 11:30:13AM +0200, Claudio Jeker wrote:
> Next stage of rttimer cleanup. Move the callback from the rttimer to the
> rttimer_queue struct. The callback is always the same for a queue so there
> is no need to define it on every call.
> 
> On top of that replace rt_timer_queue_destroy() with
> rt_timer_queue_flush(). With this queues can no longer be removed but that
> is not a problem. My next step is to actually replace
> rt_timer_queue_create() with an initalizer.

OK bluhm@

> Index: net/route.c
> ===
> RCS file: /cvs/src/sys/net/route.c,v
> retrieving revision 1.408
> diff -u -p -r1.408 route.c
> --- net/route.c   30 Apr 2022 07:20:35 -  1.408
> +++ net/route.c   2 May 2022 10:16:58 -
> @@ -1365,8 +1365,8 @@ struct mutexrttimer_mtx;
>  LIST_HEAD(, rttimer_queue)   rttimer_queue_head; /* [T] */
>  
>  #define RTTIMER_CALLOUT(r)   {   \
> - if (r->rtt_func != NULL) {  \
> - (*r->rtt_func)(r->rtt_rt, r->rtt_tableid);  \
> + if (r->rtt_queue->rtq_func != NULL) {   \
> + (*r->rtt_queue->rtq_func)(r->rtt_rt, r->rtt_tableid);   \
>   } else {\
>   struct ifnet *ifp;  \
>   \
> @@ -1403,7 +1403,7 @@ rt_timer_init(void)
>  }
>  
>  struct rttimer_queue *
> -rt_timer_queue_create(int timeout)
> +rt_timer_queue_create(int timeout, void (*func)(struct rtentry *, u_int))
>  {
>   struct rttimer_queue*rtq;
>  
> @@ -1411,6 +1411,7 @@ rt_timer_queue_create(int timeout)
>  
>   rtq->rtq_timeout = timeout;
>   rtq->rtq_count = 0;
> + rtq->rtq_func = func;
>   TAILQ_INIT(>rtq_head);
>  
>   mtx_enter(_mtx);
> @@ -1429,7 +1430,7 @@ rt_timer_queue_change(struct rttimer_que
>  }
>  
>  void
> -rt_timer_queue_destroy(struct rttimer_queue *rtq)
> +rt_timer_queue_flush(struct rttimer_queue *rtq)
>  {
>   struct rttimer  *r;
>   TAILQ_HEAD(, rttimer)rttlist;
> @@ -1445,7 +1446,6 @@ rt_timer_queue_destroy(struct rttimer_qu
>   KASSERT(rtq->rtq_count > 0);
>   rtq->rtq_count--;
>   }
> - LIST_REMOVE(rtq, rtq_link);
>   mtx_leave(_mtx);
>  
>   while ((r = TAILQ_FIRST()) != NULL) {
> @@ -1453,7 +1453,6 @@ rt_timer_queue_destroy(struct rttimer_qu
>   RTTIMER_CALLOUT(r);
>   pool_put(_pool, r);
>   }
> - pool_put(_queue_pool, rtq);
>  }
>  
>  unsigned long
> @@ -1486,8 +1485,7 @@ rt_timer_remove_all(struct rtentry *rt)
>  }
>  
>  int
> -rt_timer_add(struct rtentry *rt, void (*func)(struct rtentry *, u_int),
> - struct rttimer_queue *queue, u_int rtableid)
> +rt_timer_add(struct rtentry *rt, struct rttimer_queue *queue, u_int rtableid)
>  {
>   struct rttimer  *r, *rnew;
>   time_t   current_time;
> @@ -1500,7 +1498,6 @@ rt_timer_add(struct rtentry *rt, void (*
>  
>   rnew->rtt_rt = rt;
>   rnew->rtt_time = current_time;
> - rnew->rtt_func = func;
>   rnew->rtt_queue = queue;
>   rnew->rtt_tableid = rtableid;
>  
> @@ -1511,7 +1508,7 @@ rt_timer_add(struct rtentry *rt, void (*
>* we add a new one.
>*/
>   LIST_FOREACH(r, >rt_timer, rtt_link) {
> - if (r->rtt_func == func) {
> + if (r->rtt_queue == queue) {
>   LIST_REMOVE(r, rtt_link);
>   TAILQ_REMOVE(>rtt_queue->rtq_head, r, rtt_next);
>   KASSERT(r->rtt_queue->rtq_count > 0);
> Index: net/route.h
> ===
> RCS file: /cvs/src/sys/net/route.h,v
> retrieving revision 1.192
> diff -u -p -r1.192 route.h
> --- net/route.h   30 Apr 2022 07:20:35 -  1.192
> +++ net/route.h   30 Apr 2022 08:09:54 -
> @@ -410,8 +410,6 @@ struct rttimer {
>   LIST_ENTRY(rttimer) rtt_link;   /* [T] timers per rtentry */
>   struct rttimer_queue*rtt_queue; /* [T] back pointer to queue */
>   struct rtentry  *rtt_rt;/* [I] back pointer to route */
> - void(*rtt_func) /* [I] callback */
> - (struct rtentry *, u_int);
>   time_t  rtt_time;   /* [I] when timer registered */
>   u_int   rtt_tableid;/* [I] rtable id of rtt_rt */
>  };
> @@ -419,6 +417,8 @@ struct rttimer {
>  struct rttimer_queue {
>   TAILQ_HEAD(, rttimer)   rtq_head;   /* [T] */
>   LIST_ENTRY(rttimer_queue)   rtq_link;   /* [T] */
> + void(*rtq_func) /* [I] callback */
> + (struct rtentry *, u_int);
>   

Re: [External] : Re: add sanity checks to IGMP/MLD

2022-05-03 Thread Alexander Bluhm
On Tue, May 03, 2022 at 10:10:23AM +0200, Alexandr Nedvedicky wrote:
> updated diff is below.
> thanks for taking a look at it.

OK bluhm@

> 8<---8<---8<--8<
> diff --git a/sys/net/pf.c b/sys/net/pf.c
> index f15e1ead8c0..bf9593952ec 100644
> --- a/sys/net/pf.c
> +++ b/sys/net/pf.c
> @@ -6456,8 +6456,15 @@ pf_walk_header(struct pf_pdesc *pd, struct ip *h, 
> u_short *reason)
>   pd->off += hlen;
>   pd->proto = h->ip_p;
>   /* IGMP packets have router alert options, allow them */
> - if (pd->proto == IPPROTO_IGMP)
> + if (pd->proto == IPPROTO_IGMP) {
> + /* According to RFC 1112 ttl must be set to 1. */
> + if (h->ip_ttl != 1) {
> + DPFPRINTF(LOG_NOTICE, "TTL in IGMP must be 1");
> + REASON_SET(reason, PFRES_IPOPTIONS);
> + return (PF_DROP);
> + }
>   CLR(pd->badopts, PF_OPT_ROUTER_ALERT);
> + }
>   /* stop walking over non initial fragments */
>   if ((h->ip_off & htons(IP_OFFMASK)) != 0)
>   return (PF_PASS);
> @@ -6698,6 +6705,19 @@ pf_walk_header6(struct pf_pdesc *pd, struct ip6_hdr 
> *h, u_short *reason)
>   case MLD_LISTENER_REPORT:
>   case MLD_LISTENER_DONE:
>   case MLDV2_LISTENER_REPORT:
> + /*
> +  * According to RFC 2710 all MLD messages are
> +  * sent with hop-limit (ttl) set to 1, and link
> +  * local source address.  If either one is
> +  * missing then MLD message is invalid and
> +  * should be discarded.
> +  */
> + if ((h->ip6_hlim != 1) ||
> + !IN6_IS_ADDR_LINKLOCAL(>ip6_src)) {
> + DPFPRINTF(LOG_NOTICE, "invalid MLD");
> + REASON_SET(reason, PFRES_IPOPTIONS);
> + return (PF_DROP);
> + }
>   CLR(pd->badopts, PF_OPT_ROUTER_ALERT);
>   break;
>   }



Re: [External] : Re: add sanity checks to IGMP/MLD

2022-05-03 Thread Alexander Bluhm
On Tue, May 03, 2022 at 12:26:52AM +0200, Alexandr Nedvedicky wrote:
> OK ? or should I also drop a check for link-local source address
> in IPv6?

The link-local check makes sense.

> 8<---8<---8<--8<
> diff --git a/sys/net/pf.c b/sys/net/pf.c
> index f15e1ead8c0..2187d895749 100644
> --- a/sys/net/pf.c
> +++ b/sys/net/pf.c
> @@ -6456,8 +6456,15 @@ pf_walk_header(struct pf_pdesc *pd, struct ip *h, 
> u_short *reason)
>   pd->off += hlen;
>   pd->proto = h->ip_p;
>   /* IGMP packets have router alert options, allow them */
> - if (pd->proto == IPPROTO_IGMP)
> - CLR(pd->badopts, PF_OPT_ROUTER_ALERT);
> + if (pd->proto == IPPROTO_IGMP) {
> + /* According to RFC 1112 ttl must be set to 1. */
> + if (h->ip_ttl != 1) {
> + DPFPRINTF(LOG_NOTICE, "TTL in IGMP must be 1");
> + REASON_SET(reason, PFRES_IPOPTIONS);
> + return (PF_DROP);
> + } else
> + CLR(pd->badopts, PF_OPT_ROUTER_ALERT);

You return in the if block.  No need for else.

> + }
>   /* stop walking over non initial fragments */
>   if ((h->ip_off & htons(IP_OFFMASK)) != 0)
>   return (PF_PASS);
> @@ -6698,7 +6705,21 @@ pf_walk_header6(struct pf_pdesc *pd, struct ip6_hdr 
> *h, u_short *reason)
>   case MLD_LISTENER_REPORT:
>   case MLD_LISTENER_DONE:
>   case MLDV2_LISTENER_REPORT:
> - CLR(pd->badopts, PF_OPT_ROUTER_ALERT);
> + /*
> +  * According to RFC 2710 all MLD messages are
> +  * sent with hop-limit (ttl) set to 1, and link
> +  * local source address.  If either one is
> +  * missing then MLD message is invalid and
> +  * should be discarded.
> +  */
> + if ((h->ip6_hlim == 1) &&
> + IN6_IS_ADDR_LINKLOCAL(>ip6_src))
> + CLR(pd->badopts, PF_OPT_ROUTER_ALERT);
> + else {
> + DPFPRINTF(LOG_NOTICE, "invalid MLD");
> + REASON_SET(reason, PFRES_IPOPTIONS);
> + return (PF_DROP);
> + }

Can you turn around the logic?

if (something bad)
return
clear badopts

> 
>   break;
>   }
>   return (PF_PASS);



Re: kstat(1): implement wait with setitimer(2)

2022-05-03 Thread Alexander Bluhm
On Mon, May 02, 2022 at 08:39:07PM -0500, Scott Cheloha wrote:
> On Sat, Apr 30, 2022 at 01:27:44AM +0200, Alexander Bluhm wrote:
> > otherwise diff looks good to me
> 
> Still look good?

OK bluhm@

> Index: kstat.c
> ===
> RCS file: /cvs/src/usr.bin/kstat/kstat.c,v
> retrieving revision 1.9
> diff -u -p -r1.9 kstat.c
> --- kstat.c   22 Apr 2022 00:29:20 -  1.9
> +++ kstat.c   3 May 2022 01:37:36 -
> @@ -15,6 +15,8 @@
>   */
>  
>  #include 
> +#include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -104,6 +106,7 @@ kstat_cmp(const struct kstat_entry *ea, 
>  RBT_PROTOTYPE(kstat_tree, kstat_entry, entry, kstat_cmp);
>  RBT_GENERATE(kstat_tree, kstat_entry, entry, kstat_cmp);
>  
> +static void handle_alrm(int);
>  static struct kstat_filter *
>   kstat_filter_parse(char *);
>  static int   kstat_filter_entry(struct kstat_filters *,
> @@ -134,16 +137,17 @@ main(int argc, char *argv[])
>   int fd;
>   const char *errstr;
>   int ch;
> - struct timespec interval = { 0, 0 };
> + struct itimerval itv;
> + sigset_t empty, mask;
>   int i;
> + unsigned int wait = 0;
>  
>   while ((ch = getopt(argc, argv, "w:")) != -1) {
>   switch (ch) {
>   case 'w':
> - interval.tv_sec = strtonum(optarg, 1, 1,
> - );
> + wait = strtonum(optarg, 1, UINT_MAX, );
>   if (errstr != NULL)
> - errx(1, "wait %s: %s", optarg, errstr);
> + errx(1, "wait is %s: %s", errstr, optarg);
>   break;
>   default:
>   usage();
> @@ -168,12 +172,25 @@ main(int argc, char *argv[])
>   kstat_list(, fd, version, );
>   kstat_print();
>  
> - if (interval.tv_sec == 0)
> + if (wait == 0)
>   return (0);
>  
> - for (;;) {
> - nanosleep(, NULL);
> + if (signal(SIGALRM, handle_alrm) == SIG_ERR)
> + err(1, "signal");
> + sigemptyset();
> + sigemptyset();
> + sigaddset(, SIGALRM);
> + if (sigprocmask(SIG_BLOCK, , NULL) == -1)
> + err(1, "sigprocmask");
> +
> + itv.it_value.tv_sec = wait;
> + itv.it_value.tv_usec = 0;
> + itv.it_interval = itv.it_value;
> + if (setitimer(ITIMER_REAL, , NULL) == -1)
> + err(1, "setitimer");
>  
> + for (;;) {
> + sigsuspend();
>   kstat_read(, fd);
>   kstat_print();
>   }
> @@ -547,4 +564,9 @@ kstat_read(struct kstat_tree *kt, int fd
>   if (ioctl(fd, KSTATIOC_FIND_ID, ksreq) == -1)
>   err(1, "update id %llu", ksreq->ks_id);
>   }
> +}
> +
> +static void
> +handle_alrm(int signo)
> +{
>  }



Re: add sanity checks to IGMP/MLD

2022-05-02 Thread Alexander Bluhm
On Mon, May 02, 2022 at 11:30:58PM +0200, Alexandr Nedvedicky wrote:
> hello,
> 
> bluhm@ has committed a fix [1] which makes pf to accept IGMP/MLD messages.
> If I remember correct pf(4) was dropping those messages because
> of Router Alert IP option being present. The IP option is mandatory
> for IGMP/MLD according to RFCs.
> 
> For both protocol versions (IPv4, IPv6) standards say:
> 
> TTL/hop-limit in IP header must be set to 1
> 
> Router Alert option/extension header must be present
> 
> in case of IPv6 the MLD messages must be sent from link-local address.
> 
> 
> diff below adds exactly those checks.
> 
> OK?

Checking that the TTL equals 1 is a good thing.  We should prevent
that someone is forwarding such packets.

The router alert is a hint to routers on the way to look at these
packets.  If they are missing, no harm is done.  Maybe some multicast
does not work.  But there is no security argument to filter these.

I have seen IGMP packets without router alert.  Our stack has fixed
that in OpenBSD 5.6.  Don't believe what is written in RFCs.


revision 1.40
date: 2014/05/12 09:15:00;  author: mpi;  state: Exp;  lines: +28 -5;
Includes a router altert option (RAO) in IGMP packets.   Without this
option, required by the RFC2236, some L3 switches do not examine the
packets.

Based on FreeBSD's r14622 via Florian Riehm on tech@. ok bluhm@, jca@


bluhm

> [1] https://marc.info/?l=openbsd-tech=165109904223362=2
> 
> [2] https://datatracker.ietf.org/doc/html/rfc2710
> https://datatracker.ietf.org/doc/html/rfc2236
> 
> 8<---8<---8<--8<
> diff --git a/sys/net/pf.c b/sys/net/pf.c
> index f15e1ead8c0..9b107751d95 100644
> --- a/sys/net/pf.c
> +++ b/sys/net/pf.c
> @@ -6456,8 +6456,21 @@ pf_walk_header(struct pf_pdesc *pd, struct ip *h, 
> u_short *reason)
>   pd->off += hlen;
>   pd->proto = h->ip_p;
>   /* IGMP packets have router alert options, allow them */
> - if (pd->proto == IPPROTO_IGMP)
> - CLR(pd->badopts, PF_OPT_ROUTER_ALERT);
> + if (pd->proto == IPPROTO_IGMP) {
> + /*
> +  * If router alert option is missing or ttl is not 1, then we
> +  * deal invalid IGMP packet. According to RFC 1112 ttl must be
> +  * set to 1. Also IP header must carry router alert option
> +  * as specified in RFC 2236.
> +  */
> + if (ISSET(pd->badopts, PF_OPT_ROUTER_ALERT) && (h->ip_ttl == 1))
> + CLR(pd->badopts, PF_OPT_ROUTER_ALERT);
> + else {
> + DPFPRINTF(LOG_NOTICE, "invalid IGMP");
> + REASON_SET(reason, PFRES_IPOPTIONS);
> + return (PF_DROP);
> + }
> + }
>   /* stop walking over non initial fragments */
>   if ((h->ip_off & htons(IP_OFFMASK)) != 0)
>   return (PF_PASS);
> @@ -6698,7 +6711,22 @@ pf_walk_header6(struct pf_pdesc *pd, struct ip6_hdr 
> *h, u_short *reason)
>   case MLD_LISTENER_REPORT:
>   case MLD_LISTENER_DONE:
>   case MLDV2_LISTENER_REPORT:
> - CLR(pd->badopts, PF_OPT_ROUTER_ALERT);
> + /*
> +  * According to RFC 2710 all MLD messages are
> +  * sent with with router alert header and hop
> +  * limit (ttl) set to 1, and link local source
> +  * address.  If either one is missing then MLD
> +  * message is invalid and should be discarded.
> +  */
> + if (ISSET(pd->badopts, PF_OPT_ROUTER_ALERT) &&
> + (h->ip6_hlim == 1) &&
> + IN6_IS_ADDR_LINKLOCAL(>ip6_src))
> + CLR(pd->badopts, PF_OPT_ROUTER_ALERT);
> + else {
> + DPFPRINTF(LOG_NOTICE, "invalid MLD");
> + REASON_SET(reason, PFRES_IPOPTIONS);
> + return (PF_DROP);
> + }
>   break;
>   }
>   return (PF_PASS);



Re: install btrace scripts

2022-04-30 Thread Alexander Bluhm
On Sat, Apr 30, 2022 at 02:03:28PM -0600, Theo de Raadt wrote:
> >On 2022-04-30, Alexander Bluhm wrote:
> >> Hi,
> >> 
> >> Can we install the btrace scripts to /usr/share/btrace/ ?  The
> >> directory already exists, only the Makefile is not linked to the
> >> build.
> >> 
> >> And I would like to use #! to make them executable.
> >
> >It's weird to have exec files in share?
> 
> yes, it is weird.  i am not convinced that btrace is a natural
> interpreter, nor that anyone would put this into your $PATH, at
> which point you are using an absolute path, and it seems unlikely
> anyone would even notice these are executable, so they would the
> pattern:
> 
> >I think it's not very hard to type "btrace script" vs script.

Now without #! and with man page.

Index: share/Makefile
===
RCS file: /data/mirror/openbsd/cvs/src/share/Makefile,v
retrieving revision 1.18
diff -u -p -r1.18 Makefile
--- share/Makefile  9 Oct 2014 03:46:15 -   1.18
+++ share/Makefile  30 Apr 2022 19:22:32 -
@@ -1,5 +1,5 @@
 #  $OpenBSD: Makefile,v 1.18 2014/10/09 03:46:15 tedu Exp $
 
-SUBDIR=dict locale man misc mk snmp tabset termtypes zoneinfo
+SUBDIR=btrace dict locale man misc mk snmp tabset termtypes zoneinfo
 
 .include 
Index: share/btrace/kprofile.bt
===
RCS file: /data/mirror/openbsd/cvs/src/share/btrace/kprofile.bt,v
retrieving revision 1.1
diff -u -p -r1.1 kprofile.bt
--- share/btrace/kprofile.bt23 Oct 2021 19:37:35 -  1.1
+++ share/btrace/kprofile.bt30 Apr 2022 21:32:03 -
@@ -1,3 +1,5 @@
+/* $OpenBSD$   */
+
 /*
  * kprofile.bt Kernel profiling (stack sampling) at 100Hz.
  *
Index: share/btrace/runqlat.bt
===
RCS file: /data/mirror/openbsd/cvs/src/share/btrace/runqlat.bt,v
retrieving revision 1.2
diff -u -p -r1.2 runqlat.bt
--- share/btrace/runqlat.bt 24 Oct 2021 08:42:38 -  1.2
+++ share/btrace/runqlat.bt 30 Apr 2022 21:32:06 -
@@ -1,3 +1,5 @@
+/* $OpenBSD$   */
+
 /*
  * runqlat.bt  Measure run queue latency (aka scheduler latency). OpenBSD.
  *
Index: usr.sbin/btrace/btrace.8
===
RCS file: /data/mirror/openbsd/cvs/src/usr.sbin/btrace/btrace.8,v
retrieving revision 1.6
diff -u -p -r1.6 btrace.8
--- usr.sbin/btrace/btrace.88 Sep 2021 13:29:51 -   1.6
+++ usr.sbin/btrace/btrace.830 Apr 2022 21:38:41 -
@@ -59,6 +59,13 @@ Multiple
 options increase the verbosity.
 The maximum is 2.
 .El
+.Sh FILES
+.Bl -tag -width "/usr/share/btrace/" -compact
+.It Pa /usr/share/btrace/*
+collection of useful
+.Nm
+programs
+.El
 .Sh EXIT STATUS
 .Ex -std 
 .Sh SEE ALSO



install btrace scripts

2022-04-30 Thread Alexander Bluhm
Hi,

Can we install the btrace scripts to /usr/share/btrace/ ?  The
directory already exists, only the Makefile is not linked to the
build.

And I would like to use #! to make them executable.

ok?

bluhm

Index: share/Makefile
===
RCS file: /data/mirror/openbsd/cvs/src/share/Makefile,v
retrieving revision 1.18
diff -u -p -r1.18 Makefile
--- share/Makefile  9 Oct 2014 03:46:15 -   1.18
+++ share/Makefile  30 Apr 2022 19:22:32 -
@@ -1,5 +1,5 @@
 #  $OpenBSD: Makefile,v 1.18 2014/10/09 03:46:15 tedu Exp $
 
-SUBDIR=dict locale man misc mk snmp tabset termtypes zoneinfo
+SUBDIR=btrace dict locale man misc mk snmp tabset termtypes zoneinfo
 
 .include 
Index: share/btrace/Makefile
===
RCS file: /data/mirror/openbsd/cvs/src/share/btrace/Makefile,v
retrieving revision 1.1
diff -u -p -r1.1 Makefile
--- share/btrace/Makefile   23 Oct 2021 19:37:35 -  1.1
+++ share/btrace/Makefile   30 Apr 2022 19:18:25 -
@@ -4,7 +4,7 @@ FILES=  kprofile.bt runqlat.bt
 NOOBJ= noobj
 
 install:
-   ${INSTALL} ${INSTALL_COPY} -o ${BINOWN} -g ${BINGRP} -m 444 ${FILES} \
+   ${INSTALL} ${INSTALL_COPY} -o ${BINOWN} -g ${BINGRP} -m 555 ${FILES} \
${DESTDIR}${BINDIR}/btrace
 
 .include 
Index: share/btrace/kprofile.bt
===
RCS file: /data/mirror/openbsd/cvs/src/share/btrace/kprofile.bt,v
retrieving revision 1.1
diff -u -p -r1.1 kprofile.bt
--- share/btrace/kprofile.bt23 Oct 2021 19:37:35 -  1.1
+++ share/btrace/kprofile.bt30 Apr 2022 19:20:32 -
@@ -1,3 +1,6 @@
+#!/usr/sbin/btrace
+/* $OpenBSD$   */
+
 /*
  * kprofile.bt Kernel profiling (stack sampling) at 100Hz.
  *
Index: share/btrace/runqlat.bt
===
RCS file: /data/mirror/openbsd/cvs/src/share/btrace/runqlat.bt,v
retrieving revision 1.2
diff -u -p -r1.2 runqlat.bt
--- share/btrace/runqlat.bt 24 Oct 2021 08:42:38 -  1.2
+++ share/btrace/runqlat.bt 30 Apr 2022 19:20:38 -
@@ -1,3 +1,6 @@
+#!/usr/sbin/btrace
+/* $OpenBSD$   */
+
 /*
  * runqlat.bt  Measure run queue latency (aka scheduler latency). OpenBSD.
  *



Re: simplify rttimer api

2022-04-29 Thread Alexander Bluhm
On Fri, Apr 29, 2022 at 11:58:52AM +0200, Claudio Jeker wrote:
> The callback currently uses struct rttimer as an argument but the code
> only needs the rtt_tableid element from there. Change the callbacks to
> be of the form void (*rtt_callback)(struct rtentry *r, u_int rtableid)
> 
> Also change the default rttimer callback (in case the function is NULL)
> to only handle routes with RTF_HOST and RTF_DYNAMIC set. By doing this
> two rttimer queues can be switched to a NULL callback. The other option
> would be to require always a callback. Right now nothing uses the default
> so it should be removed or made useful.
> 
> As a next step I plan to move the callback to struct rttimer_queue since
> all rt_timer_add calls use the same callback.

OK bluhm@

> Index: net/route.c
> ===
> RCS file: /cvs/src/sys/net/route.c,v
> retrieving revision 1.407
> diff -u -p -r1.407 route.c
> --- net/route.c   28 Apr 2022 17:47:41 -  1.407
> +++ net/route.c   29 Apr 2022 09:42:09 -
> @@ -1366,12 +1366,14 @@ LIST_HEAD(, rttimer_queue)rttimer_queue
>  
>  #define RTTIMER_CALLOUT(r)   {   \
>   if (r->rtt_func != NULL) {  \
> - (*r->rtt_func)(r->rtt_rt, r);   \
> + (*r->rtt_func)(r->rtt_rt, r->rtt_tableid);  \
>   } else {\
>   struct ifnet *ifp;  \
>   \
>   ifp = if_get(r->rtt_rt->rt_ifidx);  \
> - if (ifp != NULL)\
> + if (ifp != NULL &&  \
> + (r->rtt_rt->rt_flags & (RTF_DYNAMIC|RTF_HOST)) ==   \
> + (RTF_DYNAMIC|RTF_HOST)) \
>   rtdeletemsg(r->rtt_rt, ifp, r->rtt_tableid);\
>   if_put(ifp);\
>   }   \
> @@ -1484,8 +1486,8 @@ rt_timer_remove_all(struct rtentry *rt)
>  }
>  
>  int
> -rt_timer_add(struct rtentry *rt, void (*func)(struct rtentry *,
> -struct rttimer *), struct rttimer_queue *queue, u_int rtableid)
> +rt_timer_add(struct rtentry *rt, void (*func)(struct rtentry *, u_int),
> + struct rttimer_queue *queue, u_int rtableid)
>  {
>   struct rttimer  *r, *rnew;
>   time_t   current_time;
> Index: net/route.h
> ===
> RCS file: /cvs/src/sys/net/route.h,v
> retrieving revision 1.191
> diff -u -p -r1.191 route.h
> --- net/route.h   28 Apr 2022 17:47:41 -  1.191
> +++ net/route.h   29 Apr 2022 09:30:27 -
> @@ -411,7 +411,7 @@ struct rttimer {
>   struct rttimer_queue*rtt_queue; /* [T] back pointer to queue */
>   struct rtentry  *rtt_rt;/* [I] back pointer to route */
>   void(*rtt_func) /* [I] callback */
> - (struct rtentry *, struct rttimer *);
> + (struct rtentry *, u_int);
>   time_t  rtt_time;   /* [I] when timer registered */
>   u_int   rtt_tableid;/* [I] rtable id of rtt_rt */
>  };
> @@ -459,7 +459,7 @@ struct rtentry *rt_getll(struct rtentry 
>  
>  void  rt_timer_init(void);
>  int   rt_timer_add(struct rtentry *,
> - void(*)(struct rtentry *, struct rttimer *),
> + void(*)(struct rtentry *, u_int),
>   struct rttimer_queue *, u_int);
>  void  rt_timer_remove_all(struct rtentry *);
>  struct rttimer_queue *rt_timer_queue_create(int);
> Index: netinet/ip_icmp.c
> ===
> RCS file: /cvs/src/sys/netinet/ip_icmp.c,v
> retrieving revision 1.188
> diff -u -p -r1.188 ip_icmp.c
> --- netinet/ip_icmp.c 20 Apr 2022 09:38:26 -  1.188
> +++ netinet/ip_icmp.c 29 Apr 2022 09:41:25 -
> @@ -132,9 +132,8 @@ const struct sysctl_bounded_args icmpctl
>  };
>  
>  
> -void icmp_mtudisc_timeout(struct rtentry *, struct rttimer *);
> +void icmp_mtudisc_timeout(struct rtentry *, u_int);
>  int icmp_ratelimit(const struct in_addr *, const int, const int);
> -void icmp_redirect_timeout(struct rtentry *, struct rttimer *);
>  int icmp_input_if(struct ifnet *, struct mbuf **, int *, int, int);
>  int icmp_sysctl_icmpstat(void *, size_t *, void *);
>  
> @@ -634,8 +633,8 @@ reflect:
>   rtredirect(sintosa(), sintosa(),
>   sintosa(), , m->m_pkthdr.ph_rtableid);
>   if (newrt != NULL && icmp_redirtimeout > 0) {

Re: kstat(1): implement wait with setitimer(2)

2022-04-29 Thread Alexander Bluhm
On Thu, Apr 28, 2022 at 08:54:02PM -0500, Scott Cheloha wrote:
> On Thu, Sep 17, 2020 at 06:29:48PM -0500, Scott Cheloha wrote:
> > [...]
> > 
> > Using nanosleep(2) to print the stats periodically causes the period
> > to drift.  If you use setitimer(2) it won't drift.
> > 
> > ok?
> 
> 19 month bump and rebase.
> 
> I have updated the patch according to input from kn@.
> 
> Once again, using nanosleep(2) here to print the stats periodically is
> flawed.  The period will drift.  Using setitimer(2)/sigsuspend(2) is
> better.
> 
> While here:
> 
> - We don't need the hundred million second upper bound anymore.  Just
>   cap the wait at UINT_MAX seconds.
> 
> - Use the idiomatic strtonum(3) error message format, it works here.
> 
> ok?

I would prefer to block the alarm signal with sigprocmask(2) and
only catch it during sigsuspend(2).  Although the timeout should
only happen while we sleep, blocking signals while we don't expect
them, gives me a better feeling.

Please check the error code of signal(3).

otherwise diff looks good to me

> Index: kstat.c
> ===
> RCS file: /cvs/src/usr.bin/kstat/kstat.c,v
> retrieving revision 1.9
> diff -u -p -r1.9 kstat.c
> --- kstat.c   22 Apr 2022 00:29:20 -  1.9
> +++ kstat.c   29 Apr 2022 01:43:31 -
> @@ -15,6 +15,8 @@
>   */
>  
>  #include 
> +#include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -104,6 +106,7 @@ kstat_cmp(const struct kstat_entry *ea, 
>  RBT_PROTOTYPE(kstat_tree, kstat_entry, entry, kstat_cmp);
>  RBT_GENERATE(kstat_tree, kstat_entry, entry, kstat_cmp);
>  
> +static void handle_alrm(int);
>  static struct kstat_filter *
>   kstat_filter_parse(char *);
>  static int   kstat_filter_entry(struct kstat_filters *,
> @@ -134,16 +137,17 @@ main(int argc, char *argv[])
>   int fd;
>   const char *errstr;
>   int ch;
> - struct timespec interval = { 0, 0 };
> + struct itimerval itv;
> + unsigned int wait = 0;
> + sigset_t empty;
>   int i;
>  
>   while ((ch = getopt(argc, argv, "w:")) != -1) {
>   switch (ch) {
>   case 'w':
> - interval.tv_sec = strtonum(optarg, 1, 1,
> - );
> + wait = strtonum(optarg, 1, UINT_MAX, );
>   if (errstr != NULL)
> - errx(1, "wait %s: %s", optarg, errstr);
> + errx(1, "wait is %s: %s", errstr, optarg);
>   break;
>   default:
>   usage();
> @@ -165,15 +169,21 @@ main(int argc, char *argv[])
>   if (ioctl(fd, KSTATIOC_VERSION, ) == -1)
>   err(1, "kstat version");
>  
> - kstat_list(, fd, version, );
> - kstat_print();
> -
> - if (interval.tv_sec == 0)
> + if (wait == 0) {
> + kstat_list(, fd, version, );
> + kstat_print();
>   return (0);
> + }
>  
> + sigemptyset();
> + signal(SIGALRM, handle_alrm);
> + itv.it_value.tv_sec = wait;
> + itv.it_value.tv_usec = 0;
> + itv.it_interval = itv.it_value;
> + if (setitimer(ITIMER_REAL, , NULL) == -1)
> + err(1, "setitimer");
>   for (;;) {
> - nanosleep(, NULL);
> -
> + sigsuspend();
>   kstat_read(, fd);
>   kstat_print();
>   }
> @@ -547,4 +557,9 @@ kstat_read(struct kstat_tree *kt, int fd
>   if (ioctl(fd, KSTATIOC_FIND_ID, ksreq) == -1)
>   err(1, "update id %llu", ksreq->ks_id);
>   }
> +}
> +
> +static void
> +handle_alrm(int signo)
> +{
>  }



Re: [External] : Re: pf igmp icmp6 multicast router alert

2022-04-28 Thread Alexander Bluhm
On Thu, Apr 28, 2022 at 08:15:20AM +0200, Alexandr Nedvedicky wrote:
> Hello,
> 
> On Thu, Apr 28, 2022 at 12:36:40AM +0200, Alexander Bluhm wrote:
> > On Wed, Apr 27, 2022 at 11:47:45PM +0200, Alexander Bluhm wrote:
> > > New diff:
> > > - make off and end relative to opts array
> > > - check length of IPv4 options
> > > - fix call to pf_walk_option
> > > - add case IP6OPT_PADN
> > > - add case MLDV2_LISTENER_REPORT
> > 
> > - pf_pull_hdr() before pf_walk_option6() was missing
> > 
> > ok?
> 
> diff reads OK to me as far as I can tell.
> 
> 
> OK sashan

Thanks.  regress/sys/netinet6/frag6 found a small issue.  If the
icmp6 header is fragmented, we cannot pull the icmp6 header.  I had
to copy the fragment check to the beginning of case IPPROTO_ICMPV6.

This chunk is new:
+   case IPPROTO_ICMPV6:
+   /* fragments may be short, ignore inner header then */
+   if (pd->fragoff != 0 && end < pd->off + sizeof(icmp6)) {
+   pd->off = pd->fragoff;
+   pd->proto = IPPROTO_FRAGMENT;
+   return (PF_PASS);
+   }

Although it is questionable if we should allow fragmented header
chains, I don't want to change behavior here.  If I recall correctly
newer RFCs forbid fragmented header chains.  But I had implemented
this code before the IPv6 standards have discovered the security
implications.

I am currently running a full regress.

bluhm

Index: net/pf.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf.c,v
retrieving revision 1.1126
diff -u -p -r1.1126 pf.c
--- net/pf.c17 Mar 2022 18:27:55 -  1.1126
+++ net/pf.c28 Apr 2022 20:13:00 -
@@ -227,6 +227,8 @@ u_int16_tpf_calc_mss(struct pf_addr *
 static __inline int pf_set_rt_ifp(struct pf_state *, struct pf_addr *,
sa_family_t, struct pf_src_node **);
 struct pf_divert   *pf_get_divert(struct mbuf *);
+int pf_walk_option(struct pf_pdesc *, struct ip *,
+   int, int, u_short *);
 int pf_walk_header(struct pf_pdesc *, struct ip *,
u_short *);
 int pf_walk_option6(struct pf_pdesc *, struct ip6_hdr *,
@@ -3956,7 +3958,7 @@ pf_test_rule(struct pf_pdesc *pd, struct
rtable_l2(ctx.act.rtableid) != pd->rdomain)
pd->destchg = 1;
 
-   if (r->action == PF_PASS && pd->badopts && ! r->allow_opts) {
+   if (r->action == PF_PASS && pd->badopts != 0 && ! r->allow_opts) {
REASON_SET(, PFRES_IPOPTIONS);
 #if NPFLOG > 0
pd->pflog |= PF_LOG_FORCE;
@@ -6382,6 +6384,55 @@ pf_get_divert(struct mbuf *m)
 }
 
 int
+pf_walk_option(struct pf_pdesc *pd, struct ip *h, int off, int end,
+u_short *reason)
+{
+   uint8_t type, length, opts[15 * 4 - sizeof(struct ip)];
+
+   KASSERT(end - off <= sizeof(opts));
+   m_copydata(pd->m, off, end - off, opts);
+   end -= off;
+   off = 0;
+
+   while (off < end) {
+   type = opts[off];
+   if (type == IPOPT_EOL)
+   break;
+   if (type == IPOPT_NOP) {
+   off++;
+   continue;
+   }
+   if (off + 2 > end) {
+   DPFPRINTF(LOG_NOTICE, "IP length opt");
+   REASON_SET(reason, PFRES_IPOPTIONS);
+   return (PF_DROP);
+   }
+   length = opts[off + 1];
+   if (length < 2) {
+   DPFPRINTF(LOG_NOTICE, "IP short opt");
+   REASON_SET(reason, PFRES_IPOPTIONS);
+   return (PF_DROP);
+   }
+   if (off + length > end) {
+   DPFPRINTF(LOG_NOTICE, "IP long opt");
+   REASON_SET(reason, PFRES_IPOPTIONS);
+   return (PF_DROP);
+   }
+   switch (type) {
+   case IPOPT_RA:
+   SET(pd->badopts, PF_OPT_ROUTER_ALERT);
+   break;
+   default:
+   SET(pd->badopts, PF_OPT_OTHER);
+   break;
+   }
+   off += length;
+   }
+
+   return (PF_PASS);
+}
+
+int
 pf_walk_header(struct pf_pdesc *pd, struct ip *h, u_short *reason)
 {
struct ip6_ext   ext;
@@ -6393,11 +6444,20 @@ pf_walk_header(struct pf_pdesc *pd, stru
REASON_SET(reason, PFRES_SHORT);
return (PF_DROP);
}
-   if (hlen !

Re: router timer mutex

2022-04-28 Thread Alexander Bluhm
I still need an ok for this diff.  It is the final step before we
can run IP forwaring in parallel.

bluhm

On Thu, Apr 21, 2022 at 05:44:17PM +0200, Alexander Bluhm wrote:
> On Wed, Apr 20, 2022 at 08:12:51PM +0200, Alexander Bluhm wrote:
> > mvs@ reminded me of a crash I have seen in December.  Route timers
> > are not MP safe, but I think this can be fixed with a mutex.  The
> > idea is to protect the global lists with a mutex and move the rttimer
> > into a temporary list.  Then the callback and pool put can be called
> > later without mutex.
> 
> I have a tiny update to the diff.
> 
> - Global locks are documented with capital letter, so use [T].
> 
> - rt_timer_add() grabbed the mutex twice, first remove then add.
>   Better exchange in one critical section.  pool_get before and
>   pool_put after.
> 
> ok?
> 
> Index: net/route.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/route.c,v
> retrieving revision 1.406
> diff -u -p -r1.406 route.c
> --- net/route.c   20 Apr 2022 17:58:22 -  1.406
> +++ net/route.c   21 Apr 2022 13:31:52 -
> @@ -1361,7 +1361,8 @@ rt_ifa_purge_walker(struct rtentry *rt, 
>   * for multiple queues for efficiency's sake...
>   */
>  
> -LIST_HEAD(, rttimer_queue)   rttimer_queue_head;
> +struct mutex rttimer_mtx;
> +LIST_HEAD(, rttimer_queue)   rttimer_queue_head; /* [T] */
>  
>  #define RTTIMER_CALLOUT(r)   {   \
>   if (r->rtt_func != NULL) {  \
> @@ -1393,6 +1394,7 @@ rt_timer_init(void)
>   pool_init(_queue_pool, sizeof(struct rttimer_queue), 0,
>   IPL_MPFLOOR, 0, "rttmrq", NULL);
>  
> + mtx_init(_mtx, IPL_MPFLOOR);
>   LIST_INIT(_queue_head);
>   timeout_set_proc(_timer_timeout, rt_timer_timer, _timer_timeout);
>   timeout_add_sec(_timer_timeout, 1);
> @@ -1408,7 +1410,10 @@ rt_timer_queue_create(int timeout)
>   rtq->rtq_timeout = timeout;
>   rtq->rtq_count = 0;
>   TAILQ_INIT(>rtq_head);
> +
> + mtx_enter(_mtx);
>   LIST_INSERT_HEAD(_queue_head, rtq, rtq_link);
> + mtx_leave(_mtx);
>  
>   return (rtq);
>  }
> @@ -1416,28 +1421,36 @@ rt_timer_queue_create(int timeout)
>  void
>  rt_timer_queue_change(struct rttimer_queue *rtq, int timeout)
>  {
> + mtx_enter(_mtx);
>   rtq->rtq_timeout = timeout;
> + mtx_leave(_mtx);
>  }
>  
>  void
>  rt_timer_queue_destroy(struct rttimer_queue *rtq)
>  {
> - struct rttimer  *r;
> + struct rttimer  *r;
> + TAILQ_HEAD(, rttimer)rttlist;
>  
>   NET_ASSERT_LOCKED();
>  
> + TAILQ_INIT();
> + mtx_enter(_mtx);
>   while ((r = TAILQ_FIRST(>rtq_head)) != NULL) {
>   LIST_REMOVE(r, rtt_link);
>   TAILQ_REMOVE(>rtq_head, r, rtt_next);
> + TAILQ_INSERT_TAIL(, r, rtt_next);
> + KASSERT(rtq->rtq_count > 0);
> + rtq->rtq_count--;
> + }
> + LIST_REMOVE(rtq, rtq_link);
> + mtx_leave(_mtx);
> +
> + while ((r = TAILQ_FIRST()) != NULL) {
> + TAILQ_REMOVE(, r, rtt_next);
>   RTTIMER_CALLOUT(r);
>   pool_put(_pool, r);
> - if (rtq->rtq_count > 0)
> - rtq->rtq_count--;
> - else
> - printf("rt_timer_queue_destroy: rtq_count reached 0\n");
>   }
> -
> - LIST_REMOVE(rtq, rtq_link);
>   pool_put(_queue_pool, rtq);
>  }
>  
> @@ -1450,15 +1463,22 @@ rt_timer_queue_count(struct rttimer_queu
>  void
>  rt_timer_remove_all(struct rtentry *rt)
>  {
> - struct rttimer  *r;
> + struct rttimer  *r;
> + TAILQ_HEAD(, rttimer)rttlist;
>  
> + TAILQ_INIT();
> + mtx_enter(_mtx);
>   while ((r = LIST_FIRST(>rt_timer)) != NULL) {
>   LIST_REMOVE(r, rtt_link);
>   TAILQ_REMOVE(>rtt_queue->rtq_head, r, rtt_next);
> - if (r->rtt_queue->rtq_count > 0)
> - r->rtt_queue->rtq_count--;
> - else
> - printf("rt_timer_remove_all: rtq_count reached 0\n");
> + TAILQ_INSERT_TAIL(, r, rtt_next);
> + KASSERT(r->rtt_queue->rtq_count > 0);
> + r->rtt_queue->rtq_count--;
> + }
> + mtx_leave(_mtx);
> +
> + while ((r = TAILQ_FIRST()) != NULL) {
> + TAILQ_REMOVE(, r, rtt_next);
>   pool_put(_pool, r);
>   }
>  }
> @@ -1467,12 

Re: cleanup multicast rttimer queues

2022-04-28 Thread Alexander Bluhm
On Wed, Apr 27, 2022 at 12:10:59PM +0200, Claudio Jeker wrote:
> There is no need to have a rttimer queue per rdomain. The rttimer itself
> is rdomain aware and so this just make everything more complicated for no
> gain.
> 
> This diff just drops back to a single queue and initializes the queues in
> ip_init() and the IPv6 counterpart. I have no mrouter setup to test this
> but it compiles and I see no reason why it would not work.

I have tested it with regress/sys/netinet/mcast and
regress/sys/netinet6/mcast6 .  There is a minimal multicast router.

OK bluhm@ with two nits.

Could you put these declarations into header files?
Should these globals have an ip_ prefix?

>  #ifdef MROUTING
>  extern int ip_mrtproto;
> +extern struct rttimer_queue *mrouterq;
>  #endif

> +#ifdef MROUTING
> +extern struct rttimer_queue *mrouter6q;
> +#endif



Re: pf igmp icmp6 multicast router alert

2022-04-27 Thread Alexander Bluhm
On Wed, Apr 27, 2022 at 11:47:45PM +0200, Alexander Bluhm wrote:
> New diff:
> - make off and end relative to opts array
> - check length of IPv4 options
> - fix call to pf_walk_option
> - add case IP6OPT_PADN
> - add case MLDV2_LISTENER_REPORT

- pf_pull_hdr() before pf_walk_option6() was missing

ok?

bluhm

Index: net/pf.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf.c,v
retrieving revision 1.1126
diff -u -p -r1.1126 pf.c
--- net/pf.c17 Mar 2022 18:27:55 -  1.1126
+++ net/pf.c27 Apr 2022 22:28:38 -
@@ -227,6 +227,8 @@ u_int16_tpf_calc_mss(struct pf_addr *
 static __inline int pf_set_rt_ifp(struct pf_state *, struct pf_addr *,
sa_family_t, struct pf_src_node **);
 struct pf_divert   *pf_get_divert(struct mbuf *);
+int pf_walk_option(struct pf_pdesc *, struct ip *,
+   int, int, u_short *);
 int pf_walk_header(struct pf_pdesc *, struct ip *,
u_short *);
 int pf_walk_option6(struct pf_pdesc *, struct ip6_hdr *,
@@ -3956,7 +3958,7 @@ pf_test_rule(struct pf_pdesc *pd, struct
rtable_l2(ctx.act.rtableid) != pd->rdomain)
pd->destchg = 1;
 
-   if (r->action == PF_PASS && pd->badopts && ! r->allow_opts) {
+   if (r->action == PF_PASS && pd->badopts != 0 && ! r->allow_opts) {
REASON_SET(, PFRES_IPOPTIONS);
 #if NPFLOG > 0
pd->pflog |= PF_LOG_FORCE;
@@ -6382,6 +6384,55 @@ pf_get_divert(struct mbuf *m)
 }
 
 int
+pf_walk_option(struct pf_pdesc *pd, struct ip *h, int off, int end,
+u_short *reason)
+{
+   uint8_t type, length, opts[15 * 4 - sizeof(struct ip)];
+
+   KASSERT(end - off <= sizeof(opts));
+   m_copydata(pd->m, off, end - off, opts);
+   end -= off;
+   off = 0;
+
+   while (off < end) {
+   type = opts[off];
+   if (type == IPOPT_EOL)
+   break;
+   if (type == IPOPT_NOP) {
+   off++;
+   continue;
+   }
+   if (off + 2 > end) {
+   DPFPRINTF(LOG_NOTICE, "IP length opt");
+   REASON_SET(reason, PFRES_IPOPTIONS);
+   return (PF_DROP);
+   }
+   length = opts[off + 1];
+   if (length < 2) {
+   DPFPRINTF(LOG_NOTICE, "IP short opt");
+   REASON_SET(reason, PFRES_IPOPTIONS);
+   return (PF_DROP);
+   }
+   if (off + length > end) {
+   DPFPRINTF(LOG_NOTICE, "IP long opt");
+   REASON_SET(reason, PFRES_IPOPTIONS);
+   return (PF_DROP);
+   }
+   switch (type) {
+   case IPOPT_RA:
+   SET(pd->badopts, PF_OPT_ROUTER_ALERT);
+   break;
+   default:
+   SET(pd->badopts, PF_OPT_OTHER);
+   break;
+   }
+   off += length;
+   }
+
+   return (PF_PASS);
+}
+
+int
 pf_walk_header(struct pf_pdesc *pd, struct ip *h, u_short *reason)
 {
struct ip6_ext   ext;
@@ -6393,11 +6444,20 @@ pf_walk_header(struct pf_pdesc *pd, stru
REASON_SET(reason, PFRES_SHORT);
return (PF_DROP);
}
-   if (hlen != sizeof(struct ip))
-   pd->badopts++;
+   if (hlen != sizeof(struct ip)) {
+   if (pf_walk_option(pd, h, pd->off + sizeof(struct ip),
+   pd->off + hlen, reason) != PF_PASS)
+   return (PF_DROP);
+   /* header options which contain only padding is fishy */
+   if (pd->badopts == 0)
+   SET(pd->badopts, PF_OPT_OTHER);
+   }
end = pd->off + ntohs(h->ip_len);
pd->off += hlen;
pd->proto = h->ip_p;
+   /* IGMP packets have router alert options, allow them */
+   if (pd->proto == IPPROTO_IGMP)
+   CLR(pd->badopts, PF_OPT_ROUTER_ALERT);
/* stop walking over non initial fragments */
if ((h->ip_off & htons(IP_OFFMASK)) != 0)
return (PF_PASS);
@@ -6455,7 +6515,10 @@ pf_walk_option6(struct pf_pdesc *pd, str
return (PF_DROP);
}
switch (opt.ip6o_type) {
+   case IP6OPT_PADN:
+   break;
case IP6OPT_JUMBO:
+   SET(pd->badopts, PF_OPT_JUMBO);
if (pd->jumbolen != 0) {
   

Re: [External] : Re: pf igmp icmp6 multicast router alert

2022-04-27 Thread Alexander Bluhm
On Fri, Apr 22, 2022 at 07:40:17PM +0200, Alexandr Nedvedicky wrote:
> > +   case IPPROTO_ICMPV6:
> > +   if (!pf_pull_hdr(pd->m, pd->off, , sizeof(icmp6),
> > +   NULL, reason, AF_INET6)) {
> > +   DPFPRINTF(LOG_NOTICE, "IPv6 short icmp6hdr");
> > +   return (PF_DROP);
> > +   }
> > +   /* ICMP multicast packets have router alert options */
> > +   switch (icmp6.icmp6_type) {
> > +   case MLD_LISTENER_QUERY:
> > +   case MLD_LISTENER_REPORT:
> > +   case MLD_LISTENER_DONE:
> 
> I wonder if we should have a similar check we have for IPv4 address,
> where we require a multicast address. for example in case of
> MLD_LISTENER_QUERY the packet destination address should be fe80::/10.
> I need to look at RFCs more closely first. Just asking in case someone 
> else
> knows from top of the head.

Where do we check multicast adddress for IPv4?  At this point we
are just comparing protocol and IP options.  I would not make it
more complex, so I will not add multicast adddress checks here.

bluhm



Re: pf igmp icmp6 multicast router alert

2022-04-27 Thread Alexander Bluhm
On Fri, Apr 22, 2022 at 09:03:45PM +0200, Otto Moerbeek wrote:
> On Fri, Apr 22, 2022 at 05:59:18PM +0200, Alexander Bluhm wrote:
> 
> > On Thu, Apr 21, 2022 at 09:10:02PM +0200, Alexander Bluhm wrote:
> > > The option I have ever seen in the wild is router alert.  So it may
> > > be better to allow IGMP and ICMP6 multicast if router alert is the
> > > only option in the packet.
> > 
> > This diff implements exactly that.  I have only compile tested it.
> > If we decide that is the way to go, I will adapt my pf regress.
> 
> A quick test shows that this still blocks these:
> 
> 21:00:35.009640 fe80::aef8:ccff:feca:428c > ff02::1: HBH icmp6: multicast 
> listener query v2 [|icmp6] [hlim 1]
This is v2

New diff:
- make off and end relative to opts array
- check length of IPv4 options
- fix call to pf_walk_option
- add case IP6OPT_PADN
- add case MLDV2_LISTENER_REPORT

I have written some regression tests that deal with pf IP options.

ok?

bluhm

Index: net/pf.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf.c,v
retrieving revision 1.1126
diff -u -p -r1.1126 pf.c
--- net/pf.c17 Mar 2022 18:27:55 -  1.1126
+++ net/pf.c27 Apr 2022 21:33:36 -
@@ -227,6 +227,8 @@ u_int16_tpf_calc_mss(struct pf_addr *
 static __inline int pf_set_rt_ifp(struct pf_state *, struct pf_addr *,
sa_family_t, struct pf_src_node **);
 struct pf_divert   *pf_get_divert(struct mbuf *);
+int pf_walk_option(struct pf_pdesc *, struct ip *,
+   int, int, u_short *);
 int pf_walk_header(struct pf_pdesc *, struct ip *,
u_short *);
 int pf_walk_option6(struct pf_pdesc *, struct ip6_hdr *,
@@ -3956,7 +3958,7 @@ pf_test_rule(struct pf_pdesc *pd, struct
rtable_l2(ctx.act.rtableid) != pd->rdomain)
pd->destchg = 1;
 
-   if (r->action == PF_PASS && pd->badopts && ! r->allow_opts) {
+   if (r->action == PF_PASS && pd->badopts != 0 && ! r->allow_opts) {
REASON_SET(, PFRES_IPOPTIONS);
 #if NPFLOG > 0
pd->pflog |= PF_LOG_FORCE;
@@ -6382,6 +6384,55 @@ pf_get_divert(struct mbuf *m)
 }
 
 int
+pf_walk_option(struct pf_pdesc *pd, struct ip *h, int off, int end,
+u_short *reason)
+{
+   uint8_t type, length, opts[15 * 4 - sizeof(struct ip)];
+
+   KASSERT(end - off <= sizeof(opts));
+   m_copydata(pd->m, off, end - off, opts);
+   end -= off;
+   off = 0;
+
+   while (off < end) {
+   type = opts[off];
+   if (type == IPOPT_EOL)
+   break;
+   if (type == IPOPT_NOP) {
+   off++;
+   continue;
+   }
+   if (off + 2 > end) {
+   DPFPRINTF(LOG_NOTICE, "IP length opt");
+   REASON_SET(reason, PFRES_IPOPTIONS);
+   return (PF_DROP);
+   }
+   length = opts[off + 1];
+   if (length < 2) {
+   DPFPRINTF(LOG_NOTICE, "IP short opt");
+   REASON_SET(reason, PFRES_IPOPTIONS);
+   return (PF_DROP);
+   }
+   if (off + length > end) {
+   DPFPRINTF(LOG_NOTICE, "IP long opt");
+   REASON_SET(reason, PFRES_IPOPTIONS);
+   return (PF_DROP);
+   }
+   switch (type) {
+   case IPOPT_RA:
+   SET(pd->badopts, PF_OPT_ROUTER_ALERT);
+   break;
+   default:
+   SET(pd->badopts, PF_OPT_OTHER);
+   break;
+   }
+   off += length;
+   }
+
+   return (PF_PASS);
+}
+
+int
 pf_walk_header(struct pf_pdesc *pd, struct ip *h, u_short *reason)
 {
struct ip6_ext   ext;
@@ -6393,11 +6444,20 @@ pf_walk_header(struct pf_pdesc *pd, stru
REASON_SET(reason, PFRES_SHORT);
return (PF_DROP);
}
-   if (hlen != sizeof(struct ip))
-   pd->badopts++;
+   if (hlen != sizeof(struct ip)) {
+   if (pf_walk_option(pd, h, pd->off + sizeof(struct ip),
+   pd->off + hlen, reason) != PF_PASS)
+   return (PF_DROP);
+   /* header options which contain only padding is fishy */
+   if (pd->badopts == 0)
+   SET(pd->badopts, PF_OPT_OTHER);
+   }
end = pd->off + ntohs(h->ip_len);
pd->off += hlen;
pd->proto = h->ip_p;
+   /* IGMP packets have router aler

Re: vers.c: make kernel date in UTC

2022-04-25 Thread Alexander Bluhm
On Sat, Apr 23, 2022 at 09:20:06AM +0200, Sebastien Marie wrote:
> Does such diff to force UTC timezone in kernel buildate would be acceptable ?

I always convert Canada Mountain time to UTC in my head to estimate
whether a commit may be in a snapshot used in regress testing.

Having everything in UTC would make my live easier.

bluhm

> diff 62198fa5a9d005ca1c651b3df2c33ce50d333b27 /home/semarie/repos/openbsd/src
> blob - ab97ce4c59639a6b357120b9a953c3584211aea2
> file + sys/conf/newvers.sh
> --- sys/conf/newvers.sh
> +++ sys/conf/newvers.sh
> @@ -40,7 +40,7 @@ then
>  fi
>  
>  touch version
> -v=`cat version` u=`logname` d=${PWD%/obj} h=`hostname` t=`date`
> +v=`cat version` u=`logname` d=${PWD%/obj} h=`hostname` t=`date -z UTC`
>  id=`basename "${d}"`
>  
>  # additional things which need version number upgrades:



Re: Provide memory barriers in refcnt_rele() and refcnt_finalize()

2022-04-25 Thread Alexander Bluhm
On Mon, Apr 25, 2022 at 02:52:08PM +, Visa Hankala wrote:
> 
> The patch uses membar_sync(), and not membar_enter(), after the loop
> in refcnt_finalize() because subsequent memory operations should hinge
> on the load of r_refs.
> 
> membar_enter() is usable when the reference point is a store.
> 
> > The other issue I have with the diff is that it documentations the
> > memory ordering in terms of acquire and release which is not what we
> > do in other places such as the membar_enter(9) man page.  Maybe this
> > should explicitly call out the memory ordering like what the Linux
> > comment does.
> 
> I have updated the documentation, though I am not sure if the outcome
> is an improvement.

OK bluhm@

> Index: share/man/man9/refcnt_init.9
> ===
> RCS file: src/share/man/man9/refcnt_init.9,v
> retrieving revision 1.2
> diff -u -p -r1.2 refcnt_init.9
> --- share/man/man9/refcnt_init.9  16 Mar 2022 14:13:01 -  1.2
> +++ share/man/man9/refcnt_init.9  25 Apr 2022 14:34:05 -
> @@ -74,6 +74,17 @@ There may only be one caller to
>  per refcnt
>  .Fa r .
>  .Pp
> +.Fn refcnt_rele ,
> +.Fn refcnt_rele_wake
> +and
> +.Fn refcnt_finalize
> +order prior memory loads and stores before the release of the reference.
> +The functions enforce control dependency so that after the final reference
> +has been released, subsequent loads and stores happen after the release.
> +These ensure that concurrent accesses cease before the object's destructor
> +runs and that the destructor sees all updates done during the lifetime
> +of the object.
> +.Pp
>  .Fn refcnt_shared
>  tests if the object has multiple references.
>  .Pp
> Index: sys/kern/kern_synch.c
> ===
> RCS file: src/sys/kern/kern_synch.c,v
> retrieving revision 1.185
> diff -u -p -r1.185 kern_synch.c
> --- sys/kern/kern_synch.c 18 Mar 2022 15:32:06 -  1.185
> +++ sys/kern/kern_synch.c 25 Apr 2022 14:34:05 -
> @@ -822,9 +822,14 @@ refcnt_rele(struct refcnt *r)
>  {
>   u_int refs;
>  
> + membar_exit_before_atomic();
>   refs = atomic_dec_int_nv(>r_refs);
>   KASSERT(refs != ~0);
> - return (refs == 0);
> + if (refs == 0) {
> + membar_enter_after_atomic();
> + return (1);
> + }
> + return (0);
>  }
>  
>  void
> @@ -840,6 +845,7 @@ refcnt_finalize(struct refcnt *r, const 
>   struct sleep_state sls;
>   u_int refs;
>  
> + membar_exit_before_atomic();
>   refs = atomic_dec_int_nv(>r_refs);
>   KASSERT(refs != ~0);
>   while (refs) {
> @@ -847,6 +853,8 @@ refcnt_finalize(struct refcnt *r, const 
>   refs = atomic_load_int(>r_refs);
>   sleep_finish(, refs);
>   }
> + /* Order subsequent loads and stores after refs == 0 load. */
> + membar_sync();
>  }
>  
>  int



Re: pf igmp icmp6 multicast router alert

2022-04-22 Thread Alexander Bluhm
On Thu, Apr 21, 2022 at 09:10:02PM +0200, Alexander Bluhm wrote:
> The option I have ever seen in the wild is router alert.  So it may
> be better to allow IGMP and ICMP6 multicast if router alert is the
> only option in the packet.

This diff implements exactly that.  I have only compile tested it.
If we decide that is the way to go, I will adapt my pf regress.

bluhm

Index: net/pf.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf.c,v
retrieving revision 1.1126
diff -u -p -r1.1126 pf.c
--- net/pf.c17 Mar 2022 18:27:55 -  1.1126
+++ net/pf.c22 Apr 2022 15:52:36 -
@@ -227,6 +227,8 @@ u_int16_tpf_calc_mss(struct pf_addr *
 static __inline int pf_set_rt_ifp(struct pf_state *, struct pf_addr *,
sa_family_t, struct pf_src_node **);
 struct pf_divert   *pf_get_divert(struct mbuf *);
+int pf_walk_option(struct pf_pdesc *, struct ip *,
+   int, int, u_short *);
 int pf_walk_header(struct pf_pdesc *, struct ip *,
u_short *);
 int pf_walk_option6(struct pf_pdesc *, struct ip6_hdr *,
@@ -3956,7 +3958,7 @@ pf_test_rule(struct pf_pdesc *pd, struct
rtable_l2(ctx.act.rtableid) != pd->rdomain)
pd->destchg = 1;
 
-   if (r->action == PF_PASS && pd->badopts && ! r->allow_opts) {
+   if (r->action == PF_PASS && pd->badopts != 0 && ! r->allow_opts) {
REASON_SET(, PFRES_IPOPTIONS);
 #if NPFLOG > 0
pd->pflog |= PF_LOG_FORCE;
@@ -6382,6 +6384,43 @@ pf_get_divert(struct mbuf *m)
 }
 
 int
+pf_walk_option(struct pf_pdesc *pd, struct ip *h, int off, int end,
+u_short *reason)
+{
+   uint8_t type, length, opts[15 * 4 - sizeof(struct ip)];
+
+   KASSERT(end - off <= sizeof(opts));
+   m_copydata(pd->m, off, end - off, opts);
+
+   while (off < end) {
+   type = opts[off - sizeof(struct ip)];
+   if (type == IPOPT_EOL)
+   break;
+   if (type == IPOPT_NOP) {
+   off++;
+   continue;
+   }
+   length = opts[off - sizeof(struct ip)];
+   if (off + length > end) {
+   DPFPRINTF(LOG_NOTICE, "IP long opt");
+   REASON_SET(reason, PFRES_IPOPTIONS);
+   return (PF_DROP);
+   }
+   switch (type) {
+   case IPOPT_RA:
+   SET(pd->badopts, PF_OPT_ROUTER_ALERT);
+   break;
+   default:
+   SET(pd->badopts, PF_OPT_OTHER);
+   break;
+   }
+   off += length;
+   }
+
+   return (PF_PASS);
+}
+
+int
 pf_walk_header(struct pf_pdesc *pd, struct ip *h, u_short *reason)
 {
struct ip6_ext   ext;
@@ -6393,11 +6432,18 @@ pf_walk_header(struct pf_pdesc *pd, stru
REASON_SET(reason, PFRES_SHORT);
return (PF_DROP);
}
-   if (hlen != sizeof(struct ip))
-   pd->badopts++;
+   if (hlen != sizeof(struct ip)) {
+   pf_walk_option(pd, h, sizeof(*h), hlen, reason);
+   /* header options which contain only padding is fishy */
+   if (pd->badopts == 0)
+   SET(pd->badopts, PF_OPT_OTHER);
+   }
end = pd->off + ntohs(h->ip_len);
pd->off += hlen;
pd->proto = h->ip_p;
+   /* IGMP packets have router alert options, allow them */
+   if (pd->proto == IPPROTO_IGMP)
+   CLR(pd->badopts, PF_OPT_ROUTER_ALERT);
/* stop walking over non initial fragments */
if ((h->ip_off & htons(IP_OFFMASK)) != 0)
return (PF_PASS);
@@ -6456,6 +6502,7 @@ pf_walk_option6(struct pf_pdesc *pd, str
}
switch (opt.ip6o_type) {
case IP6OPT_JUMBO:
+   SET(pd->badopts, PF_OPT_JUMBO);
if (pd->jumbolen != 0) {
DPFPRINTF(LOG_NOTICE, "IPv6 multiple jumbo");
REASON_SET(reason, PFRES_IPOPTIONS);
@@ -6480,7 +6527,11 @@ pf_walk_option6(struct pf_pdesc *pd, str
return (PF_DROP);
}
break;
+   case IP6OPT_ROUTER_ALERT:
+   SET(pd->badopts, PF_OPT_ROUTER_ALERT);
+   break;
default:
+   SET(pd->badopts, PF_OPT_OTHER);
break;
}
off += sizeof(opt) + opt.ip6o_len;
@@ -6494,6 +6545,7 @@ pf_walk_header6(s

OpenBSD Errata: April 22, 2022 (wifi)

2022-04-22 Thread Alexander Bluhm
Errata patch for wireless drivers in the kernel has been released
for OpenBSD 7.1.

Binary updates for the amd64, i386 and arm64 platform are available
via the syspatch utility.  Source code patches can be found on the
respective errata page:

  https://www.openbsd.org/errata71.html



Re: refcount btrace

2022-04-21 Thread Alexander Bluhm
I still think it is worth to have refcount debugging in generic
kernel dt(4).  Having tools is easier than first adding printf to
hunt a bug.  I see no downside.

ok?

bluhm

Index: dev/dt/dt_prov_static.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/dev/dt/dt_prov_static.c,v
retrieving revision 1.13
diff -u -p -r1.13 dt_prov_static.c
--- dev/dt/dt_prov_static.c 17 Mar 2022 14:53:59 -  1.13
+++ dev/dt/dt_prov_static.c 21 Apr 2022 21:06:03 -
@@ -87,6 +87,12 @@ DT_STATIC_PROBE1(smr, barrier_exit, "int
 DT_STATIC_PROBE0(smr, wakeup);
 DT_STATIC_PROBE2(smr, thread, "uint64_t", "uint64_t");
 
+/*
+ * reference counting
+ */
+DT_STATIC_PROBE0(refcnt, none);
+DT_STATIC_PROBE3(refcnt, inpcb, "void *", "int", "int");
+DT_STATIC_PROBE3(refcnt, tdb, "void *", "int", "int");
 
 /*
  * List of all static probes
@@ -127,15 +133,24 @@ struct dt_probe *const dtps_static[] = {
&_DT_STATIC_P(smr, barrier_exit),
&_DT_STATIC_P(smr, wakeup),
&_DT_STATIC_P(smr, thread),
+   /* refcnt */
+   &_DT_STATIC_P(refcnt, none),
+   &_DT_STATIC_P(refcnt, inpcb),
+   &_DT_STATIC_P(refcnt, tdb),
 };
 
+struct dt_probe *const *dtps_index_refcnt;
+
 int
 dt_prov_static_init(void)
 {
int i;
 
-   for (i = 0; i < nitems(dtps_static); i++)
+   for (i = 0; i < nitems(dtps_static); i++) {
+   if (dtps_static[i] == &_DT_STATIC_P(refcnt, none))
+   dtps_index_refcnt = _static[i];
dt_dev_register_probe(dtps_static[i]);
+   }
 
return i;
 }
Index: dev/dt/dtvar.h
===
RCS file: /data/mirror/openbsd/cvs/src/sys/dev/dt/dtvar.h,v
retrieving revision 1.13
diff -u -p -r1.13 dtvar.h
--- dev/dt/dtvar.h  27 Feb 2022 10:14:01 -  1.13
+++ dev/dt/dtvar.h  21 Apr 2022 21:06:03 -
@@ -313,11 +313,30 @@ extern volatile uint32_t  dt_tracing; /* 
 #defineDT_STATIC_ENTER(func, name, args...) do {   
\
extern struct dt_probe _DT_STATIC_P(func, name);\
struct dt_probe *dtp = &_DT_STATIC_P(func, name);   \
-   struct dt_provider *dtpv = dtp->dtp_prov;   \
\
if (__predict_false(dt_tracing) &&  \
__predict_false(dtp->dtp_recording)) {  \
+   struct dt_provider *dtpv = dtp->dtp_prov;   \
+   \
dtpv->dtpv_enter(dtpv, dtp, args);  \
+   }   \
+} while (0)
+
+#define _DT_INDEX_P(func)  (dtps_index_##func)
+
+#define DT_INDEX_ENTER(func, index, args...) do {  \
+   extern struct dt_probe **_DT_INDEX_P(func); \
+   \
+   if (__predict_false(dt_tracing) &&  \
+   __predict_false(index > 0) &&   \
+   __predict_true(_DT_INDEX_P(func) != NULL)) {\
+   struct dt_probe *dtp = _DT_INDEX_P(func)[index];\
+   \
+   if(__predict_false(dtp->dtp_recording)) {   \
+   struct dt_provider *dtpv = dtp->dtp_prov;   \
+   \
+   dtpv->dtpv_enter(dtpv, dtp, args);  \
+   }   \
}   \
 } while (0)
 
Index: kern/kern_synch.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_synch.c,v
retrieving revision 1.185
diff -u -p -r1.185 kern_synch.c
--- kern/kern_synch.c   18 Mar 2022 15:32:06 -  1.185
+++ kern/kern_synch.c   21 Apr 2022 21:06:03 -
@@ -804,7 +804,15 @@ sys___thrwakeup(struct proc *p, void *v,
 void
 refcnt_init(struct refcnt *r)
 {
+   refcnt_init_trace(r, 0);
+}
+
+void
+refcnt_init_trace(struct refcnt *r, int idx)
+{
+   r->r_traceidx = idx;
atomic_store_int(>r_refs, 1);
+   TRACEINDEX(refcnt, r->r_traceidx, r, 0, +1);
 }
 
 void
@@ -814,6 +822,7 @@ refcnt_take(struct refcnt *r)
 
refs = atomic_inc_int_nv(>r_refs);
KASSERT(refs != 0);
+   TRACEINDEX(refcnt, r->r_traceidx, r, refs - 1, +1);
(void)refs;
 }
 
@@ -824,6 +833,7 @@ refcnt_rele(struct refcnt *r)
 
refs = atomic_dec_int_nv(>r_refs);
KASSERT(refs != ~0);
+   TRACEINDEX(refcnt, r->r_traceidx, r, refs + 1, -1);

Re: Provide memory barriers in refcnt_rele() and refcnt_finalize()

2022-04-21 Thread Alexander Bluhm
On Mon, Apr 18, 2022 at 08:33:06AM +, Visa Hankala wrote:
> I think the sanest solution is to add the release and acquire barriers
> in refcnt_rele().

Getting memory barriers right is too complicated for developers
doing MP stuff.  The existing locking and refcount primitives have
to implement that functionality.  I am on visa@'s side and would
prefer a memory barrier in refcount API instead of searching for
races in MP code.

Better waste some CPU cycles in some cases than having strange
behavior due to missing barries in other cases.

bluhm



Re: pf igmp icmp6 multicast router alert

2022-04-21 Thread Alexander Bluhm
On Thu, Apr 21, 2022 at 08:56:07PM +0200, Otto Moerbeek wrote:
> > Currently it allows all options.  Should I make it specific to
> > router alert with IGMP or ICMP6?
> 
> To me it looks like the icmp6 case already is limited to MLD?

The question is the other way around.  My current diff allows any
option with ICMP6 MLD.  Do we want to restict the option to router
alert?

In our ip6.h we have:
#define IP6OPT_JUMBO0xC2/* 11 0 00010 = 194 */
#define IP6OPT_NSAP_ADDR0xC3/* 11 0 00011 */
#define IP6OPT_TUNNEL_LIMIT 0x04/* 00 0 00100 */
#define IP6OPT_ROUTER_ALERT 0x05/* 00 0 00101 (RFC3542, recommended) */

And who knows what other options have been designed.

In ip.h I see these:
#define IPOPT_RR7   /* record packet route */
#define IPOPT_TS68  /* timestamp */
#define IPOPT_SECURITY  130 /* provide s,c,h,tcc */
#define IPOPT_LSRR  131 /* loose source route */
#define IPOPT_SATID 136 /* satnet id */
#define IPOPT_SSRR  137 /* strict source route */
#define IPOPT_RA148 /* router alert */

The option I have ever seen in the wild is router alert.  So it may
be better to allow IGMP and ICMP6 multicast if router alert is the
only option in the packet.

bluhm



pf igmp icmp6 multicast router alert

2022-04-21 Thread Alexander Bluhm
Hi,

IGMP and ICMP6 for multicast packets have router alert options.
Per default pf drops all IP packets with options.  Usually people
ask what is wrong until someone points out that they have to use a
pf rule with allow-opts.

As this is normal behavior and our kernel generates such packets,
the pf default is bad.

Diff is untested, but otto@ and florian@ could try it.

Currently it allows all options.  Should I make it specific to
router alert with IGMP or ICMP6?

bluhm

Index: net/pf.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf.c,v
retrieving revision 1.1126
diff -u -p -r1.1126 pf.c
--- net/pf.c17 Mar 2022 18:27:55 -  1.1126
+++ net/pf.c21 Apr 2022 16:30:18 -
@@ -6398,6 +6398,9 @@ pf_walk_header(struct pf_pdesc *pd, stru
end = pd->off + ntohs(h->ip_len);
pd->off += hlen;
pd->proto = h->ip_p;
+   /* IGMP packets have router alert options, allow them */
+   if (pd->proto == IPPROTO_IGMP)
+   pd->badopts = 0;
/* stop walking over non initial fragments */
if ((h->ip_off & htons(IP_OFFMASK)) != 0)
return (PF_PASS);
@@ -6494,6 +6497,7 @@ pf_walk_header6(struct pf_pdesc *pd, str
 {
struct ip6_frag  frag;
struct ip6_ext   ext;
+   struct icmp6_hdr icmp6;
struct ip6_rthdr rthdr;
u_int32_tend;
int  hdr_cnt, fraghdr_cnt = 0, rthdr_cnt = 0;
@@ -6607,9 +6611,23 @@ pf_walk_header6(struct pf_pdesc *pd, str
pd->off += (ext.ip6e_len + 1) * 8;
pd->proto = ext.ip6e_nxt;
break;
+   case IPPROTO_ICMPV6:
+   if (!pf_pull_hdr(pd->m, pd->off, , sizeof(icmp6),
+   NULL, reason, AF_INET6)) {
+   DPFPRINTF(LOG_NOTICE, "IPv6 short icmp6hdr");
+   return (PF_DROP);
+   }
+   /* ICMP multicast packets have router alert options */
+   switch (icmp6.icmp6_type) {
+   case MLD_LISTENER_QUERY:
+   case MLD_LISTENER_REPORT:
+   case MLD_LISTENER_DONE:
+   pd->badopts = 0;
+   break;
+   }
+   /* FALLTHROUGH */
case IPPROTO_TCP:
case IPPROTO_UDP:
-   case IPPROTO_ICMPV6:
/* fragments may be short, ignore inner header then */
if (pd->fragoff != 0 && end < pd->off +
(pd->proto == IPPROTO_TCP ? sizeof(struct tcphdr) :



Re: router timer mutex

2022-04-21 Thread Alexander Bluhm
On Wed, Apr 20, 2022 at 08:12:51PM +0200, Alexander Bluhm wrote:
> mvs@ reminded me of a crash I have seen in December.  Route timers
> are not MP safe, but I think this can be fixed with a mutex.  The
> idea is to protect the global lists with a mutex and move the rttimer
> into a temporary list.  Then the callback and pool put can be called
> later without mutex.

I have a tiny update to the diff.

- Global locks are documented with capital letter, so use [T].

- rt_timer_add() grabbed the mutex twice, first remove then add.
  Better exchange in one critical section.  pool_get before and
  pool_put after.

ok?

Index: net/route.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/route.c,v
retrieving revision 1.406
diff -u -p -r1.406 route.c
--- net/route.c 20 Apr 2022 17:58:22 -  1.406
+++ net/route.c 21 Apr 2022 13:31:52 -
@@ -1361,7 +1361,8 @@ rt_ifa_purge_walker(struct rtentry *rt, 
  * for multiple queues for efficiency's sake...
  */
 
-LIST_HEAD(, rttimer_queue) rttimer_queue_head;
+struct mutex   rttimer_mtx;
+LIST_HEAD(, rttimer_queue) rttimer_queue_head; /* [T] */
 
 #define RTTIMER_CALLOUT(r) {   \
if (r->rtt_func != NULL) {  \
@@ -1393,6 +1394,7 @@ rt_timer_init(void)
pool_init(_queue_pool, sizeof(struct rttimer_queue), 0,
IPL_MPFLOOR, 0, "rttmrq", NULL);
 
+   mtx_init(_mtx, IPL_MPFLOOR);
LIST_INIT(_queue_head);
timeout_set_proc(_timer_timeout, rt_timer_timer, _timer_timeout);
timeout_add_sec(_timer_timeout, 1);
@@ -1408,7 +1410,10 @@ rt_timer_queue_create(int timeout)
rtq->rtq_timeout = timeout;
rtq->rtq_count = 0;
TAILQ_INIT(>rtq_head);
+
+   mtx_enter(_mtx);
LIST_INSERT_HEAD(_queue_head, rtq, rtq_link);
+   mtx_leave(_mtx);
 
return (rtq);
 }
@@ -1416,28 +1421,36 @@ rt_timer_queue_create(int timeout)
 void
 rt_timer_queue_change(struct rttimer_queue *rtq, int timeout)
 {
+   mtx_enter(_mtx);
rtq->rtq_timeout = timeout;
+   mtx_leave(_mtx);
 }
 
 void
 rt_timer_queue_destroy(struct rttimer_queue *rtq)
 {
-   struct rttimer  *r;
+   struct rttimer  *r;
+   TAILQ_HEAD(, rttimer)rttlist;
 
NET_ASSERT_LOCKED();
 
+   TAILQ_INIT();
+   mtx_enter(_mtx);
while ((r = TAILQ_FIRST(>rtq_head)) != NULL) {
LIST_REMOVE(r, rtt_link);
TAILQ_REMOVE(>rtq_head, r, rtt_next);
+   TAILQ_INSERT_TAIL(, r, rtt_next);
+   KASSERT(rtq->rtq_count > 0);
+   rtq->rtq_count--;
+   }
+   LIST_REMOVE(rtq, rtq_link);
+   mtx_leave(_mtx);
+
+   while ((r = TAILQ_FIRST()) != NULL) {
+   TAILQ_REMOVE(, r, rtt_next);
RTTIMER_CALLOUT(r);
pool_put(_pool, r);
-   if (rtq->rtq_count > 0)
-   rtq->rtq_count--;
-   else
-   printf("rt_timer_queue_destroy: rtq_count reached 0\n");
}
-
-   LIST_REMOVE(rtq, rtq_link);
pool_put(_queue_pool, rtq);
 }
 
@@ -1450,15 +1463,22 @@ rt_timer_queue_count(struct rttimer_queu
 void
 rt_timer_remove_all(struct rtentry *rt)
 {
-   struct rttimer  *r;
+   struct rttimer  *r;
+   TAILQ_HEAD(, rttimer)rttlist;
 
+   TAILQ_INIT();
+   mtx_enter(_mtx);
while ((r = LIST_FIRST(>rt_timer)) != NULL) {
LIST_REMOVE(r, rtt_link);
TAILQ_REMOVE(>rtt_queue->rtq_head, r, rtt_next);
-   if (r->rtt_queue->rtq_count > 0)
-   r->rtt_queue->rtq_count--;
-   else
-   printf("rt_timer_remove_all: rtq_count reached 0\n");
+   TAILQ_INSERT_TAIL(, r, rtt_next);
+   KASSERT(r->rtt_queue->rtq_count > 0);
+   r->rtt_queue->rtq_count--;
+   }
+   mtx_leave(_mtx);
+
+   while ((r = TAILQ_FIRST()) != NULL) {
+   TAILQ_REMOVE(, r, rtt_next);
pool_put(_pool, r);
}
 }
@@ -1467,12 +1487,23 @@ int
 rt_timer_add(struct rtentry *rt, void (*func)(struct rtentry *,
 struct rttimer *), struct rttimer_queue *queue, u_int rtableid)
 {
-   struct rttimer  *r;
+   struct rttimer  *r, *rnew;
time_t   current_time;
 
+   rnew = pool_get(_pool, PR_NOWAIT | PR_ZERO);
+   if (rnew == NULL)
+   return (ENOBUFS);
+
current_time = getuptime();
-   rt->rt_expire = current_time + queue->rtq_timeout;
 
+   rnew->rtt_rt = rt;
+   rnew->rtt_time = current_time;
+   rnew->rtt_func = func;
+   rnew->rtt_queue = queue;
+   rnew->rtt_tableid = rtableid;
+
+   mtx_en

router timer kernel lock

2022-04-21 Thread Alexander Bluhm
Hi,

As claudio@ wants to refactor router timer before making them MP
safe, I would like to protect them with kernel lock.  It should fix
this panic.

https://marc.info/?l=openbsd-tech=164038527425440=2

I hope this is the final step before running IP forwarding in
parallel.

ok?

bluhm

Index: netinet/ip_icmp.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_icmp.c,v
retrieving revision 1.188
diff -u -p -r1.188 ip_icmp.c
--- netinet/ip_icmp.c   20 Apr 2022 09:38:26 -  1.188
+++ netinet/ip_icmp.c   21 Apr 2022 12:45:40 -
@@ -634,8 +634,10 @@ reflect:
rtredirect(sintosa(), sintosa(),
sintosa(), , m->m_pkthdr.ph_rtableid);
if (newrt != NULL && icmp_redirtimeout > 0) {
+   KERNEL_LOCK();
rt_timer_add(newrt, icmp_redirect_timeout,
icmp_redirect_timeout_q, m->m_pkthdr.ph_rtableid);
+   KERNEL_UNLOCK();
}
rtfree(newrt);
pfctlinput(PRC_REDIRECT_HOST, sintosa());
@@ -884,8 +886,10 @@ icmp_sysctl(int *name, u_int namelen, vo
NET_LOCK();
error = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
_redirtimeout, 0, INT_MAX);
+   KERNEL_LOCK();
rt_timer_queue_change(icmp_redirect_timeout_q,
icmp_redirtimeout);
+   KERNEL_UNLOCK();
NET_UNLOCK();
break;
 
@@ -975,8 +979,10 @@ icmp_mtudisc_clone(struct in_addr dst, u
rt = nrt;
rtm_send(rt, RTM_ADD, 0, rtableid);
}
+   KERNEL_LOCK();
error = rt_timer_add(rt, icmp_mtudisc_timeout, ip_mtudisc_timeout_q,
rtableid);
+   KERNEL_UNLOCK();
if (error)
goto bad;
 
Index: netinet/ip_input.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_input.c,v
retrieving revision 1.367
diff -u -p -r1.367 ip_input.c
--- netinet/ip_input.c  20 Apr 2022 09:38:26 -  1.367
+++ netinet/ip_input.c  21 Apr 2022 13:00:33 -
@@ -1616,9 +1616,11 @@ ip_sysctl(int *name, u_int namelen, void
NET_LOCK();
error = sysctl_int(oldp, oldlenp, newp, newlen, _mtudisc);
if (ip_mtudisc == 0) {
+   KERNEL_LOCK();
rt_timer_queue_destroy(ip_mtudisc_timeout_q);
ip_mtudisc_timeout_q =
rt_timer_queue_create(ip_mtudisc_timeout);
+   KERNEL_UNLOCK();
}
NET_UNLOCK();
return error;
@@ -1626,8 +1628,10 @@ ip_sysctl(int *name, u_int namelen, void
NET_LOCK();
error = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
_mtudisc_timeout, 0, INT_MAX);
+   KERNEL_LOCK();
rt_timer_queue_change(ip_mtudisc_timeout_q,
ip_mtudisc_timeout);
+   KERNEL_UNLOCK();
NET_UNLOCK();
return (error);
 #ifdef IPSEC
Index: netinet/ip_mroute.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_mroute.c,v
retrieving revision 1.131
diff -u -p -r1.131 ip_mroute.c
--- netinet/ip_mroute.c 15 Dec 2021 17:21:08 -  1.131
+++ netinet/ip_mroute.c 21 Apr 2022 13:02:43 -
@@ -520,7 +520,9 @@ ip_mrouter_init(struct socket *so, struc
return (EADDRINUSE);
 
ip_mrouter[rtableid] = so;
+   KERNEL_LOCK();
mrouterq[rtableid] = rt_timer_queue_create(MCAST_EXPIRE_FREQUENCY);
+   KERNEL_UNLOCK();
 
return (0);
 }
@@ -572,7 +574,9 @@ ip_mrouter_done(struct socket *so)
 
mrt_api_config = 0;
 
+   KERNEL_LOCK();
rt_timer_queue_destroy(mrouterq[rtableid]);
+   KERNEL_UNLOCK();
mrouterq[rtableid] = NULL;
ip_mrouter[rtableid] = NULL;
mrt_count[rtableid] = 0;
@@ -799,8 +803,10 @@ mfc_expire_route(struct rtentry *rt, str
/* Not expired, add it back to the queue. */
if (mfc->mfc_expire == 0) {
mfc->mfc_expire = 1;
+   KERNEL_LOCK();
rt_timer_add(rt, mfc_expire_route, mrouterq[rtableid],
rtableid);
+   KERNEL_UNLOCK();
return;
}
 
@@ -834,8 +840,10 @@ mfc_add_route(struct ifnet *ifp, struct 
 
rt->rt_llinfo = (caddr_t)mfc;
 
+   KERNEL_LOCK();
rt_timer_add(rt, mfc_expire_route, mrouterq[rtableid],
rtableid);
+   KERNEL_UNLOCK();
 
mfc->mfc_parent = mfccp->mfcc_parent;
mfc->mfc_pkt_cnt = 0;
@@ -1342,7 +1350,9 @@ mrt_mcast_del(struct rtentry *rt, unsign
int  error;
 
/* Remove all 

Re: [External] : Re: pfsync(4) snapshot lists must have dedicated link element

2022-04-21 Thread Alexander Bluhm
On Wed, Apr 20, 2022 at 11:22:27PM +0200, Alexandr Nedvedicky wrote:
> updated diff is below

OK bluhm@

You have to merge again, as I removed #ifdef PFSYNC_DEBUG and added
a #ifdef DIAGNOSTIC.  Sorry.

> 8<---8<---8<--8<
> diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c
> index fc6843b541f..3061318cec9 100644
> --- a/sys/net/if_pfsync.c
> +++ b/sys/net/if_pfsync.c
> @@ -181,6 +181,7 @@ void  pfsync_q_del(struct pf_state *);
>  
>  struct pfsync_upd_req_item {
>   TAILQ_ENTRY(pfsync_upd_req_item)ur_entry;
> + TAILQ_ENTRY(pfsync_upd_req_item)ur_snap;
>   struct pfsync_upd_req   ur_msg;
>  };
>  TAILQ_HEAD(pfsync_upd_reqs, pfsync_upd_req_item);
> @@ -295,7 +296,7 @@ void  pfsync_bulk_update(void *);
>  void pfsync_bulk_fail(void *);
>  
>  void pfsync_grab_snapshot(struct pfsync_snapshot *, struct pfsync_softc *);
> -void pfsync_drop_snapshot(struct pfsync_snapshot *, struct pfsync_softc *);
> +void pfsync_drop_snapshot(struct pfsync_snapshot *);
>  
>  void pfsync_send_dispatch(void *);
>  void pfsync_send_pkt(struct mbuf *);
> @@ -422,8 +423,7 @@ pfsync_clone_destroy(struct ifnet *ifp)
>   sc->sc_deferred = 0;
>   mtx_leave(>sc_deferrals_mtx);
>  
> - while (!TAILQ_EMPTY()) {
> - pd = TAILQ_FIRST();
> + while ((pd = TAILQ_FIRST()) != NULL) {
>   TAILQ_REMOVE(, pd, pd_entry);
>   pfsync_undefer(pd, 0);
>   }
> @@ -1574,6 +1574,9 @@ void
>  pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc *sc)
>  {
>   int q;
> + struct pf_state *st;
> + struct pfsync_upd_req_item *ur;
> + struct tdb *tdb;
>  
>   sn->sn_sc = sc;
>  
> @@ -1583,14 +1586,31 @@ pfsync_grab_snapshot(struct pfsync_snapshot *sn, 
> struct pfsync_softc *sc)
>  
>   for (q = 0; q < PFSYNC_S_COUNT; q++) {
>   TAILQ_INIT(>sn_qs[q]);
> - TAILQ_CONCAT(>sn_qs[q], >sc_qs[q], sync_list);
> +
> + while ((st = TAILQ_FIRST(>sc_qs[q])) != NULL) {
> + KASSERT(st->snapped == 0);
> + TAILQ_REMOVE(>sc_qs[q], st, sync_list);
> + TAILQ_INSERT_TAIL(>sn_qs[q], st, sync_snap);
> + st->snapped = 1;
> + }
>   }
>  
>   TAILQ_INIT(>sn_upd_req_list);
> - TAILQ_CONCAT(>sn_upd_req_list, >sc_upd_req_list, ur_entry);
> + while ((ur = TAILQ_FIRST(>sc_upd_req_list)) != NULL) {
> + TAILQ_REMOVE(>sc_upd_req_list, ur, ur_entry);
> + TAILQ_INSERT_TAIL(>sn_upd_req_list, ur, ur_snap);
> + }
>  
>   TAILQ_INIT(>sn_tdb_q);
> - TAILQ_CONCAT(>sn_tdb_q, >sc_tdb_q, tdb_sync_entry);
> + while ((tdb = TAILQ_FIRST(>sc_tdb_q)) != NULL) {
> + TAILQ_REMOVE(>sc_tdb_q, tdb, tdb_sync_entry);
> + TAILQ_INSERT_TAIL(>sn_tdb_q, tdb, tdb_sync_snap);
> +
> + mtx_enter(>tdb_mtx);
> + KASSERT(!ISSET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED));
> + SET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED);
> + mtx_leave(>tdb_mtx);
> + }
>  
>   sn->sn_len = sc->sc_len;
>   sc->sc_len = PFSYNC_MINPKT;
> @@ -1606,41 +1626,40 @@ pfsync_grab_snapshot(struct pfsync_snapshot *sn, 
> struct pfsync_softc *sc)
>  }
>  
>  void
> -pfsync_drop_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc * sc)
> +pfsync_drop_snapshot(struct pfsync_snapshot *sn)
>  {
>   struct pf_state *st;
>   struct pfsync_upd_req_item *ur;
>   struct tdb *t;
>   int q;
>  
> -
>   for (q = 0; q < PFSYNC_S_COUNT; q++) {
>   if (TAILQ_EMPTY(>sn_qs[q]))
>   continue;
>  
>   while ((st = TAILQ_FIRST(>sn_qs[q])) != NULL) {
> - TAILQ_REMOVE(>sn_qs[q], st, sync_list);
> -#ifdef PFSYNC_DEBUG
>   KASSERT(st->sync_state == q);
> -#endif
> + KASSERT(st->snapped == 1);
> + TAILQ_REMOVE(>sn_qs[q], st, sync_snap);
>   st->sync_state = PFSYNC_S_NONE;
> + st->snapped = 0;
>   pf_state_unref(st);
>   }
>   }
>  
>   while ((ur = TAILQ_FIRST(>sn_upd_req_list)) != NULL) {
> - TAILQ_REMOVE(>sn_upd_req_list, ur, ur_entry);
> + TAILQ_REMOVE(>sn_upd_req_list, ur, ur_snap);
>   pool_put(>sn_sc->sc_pool, ur);
>   }
>  
> - mtx_enter(>sc_tdb_mtx);
>   while ((t = TAILQ_FIRST(>sn_tdb_q)) != NULL) {
> - TAILQ_REMOVE(>sn_tdb_q, t, tdb_sync_entry);
> + TAILQ_REMOVE(>sn_tdb_q, t, tdb_sync_snap);
>   mtx_enter(>tdb_mtx);
> + KASSERT(ISSET(t->tdb_flags, TDBF_PFSYNC_SNAPPED));
> + CLR(t->tdb_flags, TDBF_PFSYNC_SNAPPED);
>   CLR(t->tdb_flags, TDBF_PFSYNC);
>   mtx_leave(>tdb_mtx);
>   }
> - 

router timer mutex

2022-04-20 Thread Alexander Bluhm
Hi,

mvs@ reminded me of a crash I have seen in December.  Route timers
are not MP safe, but I think this can be fixed with a mutex.  The
idea is to protect the global lists with a mutex and move the rttimer
into a temporary list.  Then the callback and pool put can be called
later without mutex.

It survived a full regress with witness.

Hrvoje: Can you put this on your test machine together with parallel
IP forwarding?

ok?

bluhm

Index: net/route.c
===
RCS file: /cvs/src/sys/net/route.c,v
retrieving revision 1.406
diff -u -p -r1.406 route.c
--- net/route.c 20 Apr 2022 17:58:22 -  1.406
+++ net/route.c 20 Apr 2022 18:00:39 -
@@ -1361,7 +1361,8 @@ rt_ifa_purge_walker(struct rtentry *rt, 
  * for multiple queues for efficiency's sake...
  */
 
-LIST_HEAD(, rttimer_queue) rttimer_queue_head;
+struct mutex   rttimer_mtx;
+LIST_HEAD(, rttimer_queue) rttimer_queue_head; /* [t] */
 
 #define RTTIMER_CALLOUT(r) {   \
if (r->rtt_func != NULL) {  \
@@ -1393,6 +1394,7 @@ rt_timer_init(void)
pool_init(_queue_pool, sizeof(struct rttimer_queue), 0,
IPL_MPFLOOR, 0, "rttmrq", NULL);
 
+   mtx_init(_mtx, IPL_MPFLOOR);
LIST_INIT(_queue_head);
timeout_set_proc(_timer_timeout, rt_timer_timer, _timer_timeout);
timeout_add_sec(_timer_timeout, 1);
@@ -1408,7 +1410,10 @@ rt_timer_queue_create(int timeout)
rtq->rtq_timeout = timeout;
rtq->rtq_count = 0;
TAILQ_INIT(>rtq_head);
+
+   mtx_enter(_mtx);
LIST_INSERT_HEAD(_queue_head, rtq, rtq_link);
+   mtx_leave(_mtx);
 
return (rtq);
 }
@@ -1416,28 +1421,36 @@ rt_timer_queue_create(int timeout)
 void
 rt_timer_queue_change(struct rttimer_queue *rtq, int timeout)
 {
+   mtx_enter(_mtx);
rtq->rtq_timeout = timeout;
+   mtx_leave(_mtx);
 }
 
 void
 rt_timer_queue_destroy(struct rttimer_queue *rtq)
 {
-   struct rttimer  *r;
+   struct rttimer  *r;
+   TAILQ_HEAD(, rttimer)rttlist;
 
NET_ASSERT_LOCKED();
 
+   TAILQ_INIT();
+   mtx_enter(_mtx);
while ((r = TAILQ_FIRST(>rtq_head)) != NULL) {
LIST_REMOVE(r, rtt_link);
TAILQ_REMOVE(>rtq_head, r, rtt_next);
+   TAILQ_INSERT_TAIL(, r, rtt_next);
+   KASSERT(rtq->rtq_count > 0);
+   rtq->rtq_count--;
+   }
+   LIST_REMOVE(rtq, rtq_link);
+   mtx_leave(_mtx);
+
+   while ((r = TAILQ_FIRST()) != NULL) {
+   TAILQ_REMOVE(, r, rtt_next);
RTTIMER_CALLOUT(r);
pool_put(_pool, r);
-   if (rtq->rtq_count > 0)
-   rtq->rtq_count--;
-   else
-   printf("rt_timer_queue_destroy: rtq_count reached 0\n");
}
-
-   LIST_REMOVE(rtq, rtq_link);
pool_put(_queue_pool, rtq);
 }
 
@@ -1450,15 +1463,22 @@ rt_timer_queue_count(struct rttimer_queu
 void
 rt_timer_remove_all(struct rtentry *rt)
 {
-   struct rttimer  *r;
+   struct rttimer  *r;
+   TAILQ_HEAD(, rttimer)rttlist;
 
+   TAILQ_INIT();
+   mtx_enter(_mtx);
while ((r = LIST_FIRST(>rt_timer)) != NULL) {
LIST_REMOVE(r, rtt_link);
TAILQ_REMOVE(>rtt_queue->rtq_head, r, rtt_next);
-   if (r->rtt_queue->rtq_count > 0)
-   r->rtt_queue->rtq_count--;
-   else
-   printf("rt_timer_remove_all: rtq_count reached 0\n");
+   TAILQ_INSERT_TAIL(, r, rtt_next);
+   KASSERT(r->rtt_queue->rtq_count > 0);
+   r->rtt_queue->rtq_count--;
+   }
+   mtx_leave(_mtx);
+
+   while ((r = TAILQ_FIRST()) != NULL) {
+   TAILQ_REMOVE(, r, rtt_next);
pool_put(_pool, r);
}
 }
@@ -1471,8 +1491,9 @@ rt_timer_add(struct rtentry *rt, void (*
time_t   current_time;
 
current_time = getuptime();
-   rt->rt_expire = current_time + queue->rtq_timeout;
 
+   mtx_enter(_mtx);
+   rt->rt_expire = current_time + queue->rtq_timeout;
/*
 * If there's already a timer with this action, destroy it before
 * we add a new one.
@@ -1481,27 +1502,31 @@ rt_timer_add(struct rtentry *rt, void (*
if (r->rtt_func == func) {
LIST_REMOVE(r, rtt_link);
TAILQ_REMOVE(>rtt_queue->rtq_head, r, rtt_next);
-   if (r->rtt_queue->rtq_count > 0)
-   r->rtt_queue->rtq_count--;
-   else
-   printf("rt_timer_add: rtq_count reached 0\n");
-   pool_put(_pool, r);
+   KASSERT(r->rtt_queue->rtq_count > 0);
+   r->rtt_queue->rtq_count--;
   

pfsync debug bye-bye

2022-04-20 Thread Alexander Bluhm
Hi,

In pfsync there are some KASSERT hidden behind #ifdef PFSYNC_DEBUG.
That does not make sense to me.  Either they are correct, then they
should actively check in production.  Or they got wrong over time,
then they should not make debugging harder.

Some basic testing did not show problems.

ok?

bluhm

Index: net/if_pfsync.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_pfsync.c,v
retrieving revision 1.303
diff -u -p -r1.303 if_pfsync.c
--- net/if_pfsync.c 14 Apr 2022 11:39:44 -  1.303
+++ net/if_pfsync.c 20 Apr 2022 14:00:58 -
@@ -1620,9 +1620,7 @@ pfsync_drop_snapshot(struct pfsync_snaps
 
while ((st = TAILQ_FIRST(>sn_qs[q])) != NULL) {
TAILQ_REMOVE(>sn_qs[q], st, sync_list);
-#ifdef PFSYNC_DEBUG
KASSERT(st->sync_state == q);
-#endif
st->sync_state = PFSYNC_S_NONE;
pf_state_unref(st);
}
@@ -1857,9 +1855,7 @@ pfsync_sendout(void)
count = 0;
while ((st = TAILQ_FIRST(_qs[q])) != NULL) {
TAILQ_REMOVE(_qs[q], st, sync_list);
-#ifdef PFSYNC_DEBUG
KASSERT(st->sync_state == q);
-#endif
st->sync_state = PFSYNC_S_NONE;
pfsync_qs[q].write(st, m->m_data + offset);
offset += pfsync_qs[q].len;
@@ -1916,9 +1912,7 @@ pfsync_insert_state(struct pf_state *st)
ISSET(st->state_flags, PFSTATE_NOSYNC))
return;
 
-#ifdef PFSYNC_DEBUG
KASSERT(st->sync_state == PFSYNC_S_NONE);
-#endif
 
if (sc->sc_len == PFSYNC_MINPKT)
timeout_add_sec(>sc_tmo, 1);
@@ -2403,7 +2397,7 @@ pfsync_q_ins(struct pf_state *st, int q)
struct pfsync_softc *sc = pfsyncif;
size_t nlen, sclen;
 
-#if defined(PFSYNC_DEBUG)
+#ifdef DIAGNOSTIC
if (sc->sc_len < PFSYNC_MINPKT)
panic("pfsync pkt len is too low %zd", sc->sc_len);
 #endif



Re: [External] : Re: pfsync(4) snapshot lists must have dedicated link element

2022-04-20 Thread Alexander Bluhm
On Sat, Apr 09, 2022 at 01:51:05AM +0200, Alexandr Nedvedicky wrote:
> updated diff is below.

I am not sure what Hrvoje actually did test and what not.  My
impression was, that he got a panic with the previous version of
this diff, but the machine was stable with the code in current.

But maybe I got it wrong and we need this code to run pfsync with
IPsec in parallel.

In general it looks good, a few comments inline.

> 8<---8<---8<--8<
> diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c
> index cb0f3fbdf52..536c3f9cb70 100644
> --- a/sys/net/if_pfsync.c
> +++ b/sys/net/if_pfsync.c
> @@ -181,6 +181,7 @@ void  pfsync_q_del(struct pf_state *);
>  
>  struct pfsync_upd_req_item {
>   TAILQ_ENTRY(pfsync_upd_req_item)ur_entry;
> + TAILQ_ENTRY(pfsync_upd_req_item)ur_snap;

Do we really need two list entries?  My understanding is that the
element is either in the sc_upd_req_list or in sn_upd_req_list.  We
could use the same entry for both.  But of course with two entries
it is easier to see what is going on.

>   struct pfsync_upd_req   ur_msg;
>  };
>  TAILQ_HEAD(pfsync_upd_reqs, pfsync_upd_req_item);
> @@ -295,7 +296,7 @@ void  pfsync_bulk_update(void *);
>  void pfsync_bulk_fail(void *);
>  
>  void pfsync_grab_snapshot(struct pfsync_snapshot *, struct pfsync_softc *);
> -void pfsync_drop_snapshot(struct pfsync_snapshot *, struct pfsync_softc *);
> +void pfsync_drop_snapshot(struct pfsync_snapshot *);
>  
>  void pfsync_send_dispatch(void *);
>  void pfsync_send_pkt(struct mbuf *);
> @@ -422,8 +423,7 @@ pfsync_clone_destroy(struct ifnet *ifp)
>   sc->sc_deferred = 0;
>   mtx_leave(>sc_deferrals_mtx);
>  
> - while (!TAILQ_EMPTY()) {
> - pd = TAILQ_FIRST();
> + while ((pd = TAILQ_FIRST()) != NULL) {
>   TAILQ_REMOVE(, pd, pd_entry);
>   pfsync_undefer(pd, 0);
>   }
> @@ -1574,6 +1574,9 @@ void
>  pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc *sc)
>  {
>   int q;
> + struct pf_state *st;
> + struct pfsync_upd_req_item *ur;
> + struct tdb *tdb;
>  
>   sn->sn_sc = sc;
>  
> @@ -1583,14 +1586,36 @@ pfsync_grab_snapshot(struct pfsync_snapshot *sn, 
> struct pfsync_softc *sc)
>  
>   for (q = 0; q < PFSYNC_S_COUNT; q++) {
>   TAILQ_INIT(>sn_qs[q]);
> - TAILQ_CONCAT(>sn_qs[q], >sc_qs[q], sync_list);
> +
> + while ((st = TAILQ_FIRST(>sc_qs[q])) != NULL) {
> +#ifdef PFSYNC_DEBUG
> + KASSERT(st->snapped == 0);
> +#endif

I see that there are other #ifdef PFSYNC_DEBUG.  But why would you
hide a cheap KASSERT behind another ifdef?  If something is wrong
I want to see the crash and not only while debugging.

I will send a diff that removes existing PFSYNC_DEBUG.

> + TAILQ_REMOVE(>sc_qs[q], st, sync_list);
> + TAILQ_INSERT_TAIL(>sn_qs[q], st, sync_snap);
> + st->snapped = 1;
> + }
>   }
>  
>   TAILQ_INIT(>sn_upd_req_list);
> - TAILQ_CONCAT(>sn_upd_req_list, >sc_upd_req_list, ur_entry);
> + while ((ur = TAILQ_FIRST(>sc_upd_req_list)) != NULL) {
> + TAILQ_REMOVE(>sc_upd_req_list, ur, ur_entry);
> + TAILQ_INSERT_TAIL(>sn_upd_req_list, ur, ur_snap);
> + }
>  
>   TAILQ_INIT(>sn_tdb_q);
> - TAILQ_CONCAT(>sn_tdb_q, >sc_tdb_q, tdb_sync_entry);
> + while ((tdb = TAILQ_FIRST(>sc_tdb_q)) != NULL) {
> + TAILQ_REMOVE(>sc_tdb_q, tdb, tdb_sync_entry);
> + TAILQ_INSERT_TAIL(>sn_tdb_q, tdb, tdb_sync_snap);
> +
> + mtx_enter(>tdb_mtx);
> +#ifdef PFSYNC_DEBUG
> + KASSERT(!ISSET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED));
> +#endif
> + /* SET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED); */

This comment looks like a debugging leftover.

> + SET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED);
> + mtx_leave(>tdb_mtx);
> + }
>  
>   sn->sn_len = sc->sc_len;
>   sc->sc_len = PFSYNC_MINPKT;
> @@ -1606,41 +1631,44 @@ pfsync_grab_snapshot(struct pfsync_snapshot *sn, 
> struct pfsync_softc *sc)
>  }
>  
>  void
> -pfsync_drop_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc * sc)
> +pfsync_drop_snapshot(struct pfsync_snapshot *sn)
>  {
>   struct pf_state *st;
>   struct pfsync_upd_req_item *ur;
>   struct tdb *t;
>   int q;
>  
> -
>   for (q = 0; q < PFSYNC_S_COUNT; q++) {
>   if (TAILQ_EMPTY(>sn_qs[q]))
>   continue;
>  
>   while ((st = TAILQ_FIRST(>sn_qs[q])) != NULL) {
> - TAILQ_REMOVE(>sn_qs[q], st, sync_list);
>  #ifdef PFSYNC_DEBUG
>   KASSERT(st->sync_state == q);
> + KASSERT(st->snapped == 1);
>  #endif
> + TAILQ_REMOVE(>sn_qs[q], st, sync_snap);
>

route timer queues

2022-04-19 Thread Alexander Bluhm
Hi,

I had a look in route timer queues in netinet and netinet6 and found
some inconsistencies.

- Timeout was a mixture of int, u_int and long.  Make timeout
  int with sysctl bounds checking and make absolute time time_t.

- Some code assumes that ..._timeout_q can be NULL and at some
  places this is checked.  Better make sure that all queues always
  exist.  The pool_get is only called from initialization and from
  syscall, so PR_WAITOK is possible.

- The only special hack I kept is when ip_mtudisc is set to 0.
  Then I destroy the queue and generate an empty one.

- If redirect timeout is 0, it does not time out.  Adopt IPv6 to
  behavior of IPv4.

- sysctl net.inet6.icmp6.redirtimeout had no effect as the queue
  timeout was not modified.  Make icmp6_sysctl() look like
  icmp_sysctl().

ok?

bluhm

Index: net/route.c
===
RCS file: /cvs/src/sys/net/route.c,v
retrieving revision 1.404
diff -u -p -r1.404 route.c
--- net/route.c 19 Apr 2022 19:19:31 -  1.404
+++ net/route.c 19 Apr 2022 20:31:49 -
@@ -1399,13 +1399,11 @@ rt_timer_init(void)
 }
 
 struct rttimer_queue *
-rt_timer_queue_create(u_int timeout)
+rt_timer_queue_create(int timeout)
 {
struct rttimer_queue*rtq;
 
-   rtq = pool_get(_queue_pool, PR_NOWAIT | PR_ZERO);
-   if (rtq == NULL)
-   return (NULL);
+   rtq = pool_get(_queue_pool, PR_WAITOK | PR_ZERO);
 
rtq->rtq_timeout = timeout;
rtq->rtq_count = 0;
@@ -1416,7 +1414,7 @@ rt_timer_queue_create(u_int timeout)
 }
 
 void
-rt_timer_queue_change(struct rttimer_queue *rtq, long timeout)
+rt_timer_queue_change(struct rttimer_queue *rtq, int timeout)
 {
rtq->rtq_timeout = timeout;
 }
@@ -1470,10 +1468,10 @@ rt_timer_add(struct rtentry *rt, void (*
 struct rttimer *), struct rttimer_queue *queue, u_int rtableid)
 {
struct rttimer  *r;
-   long current_time;
+   time_t   current_time;
 
current_time = getuptime();
-   rt->rt_expire = getuptime() + queue->rtq_timeout;
+   rt->rt_expire = current_time + queue->rtq_timeout;
 
/*
 * If there's already a timer with this action, destroy it before
@@ -1514,7 +1512,7 @@ rt_timer_timer(void *arg)
struct timeout  *to = (struct timeout *)arg;
struct rttimer_queue*rtq;
struct rttimer  *r;
-   long current_time;
+   time_t   current_time;
 
current_time = getuptime();
 
Index: net/route.h
===
RCS file: /cvs/src/sys/net/route.h,v
retrieving revision 1.188
diff -u -p -r1.188 route.h
--- net/route.h 19 Apr 2022 15:44:56 -  1.188
+++ net/route.h 19 Apr 2022 20:31:49 -
@@ -411,10 +411,10 @@ struct rttimer {
 };
 
 struct rttimer_queue {
-   longrtq_timeout;
-   unsigned long   rtq_count;
TAILQ_HEAD(, rttimer)   rtq_head;
LIST_ENTRY(rttimer_queue)   rtq_link;
+   unsigned long   rtq_count;
+   int rtq_timeout;
 };
 
 const char *rtlabel_id2name(u_int16_t);
@@ -456,8 +456,8 @@ int  rt_timer_add(struct rtentry *,
 void(*)(struct rtentry *, struct rttimer *),
 struct rttimer_queue *, u_int);
 voidrt_timer_remove_all(struct rtentry *);
-struct rttimer_queue   *rt_timer_queue_create(u_int);
-voidrt_timer_queue_change(struct rttimer_queue *, long);
+struct rttimer_queue   *rt_timer_queue_create(int);
+voidrt_timer_queue_change(struct rttimer_queue *, int);
 voidrt_timer_queue_destroy(struct rttimer_queue *);
 unsigned long   rt_timer_queue_count(struct rttimer_queue *);
 voidrt_timer_timer(void *);
Index: netinet/ip_icmp.c
===
RCS file: /cvs/src/sys/netinet/ip_icmp.c,v
retrieving revision 1.187
diff -u -p -r1.187 ip_icmp.c
--- netinet/ip_icmp.c   26 Jul 2021 20:44:44 -  1.187
+++ netinet/ip_icmp.c   19 Apr 2022 20:31:50 -
@@ -120,7 +120,7 @@ int icmp_redirtimeout = 10 * 60;
 static int icmperrpps_count = 0;
 static struct timeval icmperrppslim_last;
 
-static struct rttimer_queue *icmp_redirect_timeout_q = NULL;
+struct rttimer_queue *icmp_redirect_timeout_q;
 struct cpumem *icmpcounters;
 
 const struct sysctl_bounded_args icmpctl_vars[] =  {
@@ -141,15 +141,8 @@ int icmp_sysctl_icmpstat(void *, size_t 
 void
 icmp_init(void)
 {
+   icmp_redirect_timeout_q = rt_timer_queue_create(icmp_redirtimeout);
icmpcounters = counters_alloc(icps_ncounters);
-   /*
-* This is only useful if the user initializes redirtimeout to
-* something other than zero.
-*/
-   if 

Re: route timer pool

2022-04-19 Thread Alexander Bluhm
On Tue, Apr 19, 2022 at 08:59:25AM +0200, Claudio Jeker wrote:
> On Tue, Apr 19, 2022 at 01:44:40AM +0200, Alexander Bluhm wrote:
> > Hi,
> > 
> > Can we use a pool for rttimer_queue_pool?
> 
> Another option would be to use static rttimer_queues instead of allocating
> them. Not that many timers are used.

Multicast allocates queues on demand per routing table.  So static
allocation is not an option.

sys_setsockopt -> sosetopt -> rip_ctloutput -> ip_mrouter_set ->
ip_mrouter_init -> rt_timer_queue_create

Diff merged to current.

ok?

> Requires additional changes in the
> sysctl handlers (but that code is strange anyway).

I am trying to clean up that mess.

bluhm

Index: net/route.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/route.c,v
retrieving revision 1.403
diff -u -p -r1.403 route.c
--- net/route.c 19 Apr 2022 15:44:56 -  1.403
+++ net/route.c 19 Apr 2022 15:53:59 -
@@ -148,8 +148,9 @@ struct cpumem * rtcounters;
 intrttrash;/* routes not in table but not freed */
 intifatrash;   /* ifas not in ifp list but not free */
 
-struct poolrtentry_pool;   /* pool for rtentry structures */
-struct poolrttimer_pool;   /* pool for rttimer structures */
+struct poolrtentry_pool;   /* pool for rtentry structures */
+struct poolrttimer_pool;   /* pool for rttimer structures */
+struct poolrttimer_queue_pool; /* pool for rttimer_queue structures */
 
 intrt_setgwroute(struct rtentry *, u_int);
 void   rt_putgwroute(struct rtentry *);
@@ -183,7 +184,7 @@ route_init(void)
 {
rtcounters = counters_alloc(rts_ncounters);
 
-   pool_init(_pool, sizeof(struct rtentry), 0, IPL_SOFTNET, 0,
+   pool_init(_pool, sizeof(struct rtentry), 0, IPL_MPFLOOR, 0,
"rtentry", NULL);
 
while (rt_hashjitter == 0)
@@ -1387,8 +1388,10 @@ rt_timer_init(void)
 {
static struct timeout   rt_timer_timeout;
 
-   pool_init(_pool, sizeof(struct rttimer), 0, IPL_SOFTNET, 0,
-   "rttmr", NULL);
+   pool_init(_pool, sizeof(struct rttimer), 0,
+   IPL_MPFLOOR, 0, "rttmr", NULL);
+   pool_init(_queue_pool, sizeof(struct rttimer_queue), 0,
+   IPL_MPFLOOR, 0, "rttmrq", NULL);
 
LIST_INIT(_queue_head);
timeout_set_proc(_timer_timeout, rt_timer_timer, _timer_timeout);
@@ -1400,7 +1403,8 @@ rt_timer_queue_create(u_int timeout)
 {
struct rttimer_queue*rtq;
 
-   if ((rtq = malloc(sizeof(*rtq), M_RTABLE, M_NOWAIT|M_ZERO)) == NULL)
+   rtq = pool_get(_queue_pool, PR_NOWAIT | PR_ZERO);
+   if (rtq == NULL)
return (NULL);
 
rtq->rtq_timeout = timeout;
@@ -1436,7 +1440,7 @@ rt_timer_queue_destroy(struct rttimer_qu
}
 
LIST_REMOVE(rtq, rtq_link);
-   free(rtq, M_RTABLE, sizeof(*rtq));
+   pool_put(_queue_pool, rtq);
 }
 
 unsigned long



Re: route timer init

2022-04-19 Thread Alexander Bluhm
On Tue, Apr 19, 2022 at 08:46:06AM +0200, Claudio Jeker wrote:
> On Tue, Apr 19, 2022 at 12:07:49AM +0200, Alexander Bluhm wrote:
> > Hi,
> > 
> > Instead of using a MP unsafe global variable, just call rt_timer_init()
> > from route_init().
> > 
> > ok?
> 
> Wouldn't it be better to move this into rtable_init?
> route_init() is called by domaininit() as the last init function
> (routedomain is the last domain in domains[].

I see the problem.  ip_init() and icmp6_init() create the queue
before route_init() has been initialized.

Calling rt_timer_init() from rtable_init() fixes it.

ok?

bluhm

Index: net/route.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/route.c,v
retrieving revision 1.402
diff -u -p -r1.402 route.c
--- net/route.c 22 Feb 2022 01:15:02 -  1.402
+++ net/route.c 19 Apr 2022 14:48:26 -
@@ -151,7 +151,6 @@ int ifatrash;   /* ifas not in ifp list 
 struct poolrtentry_pool;   /* pool for rtentry structures */
 struct poolrttimer_pool;   /* pool for rttimer structures */
 
-void   rt_timer_init(void);
 intrt_setgwroute(struct rtentry *, u_int);
 void   rt_putgwroute(struct rtentry *);
 intrtflushclone1(struct rtentry *, void *, u_int);
@@ -1362,7 +1361,6 @@ rt_ifa_purge_walker(struct rtentry *rt, 
  */
 
 LIST_HEAD(, rttimer_queue) rttimer_queue_head;
-static int rt_init_done = 0;
 
 #define RTTIMER_CALLOUT(r) {   \
if (r->rtt_func != NULL) {  \
@@ -1389,25 +1387,18 @@ rt_timer_init(void)
 {
static struct timeout   rt_timer_timeout;
 
-   if (rt_init_done)
-   panic("rt_timer_init: already initialized");
-
pool_init(_pool, sizeof(struct rttimer), 0, IPL_SOFTNET, 0,
"rttmr", NULL);
 
LIST_INIT(_queue_head);
timeout_set_proc(_timer_timeout, rt_timer_timer, _timer_timeout);
timeout_add_sec(_timer_timeout, 1);
-   rt_init_done = 1;
 }
 
 struct rttimer_queue *
 rt_timer_queue_create(u_int timeout)
 {
struct rttimer_queue*rtq;
-
-   if (rt_init_done == 0)
-   rt_timer_init();
 
if ((rtq = malloc(sizeof(*rtq), M_RTABLE, M_NOWAIT|M_ZERO)) == NULL)
return (NULL);
Index: net/route.h
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/route.h,v
retrieving revision 1.187
diff -u -p -r1.187 route.h
--- net/route.h 12 Nov 2021 15:49:41 -  1.187
+++ net/route.h 19 Apr 2022 14:47:26 -
@@ -451,6 +451,7 @@ void rtm_proposal(struct ifnet *, struc
 int rt_setgate(struct rtentry *, struct sockaddr *, u_int);
 struct rtentry *rt_getll(struct rtentry *);
 
+voidrt_timer_init(void);
 int rt_timer_add(struct rtentry *,
 void(*)(struct rtentry *, struct rttimer *),
 struct rttimer_queue *, u_int);
Index: net/rtable.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/rtable.c,v
retrieving revision 1.76
diff -u -p -r1.76 rtable.c
--- net/rtable.c2 Jan 2022 22:36:04 -   1.76
+++ net/rtable.c19 Apr 2022 14:48:46 -
@@ -184,6 +184,8 @@ rtable_init(void)
 
if (rtable_add(0) != 0)
panic("unable to create default routing table");
+
+   rt_timer_init();
 }
 
 int



Re: rate limit uvn_flush warning

2022-04-18 Thread Alexander Bluhm
On Thu, Apr 14, 2022 at 11:44:42AM -0600, Theo de Raadt wrote:
> > If I understand correctly, the problem is that writing to memory
> > of an mmap(2)ed file has no error handling.  If the file system is
> > full, userland cannot be informed.  So someone invented this message
> > in the kernel.
> 
> If you cannot return an error to the program, deciding to print a message
> to the console doesn't fix anything. The program doesn't see the problem.
> The program cannot cope.  The user sees an irrelevant message which doesn't
> give them any action they can take.  Not even root will know what to do.

ok to delete the message?

bluhm

Index: uvm/uvm_vnode.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/uvm/uvm_vnode.c,v
retrieving revision 1.121
diff -u -p -r1.121 uvm_vnode.c
--- uvm/uvm_vnode.c 15 Dec 2021 12:53:53 -  1.121
+++ uvm/uvm_vnode.c 14 Apr 2022 17:34:01 -
@@ -744,7 +744,7 @@ ReTry:
 */
 #ifdef DIAGNOSTIC
if (flags & PGO_SYNCIO)
-   panic("uvn_flush: PGO_SYNCIO return 'try again' error (impossible)");
+   panic("%s: PGO_SYNCIO return 'try again' error (impossible)", __func__);
 #endif
flags |= PGO_SYNCIO;
if (flags & PGO_FREE)
@@ -807,17 +807,8 @@ ReTry:
}
} else if (flags & PGO_FREE &&
result != VM_PAGER_PEND) {
-   if (result != VM_PAGER_OK) {
-   printf("uvn_flush: obj=%p, "
-  "offset=0x%llx.  error "
-  "during pageout.\n",
-   pp->uobject,
-   (long long)pp->offset);
-   printf("uvn_flush: WARNING: "
-   "changes to page may be "
-   "lost!\n");
+   if (result != VM_PAGER_OK)
retval = FALSE;
-   }
pmap_page_protect(ptmp, PROT_NONE);
uvm_pageclean(ptmp);
TAILQ_INSERT_TAIL(, ptmp, pageq);



route timer pool

2022-04-18 Thread Alexander Bluhm
Hi,

Can we use a pool for rttimer_queue_pool?

As we run without kernel lock, these pools should have IPL_MPFLOOR
protection.

ok?

bluhm

Index: net/route.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/route.c,v
retrieving revision 1.402
diff -u -p -r1.402 route.c
--- net/route.c 22 Feb 2022 01:15:02 -  1.402
+++ net/route.c 18 Apr 2022 23:42:09 -
@@ -148,8 +148,9 @@ struct cpumem * rtcounters;
 intrttrash;/* routes not in table but not freed */
 intifatrash;   /* ifas not in ifp list but not free */
 
-struct poolrtentry_pool;   /* pool for rtentry structures */
-struct poolrttimer_pool;   /* pool for rttimer structures */
+struct poolrtentry_pool;   /* pool for rtentry structures */
+struct poolrttimer_pool;   /* pool for rttimer structures */
+struct poolrttimer_queue_pool; /* pool for rttimer_queue structures */
 
 void   rt_timer_init(void);
 intrt_setgwroute(struct rtentry *, u_int);
@@ -184,7 +185,7 @@ route_init(void)
 {
rtcounters = counters_alloc(rts_ncounters);
 
-   pool_init(_pool, sizeof(struct rtentry), 0, IPL_SOFTNET, 0,
+   pool_init(_pool, sizeof(struct rtentry), 0, IPL_MPFLOOR, 0,
"rtentry", NULL);
 
while (rt_hashjitter == 0)
@@ -1392,8 +1393,10 @@ rt_timer_init(void)
if (rt_init_done)
panic("rt_timer_init: already initialized");
 
-   pool_init(_pool, sizeof(struct rttimer), 0, IPL_SOFTNET, 0,
-   "rttmr", NULL);
+   pool_init(_pool, sizeof(struct rttimer), 0,
+   IPL_MPFLOOR, 0, "rttmr", NULL);
+   pool_init(_queue_pool, sizeof(struct rttimer_queue), 0,
+   IPL_MPFLOOR, 0, "rttmrq", NULL);
 
LIST_INIT(_queue_head);
timeout_set_proc(_timer_timeout, rt_timer_timer, _timer_timeout);
@@ -1409,7 +1412,8 @@ rt_timer_queue_create(u_int timeout)
if (rt_init_done == 0)
rt_timer_init();
 
-   if ((rtq = malloc(sizeof(*rtq), M_RTABLE, M_NOWAIT|M_ZERO)) == NULL)
+   rtq = pool_get(_queue_pool, PR_NOWAIT | PR_ZERO);
+   if (rtq == NULL)
return (NULL);
 
rtq->rtq_timeout = timeout;
@@ -1445,7 +1449,7 @@ rt_timer_queue_destroy(struct rttimer_qu
}
 
LIST_REMOVE(rtq, rtq_link);
-   free(rtq, M_RTABLE, sizeof(*rtq));
+   pool_put(_queue_pool, rtq);
 }
 
 unsigned long



route timer init

2022-04-18 Thread Alexander Bluhm
Hi,

Instead of using a MP unsafe global variable, just call rt_timer_init()
from route_init().

ok?

bluhm

Index: net/route.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/route.c,v
retrieving revision 1.402
diff -u -p -r1.402 route.c
--- net/route.c 22 Feb 2022 01:15:02 -  1.402
+++ net/route.c 18 Apr 2022 22:00:37 -
@@ -190,6 +190,8 @@ route_init(void)
while (rt_hashjitter == 0)
rt_hashjitter = arc4random();
 
+   rt_timer_init();
+
 #ifdef BFD
bfdinit();
 #endif
@@ -1362,7 +1364,6 @@ rt_ifa_purge_walker(struct rtentry *rt, 
  */
 
 LIST_HEAD(, rttimer_queue) rttimer_queue_head;
-static int rt_init_done = 0;
 
 #define RTTIMER_CALLOUT(r) {   \
if (r->rtt_func != NULL) {  \
@@ -1389,25 +1390,18 @@ rt_timer_init(void)
 {
static struct timeout   rt_timer_timeout;
 
-   if (rt_init_done)
-   panic("rt_timer_init: already initialized");
-
pool_init(_pool, sizeof(struct rttimer), 0, IPL_SOFTNET, 0,
"rttmr", NULL);
 
LIST_INIT(_queue_head);
timeout_set_proc(_timer_timeout, rt_timer_timer, _timer_timeout);
timeout_add_sec(_timer_timeout, 1);
-   rt_init_done = 1;
 }
 
 struct rttimer_queue *
 rt_timer_queue_create(u_int timeout)
 {
struct rttimer_queue*rtq;
-
-   if (rt_init_done == 0)
-   rt_timer_init();
 
if ((rtq = malloc(sizeof(*rtq), M_RTABLE, M_NOWAIT|M_ZERO)) == NULL)
return (NULL);



Re: parallel IP forwarding

2022-04-18 Thread Alexander Bluhm
On Mon, Apr 18, 2022 at 12:27:23PM +0200, Hrvoje Popovski wrote:
> On 8.4.2022. 12:56, Alexander Bluhm wrote:
> > I now the right time to commit the parallel forwarding diff?
> > 
> > Known limitiations are:
> > - Hrvoje has seen a crash with both pfsync and ipsec on his production
> >   machine.  But he cannot reproduce it in his lab.
> 
> This is resolved. At least this panic doesn't happen any more.

Good to hear.  I guess the crash seen before is related to an
uncommited diff that was tested in Hrvoje's setup.

So we can run IP forwarding in parallel.  I would like to commit
this diff now.  Then we can learn whether there are other limitations
and fix them in tree.

ok?

bluhm

Index: net/if.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v
retrieving revision 1.649
diff -u -p -r1.649 if.c
--- net/if.c25 Feb 2022 23:51:03 -  1.649
+++ net/if.c18 Apr 2022 17:12:59 -
@@ -237,7 +237,7 @@ int ifq_congestion;
 
 int netisr;
 
-#defineNET_TASKQ   1
+#defineNET_TASKQ   4
 struct taskq   *nettqmp[NET_TASKQ];
 
 struct task if_input_task_locked = TASK_INITIALIZER(if_netisr, NULL);
@@ -834,15 +834,10 @@ if_input_process(struct ifnet *ifp, stru
 * lists and the socket layer.
 */
 
-   /*
-* XXXSMP IPsec data structures are not ready to be accessed
-* by multiple network threads in parallel.  In this case
-* use an exclusive lock.
-*/
-   NET_LOCK();
+   NET_RLOCK_IN_SOFTNET();
while ((m = ml_dequeue(ml)) != NULL)
(*ifp->if_input)(ifp, m);
-   NET_UNLOCK();
+   NET_RUNLOCK_IN_SOFTNET();
 }
 
 void
@@ -899,6 +894,12 @@ if_netisr(void *unused)
arpintr();
KERNEL_UNLOCK();
}
+#endif
+   if (n & (1 << NETISR_IP))
+   ipintr();
+#ifdef INET6
+   if (n & (1 << NETISR_IPV6))
+   ip6intr();
 #endif
 #if NPPP > 0
if (n & (1 << NETISR_PPP)) {
Index: net/if_ethersubr.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_ethersubr.c,v
retrieving revision 1.278
diff -u -p -r1.278 if_ethersubr.c
--- net/if_ethersubr.c  22 Feb 2022 01:15:02 -  1.278
+++ net/if_ethersubr.c  18 Apr 2022 17:12:59 -
@@ -221,7 +221,10 @@ ether_resolve(struct ifnet *ifp, struct 
 
switch (af) {
case AF_INET:
+   KERNEL_LOCK();
+   /* XXXSMP there is a MP race in arpresolve() */
error = arpresolve(ifp, rt, m, dst, eh->ether_dhost);
+   KERNEL_UNLOCK();
if (error)
return (error);
eh->ether_type = htons(ETHERTYPE_IP);
@@ -244,7 +247,10 @@ ether_resolve(struct ifnet *ifp, struct 
break;
 #ifdef INET6
case AF_INET6:
+   KERNEL_LOCK();
+   /* XXXSMP there is a MP race in nd6_resolve() */
error = nd6_resolve(ifp, rt, m, dst, eh->ether_dhost);
+   KERNEL_UNLOCK();
if (error)
return (error);
eh->ether_type = htons(ETHERTYPE_IPV6);
@@ -270,13 +276,19 @@ ether_resolve(struct ifnet *ifp, struct 
break;
 #ifdef INET6
case AF_INET6:
+   KERNEL_LOCK();
+   /* XXXSMP there is a MP race in nd6_resolve() */
error = nd6_resolve(ifp, rt, m, dst, eh->ether_dhost);
+   KERNEL_UNLOCK();
if (error)
return (error);
break;
 #endif
case AF_INET:
+   KERNEL_LOCK();
+   /* XXXSMP there is a MP race in arpresolve() */
error = arpresolve(ifp, rt, m, dst, eh->ether_dhost);
+   KERNEL_UNLOCK();
if (error)
return (error);
break;
@@ -528,12 +540,14 @@ ether_input(struct ifnet *ifp, struct mb
case ETHERTYPE_PPPOE:
if (m->m_flags & (M_MCAST | M_BCAST))
goto dropanyway;
+   KERNEL_LOCK();
 #ifdef PIPEX
if (pipex_enable) {
struct pipex_session *session;
 
if ((session = pipex_pppoe_lookup_session(m)) != NULL) {
pipex_pppoe_input(m, session);
+   KERNEL_UNLOCK();
return;
}
}
@@ -542,6 +556,7 @@ ether_input(struct ifnet *ifp, struct mb
pppoe_disc_input

kbd set error message

2022-04-17 Thread Alexander Bluhm
Hi,

After fixing the kbd -l error handling, kbd set needs the same diff.
While there, shorten long lines and avoid v--; v++; logic.

$ ./kbd de
kbd: /dev/wskbd0: Permission denied

ok?

bluhm

Index: sbin/kbd/kbd_wscons.c
===
RCS file: /data/mirror/openbsd/cvs/src/sbin/kbd/kbd_wscons.c,v
retrieving revision 1.35
diff -u -p -r1.35 kbd_wscons.c
--- sbin/kbd/kbd_wscons.c   17 Apr 2022 17:33:50 -  1.35
+++ sbin/kbd/kbd_wscons.c   17 Apr 2022 17:39:56 -
@@ -232,7 +232,7 @@ void
 kbd_set(char *name, int verbose)
 {
charbuf[LINE_MAX], *c, *b, device[sizeof "/dev/wskbd00"];
-   int map = 0, v, i, fd;
+   int map = 0, v, i, fd, error = 0;
struct nameint *n;
 
c = name;
@@ -271,19 +271,29 @@ kbd_set(char *name, int verbose)
fd = open(device, O_WRONLY);
if (fd == -1)
fd = open(device, O_RDONLY);
-   if (fd >= 0) {
+   if (fd == -1) {
+   /* remember the first error number */
+   if (error == 0)
+   error = errno;
+   } else {
+   /* at least one success, do not print error */
+   error = -1;
+
if (ioctl(fd, WSKBDIO_SETENCODING, ) == -1) {
-   if (errno == EINVAL) {
-   fprintf(stderr,
-   "%s: unsupported encoding %s on 
%s\n",
-   __progname, name, device);
-   } else
-   err(1, "WSKBDIO_SETENCODING: %s", 
device);
-   v--;
-   }
-   v++;
+   if (errno != EINVAL)
+   err(1, "WSKBDIO_SETENCODING %s",
+   device);
+   fprintf(stderr,
+   "%s: unsupported encoding %s on %s\n",
+   __progname, name, device);
+   } else
+   v++;
close(fd);
}
+   }
+   if (error > 0) {
+   errno = error;
+   err(1, "/dev/wskbd0");
}
 
if (verbose && v > 0)



Re: rate limit uvn_flush warning

2022-04-14 Thread Alexander Bluhm
On Wed, Apr 13, 2022 at 02:22:00PM -0600, Theo de Raadt wrote:
> I think we should fix the bug and/or DELETE the message entirely

I don't see the bug.  The message was added in the initial NetBSD
uvm commit.
http://cvsweb.netbsd.org/bsdweb.cgi/src/sys/uvm/uvm_vnode.c?annotate=1.1

With a major refactoring the whole function with the message
disappeared in 2001.
http://cvsweb.netbsd.org/bsdweb.cgi/src/sys/uvm/uvm_vnode.c#rev1.52

If I understand correctly, the problem is that writing to memory
of an mmap(2)ed file has no error handling.  If the file system is
full, userland cannot be informed.  So someone invented this message
in the kernel.

What is the correct behavior?  Should close(2) after munmap(2) fail?
According to ktrace close returns 0 and ld exits with 0.  It does
not see any error although newbsd was not written correctly.

This is the output when ld fails:

../GENERIC# make newbsd  
LD="ld" sh makegap.sh 0x gapdummy.o
ld -T ld.script -X --warn-common -nopie -o newbsd ${SYSTEM_HEAD} vers.o ${OBJS}

/usr: write failed, file system is full
textdatabss dec hex
0   0   0   0   0
mv newbsd newbsd.gdb
ctfstrip -S -o newbsd newbsd.gdb
strip: there are no sections to be copied!
rm -f bsd.gdb
mv -f newbsd bsd
mv: newbsd: No such file or directory
*** Error 1 in /usr/share/relink/kernel/GENERIC (Makefile:1934 'newbsd')

The "/usr: write failed, file system is full" message is printed
by ffs to the controlling tty with rate limiting.  So error reporting
happens.

Deleting uvm message is easy.  While there use __func__ for easier
function grepping.

bluhm

Index: uvm/uvm_vnode.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/uvm/uvm_vnode.c,v
retrieving revision 1.121
diff -u -p -r1.121 uvm_vnode.c
--- uvm/uvm_vnode.c 15 Dec 2021 12:53:53 -  1.121
+++ uvm/uvm_vnode.c 14 Apr 2022 17:34:01 -
@@ -744,7 +744,7 @@ ReTry:
 */
 #ifdef DIAGNOSTIC
if (flags & PGO_SYNCIO)
-   panic("uvn_flush: PGO_SYNCIO return 'try again' error (impossible)");
+   panic("%s: PGO_SYNCIO return 'try again' error (impossible)", __func__);
 #endif
flags |= PGO_SYNCIO;
if (flags & PGO_FREE)
@@ -807,17 +807,8 @@ ReTry:
}
} else if (flags & PGO_FREE &&
result != VM_PAGER_PEND) {
-   if (result != VM_PAGER_OK) {
-   printf("uvn_flush: obj=%p, "
-  "offset=0x%llx.  error "
-  "during pageout.\n",
-   pp->uobject,
-   (long long)pp->offset);
-   printf("uvn_flush: WARNING: "
-   "changes to page may be "
-   "lost!\n");
+   if (result != VM_PAGER_OK)
retval = FALSE;
-   }
pmap_page_protect(ptmp, PROT_NONE);
uvm_pageclean(ptmp);
TAILQ_INSERT_TAIL(, ptmp, pageq);



Re: kbd -l error message

2022-04-14 Thread Alexander Bluhm
On Thu, Apr 14, 2022 at 04:44:10PM +0200, Marc Espie wrote:
> I'm not quite fond of the error reports, though... they could be more specific
> - we keep track of the first error, so it should probably talk 
> about /dev/wskbd0 directly ?

I wanted to show that more than one device is envolved.
But actually the error is always from /dev/wskbd0.

> - by comparison, the message for the WSKBDIO_GTYPE doesn't mention the
> device name. I think err(1, "WKBDIO_GTYPE on %s", device) might be slightly
> more helpful.

# kbd -l
kbd: WSKBDIO_GTYPE /dev/wskbd0: Bad address

> I don't see the need for the word "open "in the message.

$ ./kbd -l
kbd: /dev/wskbd0: Permission denied

ok?

Index: sbin/kbd/kbd_wscons.c
===
RCS file: /data/mirror/openbsd/cvs/src/sbin/kbd/kbd_wscons.c,v
retrieving revision 1.34
diff -u -p -r1.34 kbd_wscons.c
--- sbin/kbd/kbd_wscons.c   22 Jan 2020 06:24:07 -  1.34
+++ sbin/kbd/kbd_wscons.c   14 Apr 2022 15:59:26 -
@@ -150,7 +150,7 @@ kbd_list(void)
 {
int kbds[SA_MAX];
struct wskbd_encoding_data encs[SA_MAX];
-   int fd, i, kbtype, t;
+   int fd, i, kbtype, t, error = 0;
chardevice[PATH_MAX];
 
memset(kbds, 0, sizeof(kbds));
@@ -162,9 +162,16 @@ kbd_list(void)
fd = open(device, O_WRONLY);
if (fd == -1)
fd = open(device, O_RDONLY);
-   if (fd >= 0) {
+   if (fd == -1) {
+   /* remember the first error number */
+   if (error == 0)
+   error = errno;
+   } else {
+   /* at least one success, do not print error */
+   error = -1;
+
if (ioctl(fd, WSKBDIO_GTYPE, ) == -1)
-   err(1, "WSKBDIO_GTYPE");
+   err(1, "WSKBDIO_GTYPE %s", device);
switch (kbtype) {
case WSKBD_TYPE_PC_XT:
case WSKBD_TYPE_PC_AT:
@@ -207,6 +214,10 @@ kbd_list(void)
}
close(fd);
}
+   }
+   if (error > 0) {
+   errno = error;
+   err(1, "/dev/wskbd0");
}
 
for (i = 0; i < SA_MAX; i++)



kbd -l error message

2022-04-14 Thread Alexander Bluhm
Hi,

When kbd -l is executed as regular user, it fails silently.

$ kbd -l
$ echo $?
0

Error handling is a bit tricky.  We want the first error if no
device is available.

$ ./kbd -l 
kbd: open /dev/wskbd[0-9]: Permission denied
$ echo $?
1

ok?

bluhm

Index: sbin/kbd/kbd_wscons.c
===
RCS file: /data/mirror/openbsd/cvs/src/sbin/kbd/kbd_wscons.c,v
retrieving revision 1.34
diff -u -p -r1.34 kbd_wscons.c
--- sbin/kbd/kbd_wscons.c   22 Jan 2020 06:24:07 -  1.34
+++ sbin/kbd/kbd_wscons.c   14 Apr 2022 14:21:17 -
@@ -150,7 +150,7 @@ kbd_list(void)
 {
int kbds[SA_MAX];
struct wskbd_encoding_data encs[SA_MAX];
-   int fd, i, kbtype, t;
+   int fd, i, kbtype, t, error = 0;
chardevice[PATH_MAX];
 
memset(kbds, 0, sizeof(kbds));
@@ -162,7 +162,14 @@ kbd_list(void)
fd = open(device, O_WRONLY);
if (fd == -1)
fd = open(device, O_RDONLY);
-   if (fd >= 0) {
+   if (fd == -1) {
+   /* remember the first error number */
+   if (error == 0)
+   error = errno;
+   } else {
+   /* at least one success, do not print error */
+   error = -1;
+
if (ioctl(fd, WSKBDIO_GTYPE, ) == -1)
err(1, "WSKBDIO_GTYPE");
switch (kbtype) {
@@ -207,6 +214,10 @@ kbd_list(void)
}
close(fd);
}
+   }
+   if (error > 0) {
+   errno = error;
+   err(1, "open /dev/wskbd[0-9]");
}
 
for (i = 0; i < SA_MAX; i++)



pfsync mutex mpfloor

2022-04-13 Thread Alexander Bluhm
Hi,

Hrvoje has hit a witness issue in pfsync.

panic: acquiring blockable sleep lock with spinlock or critical
section held (kernel_lock) _lock

panic(81f45bb7) at panic+0xbf
witness_checkorder(8246e970,9,0) at witness_checkorder+0xb61
__mp_lock(8246e768) at __mp_lock+0x5f
kpageflttrap(800020b26dc0,17) at kpageflttrap+0x173
kerntrap(800020b26dc0) at kerntrap+0x91
alltraps_kern_meltdown() at alltraps_kern_meltdown+0x7b
pfsync_q_del(fd875f6336c0) at pfsync_q_del+0x70
pfsync_delete_state(fd875f6336c0) at pfsync_delete_state+0x118

pf and pfsync are running without kernel lock, so the mutexes
must have at least mpfloor spl protection.

ok?

bluhm

Index: net/if_pfsync.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_pfsync.c,v
retrieving revision 1.302
diff -u -p -r1.302 if_pfsync.c
--- net/if_pfsync.c 7 Apr 2022 13:38:54 -   1.302
+++ net/if_pfsync.c 11 Apr 2022 15:51:16 -
@@ -315,7 +315,7 @@ pfsyncattach(int npfsync)
 {
if_clone_attach(_cloner);
pfsynccounters = counters_alloc(pfsyncs_ncounters);
-   mq_init(_mq, 4096, IPL_SOFTNET);
+   mq_init(_mq, 4096, IPL_MPFLOOR);
 }
 
 int
@@ -333,21 +333,21 @@ pfsync_clone_create(struct if_clone *ifc
sc = malloc(sizeof(*pfsyncif), M_DEVBUF, M_WAITOK|M_ZERO);
for (q = 0; q < PFSYNC_S_COUNT; q++)
TAILQ_INIT(>sc_qs[q]);
-   mtx_init_flags(>sc_st_mtx, IPL_SOFTNET, "st_mtx", 0);
+   mtx_init(>sc_st_mtx, IPL_MPFLOOR);
 
-   pool_init(>sc_pool, PFSYNC_PLSIZE, 0, IPL_SOFTNET, 0, "pfsync",
+   pool_init(>sc_pool, PFSYNC_PLSIZE, 0, IPL_MPFLOOR, 0, "pfsync",
NULL);
TAILQ_INIT(>sc_upd_req_list);
-   mtx_init(>sc_upd_req_mtx, IPL_SOFTNET);
+   mtx_init(>sc_upd_req_mtx, IPL_MPFLOOR);
TAILQ_INIT(>sc_deferrals);
-   mtx_init(>sc_deferrals_mtx, IPL_SOFTNET);
+   mtx_init(>sc_deferrals_mtx, IPL_MPFLOOR);
timeout_set_proc(>sc_deferrals_tmo, pfsync_deferrals_tmo, sc);
task_set(>sc_ltask, pfsync_syncdev_state, sc);
task_set(>sc_dtask, pfsync_ifdetach, sc);
sc->sc_deferred = 0;
 
TAILQ_INIT(>sc_tdb_q);
-   mtx_init(>sc_tdb_mtx, IPL_SOFTNET);
+   mtx_init(>sc_tdb_mtx, IPL_MPFLOOR);
 
sc->sc_len = PFSYNC_MINPKT;
sc->sc_maxupdates = 128;



rate limit uvn_flush warning

2022-04-13 Thread Alexander Bluhm
Hi,

If /usr fills up during relinking kernel, the console spits out
masses of these warnings.

uvn_flush: obj=0x0, offset=0x476.  error during pageout.
uvn_flush: WARNING: changes to page may be lost!
uvn_flush: obj=0x0, offset=0x476.  error during pageout.
uvn_flush: WARNING: changes to page may be lost!
uvn_flush: obj=0x0, offset=0x476.  error during pageout.
uvn_flush: WARNING: changes to page may be lost!

The machine becomes unusable for several minutes.

I think we should rate limit the printf.  As this is not a hot path,
kernel lock seems best to protect struct timeval lasttime.

ok?

bluhm

Index: uvm/uvm_vnode.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/uvm/uvm_vnode.c,v
retrieving revision 1.121
diff -u -p -r1.121 uvm_vnode.c
--- uvm/uvm_vnode.c 15 Dec 2021 12:53:53 -  1.121
+++ uvm/uvm_vnode.c 13 Apr 2022 15:10:40 -
@@ -744,7 +744,7 @@ ReTry:
 */
 #ifdef DIAGNOSTIC
if (flags & PGO_SYNCIO)
-   panic("uvn_flush: PGO_SYNCIO return 'try again' error (impossible)");
+   panic("%s: PGO_SYNCIO return 'try again' error (impossible)", __func__);
 #endif
flags |= PGO_SYNCIO;
if (flags & PGO_FREE)
@@ -808,14 +808,22 @@ ReTry:
} else if (flags & PGO_FREE &&
result != VM_PAGER_PEND) {
if (result != VM_PAGER_OK) {
-   printf("uvn_flush: obj=%p, "
-  "offset=0x%llx.  error "
-  "during pageout.\n",
-   pp->uobject,
-   (long long)pp->offset);
-   printf("uvn_flush: WARNING: "
-   "changes to page may be "
-   "lost!\n");
+   static struct timeval lasttime;
+   static const struct timeval interval =
+   { 5, 0 };
+
+   KERNEL_LOCK()
+   if (ratecheck(, )) {
+   printf("%s: obj=%p, "
+  "offset=0x%llx.  error "
+  "during pageout.\n",
+   __func__, pp->uobject,
+   (long long)pp->offset);
+   printf("%s: WARNING: "
+   "changes to page may be "
+   "lost!\n", __func__);
+   }
+   KERNEL_UNLOCK()
retval = FALSE;
}
pmap_page_protect(ptmp, PROT_NONE);



Re: refcount btrace

2022-04-12 Thread Alexander Bluhm
On Mon, Apr 11, 2022 at 07:19:00PM +0200, Martin Pieuchot wrote:
> On 08/04/22(Fri) 12:16, Alexander Bluhm wrote:
> > On Fri, Apr 08, 2022 at 02:39:34AM +, Visa Hankala wrote:
> > > On Thu, Apr 07, 2022 at 07:55:11PM +0200, Alexander Bluhm wrote:
> > > > On Wed, Mar 23, 2022 at 06:13:27PM +0100, Alexander Bluhm wrote:
> > > > > In my opinion tracepoints give insight at minimal cost.  It is worth
> > > > > it to have it in GENERIC to make it easy to use.
> > > > 
> > > > After release I want to revive the btrace of refcounts discussion.
> > > > 
> > > > As mpi@ mentioned the idea of dt(4) is to have these trace points
> > > > in GENERIC kernel.  If you want to hunt a bug, just turn it on.
> > > > Refounting is a common place for bugs, leaks can be detected easily.
> > > > 
> > > > The alternative are some defines that you can compile in and access
> > > > from ddb.  This is more work and you would have to implement it for
> > > > every recount.
> > > > https://marc.info/?l=openbsd-tech=163786435916039=2
> > > > 
> > > > There is no measuarable performance difference.  dt(4) is written
> > > > in a way that is is only one additional branch.  At least my goal
> > > > is to add trace points to useful places when we identify them.
> > > 
> > > DT_INDEX_ENTER() still checks the index first, so it has two branches
> > > in practice.
> > > 
> > > I think dt_tracing should be checked first so that it serves as
> > > a gateway to the trace code. Under normal operation, the check's
> > > outcome is always the same, which is easy even for simple branch
> > > predictors.
> > 
> > Reordering the check is easy.  Now dt_tracing is first.
> > 
> > > I have a slight suspicion that dt(4) is now becoming a way to add code
> > > that would be otherwise unacceptable. Also, how "durable" are trace
> > > points perceived? Is an added trace point an achieved advantage that
> > > is difficult to remove even when its utility has diminished? There is
> > > a similarity to (ad hoc) debug printfs too.
> > 
> > As I understand dt(4) it is a replacement for debug printfs.  But
> > it has advantages.  I can be turnd on selectively from userland.
> > It does not spam the console, but can be processed in userland.  It
> > is always there, you don't have to recompile.
> > 
> > Of course you always have the printf or tracepoint at the worng
> > place.  I think people debugging the code should move them to
> > the useful places.  Then we may end with generally useful tool.
> > A least that is my hope.
> > 
> > There are obvious places to debug.  We have syscall entry and return.
> > And I think reference counting is also generally interesting.
> 
> I'm happy if this can help debugging real reference counting issues.  Do
> you have a script that could be committed to /usr/share/btrace to show
> how to track reference counting using these probes?

Script looks like this:

#!/usr/sbin/btrace
tracepoint:refcnt:inpcb{
printf("%s %x %u %+d\n", probe, arg0, arg1, arg2)
}

Note that output should be -1 instead of +4294967295, but that is
a different problem.

tracepoint:refcnt:inpcb fd80793885c0 2 +1
tracepoint:refcnt:inpcb fd80793885c0 3 +4294967295
tracepoint:refcnt:inpcb fd80793885c0 2 +1
tracepoint:refcnt:inpcb fd80793885c0 3 +4294967295
tracepoint:refcnt:inpcb fd80793885c0 2 +1
tracepoint:refcnt:inpcb fd80793885c0 3 +4294967295

Or with kernel stack:

#!/usr/sbin/btrace
tracepoint:refcnt:inpcb{
printf("%s %x %u %+d%s\n", probe, arg0, arg1, arg2, kstack)
}

tracepoint:refcnt:inpcb fd80793885c0 3 +4294967295
refcnt_rele+0x88
in_pcbunref+0x24
pf_find_state+0x2a6
pf_test_state+0x172
pf_test+0xd17
ip6_output+0xd14
tcp_output+0x164f
tcp_usrreq+0x386
sosend+0x37c
dofilewritev+0x14d
sys_write+0x51
syscall+0x314
Xsyscall+0x128
kernel

> > Index: dev/dt/dt_prov_static.c
> > ===
> > RCS file: /data/mirror/openbsd/cvs/src/sys/dev/dt/dt_prov_static.c,v
> > retrieving revision 1.13
> > diff -u -p -r1.13 dt_prov_static.c
> > --- dev/dt/dt_prov_static.c 17 Mar 2022 14:53:59 -  1.13
> > +++ dev/dt/dt_prov_static.c 8 Apr 2022 09:40:29 -
> > @@ -87,6 +87,12 @@ DT_STATIC_PROBE1(smr, barrier_exit, "int
> >  DT_STATIC_PROBE0(smr, wakeup);
> >  DT_STATIC_PROBE2(smr, thread, "uint64_t", "uint64_t");
> >  
> > +/*
> > + * reference counting
> &g

OpenBSD Errata: April 11, 2022 (rpki)

2022-04-10 Thread Alexander Bluhm
Errata patches for rpki-client have been released for OpenBSD 6.9
and 7.0.

Binary updates for the amd64, i386 and arm64 platform are available
via the syspatch utility.  Source code patches can be found on the
respective errata page:

  https://www.openbsd.org/errata69.html
  https://www.openbsd.org/errata70.html



parallel IP forwarding

2022-04-08 Thread Alexander Bluhm
Hi,

I now the right time to commit the parallel forwarding diff?

Known limitiations are:
- Hrvoje has seen a crash with both pfsync and ipsec on his production
  machine.  But he cannot reproduce it in his lab.
- TCP processing gets slower as we have an additional queue between
  IP and protocol layer.
- Protocol layer may starve as 1 exclusive lock is fightig with 4
  shared locks.  This happens only when forwardig a lot.

The advantage of commiting is that we see how relevant these things
are in real world.  But the most important thing is that we learn
how all the locks behave under MP pressure.  You can add a lot of
locking, but only when you run in parallel, you see if it is correct.

An alternative could be to commit it with NET_TASKQ 1.  With only
one softnet thread I would expect to see less bugs, but there is
also less to learn.  NET_TASKQ 1 could be a safe point where we
could easily switch back.

bluhm

Index: net/if.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v
retrieving revision 1.649
diff -u -p -r1.649 if.c
--- net/if.c25 Feb 2022 23:51:03 -  1.649
+++ net/if.c29 Mar 2022 12:44:05 -
@@ -237,7 +237,7 @@ int ifq_congestion;
 
 int netisr;
 
-#defineNET_TASKQ   1
+#defineNET_TASKQ   4
 struct taskq   *nettqmp[NET_TASKQ];
 
 struct task if_input_task_locked = TASK_INITIALIZER(if_netisr, NULL);
@@ -834,15 +834,10 @@ if_input_process(struct ifnet *ifp, stru
 * lists and the socket layer.
 */
 
-   /*
-* XXXSMP IPsec data structures are not ready to be accessed
-* by multiple network threads in parallel.  In this case
-* use an exclusive lock.
-*/
-   NET_LOCK();
+   NET_RLOCK_IN_SOFTNET();
while ((m = ml_dequeue(ml)) != NULL)
(*ifp->if_input)(ifp, m);
-   NET_UNLOCK();
+   NET_RUNLOCK_IN_SOFTNET();
 }
 
 void
@@ -899,6 +894,12 @@ if_netisr(void *unused)
arpintr();
KERNEL_UNLOCK();
}
+#endif
+   if (n & (1 << NETISR_IP))
+   ipintr();
+#ifdef INET6
+   if (n & (1 << NETISR_IPV6))
+   ip6intr();
 #endif
 #if NPPP > 0
if (n & (1 << NETISR_PPP)) {
Index: net/if_ethersubr.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_ethersubr.c,v
retrieving revision 1.278
diff -u -p -r1.278 if_ethersubr.c
--- net/if_ethersubr.c  22 Feb 2022 01:15:02 -  1.278
+++ net/if_ethersubr.c  29 Mar 2022 12:44:05 -
@@ -221,7 +221,10 @@ ether_resolve(struct ifnet *ifp, struct 
 
switch (af) {
case AF_INET:
+   KERNEL_LOCK();
+   /* XXXSMP there is a MP race in arpresolve() */
error = arpresolve(ifp, rt, m, dst, eh->ether_dhost);
+   KERNEL_UNLOCK();
if (error)
return (error);
eh->ether_type = htons(ETHERTYPE_IP);
@@ -244,7 +247,10 @@ ether_resolve(struct ifnet *ifp, struct 
break;
 #ifdef INET6
case AF_INET6:
+   KERNEL_LOCK();
+   /* XXXSMP there is a MP race in nd6_resolve() */
error = nd6_resolve(ifp, rt, m, dst, eh->ether_dhost);
+   KERNEL_UNLOCK();
if (error)
return (error);
eh->ether_type = htons(ETHERTYPE_IPV6);
@@ -270,13 +276,19 @@ ether_resolve(struct ifnet *ifp, struct 
break;
 #ifdef INET6
case AF_INET6:
+   KERNEL_LOCK();
+   /* XXXSMP there is a MP race in nd6_resolve() */
error = nd6_resolve(ifp, rt, m, dst, eh->ether_dhost);
+   KERNEL_UNLOCK();
if (error)
return (error);
break;
 #endif
case AF_INET:
+   KERNEL_LOCK();
+   /* XXXSMP there is a MP race in arpresolve() */
error = arpresolve(ifp, rt, m, dst, eh->ether_dhost);
+   KERNEL_UNLOCK();
if (error)
return (error);
break;
@@ -528,12 +540,14 @@ ether_input(struct ifnet *ifp, struct mb
case ETHERTYPE_PPPOE:
if (m->m_flags & (M_MCAST | M_BCAST))
goto dropanyway;
+   KERNEL_LOCK();
 #ifdef PIPEX
if (pipex_enable) {
struct pipex_session *session;
 
if ((session = pipex_pppoe_lookup_session(m)) != NULL) {
pipex_pppoe_input(m, session);
+   KERNEL_UNLOCK();
return;
   

Re: refcount btrace

2022-04-08 Thread Alexander Bluhm
On Fri, Apr 08, 2022 at 02:39:34AM +, Visa Hankala wrote:
> On Thu, Apr 07, 2022 at 07:55:11PM +0200, Alexander Bluhm wrote:
> > On Wed, Mar 23, 2022 at 06:13:27PM +0100, Alexander Bluhm wrote:
> > > In my opinion tracepoints give insight at minimal cost.  It is worth
> > > it to have it in GENERIC to make it easy to use.
> > 
> > After release I want to revive the btrace of refcounts discussion.
> > 
> > As mpi@ mentioned the idea of dt(4) is to have these trace points
> > in GENERIC kernel.  If you want to hunt a bug, just turn it on.
> > Refounting is a common place for bugs, leaks can be detected easily.
> > 
> > The alternative are some defines that you can compile in and access
> > from ddb.  This is more work and you would have to implement it for
> > every recount.
> > https://marc.info/?l=openbsd-tech=163786435916039=2
> > 
> > There is no measuarable performance difference.  dt(4) is written
> > in a way that is is only one additional branch.  At least my goal
> > is to add trace points to useful places when we identify them.
> 
> DT_INDEX_ENTER() still checks the index first, so it has two branches
> in practice.
> 
> I think dt_tracing should be checked first so that it serves as
> a gateway to the trace code. Under normal operation, the check's
> outcome is always the same, which is easy even for simple branch
> predictors.

Reordering the check is easy.  Now dt_tracing is first.

> I have a slight suspicion that dt(4) is now becoming a way to add code
> that would be otherwise unacceptable. Also, how "durable" are trace
> points perceived? Is an added trace point an achieved advantage that
> is difficult to remove even when its utility has diminished? There is
> a similarity to (ad hoc) debug printfs too.

As I understand dt(4) it is a replacement for debug printfs.  But
it has advantages.  I can be turnd on selectively from userland.
It does not spam the console, but can be processed in userland.  It
is always there, you don't have to recompile.

Of course you always have the printf or tracepoint at the worng
place.  I think people debugging the code should move them to
the useful places.  Then we may end with generally useful tool.
A least that is my hope.

There are obvious places to debug.  We have syscall entry and return.
And I think reference counting is also generally interesting.

bluhm

Index: dev/dt/dt_prov_static.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/dev/dt/dt_prov_static.c,v
retrieving revision 1.13
diff -u -p -r1.13 dt_prov_static.c
--- dev/dt/dt_prov_static.c 17 Mar 2022 14:53:59 -  1.13
+++ dev/dt/dt_prov_static.c 8 Apr 2022 09:40:29 -
@@ -87,6 +87,12 @@ DT_STATIC_PROBE1(smr, barrier_exit, "int
 DT_STATIC_PROBE0(smr, wakeup);
 DT_STATIC_PROBE2(smr, thread, "uint64_t", "uint64_t");
 
+/*
+ * reference counting
+ */
+DT_STATIC_PROBE0(refcnt, none);
+DT_STATIC_PROBE3(refcnt, inpcb, "void *", "int", "int");
+DT_STATIC_PROBE3(refcnt, tdb, "void *", "int", "int");
 
 /*
  * List of all static probes
@@ -127,15 +133,24 @@ struct dt_probe *const dtps_static[] = {
&_DT_STATIC_P(smr, barrier_exit),
&_DT_STATIC_P(smr, wakeup),
&_DT_STATIC_P(smr, thread),
+   /* refcnt */
+   &_DT_STATIC_P(refcnt, none),
+   &_DT_STATIC_P(refcnt, inpcb),
+   &_DT_STATIC_P(refcnt, tdb),
 };
 
+struct dt_probe *const *dtps_index_refcnt;
+
 int
 dt_prov_static_init(void)
 {
int i;
 
-   for (i = 0; i < nitems(dtps_static); i++)
+   for (i = 0; i < nitems(dtps_static); i++) {
+   if (dtps_static[i] == &_DT_STATIC_P(refcnt, none))
+   dtps_index_refcnt = _static[i];
dt_dev_register_probe(dtps_static[i]);
+   }
 
return i;
 }
Index: dev/dt/dtvar.h
===
RCS file: /data/mirror/openbsd/cvs/src/sys/dev/dt/dtvar.h,v
retrieving revision 1.13
diff -u -p -r1.13 dtvar.h
--- dev/dt/dtvar.h  27 Feb 2022 10:14:01 -  1.13
+++ dev/dt/dtvar.h  8 Apr 2022 09:42:19 -
@@ -313,11 +313,30 @@ extern volatile uint32_t  dt_tracing; /* 
 #defineDT_STATIC_ENTER(func, name, args...) do {   
\
extern struct dt_probe _DT_STATIC_P(func, name);\
struct dt_probe *dtp = &_DT_STATIC_P(func, name);   \
-   struct dt_provider *dtpv = dtp->dtp_prov;   \
\
if (__predict_false(dt_tracing) &&  

Re: refcount btrace

2022-04-07 Thread Alexander Bluhm
On Wed, Mar 23, 2022 at 06:13:27PM +0100, Alexander Bluhm wrote:
> In my opinion tracepoints give insight at minimal cost.  It is worth
> it to have it in GENERIC to make it easy to use.

After release I want to revive the btrace of refcounts discussion.

As mpi@ mentioned the idea of dt(4) is to have these trace points
in GENERIC kernel.  If you want to hunt a bug, just turn it on.
Refounting is a common place for bugs, leaks can be detected easily.

The alternative are some defines that you can compile in and access
from ddb.  This is more work and you would have to implement it for
every recount.
https://marc.info/?l=openbsd-tech=163786435916039=2

There is no measuarable performance difference.  dt(4) is written
in a way that is is only one additional branch.  At least my goal
is to add trace points to useful places when we identify them.

ok?

bluhm

Index: dev/dt/dt_prov_static.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/dev/dt/dt_prov_static.c,v
retrieving revision 1.13
diff -u -p -r1.13 dt_prov_static.c
--- dev/dt/dt_prov_static.c 17 Mar 2022 14:53:59 -  1.13
+++ dev/dt/dt_prov_static.c 7 Apr 2022 17:32:23 -
@@ -87,6 +87,12 @@ DT_STATIC_PROBE1(smr, barrier_exit, "int
 DT_STATIC_PROBE0(smr, wakeup);
 DT_STATIC_PROBE2(smr, thread, "uint64_t", "uint64_t");
 
+/*
+ * reference counting
+ */
+DT_STATIC_PROBE0(refcnt, none);
+DT_STATIC_PROBE3(refcnt, inpcb, "void *", "int", "int");
+DT_STATIC_PROBE3(refcnt, tdb, "void *", "int", "int");
 
 /*
  * List of all static probes
@@ -127,15 +133,24 @@ struct dt_probe *const dtps_static[] = {
&_DT_STATIC_P(smr, barrier_exit),
&_DT_STATIC_P(smr, wakeup),
&_DT_STATIC_P(smr, thread),
+   /* refcnt */
+   &_DT_STATIC_P(refcnt, none),
+   &_DT_STATIC_P(refcnt, inpcb),
+   &_DT_STATIC_P(refcnt, tdb),
 };
 
+struct dt_probe *const *dtps_index_refcnt;
+
 int
 dt_prov_static_init(void)
 {
int i;
 
-   for (i = 0; i < nitems(dtps_static); i++)
+   for (i = 0; i < nitems(dtps_static); i++) {
+   if (dtps_static[i] == &_DT_STATIC_P(refcnt, none))
+   dtps_index_refcnt = _static[i];
dt_dev_register_probe(dtps_static[i]);
+   }
 
return i;
 }
Index: dev/dt/dtvar.h
===
RCS file: /data/mirror/openbsd/cvs/src/sys/dev/dt/dtvar.h,v
retrieving revision 1.13
diff -u -p -r1.13 dtvar.h
--- dev/dt/dtvar.h  27 Feb 2022 10:14:01 -  1.13
+++ dev/dt/dtvar.h  7 Apr 2022 17:41:55 -
@@ -313,11 +313,30 @@ extern volatile uint32_t  dt_tracing; /* 
 #defineDT_STATIC_ENTER(func, name, args...) do {   
\
extern struct dt_probe _DT_STATIC_P(func, name);\
struct dt_probe *dtp = &_DT_STATIC_P(func, name);   \
-   struct dt_provider *dtpv = dtp->dtp_prov;   \
\
if (__predict_false(dt_tracing) &&  \
__predict_false(dtp->dtp_recording)) {  \
+   struct dt_provider *dtpv = dtp->dtp_prov;   \
+   \
dtpv->dtpv_enter(dtpv, dtp, args);  \
+   }   \
+} while (0)
+
+#define _DT_INDEX_P(func)  (dtps_index_##func)
+
+#define DT_INDEX_ENTER(func, index, args...) do {  \
+   extern struct dt_probe **_DT_INDEX_P(func); \
+   \
+   if (__predict_false(index > 0) &&   \
+   __predict_false(dt_tracing) &&  \
+   __predict_true(_DT_INDEX_P(func) != NULL)) {\
+   struct dt_probe *dtp = _DT_INDEX_P(func)[index];\
+   \
+   if(__predict_false(dtp->dtp_recording)) {   \
+   struct dt_provider *dtpv = dtp->dtp_prov;   \
+   \
+   dtpv->dtpv_enter(dtpv, dtp, args);  \
+   }   \
}   \
 } while (0)
 
Index: kern/kern_synch.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_synch.c,v
retrieving

Re: pfsync(4) snapshot lists must have dedicated link element

2022-04-07 Thread Alexander Bluhm
On Wed, Apr 06, 2022 at 05:01:55PM +0200, Alexandr Nedvedicky wrote:
> Hello,
> 
> Hrvoje was testing pf(4) and pfsync(4) with parallel forwarding diff
> which bluhm@ has shared sometime ago.
> 
> Hrvoje found a bug in my very naive implementation of 'snapshots':
> 
> 1573 void
> 1574 pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc *sc)
> 1575 {
> 1576 int q;
> 1577 
> 1578 sn->sn_sc = sc;
> 1579 
> 1580 mtx_enter(>sc_st_mtx);
> 1581 mtx_enter(>sc_upd_req_mtx);
> 1582 mtx_enter(>sc_tdb_mtx);
> 1583 
> 1584 for (q = 0; q < PFSYNC_S_COUNT; q++) {
> 1585 TAILQ_INIT(>sn_qs[q]);
> 1586 TAILQ_CONCAT(>sn_qs[q], >sc_qs[q], sync_list);
> 1587 }
> 1588 
> 1589 TAILQ_INIT(>sn_upd_req_list);
> 1590 TAILQ_CONCAT(>sn_upd_req_list, >sc_upd_req_list, 
> ur_entry);
> 1591 
> 1592 TAILQ_INIT(>sn_tdb_q);
> 1593 TAILQ_CONCAT(>sn_tdb_q, >sc_tdb_q, tdb_sync_entry);
> 1594 
> 
> 
> the problem with code above is that we just take care about heads of various
> queues. However individual objects may get re-inserted to queue on behalf of
> state update. This creates a havoc. The proposed change introduces a dedicated
> link member for snapshot, so we can move elements from sync_list to
> snapshot_list.
> 
> The diff below does not hurt pfsync(4) in current tree, because
> we still don't forward packets in parallel. It will just make
> things bit easier for Hrvoje et. al. so we can keep smaller diff
> against current tree.
> 
> 
> OK ?

I think there is a use after free in you diff.  After you return
from pfsync_delete_tdb() you must not access the TDB again.

Comments inline.

> thanks and
> regards
> sashan
> 
> 8<---8<---8<--8<
> diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c
> index cb0f3fbdf52..161f8c89317 100644
> --- a/sys/net/if_pfsync.c
> +++ b/sys/net/if_pfsync.c
> @@ -181,6 +181,7 @@ void  pfsync_q_del(struct pf_state *);
>  
>  struct pfsync_upd_req_item {
>   TAILQ_ENTRY(pfsync_upd_req_item)ur_entry;
> + TAILQ_ENTRY(pfsync_upd_req_item)ur_snap;
>   struct pfsync_upd_req   ur_msg;
>  };
>  TAILQ_HEAD(pfsync_upd_reqs, pfsync_upd_req_item);
> @@ -295,7 +296,7 @@ void  pfsync_bulk_update(void *);
>  void pfsync_bulk_fail(void *);
>  
>  void pfsync_grab_snapshot(struct pfsync_snapshot *, struct pfsync_softc *);
> -void pfsync_drop_snapshot(struct pfsync_snapshot *, struct pfsync_softc *);
> +void pfsync_drop_snapshot(struct pfsync_snapshot *);
>  
>  void pfsync_send_dispatch(void *);
>  void pfsync_send_pkt(struct mbuf *);
> @@ -1574,6 +1575,9 @@ void
>  pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc *sc)
>  {
>   int q;
> + struct pf_state *st;
> + struct pfsync_upd_req_item *ur;
> + struct tdb *tdb;
>  
>   sn->sn_sc = sc;
>  
> @@ -1583,14 +1587,33 @@ pfsync_grab_snapshot(struct pfsync_snapshot *sn, 
> struct pfsync_softc *sc)
>  
>   for (q = 0; q < PFSYNC_S_COUNT; q++) {
>   TAILQ_INIT(>sn_qs[q]);
> - TAILQ_CONCAT(>sn_qs[q], >sc_qs[q], sync_list);
> +
> + while ((st = TAILQ_FIRST(>sc_qs[q])) != NULL) {
> +#ifdef PFSYNC_DEBUG
> + KASSERT(st->snapped == 0);
> +#endif
> + TAILQ_REMOVE(>sc_qs[q], st, sync_list);
> + TAILQ_INSERT_TAIL(>sn_qs[q], st, sync_snap);
> + st->snapped = 1;
> + }
>   }
>  
>   TAILQ_INIT(>sn_upd_req_list);
> - TAILQ_CONCAT(>sn_upd_req_list, >sc_upd_req_list, ur_entry);
> + while (!TAILQ_EMPTY(>sc_upd_req_list)) {
> + ur = TAILQ_FIRST(>sc_upd_req_list);

Other loops have this idiom
while ((ur = TAILQ_FIRST(>sc_upd_req_list) != NULL) {

> + TAILQ_REMOVE(>sc_upd_req_list, ur, ur_entry);
> + TAILQ_INSERT_TAIL(>sn_upd_req_list, ur, ur_snap);
> + }
>  
>   TAILQ_INIT(>sn_tdb_q);
> - TAILQ_CONCAT(>sn_tdb_q, >sc_tdb_q, tdb_sync_entry);
> + while ((tdb = TAILQ_FIRST(>sc_tdb_q)) != NULL) {
> +#ifdef PFSYNC_DEBUG
> + KASSERT(tdb->snapped == 0);
> +#endif
> + TAILQ_REMOVE(>sc_tdb_q, tdb, tdb_sync_entry);
> + TAILQ_INSERT_TAIL(>sn_tdb_q, tdb, tdb_sync_snap);
> + tdb->tdb_snapped = 1;
> + }
>  
>   sn->sn_len = sc->sc_len;
>   sc->sc_len = PFSYNC_MINPKT;
> @@ -1606,41 +1629,44 @@ pfsync_grab_snapshot(struct pfsync_snapshot *sn, 
> struct pfsync_softc *sc)
>  }
>  
>  void
> -pfsync_drop_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc * sc)
> +pfsync_drop_snapshot(struct pfsync_snapshot *sn)
>  {
>   struct pf_state *st;
>   struct pfsync_upd_req_item *ur;
>   struct tdb *t;
>   int q;
>  
> -
>   for (q = 0; q < PFSYNC_S_COUNT; q++) {
>   if (TAILQ_EMPTY(>sn_qs[q]))
>   continue;
>  
>   

OpenBSD Errata: April 5, 2022 (syszlib)

2022-04-04 Thread Alexander Bluhm
Errata patches for zlib in the kernel have been released for OpenBSD
6.9 and 7.0.

Binary updates for the amd64, i386 and arm64 platform are available
via the syspatch utility.  Source code patches can be found on the
respective errata page:

  https://www.openbsd.org/errata69.html
  https://www.openbsd.org/errata70.html



OpenBSD Errata: April 1, 2022 (zlib)

2022-04-01 Thread Alexander Bluhm
Errata patches for zlib have been released for OpenBSD 6.9 and 7.0.

Binary updates for the amd64, i386 and arm64 platform are available
via the syspatch utility.  Source code patches can be found on the
respective errata page:

  https://www.openbsd.org/errata69.html
  https://www.openbsd.org/errata70.html



Re: refcount btrace

2022-03-23 Thread Alexander Bluhm
On Mon, Mar 21, 2022 at 01:22:22PM +0100, Martin Pieuchot wrote:
> On 20/03/22(Sun) 05:39, Visa Hankala wrote:
> > On Sat, Mar 19, 2022 at 12:10:11AM +0100, Alexander Bluhm wrote:
> > > On Thu, Mar 17, 2022 at 07:25:27AM +, Visa Hankala wrote:
> > > > On Thu, Mar 17, 2022 at 12:42:13AM +0100, Alexander Bluhm wrote:
> > > > > I would like to use btrace to debug refernce counting.  The idea
> > > > > is to a a tracepoint for every type of refcnt we have.  When it
> > > > > changes, print the actual object, the current counter and the change
> > > > > value.
> > > > 
> > > > > Do we want that feature?
> > > > 
> > > > I am against this in its current form. The code would become more
> > > > complex, and the trace points can affect timing. There is a risk that
> > > > the kernel behaves slightly differently when dt has been compiled in.
> > > 
> > > On our main architectures dt(4) is in GENERIC.  I see your timing
> > > point for uvm structures.
> > 
> > In my opinion, having dt(4) enabled by default is another reason why
> > there should be no carte blanche for adding trace points. Each trace
> > point adds a tiny amount of bloat. Few users will use the tracing
> > facility.
> > 
> > Maybe high-rate trace points could be behind a build option...
> 
> The whole point of dt(4) is to be able to debug GENERIC kernel.  I doubt
> the cost of an additional if () block matters.

The idea of dt(4) is that developer or end user with instructions
can debug a running kernel without recompiling.  So we have to put
trace points at places where we gain much information.

I did some meassurement with and without dt.  Note that I configure
my tests machines with sysctl kern.allowdt=1.  I had to disable it
in kernel diff.

http://bluhm.genua.de/perform/results/2022-03-21T09%3A08%3A37Z/perform.html

I see difference from moving the kernel objects.  Even reboot and
testing again has more variance than dt(4).

The story is different when btrace(8) is actually running.  Look
at the numbers in the right column.

http://bluhm.genua.de/perform/results/2022-03-21T09%3A08%3A37Z/2022-03-21T00%3A00%3A00Z/perform.html

For the network test it does not matter, as our IP stack uses only
one or maybe two cores.  On a 4 core machine btrace userland can
use 1 core.  When compiling the kernel in the "make-bsd-j4" test
row, the build time goes up as btrace takes CPU time from the
compiler.

In my opinion tracepoints give insight at minimal cost.  It is worth
it to have it in GENERIC to make it easy to use.

bluhm



Re: fix very small ntpd leak

2022-03-23 Thread Alexander Bluhm
On Wed, Mar 23, 2022 at 09:09:01PM +1000, Jonathan Matthew wrote:
> We noticed that the ntpd engine process was getting a bit big on some boxes
> that we'd accidentally cut off from the ntp servers (routing is hard).
> Reading through the code, I noticed the 'query' member of struct ntp_peer
> is never freed, which seems to account for the leak.
> 
> If you have a server pool in ntpd.conf and it resolves, but ntpd is unable
> to talk to the servers, it will re-resolve periodically, freeing the old list
> of peers and creating new ones.
> 
> To show how slow the leak is, here's the leak report from MALLOC_OPTIONS=D
> after running for about two hours with four servers from two pools.
> 
> without diff:
>  
> Leak report
>  f sum  #avg
>0x09392128 73
>  0x889878b920b 512  1512
>  0x889878bc8e14096  4   1024
>  0x889878bd065 128  2 64
>  0x88bc91f0b4b   18280  1  18280
>  0x88bc926a9ed   65536  1  65536
>  
>  
> with diff:
>  
> Leak report
>  f sum  #avg
>0x06064 16379
>  0xbee1253320b 512  1512
>  0xbf0265f4b4b   18280  1  18280
>  0xbf02666e9ed   65536  1  65536
> 
> ok?

OK bluhm@

> Index: ntp.c
> ===
> RCS file: /cvs/src/usr.sbin/ntpd/ntp.c,v
> retrieving revision 1.168
> diff -u -p -r1.168 ntp.c
> --- ntp.c 24 Oct 2021 21:24:19 -  1.168
> +++ ntp.c 23 Mar 2022 10:43:59 -
> @@ -686,6 +686,7 @@ void
>  peer_remove(struct ntp_peer *p)
>  {
>   TAILQ_REMOVE(>ntp_peers, p, entry);
> + free(p->query);
>   free(p);
>   peer_cnt--;
>  }



pfioctl goto fail

2022-03-23 Thread Alexander Bluhm
Hi,

pfioctl() is inconsistent when to use break or goto fail.  There
is a big switch and when looking at a break you need more context
to see where it jumps to.

I would like to use goto fail consistently to leave the big switch.
break is used for inner switches and loops.

No binary diff.

ok?

bluhm

Index: net/pf_ioctl.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf_ioctl.c,v
retrieving revision 1.374
diff -u -p -r1.374 pf_ioctl.c
--- net/pf_ioctl.c  23 Mar 2022 09:01:59 -  1.374
+++ net/pf_ioctl.c  23 Mar 2022 14:36:30 -
@@ -1217,7 +1217,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
error = EBUSY;
PF_UNLOCK();
NET_UNLOCK();
-   break;
+   goto fail;
}
 
/* save state to not run over them all each time? */
@@ -1228,7 +1228,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
error = EBUSY;
PF_UNLOCK();
NET_UNLOCK();
-   break;
+   goto fail;
}
memcpy(>queue, qs, sizeof(pq->queue));
PF_UNLOCK();
@@ -1248,7 +1248,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
error = EBUSY;
PF_UNLOCK();
NET_UNLOCK();
-   break;
+   goto fail;
}
nbytes = pq->nbytes;
nr = 0;
@@ -1261,7 +1261,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
error = EBUSY;
PF_UNLOCK();
NET_UNLOCK();
-   break;
+   goto fail;
}
memcpy(>queue, qs, sizeof(pq->queue));
/* It's a root flow queue but is not an HFSC root class */
@@ -1286,7 +1286,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
qs = pool_get(_queue_pl, PR_WAITOK|PR_LIMITFAIL|PR_ZERO);
if (qs == NULL) {
error = ENOMEM;
-   break;
+   goto fail;
}
 
NET_LOCK();
@@ -1296,7 +1296,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
PF_UNLOCK();
NET_UNLOCK();
pool_put(_queue_pl, qs);
-   break;
+   goto fail;
}
memcpy(qs, >queue, sizeof(*qs));
qs->qid = pf_qname2qid(qs->qname, 1);
@@ -1305,7 +1305,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
PF_UNLOCK();
NET_UNLOCK();
pool_put(_queue_pl, qs);
-   break;
+   goto fail;
}
if (qs->parent[0] && (qs->parent_qid =
pf_qname2qid(qs->parent, 0)) == 0) {
@@ -1313,7 +1313,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
PF_UNLOCK();
NET_UNLOCK();
pool_put(_queue_pl, qs);
-   break;
+   goto fail;
}
qs->kif = pfi_kif_get(qs->ifname, NULL);
if (qs->kif == NULL) {
@@ -1321,7 +1321,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
PF_UNLOCK();
NET_UNLOCK();
pool_put(_queue_pl, qs);
-   break;
+   goto fail;
}
/* XXX resolve bw percentage specs */
pfi_kif_ref(qs->kif, PFI_KIF_REF_RULE);
@@ -1341,20 +1341,20 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
rule = pool_get(_rule_pl, PR_WAITOK|PR_LIMITFAIL|PR_ZERO);
if (rule == NULL) {
error = ENOMEM;
-   break;
+   goto fail;
}
 
if ((error = pf_rule_copyin(>rule, rule))) {
pf_rule_free(rule);
rule = NULL;
-   break;
+   goto fail;
}
 
if (pr->rule.return_icmp >> 8 > ICMP_MAXTYPE) {
error = EINVAL;
pf_rule_free(rule);
rule = NULL;
-   break;
+   goto fail;
}
if ((error = pf_rule_checkaf(rule))) {
pf_rule_free(rule);
@@ -1366,14 +1366,14 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
error = EINVAL;
pf_rule_free(rule);
rule = NULL;
-   break;
+   goto fail;
 

Re: introduce pfioctl_rw

2022-03-22 Thread Alexander Bluhm
On Mon, Mar 21, 2022 at 11:48:48PM +0100, Alexandr Nedvedicky wrote:
> OK?

I did a regress run with witness.  OK bluhm@

> 8<---8<---8<--8<
> diff --git a/sys/net/pf_ioctl.c b/sys/net/pf_ioctl.c
> index dbbc79c0a0e..329284ce6a6 100644
> --- a/sys/net/pf_ioctl.c
> +++ b/sys/net/pf_ioctl.c
> @@ -150,6 +150,7 @@ TAILQ_HEAD(pf_tags, pf_tagname)   pf_tags = 
> TAILQ_HEAD_INITIALIZER(pf_tags),
>   */
>  struct rwlock pf_lock = RWLOCK_INITIALIZER("pf_lock");
>  struct rwlock pf_state_lock = 
> RWLOCK_INITIALIZER("pf_state_lock");
> +struct rwlock pfioctl_rw = RWLOCK_INITIALIZER("pfioctl_rw");
>  
>  #if (PF_QNAME_SIZE != PF_TAG_NAME_SIZE)
>  #error PF_QNAME_SIZE must be equal to PF_TAG_NAME_SIZE
> @@ -1142,6 +1143,11 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int 
> flags, struct proc *p)
>   return (EACCES);
>   }
>  
> + if (flags & FWRITE)
> + rw_enter_write(_rw);
> + else
> + rw_enter_read(_rw);
> +
>   switch (cmd) {
>  
>   case DIOCSTART:
> @@ -2945,8 +2951,10 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int 
> flags, struct proc *p)
>   case DIOCSETIFFLAG: {
>   struct pfioc_iface *io = (struct pfioc_iface *)addr;
>  
> - if (io == NULL)
> - return (EINVAL);
> + if (io == NULL) {
> + error = EINVAL;
> + break;
> + }
>  
>   NET_LOCK();
>   PF_LOCK();
> @@ -2959,8 +2967,10 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int 
> flags, struct proc *p)
>   case DIOCCLRIFFLAG: {
>   struct pfioc_iface *io = (struct pfioc_iface *)addr;
>  
> - if (io == NULL)
> - return (EINVAL);
> + if (io == NULL) {
> + error = EINVAL;
> + break;
> + }
>  
>   NET_LOCK();
>   PF_LOCK();
> @@ -3020,6 +3030,11 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int 
> flags, struct proc *p)
>   break;
>   }
>  fail:
> + if (flags & FWRITE)
> + rw_exit_write(_rw);
> + else
> + rw_exit_read(_rw);
> +
>   return (error);
>  }
>  



Re: rip sbappendaddr() with inpcb table mutex

2022-03-22 Thread Alexander Bluhm
On Tue, Mar 22, 2022 at 04:42:45PM +0100, Claudio Jeker wrote:
> No but you push this layer into a specifc direction and by that make it
> harder to fix the PCB tables in a different way. I just see people
> changing the NET_ASSERT_WLOCKED() without realizing the actual reason for
> the exclusive netlock use.

Of course MP for PCB could be implemented differently.  SRP, SMR,
whatever.  But mutex is easy and we should start with that.  As we
currently run only on one CPU, it does not matter.

There is much more to be done like ref counting and protecting the
PCB fields.  But I want to go in small steps.  This NET_ASSERT_WLOCKED()
beside SIMPLEQ_INIT() makes it quite obvious where the next unlocking
problem is.  Look in netinet/ip_ipsp.c tdb_walk(), there is another
one.  When they will be only left in the slow path a lot is gained.
And if not, we have to fix them step by step.

> Looking at the pcb hash problem, I have to wonder if this reinserting of
> PCBs is actually resulting in a measurable performance difference. The
> hash table should be large enough to keep the number of PCB per bucket low. 

The reinsertion is done for PCB notify, UDP multicast, Raw IPv4 and
IPv6.  I don't have benchmarks for these cases and I doubt that
others will feel much difference there.

There is one thing that might make things slower.  The in_pcbref(inp)
in_pcbunref(inp) is not strictly necessary, we have exclusive net
lock.  But I put it there so we will not forget it when unlocking.
Maybe it costs a bit of performance, but who cares about multicast
and rip.

I have measured the PCB mutex diff.  But as usual benchmarks have
to be explained.

http://bluhm.genua.de/perform/results/2022-03-10T17%3A19%3A00Z/perform.html

Left column is baseline, middle column is as mistake, where I only
applied the IPv4 part.  Right columns is the full PCB diff, but
without UDP Multicast and Raw IP queuing, which I missed before.

In TCP Perfomance graph (IPv4 only), the right column looks slow.
But the code difference between middle and right only affects IPv6.
How can that be?  The answer is in this row:

kernel name list+53 -52 +40054 -40054

Mapping the kernel object files to different pages affects throughput
a lot, more than most diffs.  So I sort and align them and compare
the nm /bsd output.  When you click on it you see how 4 symbol
addresses move around.

So the more or less correct numbers are in the middle column but
only for IPv4.

Look at the kstack output of one TCP benchmark.

http://bluhm.genua.de/perform/results/2022-03-10T17%3A19%3A00Z/patch-sys-pcbtable-mtx.0/btrace/iperf3_-c10.3.45.35_-w1m_-t10_-R-btrace-kstack.0.svg

Search for mtx in the right top field.  Then you see mutex contension.
They are not in PCB lockup as the pf state links to the socket.

When receiving short UDP packets you can see the affected code:

http://bluhm.genua.de/perform/results/2022-03-10T17%3A19%3A00Z/patch-sys-pcbtable-mtx.0/btrace/udpbench_-l36_-t10_-r_ot15_recv_10.3.45.34-btrace-kstack.0.svg

We are 4.3% in PCB lookup.  And in that part you find 10% in mutex.

Compare it to the orignal code, this mutex is not there:

http://bluhm.genua.de/perform/results/2022-03-10T17%3A19%3A00Z/2022-03-10T00%3A00%3A00Z/btrace/udpbench_-l36_-t10_-r_ot15_recv_10.3.45.34-btrace-kstack.0.svg

But UDP thoughput numbers do not change.


> One comment below.
> At least the one bit that can fail can be moved outside of this loop:

This is a very good idea and can be done upfront.

ok?

bluhm

Index: netinet6/raw_ip6.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/raw_ip6.c,v
retrieving revision 1.145
diff -u -p -r1.145 raw_ip6.c
--- netinet6/raw_ip6.c  21 Mar 2022 09:12:34 -  1.145
+++ netinet6/raw_ip6.c  22 Mar 2022 16:22:43 -
@@ -125,10 +125,19 @@ rip6_input(struct mbuf **mp, int *offp, 
struct in6_addr *key;
struct sockaddr_in6 rip6src;
struct mbuf *opts = NULL;
+   uint8_t type;
 
KASSERT(af == AF_INET6);
 
-   if (proto != IPPROTO_ICMPV6)
+   if (proto == IPPROTO_ICMPV6) {
+   struct icmp6_hdr *icmp6;
+
+   IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, *offp,
+   sizeof(*icmp6));
+   if (icmp6 == NULL)
+   return IPPROTO_DONE;
+   type = icmp6->icmp6_type;
+   } else
rip6stat_inc(rip6s_ipackets);
 
bzero(, sizeof(rip6src));
@@ -177,16 +186,7 @@ rip6_input(struct mbuf **mp, int *offp, 
!IN6_ARE_ADDR_EQUAL(>inp_faddr6, >ip6_src))
continue;
if (proto == IPPROTO_ICMPV6 && in6p->inp_icmp6filt) {
-   struct icmp6_hdr *icmp6;
-
-   IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, *offp,
-   sizeof(*icmp6));
-   if (icmp6 == NULL) {
-   

Re: if_get NULL race arp, nd6, igmp

2022-03-22 Thread Alexander Bluhm
anyone?

On Fri, Mar 04, 2022 at 12:09:03PM +0100, Alexander Bluhm wrote:
> Hi,
> 
> syzkaller has found this race in arp.
> 
> https://syzkaller.appspot.com/bug?id=e3dc94533ddee95b6d69c2e7049360022f4190d3
> 
> The assumption of the code is that either the arp entry or the
> interface is removed.
> 
> But in if_detach() if_remove() is called without net lock and all
> arp entries are removed later in in_ifdetach() -> in_purgeaddr()
> -> rt_ifa_purge() -> rtdeletemsg().
> 
> When the arp timeout fires while if_detach() is between if_remove()
> and NET_LOCK() then arptfree() has do deal with partially destroyed
> interfaces.  We can skip rtdeletemsg() as if_detach() will take
> care of it.
> 
> 
> While syzkaller has not found it, nd6 has to deal with the same
> problem.  Make nd6_free() simmilar to arptfree().
> 
> 
> This crash may have the same source of problem.
> 
> https://syzkaller.appspot.com/bug?id=9649f7319437a49298a38572b83f38f0b7d37fbe
> 
> if_detach() does if_remove(ifp); NET_LOCK(); rti_delete().  So new
> igmp groups may appear during interface destruction.  igmp_joingroup()
> does not call rti_fill() as if_get() fails.  Then inm->inm_rti may
> be NULL.  This is the condition when syzkaller crashes in
> igmp_leavegroup().
> 
> When we pass the ifp this CPU is already holding, we avoid half
> constructed igmp groups.  Calling if_get() multiple times in caller
> and callee makes no sense anyway.
> 
> ok?  partial ok for one of the fixes also welcome.
> 
> bluhm
> 
> 
> Index: netinet/if_ether.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/if_ether.c,v
> retrieving revision 1.248
> diff -u -p -r1.248 if_ether.c
> --- netinet/if_ether.c28 Apr 2021 21:21:44 -  1.248
> +++ netinet/if_ether.c3 Mar 2022 23:31:55 -
> @@ -722,7 +722,9 @@ arptfree(struct rtentry *rt)
>   arpinvalidate(rt);
>  
>   ifp = if_get(rt->rt_ifidx);
> - KASSERT(ifp != NULL);
> + if (ifp == NULL)
> + return;
> +
>   if (!ISSET(rt->rt_flags, RTF_STATIC|RTF_CACHED))
>   rtdeletemsg(rt, ifp, ifp->if_rdomain);
>   if_put(ifp);
> Index: netinet/igmp.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/igmp.c,v
> retrieving revision 1.77
> diff -u -p -r1.77 igmp.c
> --- netinet/igmp.c15 Dec 2021 15:58:01 -  1.77
> +++ netinet/igmp.c3 Mar 2022 23:58:32 -
> @@ -483,17 +483,14 @@ igmp_input_if(struct ifnet *ifp, struct 
>  }
>  
>  void
> -igmp_joingroup(struct in_multi *inm)
> +igmp_joingroup(struct in_multi *inm, struct ifnet *ifp)
>  {
> - struct ifnet* ifp;
>   int i;
>  
> - ifp = if_get(inm->inm_ifidx);
> -
>   inm->inm_state = IGMP_IDLE_MEMBER;
>  
>   if (!IN_LOCAL_GROUP(inm->inm_addr.s_addr) &&
> - ifp && (ifp->if_flags & IFF_LOOPBACK) == 0) {
> + (ifp->if_flags & IFF_LOOPBACK) == 0) {
>   i = rti_fill(inm);
>   igmp_sendpkt(ifp, inm, i, 0);
>   inm->inm_state = IGMP_DELAYING_MEMBER;
> @@ -502,22 +499,16 @@ igmp_joingroup(struct in_multi *inm)
>   igmp_timers_are_running = 1;
>   } else
>   inm->inm_timer = 0;
> -
> - if_put(ifp);
>  }
>  
>  void
> -igmp_leavegroup(struct in_multi *inm)
> +igmp_leavegroup(struct in_multi *inm, struct ifnet *ifp)
>  {
> - struct ifnet* ifp;
> -
> - ifp = if_get(inm->inm_ifidx);
> -
>   switch (inm->inm_state) {
>   case IGMP_DELAYING_MEMBER:
>   case IGMP_IDLE_MEMBER:
>   if (!IN_LOCAL_GROUP(inm->inm_addr.s_addr) &&
> - ifp && (ifp->if_flags & IFF_LOOPBACK) == 0)
> + (ifp->if_flags & IFF_LOOPBACK) == 0)
>   if (inm->inm_rti->rti_type != IGMP_v1_ROUTER)
>   igmp_sendpkt(ifp, inm,
>   IGMP_HOST_LEAVE_MESSAGE,
> @@ -528,7 +519,6 @@ igmp_leavegroup(struct in_multi *inm)
>   case IGMP_SLEEPING_MEMBER:
>   break;
>   }
> - if_put(ifp);
>  }
>  
>  void
> Index: netinet/igmp_var.h
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/igmp_var.h,v
> retrieving revision 1.14
> diff -u -p -r1.14 igmp_var.h
> --- netinet/igmp_var.h17 Aug 2020 16:25:34 -  1.14
> +++ netinet/igmp_var.h4 Mar 2022 00:02:45 -
> 

Re: rip sbappendaddr() with inpcb table mutex

2022-03-22 Thread Alexander Bluhm
On Tue, Mar 22, 2022 at 02:25:08PM +0100, Claudio Jeker wrote:
> On Tue, Mar 22, 2022 at 02:09:51PM +0100, Alexander Bluhm wrote:
> > Hi,
> > 
> > syzkaller and witness found the same bug I introduced in UDP also
> > for Raw IP.  Fix it the same was for rip and rip6.
> > 
> > https://syzkaller.appspot.com/bug?extid=9bac6356a881dc644265
> > https://syzkaller.appspot.com/bug?extid=5b2679ee9be0895d26f9
> > 
> > ok?
> 
> Absolutly not a fan of this "fix". It just moves the landmine that is
> about to explode a bit further to the left for the next person to step on.
> The moment someone tries to run these input handlers in parallel all of
> this will blow up. It is a workaround for now but how will we get out of
> this in the future when the code runs in parallel up to the socket
> layer?

Moving the problem around is the only way to make any progress.

The bug with MP forwarding I try to solve is this one.
https://marc.info/?l=openbsd-tech=163857624429253=2

After 4 months of ideas that were denied by different people, I
came to this solution.  Put a mutex around PCB tables.  I think
this is necessary anyway if we want to reach parallel protocol
processing.  Unfortunately I missed 3 of 4 places where I hold the
mutex too long.  I am trying to fix the last 2 of them.

I do not want to delay parallel forwaring until parallel protocol
layer is finished.  Then neither will happen.  If someone is working
on parallel protocols, this code will blow up due to NET_ASSERT_WLOCKED().
It has to be fixed then.  My change is delaying work to make progress
elsewhere.  We cannot solve everything in a big commit.

Do you have a better idea?

bluhm

> > Index: netinet/raw_ip.c
> > ===
> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/raw_ip.c,v
> > retrieving revision 1.125
> > diff -u -p -r1.125 raw_ip.c
> > --- netinet/raw_ip.c21 Mar 2022 09:12:34 -  1.125
> > +++ netinet/raw_ip.c22 Mar 2022 12:59:05 -
> > @@ -122,9 +122,9 @@ rip_input(struct mbuf **mp, int *offp, i
> >  {
> > struct mbuf *m = *mp;
> > struct ip *ip = mtod(m, struct ip *);
> > -   struct inpcb *inp, *last = NULL;
> > +   struct inpcb *inp;
> > +   SIMPLEQ_HEAD(, inpcb) inpcblist;
> > struct in_addr *key;
> > -   struct mbuf *opts = NULL;
> > struct counters_ref ref;
> > uint64_t *counters;
> >  
> > @@ -150,7 +150,8 @@ rip_input(struct mbuf **mp, int *offp, i
> > }
> > }
> >  #endif
> > -   NET_ASSERT_LOCKED();
> > +   NET_ASSERT_WLOCKED();
> > +   SIMPLEQ_INIT();
> > mtx_enter(_mtx);
> > TAILQ_FOREACH(inp, _queue, inp_queue) {
> > if (inp->inp_socket->so_state & SS_CANTRCVMORE)
> > @@ -171,41 +172,16 @@ rip_input(struct mbuf **mp, int *offp, i
> > if (inp->inp_faddr.s_addr &&
> > inp->inp_faddr.s_addr != ip->ip_src.s_addr)
> > continue;
> > -   if (last) {
> > -   struct mbuf *n;
> >  
> > -   if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) {
> > -   if (last->inp_flags & INP_CONTROLOPTS ||
> > -   last->inp_socket->so_options & SO_TIMESTAMP)
> > -   ip_savecontrol(last, , ip, n);
> > -   if (sbappendaddr(last->inp_socket,
> > -   >inp_socket->so_rcv,
> > -   sintosa(), n, opts) == 0) {
> > -   /* should notify about lost packet */
> > -   m_freem(n);
> > -   m_freem(opts);
> > -   } else
> > -   sorwakeup(last->inp_socket);
> > -   opts = NULL;
> > -   }
> > -   }
> > -   last = inp;
> > +   in_pcbref(inp);
> > +   SIMPLEQ_INSERT_TAIL(, inp, inp_notify);
> > }
> > mtx_leave(_mtx);
> >  
> > -   if (last) {
> > -   if (last->inp_flags & INP_CONTROLOPTS ||
> > -   last->inp_socket->so_options & SO_TIMESTAMP)
> > -   ip_savecontrol(last, , ip, m);
> > -   if (sbappendaddr(last->inp_socket, >inp_socket->so_rcv,
> > -   sintosa(), m, opts) == 0) {
> > -   m_freem(m);
> > -   m_freem(opts);
&

rip sbappendaddr() with inpcb table mutex

2022-03-22 Thread Alexander Bluhm
Hi,

syzkaller and witness found the same bug I introduced in UDP also
for Raw IP.  Fix it the same was for rip and rip6.

https://syzkaller.appspot.com/bug?extid=9bac6356a881dc644265
https://syzkaller.appspot.com/bug?extid=5b2679ee9be0895d26f9

ok?

bluhm

Index: netinet/raw_ip.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/raw_ip.c,v
retrieving revision 1.125
diff -u -p -r1.125 raw_ip.c
--- netinet/raw_ip.c21 Mar 2022 09:12:34 -  1.125
+++ netinet/raw_ip.c22 Mar 2022 12:59:05 -
@@ -122,9 +122,9 @@ rip_input(struct mbuf **mp, int *offp, i
 {
struct mbuf *m = *mp;
struct ip *ip = mtod(m, struct ip *);
-   struct inpcb *inp, *last = NULL;
+   struct inpcb *inp;
+   SIMPLEQ_HEAD(, inpcb) inpcblist;
struct in_addr *key;
-   struct mbuf *opts = NULL;
struct counters_ref ref;
uint64_t *counters;
 
@@ -150,7 +150,8 @@ rip_input(struct mbuf **mp, int *offp, i
}
}
 #endif
-   NET_ASSERT_LOCKED();
+   NET_ASSERT_WLOCKED();
+   SIMPLEQ_INIT();
mtx_enter(_mtx);
TAILQ_FOREACH(inp, _queue, inp_queue) {
if (inp->inp_socket->so_state & SS_CANTRCVMORE)
@@ -171,41 +172,16 @@ rip_input(struct mbuf **mp, int *offp, i
if (inp->inp_faddr.s_addr &&
inp->inp_faddr.s_addr != ip->ip_src.s_addr)
continue;
-   if (last) {
-   struct mbuf *n;
 
-   if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) {
-   if (last->inp_flags & INP_CONTROLOPTS ||
-   last->inp_socket->so_options & SO_TIMESTAMP)
-   ip_savecontrol(last, , ip, n);
-   if (sbappendaddr(last->inp_socket,
-   >inp_socket->so_rcv,
-   sintosa(), n, opts) == 0) {
-   /* should notify about lost packet */
-   m_freem(n);
-   m_freem(opts);
-   } else
-   sorwakeup(last->inp_socket);
-   opts = NULL;
-   }
-   }
-   last = inp;
+   in_pcbref(inp);
+   SIMPLEQ_INSERT_TAIL(, inp, inp_notify);
}
mtx_leave(_mtx);
 
-   if (last) {
-   if (last->inp_flags & INP_CONTROLOPTS ||
-   last->inp_socket->so_options & SO_TIMESTAMP)
-   ip_savecontrol(last, , ip, m);
-   if (sbappendaddr(last->inp_socket, >inp_socket->so_rcv,
-   sintosa(), m, opts) == 0) {
-   m_freem(m);
-   m_freem(opts);
-   } else
-   sorwakeup(last->inp_socket);
-   } else {
+   if (SIMPLEQ_EMPTY()) {
if (ip->ip_p != IPPROTO_ICMP)
-   icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0, 
0);
+   icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL,
+   0, 0);
else
m_freem(m);
 
@@ -213,6 +189,30 @@ rip_input(struct mbuf **mp, int *offp, i
counters[ips_noproto]++;
counters[ips_delivered]--;
counters_leave(, ipcounters);
+   }
+
+   while ((inp = SIMPLEQ_FIRST()) != NULL) {
+   struct mbuf *n, *opts = NULL;
+
+   SIMPLEQ_REMOVE_HEAD(, inp_notify);
+   if (SIMPLEQ_EMPTY())
+   n = m;
+   else
+   n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
+   if (n != NULL) {
+   if (inp->inp_flags & INP_CONTROLOPTS ||
+   inp->inp_socket->so_options & SO_TIMESTAMP)
+   ip_savecontrol(inp, , ip, n);
+   if (sbappendaddr(inp->inp_socket,
+   >inp_socket->so_rcv,
+   sintosa(), n, opts) == 0) {
+   /* should notify about lost packet */
+   m_freem(n);
+   m_freem(opts);
+   } else
+   sorwakeup(inp->inp_socket);
+   }
+   in_pcbunref(inp);
}
return IPPROTO_DONE;
 }
Index: netinet6/raw_ip6.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/raw_ip6.c,v
retrieving revision 1.145
diff -u -p -r1.145 raw_ip6.c
--- netinet6/raw_ip6.c  21 Mar 2022 09:12:34 -  1.145
+++ netinet6/raw_ip6.c  22 Mar 2022 12:59:05 -
@@ -121,10 +121,9 @@ 

OpenBSD Errata: March 22, 2022 (slaacd)

2022-03-21 Thread Alexander Bluhm
Errata patches for slaacd have been released for OpenBSD 6.9 and
7.0.

Binary updates for the amd64, i386 and arm64 platform are available
via the syspatch utility.  Source code patches can be found on the
respective errata page:

  https://www.openbsd.org/errata69.html
  https://www.openbsd.org/errata70.html



udp_sbappend() with inpcb table mutex

2022-03-21 Thread Alexander Bluhm
Hi,

syzkaller and witness found a bug in my pcb table mutex commit.

https://syzkaller.appspot.com/bug?id=90a4811c99d6a2df7b252971b754612ca632894d

For multicast and broadcast packets udp_input() traverses the loop
of all UDP PCBs.  There it calls udp_sbappend() while holding the
UDB table mutex.  This results in sorwakeup() and finally kernel
lock while holding a mutex.

I use the same solution as for PCB notify.  Collect the affected
PCBs in a temporary list.  This list is protected by exclusive net
lock.  When we unlock the protocol layer this has to be reconsidered.

The loop for raw sockets is on my todo list.

ok?

bluhm

Index: netinet/in_pcb.h
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
retrieving revision 1.127
diff -u -p -r1.127 in_pcb.h
--- netinet/in_pcb.h21 Mar 2022 09:12:34 -  1.127
+++ netinet/in_pcb.h21 Mar 2022 14:52:03 -
@@ -102,7 +102,7 @@ struct inpcb {
LIST_ENTRY(inpcb) inp_hash; /* [t] local and foreign hash */
LIST_ENTRY(inpcb) inp_lhash;/* [t] local port hash */
TAILQ_ENTRY(inpcb) inp_queue;   /* [t] inet PCB queue */
-   SIMPLEQ_ENTRY(inpcb) inp_notify;/* [N] queue to notify PCB */
+   SIMPLEQ_ENTRY(inpcb) inp_notify;/* [N] notify or udp append */
structinpcbtable *inp_table;/* [I] inet queue/hash table */
union inpaddru inp_faddru;  /* Foreign address. */
union inpaddru inp_laddru;  /* Local address. */
Index: netinet/udp_usrreq.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_usrreq.c,v
retrieving revision 1.275
diff -u -p -r1.275 udp_usrreq.c
--- netinet/udp_usrreq.c21 Mar 2022 09:12:34 -  1.275
+++ netinet/udp_usrreq.c21 Mar 2022 14:52:55 -
@@ -342,7 +342,8 @@ udp_input(struct mbuf **mp, int *offp, i
}
 
if (m->m_flags & (M_BCAST|M_MCAST)) {
-   struct inpcb *last;
+   SIMPLEQ_HEAD(, inpcb) inpcblist;
+
/*
 * Deliver a multicast or broadcast datagram to *all* sockets
 * for which the local and remote addresses and ports match
@@ -363,8 +364,8 @@ udp_input(struct mbuf **mp, int *offp, i
 * Locate pcb(s) for datagram.
 * (Algorithm copied from raw_intr().)
 */
-   last = NULL;
-   NET_ASSERT_LOCKED();
+   NET_ASSERT_WLOCKED();
+   SIMPLEQ_INIT();
mtx_enter(_mtx);
TAILQ_FOREACH(inp, _queue, inp_queue) {
if (inp->inp_socket->so_state & SS_CANTRCVMORE)
@@ -419,16 +420,9 @@ udp_input(struct mbuf **mp, int *offp, i
continue;
}
 
-   if (last != NULL) {
-   struct mbuf *n;
+   in_pcbref(inp);
+   SIMPLEQ_INSERT_TAIL(, inp, inp_notify);
 
-   n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
-   if (n != NULL) {
-   udp_sbappend(last, n, ip, ip6, iphlen,
-   uh, , 0);
-   }
-   }
-   last = inp;
/*
 * Don't look for additional matches if this one does
 * not have either the SO_REUSEPORT or SO_REUSEADDR
@@ -437,13 +431,13 @@ udp_input(struct mbuf **mp, int *offp, i
 * port.  It assumes that an application will never
 * clear these options after setting them.
 */
-   if ((last->inp_socket->so_options & (SO_REUSEPORT |
+   if ((inp->inp_socket->so_options & (SO_REUSEPORT |
SO_REUSEADDR)) == 0)
break;
}
mtx_leave(_mtx);
 
-   if (last == NULL) {
+   if (SIMPLEQ_EMPTY()) {
/*
 * No matching pcb found; discard datagram.
 * (No need to send an ICMP Port Unreachable
@@ -453,7 +447,20 @@ udp_input(struct mbuf **mp, int *offp, i
goto bad;
}
 
-   udp_sbappend(last, m, ip, ip6, iphlen, uh, , 0);
+   while ((inp = SIMPLEQ_FIRST()) != NULL) {
+   struct mbuf *n;
+
+   SIMPLEQ_REMOVE_HEAD(, inp_notify);
+   if (SIMPLEQ_EMPTY())
+   n = m;
+   else
+   n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
+   if (n != 

Re: have in_pcbselsrc copy the selected ip to the caller instead of a reference to it

2022-03-21 Thread Alexander Bluhm
On Mon, Mar 21, 2022 at 11:13:24AM +0100, Claudio Jeker wrote:
> On Mon, Mar 21, 2022 at 02:17:21PM +1000, David Gwynne wrote:
> > in_pcbselsrc has this:
> > 
> > ifp = if_get(mopts->imo_ifidx);
> > if (ifp != NULL) {
> > if (ifp->if_rdomain == rtable_l2(rtableid))
> > IFP_TO_IA(ifp, ia);
> > if (ia == NULL) {
> > if_put(ifp);
> > return (EADDRNOTAVAIL);
> > }
> > 
> > *insrc = ia->ia_addr.sin_addr;
> > if_put(ifp);
> > return (0);
> > }
> > 
> > which looks very much like it releases a reference to the interface
> > holding the address it's passing back to the caller to use.
> 
> This seems indeed to be an issue.
>  
> > this diff has it copy the address to memory the caller provides instead.
> > 
> > ok?
> 
> I think it makes to code overall a bit simpler.
> OK claudio@
> 
> In in_pcbselsrc() you could even eliminate laddr and just replace it with
> inp->inp_laddr. Or assign to laddr instead of making it a pointer.

I would replace laddr with inp->inp_laddr.

OK bluhm@

> > Index: in_pcb.c
> > ===
> > RCS file: /cvs/src/sys/netinet/in_pcb.c,v
> > retrieving revision 1.262
> > diff -u -p -r1.262 in_pcb.c
> > --- in_pcb.c21 Mar 2022 03:51:09 -  1.262
> > +++ in_pcb.c21 Mar 2022 04:10:24 -
> > @@ -476,7 +476,7 @@ in_pcbpickport(u_int16_t *lport, void *l
> >  int
> >  in_pcbconnect(struct inpcb *inp, struct mbuf *nam)
> >  {
> > -   struct in_addr *ina = NULL;
> > +   struct in_addr ina;
> > struct sockaddr_in *sin;
> > int error;
> >  
> > @@ -495,7 +495,7 @@ in_pcbconnect(struct inpcb *inp, struct 
> > return (error);
> >  
> > if (in_pcbhashlookup(inp->inp_table, sin->sin_addr, sin->sin_port,
> > -   *ina, inp->inp_lport, inp->inp_rtableid) != NULL)
> > +   ina, inp->inp_lport, inp->inp_rtableid) != NULL)
> > return (EADDRINUSE);
> >  
> > KASSERT(inp->inp_laddr.s_addr == INADDR_ANY || inp->inp_lport);
> > @@ -506,13 +506,13 @@ in_pcbconnect(struct inpcb *inp, struct 
> > if (error)
> > return (error);
> > if (in_pcbhashlookup(inp->inp_table, sin->sin_addr,
> > -   sin->sin_port, *ina, inp->inp_lport,
> > +   sin->sin_port, ina, inp->inp_lport,
> > inp->inp_rtableid) != NULL) {
> > inp->inp_lport = 0;
> > return (EADDRINUSE);
> > }
> > }
> > -   inp->inp_laddr = *ina;
> > +   inp->inp_laddr = ina;
> > }
> > inp->inp_faddr = sin->sin_addr;
> > inp->inp_fport = sin->sin_port;
> > @@ -870,7 +870,7 @@ in_pcbrtentry(struct inpcb *inp)
> >   * an entry to the caller for later use.
> >   */
> >  int
> > -in_pcbselsrc(struct in_addr **insrc, struct sockaddr_in *sin,
> > +in_pcbselsrc(struct in_addr *insrc, struct sockaddr_in *sin,
> >  struct inpcb *inp)
> >  {
> > struct ip_moptions *mopts = inp->inp_moptions;
> > @@ -886,9 +886,9 @@ in_pcbselsrc(struct in_addr **insrc, str
> >  * If the socket(if any) is already bound, use that bound address
> >  * unless it is INADDR_ANY or INADDR_BROADCAST.
> >  */
> > -   if (laddr && laddr->s_addr != INADDR_ANY &&
> > +   if (laddr->s_addr != INADDR_ANY &&
> > laddr->s_addr != INADDR_BROADCAST) {
> > -   *insrc = laddr;
> > +   *insrc = *laddr;
> > return (0);
> > }
> >  
> > @@ -911,7 +911,7 @@ in_pcbselsrc(struct in_addr **insrc, str
> > return (EADDRNOTAVAIL);
> > }
> >  
> > -   *insrc = >ia_addr.sin_addr;
> > +   *insrc = ia->ia_addr.sin_addr;
> > if_put(ifp);
> > return (0);
> > }
> > @@ -962,7 +962,7 @@ in_pcbselsrc(struct in_addr **insrc, str
> > struct ifaddr *ifa;
> > if ((ifa = ifa_ifwithaddr(ip4_source, rtableid)) !=
> > NULL && ISSET(ifa->ifa_ifp->if_flags, IFF_UP)) {
> > -   *insrc = (ip4_source)->sin_addr;
> > +   *insrc = satosin(ip4_source)->sin_addr;
> > return (0);
> > }
> > }
> > @@ -971,7 +971,7 @@ in_pcbselsrc(struct in_addr **insrc, str
> > if (ia == NULL)
> > return (EADDRNOTAVAIL);
> >  
> > -   *insrc = >ia_addr.sin_addr;
> > +   *insrc = ia->ia_addr.sin_addr;
> > return (0);
> >  }
> >  
> > Index: in_pcb.h
> > ===
> > RCS file: /cvs/src/sys/netinet/in_pcb.h,v
> > retrieving revision 1.125
> 

Re: pcb mutex userland

2022-03-18 Thread Alexander Bluhm
On Thu, Mar 17, 2022 at 12:47:15AM +0100, Alexander Bluhm wrote:
> My previous atempt to add a mutex to in_pcb.h was reverted as it
> broke userland build.

This diff passes make release and regress test.

I would like to commit it to proceed with pcb locking.
Generally fixing sysctl includes is out of my scope.

ok?

bluhm

Index: sys/netinet/in_pcb.h
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
retrieving revision 1.125
diff -u -p -r1.125 in_pcb.h
--- sys/netinet/in_pcb.h14 Mar 2022 22:38:43 -  1.125
+++ sys/netinet/in_pcb.h18 Mar 2022 16:17:54 -
@@ -65,6 +65,7 @@
 #define _NETINET_IN_PCB_H_
 
 #include 
+#include 
 #include 
 #include 
 #include 



Re: refcount btrace

2022-03-18 Thread Alexander Bluhm
On Thu, Mar 17, 2022 at 07:25:27AM +, Visa Hankala wrote:
> On Thu, Mar 17, 2022 at 12:42:13AM +0100, Alexander Bluhm wrote:
> > I would like to use btrace to debug refernce counting.  The idea
> > is to a a tracepoint for every type of refcnt we have.  When it
> > changes, print the actual object, the current counter and the change
> > value.
> 
> > Do we want that feature?
> 
> I am against this in its current form. The code would become more
> complex, and the trace points can affect timing. There is a risk that
> the kernel behaves slightly differently when dt has been compiled in.

On our main architectures dt(4) is in GENERIC.  I see your timing
point for uvm structures.

What do you think about this?  The check starts with a
__predict_false(index > 0) in #define DT_INDEX_ENTER.  The r_traceidx
is very likely in the same cache line as r_refs.  So the additional
overhead of the branch should be small compared to the atomic
operation.  The __predict_false(dt_tracing) might take longer as
it is a global variable.

Default is not to trace refcnt.  But I would like to have it for
network objects.  For sending network packets the additional branch
instruction depending on a global variable does not count.

bluhm

Index: dev/dt/dt_prov_static.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/dev/dt/dt_prov_static.c,v
retrieving revision 1.13
diff -u -p -r1.13 dt_prov_static.c
--- dev/dt/dt_prov_static.c 17 Mar 2022 14:53:59 -  1.13
+++ dev/dt/dt_prov_static.c 18 Mar 2022 20:35:02 -
@@ -2,6 +2,7 @@
 
 /*
  * Copyright (c) 2019 Martin Pieuchot 
+ * Copyright (c) 2022 Alexander Bluhm 
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -87,6 +88,12 @@ DT_STATIC_PROBE1(smr, barrier_exit, "int
 DT_STATIC_PROBE0(smr, wakeup);
 DT_STATIC_PROBE2(smr, thread, "uint64_t", "uint64_t");
 
+/*
+ * reference counting
+ */
+DT_STATIC_PROBE0(refcnt, none);
+DT_STATIC_PROBE3(refcnt, inpcb, "void *", "int", "int");
+DT_STATIC_PROBE3(refcnt, tdb, "void *", "int", "int");
 
 /*
  * List of all static probes
@@ -127,15 +134,24 @@ struct dt_probe *const dtps_static[] = {
&_DT_STATIC_P(smr, barrier_exit),
&_DT_STATIC_P(smr, wakeup),
&_DT_STATIC_P(smr, thread),
+   /* refcnt */
+   &_DT_STATIC_P(refcnt, none),
+   &_DT_STATIC_P(refcnt, inpcb),
+   &_DT_STATIC_P(refcnt, tdb),
 };
 
+struct dt_probe *const *dtps_index_refcnt;
+
 int
 dt_prov_static_init(void)
 {
int i;
 
-   for (i = 0; i < nitems(dtps_static); i++)
+   for (i = 0; i < nitems(dtps_static); i++) {
+   if (dtps_static[i] == &_DT_STATIC_P(refcnt, none))
+   dtps_index_refcnt = _static[i];
dt_dev_register_probe(dtps_static[i]);
+   }
 
return i;
 }
Index: dev/dt/dtvar.h
===
RCS file: /data/mirror/openbsd/cvs/src/sys/dev/dt/dtvar.h,v
retrieving revision 1.13
diff -u -p -r1.13 dtvar.h
--- dev/dt/dtvar.h  27 Feb 2022 10:14:01 -  1.13
+++ dev/dt/dtvar.h  18 Mar 2022 20:58:28 -
@@ -2,6 +2,7 @@
 
 /*
  * Copyright (c) 2019 Martin Pieuchot 
+ * Copyright (c) 2022 Alexander Bluhm 
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -313,11 +314,30 @@ extern volatile uint32_t  dt_tracing; /* 
 #defineDT_STATIC_ENTER(func, name, args...) do {   
\
extern struct dt_probe _DT_STATIC_P(func, name);\
struct dt_probe *dtp = &_DT_STATIC_P(func, name);   \
-   struct dt_provider *dtpv = dtp->dtp_prov;   \
\
if (__predict_false(dt_tracing) &&  \
__predict_false(dtp->dtp_recording)) {  \
+   struct dt_provider *dtpv = dtp->dtp_prov;   \
+   \
dtpv->dtpv_enter(dtpv, dtp, args);  \
+   }   \
+} while (0)
+
+#define _DT_INDEX_P(func)  (dtps_index_##func)
+
+#define DT_INDEX_ENTER(func, index, args...) do {  \
+   extern struct dt_probe **_DT_INDEX_P(func); \
+   \
+   if (__predict_false(index > 0) &&   \
+

Re: refcount btrace

2022-03-17 Thread Alexander Bluhm
On Thu, Mar 17, 2022 at 07:25:27AM +, Visa Hankala wrote:
> On Thu, Mar 17, 2022 at 12:42:13AM +0100, Alexander Bluhm wrote:
> > I would like to use btrace to debug refernce counting.  The idea
> > is to a a tracepoint for every type of refcnt we have.  When it
> > changes, print the actual object, the current counter and the change
> > value.
> 
> > Do we want that feature?
> 
> I am against this in its current form. The code would become more
> complex, and the trace points can affect timing. There is a risk that
> the kernel behaves slightly differently when dt has been compiled in.

Can we get in this part then?

- Remove DIAGNOSTIC to keep similar in non DIAGNOSTIC case.
- Rename refcnt to refs.  refcnt is the struct, refs contains the
  r_refs value.
- Add KASSERT(refs != ~0) in refcnt_finalize().
- Always use u_int refs so I can insert my btrace diff easily.

Maybe I can optimize btrace diff later.

bluhm

Index: kern/kern_synch.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_synch.c,v
retrieving revision 1.184
diff -u -p -r1.184 kern_synch.c
--- kern/kern_synch.c   16 Mar 2022 14:13:01 -  1.184
+++ kern/kern_synch.c   17 Mar 2022 16:12:50 -
@@ -810,25 +810,21 @@ refcnt_init(struct refcnt *r)
 void
 refcnt_take(struct refcnt *r)
 {
-#ifdef DIAGNOSTIC
-   u_int refcnt;
+   u_int refs;
 
-   refcnt = atomic_inc_int_nv(>r_refs);
-   KASSERT(refcnt != 0);
-#else
-   atomic_inc_int(>r_refs);
-#endif
+   refs = atomic_inc_int_nv(>r_refs);
+   KASSERT(refs != 0);
+   (void)refs;
 }
 
 int
 refcnt_rele(struct refcnt *r)
 {
-   u_int refcnt;
+   u_int refs;
 
-   refcnt = atomic_dec_int_nv(>r_refs);
-   KASSERT(refcnt != ~0);
-
-   return (refcnt == 0);
+   refs = atomic_dec_int_nv(>r_refs);
+   KASSERT(refs != ~0);
+   return (refs == 0);
 }
 
 void
@@ -842,26 +838,33 @@ void
 refcnt_finalize(struct refcnt *r, const char *wmesg)
 {
struct sleep_state sls;
-   u_int refcnt;
+   u_int refs;
 
-   refcnt = atomic_dec_int_nv(>r_refs);
-   while (refcnt) {
+   refs = atomic_dec_int_nv(>r_refs);
+   KASSERT(refs != ~0);
+   while (refs) {
sleep_setup(, r, PWAIT, wmesg, 0);
-   refcnt = atomic_load_int(>r_refs);
-   sleep_finish(, refcnt);
+   refs = atomic_load_int(>r_refs);
+   sleep_finish(, refs);
}
 }
 
 int
 refcnt_shared(struct refcnt *r)
 {
-   return (atomic_load_int(>r_refs) > 1);
+   u_int refs;
+
+   refs = atomic_load_int(>r_refs);
+   return (refs > 1);
 }
 
 unsigned int
 refcnt_read(struct refcnt *r)
 {
-   return (atomic_load_int(>r_refs));
+   u_int refs;
+
+   refs = atomic_load_int(>r_refs);
+   return (refs);
 }
 
 void



Re: Use refcnt API with struct plimit

2022-03-17 Thread Alexander Bluhm
On Thu, Mar 17, 2022 at 04:07:24PM +, Visa Hankala wrote:
> Use the refcnt API with struct plimit.
> 
> OK?

OK bluhm@

> Index: kern/kern_resource.c
> ===
> RCS file: src/sys/kern/kern_resource.c,v
> retrieving revision 1.71
> diff -u -p -r1.71 kern_resource.c
> --- kern/kern_resource.c  8 Feb 2021 10:51:01 -   1.71
> +++ kern/kern_resource.c  17 Mar 2022 15:59:52 -
> @@ -582,7 +582,7 @@ lim_startup(struct plimit *limit0)
>   limit0->pl_rlimit[RLIMIT_RSS].rlim_max = lim;
>   limit0->pl_rlimit[RLIMIT_MEMLOCK].rlim_max = lim;
>   limit0->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = lim / 3;
> - limit0->pl_refcnt = 1;
> + refcnt_init(>pl_refcnt);
>  }
>  
>  /*
> @@ -598,14 +598,14 @@ lim_copy(struct plimit *lim)
>   newlim = pool_get(_pool, PR_WAITOK);
>   memcpy(newlim->pl_rlimit, lim->pl_rlimit,
>   sizeof(struct rlimit) * RLIM_NLIMITS);
> - newlim->pl_refcnt = 1;
> + refcnt_init(>pl_refcnt);
>   return (newlim);
>  }
>  
>  void
>  lim_free(struct plimit *lim)
>  {
> - if (atomic_dec_int_nv(>pl_refcnt) > 0)
> + if (refcnt_rele(>pl_refcnt) == 0)
>   return;
>   pool_put(_pool, lim);
>  }
> @@ -617,7 +617,7 @@ lim_fork(struct process *parent, struct 
>  
>   mtx_enter(>ps_mtx);
>   limit = parent->ps_limit;
> - atomic_inc_int(>pl_refcnt);
> + refcnt_take(>pl_refcnt);
>   mtx_leave(>ps_mtx);
>  
>   child->ps_limit = limit;
> @@ -650,7 +650,7 @@ lim_write_begin(void)
>*/
>  
>   limit = p->p_p->ps_limit;
> - if (P_HASSIBLING(p) || limit->pl_refcnt > 1)
> + if (P_HASSIBLING(p) || refcnt_shared(>pl_refcnt))
>   limit = lim_copy(limit);
>  
>   return (limit);
> @@ -703,7 +703,7 @@ lim_read_enter(void)
>   if (limit != pr->ps_limit) {
>   mtx_enter(>ps_mtx);
>   limit = pr->ps_limit;
> - atomic_inc_int(>pl_refcnt);
> + refcnt_take(>pl_refcnt);
>   mtx_leave(>ps_mtx);
>   if (p->p_limit != NULL)
>   lim_free(p->p_limit);
> Index: sys/resourcevar.h
> ===
> RCS file: src/sys/sys/resourcevar.h,v
> retrieving revision 1.24
> diff -u -p -r1.24 resourcevar.h
> --- sys/resourcevar.h 21 Jun 2019 09:39:48 -  1.24
> +++ sys/resourcevar.h 17 Mar 2022 15:59:52 -
> @@ -35,6 +35,7 @@
>  #ifndef  _SYS_RESOURCEVAR_H_
>  #define  _SYS_RESOURCEVAR_H_
>  
> +#include 
>  #include 
>  
>  /*
> @@ -44,7 +45,7 @@
>   */
>  struct plimit {
>   struct  rlimit pl_rlimit[RLIM_NLIMITS];
> - u_int   pl_refcnt;  /* number of references */
> + struct  refcnt pl_refcnt;
>  };
>  
>  /* add user profiling from AST */



Re: pcb mutex userland

2022-03-17 Thread Alexander Bluhm
On Thu, Mar 17, 2022 at 02:09:39PM +0100, Mark Kettenis wrote:
> I fear the fundamental problem is that we should not expose data
> structures internal to the kernel to userland.  What I don't
> understand though is how that happens.  The sysctl code doesn't seem
> to export "struct inpcb" instances directly, but instead it exports
> selected members through "struct kinfo_file".  So why is "struct
> inpcb" exposed to userland at all?

A few tools use it.  One thing is post mortem analysis of kernel
core dumps.  Sometimes I get dumps sent from customers.  They don't
have WITNESS.

Some traditional network debugging tools use these strutures.

lib/libkvm
sbin/sysctl
usr.bin/netstat
usr.bin/tcpbench
usr.sbin/trpt

As you need sysctl kern.allowkmem=1 for them this is only useful
on debugging machines.  Ob course they can be rewritten using sysctl.
A drawback is that you have to write a lot of copy code and post
mortem analysis code gets out of sync.  I see tools with -M -N
breaking over time.

The question is how to proceed.  For MP in the network stack I need
mutex in structs.  And I don't want to rewrite all tools before
making progress.

bluhm

> > Index: sys/netinet/in_pcb.h
> > ===
> > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
> > retrieving revision 1.125
> > diff -u -p -r1.125 in_pcb.h
> > --- sys/netinet/in_pcb.h14 Mar 2022 22:38:43 -  1.125
> > +++ sys/netinet/in_pcb.h17 Mar 2022 00:44:54 -
> > @@ -65,6 +65,7 @@
> >  #define _NETINET_IN_PCB_H_
> >  
> >  #include 
> > +#include 
> >  #include 
> >  #include 
> >  #include 
> > Index: sys/sys/mutex.h
> > ===
> > RCS file: /data/mirror/openbsd/cvs/src/sys/sys/mutex.h,v
> > retrieving revision 1.18
> > diff -u -p -r1.18 mutex.h
> > --- sys/sys/mutex.h 23 Apr 2019 13:35:12 -  1.18
> > +++ sys/sys/mutex.h 17 Mar 2022 00:44:23 -
> > @@ -48,6 +48,8 @@ struct mutex {
> >  #endif
> >  };
> >  
> > +#ifdef _KERNEL
> > +
> >  /*
> >   * To prevent lock ordering problems with the kernel lock, we need to
> >   * make sure we block all interrupts that can grab the kernel lock.
> > @@ -148,7 +150,7 @@ void_mtx_init_flags(struct mutex *, int
> >  
> >  #endif /* WITNESS */
> >  
> > -#if defined(_KERNEL) && defined(DDB)
> > +#ifdef DDB
> >  
> >  struct db_mutex {
> > struct cpu_info *mtx_owner;
> > @@ -160,6 +162,8 @@ struct db_mutex {
> >  void   db_mtx_enter(struct db_mutex *);
> >  void   db_mtx_leave(struct db_mutex *);
> >  
> > -#endif /* _KERNEL && DDB */
> > +#endif /* DDB */
> > +
> > +#endif /* _KERNEL */
> >  
> >  #endif
> > 
> > 



Re: pcb mutex userland

2022-03-17 Thread Alexander Bluhm
On Thu, Mar 17, 2022 at 08:24:10AM +0100, Claudio Jeker wrote:
> On Thu, Mar 17, 2022 at 12:47:15AM +0100, Alexander Bluhm wrote:
> > Hi,
> > 
> > My previous atempt to add a mutex to in_pcb.h was reverted as it
> > broke userland build.
> > 
> > Is the correct fix to include sys/mutex.h in every .c file that
> > includes netinet/in_pcb.h ?  I made a release with it.
> > Or should I include sys/mutex.h in netinet/in_pcb.h ?
> 
> I would add sys/mutex.h in netinet/in_pcb.h. We do the same in other
> headers like sys/proc.h etc.

This survived make release.  It is similar to what we do in sys/proc.h
as suggested by claudio@ and has more #ifdef _KERNEL to please
kettenis@.

ok?

bluhm

Index: sys/netinet/in_pcb.h
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
retrieving revision 1.125
diff -u -p -r1.125 in_pcb.h
--- sys/netinet/in_pcb.h14 Mar 2022 22:38:43 -  1.125
+++ sys/netinet/in_pcb.h17 Mar 2022 00:44:54 -
@@ -65,6 +65,7 @@
 #define _NETINET_IN_PCB_H_
 
 #include 
+#include 
 #include 
 #include 
 #include 
Index: sys/sys/mutex.h
===
RCS file: /data/mirror/openbsd/cvs/src/sys/sys/mutex.h,v
retrieving revision 1.18
diff -u -p -r1.18 mutex.h
--- sys/sys/mutex.h 23 Apr 2019 13:35:12 -  1.18
+++ sys/sys/mutex.h 17 Mar 2022 00:44:23 -
@@ -48,6 +48,8 @@ struct mutex {
 #endif
 };
 
+#ifdef _KERNEL
+
 /*
  * To prevent lock ordering problems with the kernel lock, we need to
  * make sure we block all interrupts that can grab the kernel lock.
@@ -148,7 +150,7 @@ void_mtx_init_flags(struct mutex *, int
 
 #endif /* WITNESS */
 
-#if defined(_KERNEL) && defined(DDB)
+#ifdef DDB
 
 struct db_mutex {
struct cpu_info *mtx_owner;
@@ -160,6 +162,8 @@ struct db_mutex {
 void   db_mtx_enter(struct db_mutex *);
 void   db_mtx_leave(struct db_mutex *);
 
-#endif /* _KERNEL && DDB */
+#endif /* DDB */
+
+#endif /* _KERNEL */
 
 #endif



Re: Remove data dependency barrier from atomic_load_*

2022-03-17 Thread Alexander Bluhm
On Thu, Mar 17, 2022 at 07:12:16AM +, Visa Hankala wrote:
> On Wed, Mar 16, 2022 at 11:09:12PM +0100, Alexander Bluhm wrote:
> > On Tue, Mar 15, 2022 at 09:15:34AM +, Visa Hankala wrote:
> > > However, some DEC Alpha CPUs have their data caches divided into cache
> > > banks to improve bandwidth. These cache banks are relatively
> > > independent. The system maintains coherency, but bus contention can
> > > delay propagation of cache updates. If the loads spanned different cache
> > > banks, the second load could deliver data which is older than the
> > > initial load's value. The data dependency barrier causes an interlock
> > > with cache updating, ensuring causal ordering.)
> > 
> > The code with the membar is copied from READ_ONCE() which is copied
> > from Linux.  The membar_datadep_consumer() has an #ifdef __alpha__
> > in it.  It is only used for that case.  I don't know whether we
> > want to support such CPU.  But if that is the case, we need the
> > membar.
> 
> Whether the membar is necessary or not depends on the use case.
> READ_ONCE(), and SMR_PTR_GET(), have it built in so that loaded
> pointers would work in the expected way in lockless contexts. This
> is intentional, the membar has not been just copied there.

With that explanation OK bluhm@ to remove the membar.



Re: pcb mutex userland

2022-03-16 Thread Alexander Bluhm
On Thu, Mar 17, 2022 at 01:07:12AM +0100, Mark Kettenis wrote:
> > Date: Thu, 17 Mar 2022 01:01:46 +0100 (CET)
> > From: Mark Kettenis 
> > 
> > > Date: Thu, 17 Mar 2022 00:47:15 +0100
> > > From: Alexander Bluhm 
> > > 
> > > Hi,
> > > 
> > > My previous atempt to add a mutex to in_pcb.h was reverted as it
> > > broke userland build.
> > > 
> > > Is the correct fix to include sys/mutex.h in every .c file that
> > > includes netinet/in_pcb.h ?  I made a release with it.
> > > Or should I include sys/mutex.h in netinet/in_pcb.h ?
> > 
> > Neither?
> > 
> > It makes no sense to export the kernel mutex stuff to userland.  Is
> > there a way to avoid doing that by adding a bit for #ifdef _KERNEL?
> ^
>   a bit more

My diff adds struct mutex to struct inpcbtable.  My later plan is
to add a mutex also to struct inpcb.

tcpbench uses libkvm to extract information from struct inpcbtable.
netstat does that for struct inpcb.  Also post mortem analysis from
a kernel core dump is possible.

I don't understand why userland must not know the size of struct
mutex when tools where written to analyze these structs.

Is there something special about struct mutex that should not shown
to userland?

Do you like this?  Different structs for kernel and userland.
I think this is confusing when used with libkvm.

struct inpcbtable {
TAILQ_HEAD(inpthead, inpcb) inpt_queue; /* [t] inet PCB queue */
struct  inpcbhead *inpt_hashtbl;/* [t] local and foreign hash */
struct  inpcbhead *inpt_lhashtbl;   /* [t] local port hash */
SIPHASH_KEY inpt_key, inpt_lkey;/* [t] secrets for hashes */
u_long  inpt_mask, inpt_lmask;  /* [t] hash masks */
int inpt_count, inpt_size;  /* [t] queue count, hash size */
#ifdef _KERNEL
struct mutex inpt_mtx;  /* protect queue and hash */
#endif
};

And we have code like this in lib/libkvm/kvm_file2.c

#define _KERNEL
#include 
#include 
#undef _KERNEL

Or can we include a minimal non kernel version of sys/mutex.h
everywhere?  (not tested yet)

bluhm

Index: netinet/in_pcb.h
===
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
retrieving revision 1.125
diff -u -p -r1.125 in_pcb.h
--- netinet/in_pcb.h14 Mar 2022 22:38:43 -  1.125
+++ netinet/in_pcb.h17 Mar 2022 00:44:54 -
@@ -65,6 +65,7 @@
 #define _NETINET_IN_PCB_H_
 
 #include 
+#include 
 #include 
 #include 
 #include 
Index: sys/mutex.h
===
RCS file: /data/mirror/openbsd/cvs/src/sys/sys/mutex.h,v
retrieving revision 1.18
diff -u -p -r1.18 mutex.h
--- sys/mutex.h 23 Apr 2019 13:35:12 -  1.18
+++ sys/mutex.h 17 Mar 2022 00:44:23 -
@@ -48,6 +48,8 @@ struct mutex {
 #endif
 };
 
+#ifdef _KERNEL
+
 /*
  * To prevent lock ordering problems with the kernel lock, we need to
  * make sure we block all interrupts that can grab the kernel lock.
@@ -148,7 +150,7 @@ void_mtx_init_flags(struct mutex *, int
 
 #endif /* WITNESS */
 
-#if defined(_KERNEL) && defined(DDB)
+#ifdef DDB
 
 struct db_mutex {
struct cpu_info *mtx_owner;
@@ -160,6 +162,8 @@ struct db_mutex {
 void   db_mtx_enter(struct db_mutex *);
 void   db_mtx_leave(struct db_mutex *);
 
-#endif /* _KERNEL && DDB */
+#endif /* DDB */
+
+#endif /* _KERNEL */
 
 #endif



pcb mutex userland

2022-03-16 Thread Alexander Bluhm
Hi,

My previous atempt to add a mutex to in_pcb.h was reverted as it
broke userland build.

Is the correct fix to include sys/mutex.h in every .c file that
includes netinet/in_pcb.h ?  I made a release with it.
Or should I include sys/mutex.h in netinet/in_pcb.h ?

ok?

bluhm

Index: lib/libkvm/kvm_file2.c
===
RCS file: /data/mirror/openbsd/cvs/src/lib/libkvm/kvm_file2.c,v
retrieving revision 1.57
diff -u -p -r1.57 kvm_file2.c
--- lib/libkvm/kvm_file2.c  22 Feb 2022 17:35:01 -  1.57
+++ lib/libkvm/kvm_file2.c  16 Mar 2022 16:42:15 -
@@ -74,6 +74,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
Index: sbin/sysctl/sysctl.c
===
RCS file: /data/mirror/openbsd/cvs/src/sbin/sysctl/sysctl.c,v
retrieving revision 1.258
diff -u -p -r1.258 sysctl.c
--- sbin/sysctl/sysctl.c12 Jul 2021 15:09:19 -  1.258
+++ sbin/sysctl/sysctl.c15 Mar 2022 09:18:31 -
@@ -42,9 +42,11 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
+
 #include 
 #include 
 
Index: usr.bin/netstat/inet.c
===
RCS file: /data/mirror/openbsd/cvs/src/usr.bin/netstat/inet.c,v
retrieving revision 1.173
diff -u -p -r1.173 inet.c
--- usr.bin/netstat/inet.c  5 Dec 2021 22:36:19 -   1.173
+++ usr.bin/netstat/inet.c  16 Mar 2022 16:44:32 -
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #define _KERNEL
@@ -41,6 +42,7 @@
 #undef _KERNEL
 
 #include 
+
 #include 
 #include 
 #include 
Index: usr.bin/tcpbench/tcpbench.c
===
RCS file: /data/mirror/openbsd/cvs/src/usr.bin/tcpbench/tcpbench.c,v
retrieving revision 1.65
diff -u -p -r1.65 tcpbench.c
--- usr.bin/tcpbench/tcpbench.c 12 Jul 2021 15:09:20 -  1.65
+++ usr.bin/tcpbench/tcpbench.c 16 Mar 2022 16:44:55 -
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
Index: usr.sbin/trpt/trpt.c
===
RCS file: /data/mirror/openbsd/cvs/src/usr.sbin/trpt/trpt.c,v
retrieving revision 1.39
diff -u -p -r1.39 trpt.c
--- usr.sbin/trpt/trpt.c2 Dec 2019 21:47:54 -   1.39
+++ usr.sbin/trpt/trpt.c16 Mar 2022 16:45:23 -
@@ -62,6 +62,7 @@
 #include 
 #include 
 #include 
+#include 
 #define PRUREQUESTS
 #include 
 #define _KERNEL



refcount btrace

2022-03-16 Thread Alexander Bluhm
Hi,

I would like to use btrace to debug refernce counting.  The idea
is to a a tracepoint for every type of refcnt we have.  When it
changes, print the actual object, the current counter and the change
value.

#!/usr/sbin/btrace
tracepoint:refcnt:inpcb{
printf("%s %x %u %+d\n", probe, arg0, arg1, arg2)
}

It should look like this:

tracepoint:refcnt:inpcb fd8078e31840 0 +1
tracepoint:refcnt:inpcb fd8078e31840 1 +1
tracepoint:refcnt:inpcb fd8078e31840 2 +1
tracepoint:refcnt:inpcb fd8078e31840 3 -1
tracepoint:refcnt:inpcb fd8078e31840 2 +1
tracepoint:refcnt:inpcb fd8078e31840 3 -1
tracepoint:refcnt:inpcb fd8078e31840 2 -1
tracepoint:refcnt:inpcb fd8078e31840 1 -1

Unfortunately btrace cannot deal with negative numbers right now.
So it looks like this, but that can be fixed independently.

tracepoint:refcnt:inpcb fd8078e31840 0 +1
tracepoint:refcnt:inpcb fd8078e31840 1 +1
tracepoint:refcnt:inpcb fd8078e31840 2 +1
tracepoint:refcnt:inpcb fd8078e31840 3 +4294967295
tracepoint:refcnt:inpcb fd8078e31840 2 +1
tracepoint:refcnt:inpcb fd8078e31840 3 +4294967295
tracepoint:refcnt:inpcb fd8078e31840 2 +4294967295
tracepoint:refcnt:inpcb fd8078e31840 1 +4294967295

To debug leaks, btrace can also print kernel stack trace.

tracepoint:refcnt:inpcb fd8078e31840 0 +1
in_pcballoc+0x92
tcp_attach+0xd1
sonewconn+0x23d
syn_cache_get+0x1bf
tcp_input+0x885
ip_deliver+0xd3
ip6_input_if+0x762
ipv6_input+0x39
ether_input+0x3a2
if_input_process+0x6f
ifiq_process+0x69
taskq_thread+0xdc
proc_trampoline+0x17
kernel
tracepoint:refcnt:inpcb fd8078e31840 1 +1
in_pcbref+0x29
pf_inp_link+0x4e
tcp_input+0x8d2
ip_deliver+0xd3
ip6_input_if+0x762
ipv6_input+0x39
ether_input+0x3a2
if_input_process+0x6f
ifiq_process+0x69
taskq_thread+0xdc
proc_trampoline+0x17
kernel
tracepoint:refcnt:inpcb fd8078e31840 2 +1
in_pcbref+0x29
pf_mbuf_link_inpcb+0x27
tcp_output+0x1455
tcp_usrreq+0x386
sosend+0x37c
dofilewritev+0x14d
sys_write+0x51
syscall+0x314
Xsyscall+0x128
kernel

I register the tracepoint when initializing the refcnt.  There
exists a global array of possible refcnt types.  I implemented it
only for inpcb and tdb as a prove of concept.

Do we want that feature?

bluhm

Index: dev/dt/dt_prov_static.c
===
RCS file: /data/mirror/openbsd/cvs/src/sys/dev/dt/dt_prov_static.c,v
retrieving revision 1.12
diff -u -p -r1.12 dt_prov_static.c
--- dev/dt/dt_prov_static.c 26 Jan 2022 06:31:31 -  1.12
+++ dev/dt/dt_prov_static.c 16 Mar 2022 23:22:34 -
@@ -2,6 +2,7 @@
 
 /*
  * Copyright (c) 2019 Martin Pieuchot 
+ * Copyright (c) 2022 Alexander Bluhm 
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -87,6 +88,12 @@ DT_STATIC_PROBE1(smr, barrier_exit, "int
 DT_STATIC_PROBE0(smr, wakeup);
 DT_STATIC_PROBE2(smr, thread, "uint64_t", "uint64_t");
 
+/*
+ * reference counting
+ */
+DT_STATIC_PROBE0(refcnt, none);
+DT_STATIC_PROBE3(refcnt, inpcb, "void *", "int", "int");
+DT_STATIC_PROBE3(refcnt, tdb, "void *", "int", "int");
 
 /*
  * List of all static probes
@@ -127,15 +134,24 @@ struct dt_probe *dtps_static[] = {
&_DT_STATIC_P(smr, barrier_exit),
&_DT_STATIC_P(smr, wakeup),
&_DT_STATIC_P(smr, thread),
+   /* refcnt */
+   &_DT_STATIC_P(refcnt, none),
+   &_DT_STATIC_P(refcnt, inpcb),
+   &_DT_STATIC_P(refcnt, tdb),
 };
 
+struct dt_probe **dtps_index_refcnt;
+
 int
 dt_prov_static_init(void)
 {
int i;
 
-   for (i = 0; i < nitems(dtps_static); i++)
+   for (i = 0; i < nitems(dtps_static); i++) {
+   if (dtps_static[i] == &_DT_STATIC_P(refcnt, none))
+   dtps_index_refcnt = _static[i];
dt_dev_register_probe(dtps_static[i]);
+   }
 
return i;
 }
Index: dev/dt/dtvar.h
===
RCS file: /data/mirror/openbsd/cvs/src/sys/dev/dt/dtvar.h,v
retrieving revision 1.13
diff -u -p -r1.13 dtvar.h
--- dev/dt/dtvar.h  27 Feb 2022 10:14:01 -  1.13
+++ dev/dt/dtvar.h  16 Mar 2022 23:22:34 -
@@ -2,6 +2,7 @@
 
 /*
  * Copyright (c) 2019 Martin Pieuchot 
+ * Copyright (c) 2022 Alexander Bluhm 
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -313,11 +314,29 @@ extern volatile uint32_t  dt_tracing; /* 
 #defineDT_STATIC_ENTER(func, name, args...) do {   
\
extern struct dt_probe _DT_STATIC_P(func, name);\
struct dt_probe *dtp = &_DT_STATIC_P(func, name);   \
-   struct dt_provider *dtpv = dtp->dtp_prov; 

  1   2   3   4   5   6   7   8   9   10   >