from:"Visa Hankala"

Re: Please test; midi(4): make midi{read,write}_filtops mp safe

2023-09-24 Thread Visa Hankala

On Sun, Sep 24, 2023 at 11:03:54PM +0300, Vitaliy Makkoveev wrote:
> Please test this diff, I have no midi(4) devices.
> 
> midi(4) already uses `audio_lock' mutex(9) for filterops, but they are
> still kernel locked. Wipe out old selwakeup API and make them MP safe.
> knote_locked(9) will not grab kernel lock, so call it directly from
> interrupt handlers instead of scheduling software interrupts.

https://marc.info/?l=openbsd-tech=167604232828221 has minor takeaways
if you pay attention.

> Index: sys/dev/midi.c
> ===
> RCS file: /cvs/src/sys/dev/midi.c,v
> retrieving revision 1.55
> diff -u -p -r1.55 midi.c
> --- sys/dev/midi.c2 Jul 2022 08:50:41 -   1.55
> +++ sys/dev/midi.c24 Sep 2023 19:57:56 -
> @@ -31,7 +31,6 @@
>  #include 
>  #include 
>  
> -#define IPL_SOFTMIDI IPL_SOFTNET
>  #define DEVNAME(sc)  ((sc)->dev.dv_xname)
>  
>  int  midiopen(dev_t, int, int, struct proc *);
> @@ -65,41 +64,38 @@ struct cfdriver midi_cd = {
>  
>  void filt_midiwdetach(struct knote *);
>  int filt_midiwrite(struct knote *, long);
> +int filt_midimodify(struct kevent *, struct knote *);
> +int filt_midiprocess(struct knote *, struct kevent *);
>  
>  const struct filterops midiwrite_filtops = {
> - .f_flags= FILTEROP_ISFD,
> + .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
>   .f_attach   = NULL,
>   .f_detach   = filt_midiwdetach,
>   .f_event= filt_midiwrite,
> + .f_modify   = filt_midimodify,
> + .f_process  = filt_midiprocess,
>  };
>  
>  void filt_midirdetach(struct knote *);
>  int filt_midiread(struct knote *, long);
>  
>  const struct filterops midiread_filtops = {
> - .f_flags= FILTEROP_ISFD,
> + .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
>   .f_attach   = NULL,
>   .f_detach   = filt_midirdetach,
>   .f_event= filt_midiread,
> + .f_modify   = filt_midimodify,
> + .f_process  = filt_midiprocess,
>  };
>  
>  void
> -midi_buf_wakeup(void *addr)
> +midi_buf_wakeup(struct midi_buffer *buf)
>  {
> - struct midi_buffer *buf = addr;
> -
>   if (buf->blocking) {
>   wakeup(>blocking);
>   buf->blocking = 0;
>   }
> - /*
> -  * As long as selwakeup() grabs the KERNEL_LOCK() make sure it is
> -  * already held here to avoid lock ordering problems with `audio_lock'
> -  */
> - KERNEL_ASSERT_LOCKED();
> - mtx_enter(_lock);
> - selwakeup(>sel);
> - mtx_leave(_lock);
> + knote_locked(>klist, 0);
>  }
>  
>  void
> @@ -117,13 +113,7 @@ midi_iintr(void *addr, int data)
>  
>   MIDIBUF_WRITE(mb, data);
>  
> - /*
> -  * As long as selwakeup() needs to be protected by the
> -  * KERNEL_LOCK() we have to delay the wakeup to another
> -  * context to keep the interrupt context KERNEL_LOCK()
> -  * free.
> -  */
> - softintr_schedule(sc->inbuf.softintr);
> + midi_buf_wakeup(mb);
>  }
>  
>  int
> @@ -226,14 +216,7 @@ void
>  midi_out_stop(struct midi_softc *sc)
>  {
>   sc->isbusy = 0;
> -
> - /*
> -  * As long as selwakeup() needs to be protected by the
> -  * KERNEL_LOCK() we have to delay the wakeup to another
> -  * context to keep the interrupt context KERNEL_LOCK()
> -  * free.
> -  */
> - softintr_schedule(sc->outbuf.softintr);
> + midi_buf_wakeup(>outbuf);
>  }
>  
>  void
> @@ -342,11 +325,11 @@ midikqfilter(dev_t dev, struct knote *kn
>   error = 0;
>   switch (kn->kn_filter) {
>   case EVFILT_READ:
> - klist = >inbuf.sel.si_note;
> + klist = >inbuf.klist;
>   kn->kn_fop = _filtops;
>   break;
>   case EVFILT_WRITE:
> - klist = >outbuf.sel.si_note;
> + klist = >outbuf.klist;
>   kn->kn_fop = _filtops;
>   break;
>   default:
> @@ -355,9 +338,7 @@ midikqfilter(dev_t dev, struct knote *kn
>   }
>   kn->kn_hook = (void *)sc;
>  
> - mtx_enter(_lock);
> - klist_insert_locked(klist, kn);
> - mtx_leave(_lock);
> + klist_insert(klist, kn);
>  done:
>   device_unref(>dev);
>   return error;
> @@ -368,24 +349,15 @@ filt_midirdetach(struct knote *kn)
>  {
>   struct midi_softc *sc = (struct midi_softc *)kn->kn_hook;
>  
> - mtx_enter(_lock);
> - klist_remove_locked(>inbuf.sel.si_note, kn);
> - mtx_leave(_lock);
> + klist_remove(>inbuf.klist, kn);
>  }
>  
>  int
>  filt_midiread(struct knote *kn, long hint)
>  {
>   struct midi_softc *sc = (struct midi_softc *)kn->kn_hook;
> - int retval;
> -
> - if ((hint & NOTE_SUBMIT) == 0)
> - mtx_enter(_lock);
> - retval = !MIDIBUF_ISEMPTY(>inbuf);
> - if ((hint & NOTE_SUBMIT) == 0)
> - mtx_leave(_lock);
>  
> - return (retval);
> + return (!MIDIBUF_ISEMPTY(>inbuf));
>  }
>  
>  void
> @@ -393,24

Re: prevent re-upgrade in powerpc64 boot loader

2023-09-24 Thread Visa Hankala

{
-   if (cmd.argc > 1 && cmd.argv[1][0] != '-') {
-   qualify((cmd.argv[1]? cmd.argv[1]: cmd.image));
-   if (bootparse(2))
-   return 0;
-   } else {
-   if (bootparse(1))
-   return 0;
-   snprintf(cmd.path, sizeof cmd.path, "%s:%s",
-   cmd.bootdev, cmd.image);
-   }
-
-   return 1;
-}
-
-/*
- * Qualifies the path adding necessary dev
- */
-
-static char *
-qualify(char *name)
-{
-   char *p;
-
-   for (p = name; *p; p++)
-   if (*p == ':')
-   break;
-   if (*p == ':')
-   strlcpy(cmd.path, name, sizeof(cmd.path));
-   else
-   snprintf(cmd.path, sizeof cmd.path, "%s:%s",
-   cmd.bootdev, name);
-   return cmd.path;
-}
-
-static int
-Xreboot(void)
-{
-   printf("Rebooting...\n");
-   reboot(0);
-   return 0; /* just in case */
-}
-
-int
-upgrade(void)
-{
-   struct stat sb;
-   const char *path;
-   int ret = 0;
-
-   path = disk_open(qualify("/bsd.upgrade"));
-   if (path == NULL)
-   return 0;
-   if (stat(path, ) == 0 && S_ISREG(sb.st_mode))
-   ret = 1;
-   disk_close();
-
-   return ret;
-}
diff --git a/sys/arch/octeon/stand/rdboot/cmd.h 
b/sys/arch/octeon/stand/rdboot/cmd.h
deleted file mode 100644
index 1ffafde59ef..000
--- a/sys/arch/octeon/stand/rdboot/cmd.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* $OpenBSD: cmd.h,v 1.1 2019/07/17 14:36:32 visa Exp $*/
-
-/*
- * Copyright (c) 1997 Michael Shalayeff
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *notice, this list of conditions and the following disclaimer in the
- *documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- */
-
-#define CMD_BUFF_SIZE  133
-#define BOOTDEVLEN 1024
-
-struct cmd_table {
-   char *cmd_name;
-   char cmd_type;
-#define CMDT_CMD 0
-#define CMDT_VAR 1
-#define CMDT_SET 2
-#define CMDT_MDC 3
-   int (*cmd_exec)(void);
-};
-
-struct cmd_state {
-   char bootdev[BOOTDEVLEN]; /* device */
-   char image[MAXPATHLEN - 16]; /* image */
-   unsigned char bootduid[8]; /* duid of root disk */
-   int boothowto; /* howto */
-   int hasduid;
-   char *conf; /* /etc/boot.conf normally */
-   int timeout;
-
-   char path[MAXPATHLEN]; /* buffer for pathname compose */
-   const struct cmd_table *cmd;
-   int argc;
-   char *argv[8];  /* XXX i hope this is enough */
-};
-extern struct cmd_state cmd;
-
-int getcmd(void);
-int read_conf(void);
-int bootparse(int);
-void boot(dev_t);
-
-int upgrade(void);
-int docmd(void);   /* No longer static: needed by regress test */
diff --git a/sys/arch/octeon/stand/rdboot/disk.c 
b/sys/arch/octeon/stand/rdboot/disk.c
deleted file mode 100644
index eda089bc34f..000
--- a/sys/arch/octeon/stand/rdboot/disk.c
+++ /dev/null
@@ -1,208 +0,0 @@
-/* $OpenBSD: disk.c,v 1.2 2020/05/26 13:30:47 visa Exp $   */
-
-/*
- * Copyright (c) 2019 Visa Hankala
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION W

Re: kqueue(2) and close-on-exec

2023-08-14 Thread Visa Hankala

t -lutil
 DPADD= ${LIBEVENT} ${LIBUTIL}
 
+kq-exec: ${PROG}
+   ./${PROG} -e
 kq-pipe: ${PROG}
./${PROG} -p
 kq-fork: ${PROG}
@@ -51,6 +53,7 @@ kq-regress-5: ${PROG}
 kq-regress-6: ${PROG}
./${PROG} -R6
 
+TESTS+=kq-exec
 TESTS+=kq-fdpass
 TESTS+=kq-flock
 TESTS+=kq-fork
Index: regress/sys/kern/kqueue/kqueue-exec.c
===
RCS file: regress/sys/kern/kqueue/kqueue-exec.c
diff -N regress/sys/kern/kqueue/kqueue-exec.c
--- /dev/null   1 Jan 1970 00:00:00 -
+++ regress/sys/kern/kqueue/kqueue-exec.c   14 Aug 2023 14:52:43 -
@@ -0,0 +1,113 @@
+/* $OpenBSD$   */
+
+/*
+ * Copyright (c) 2023 Visa Hankala
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "main.h"
+
+static voiddo_exec_child(void);
+static voiddo_exec_parent(const char *, int);
+
+int
+do_exec(const char *argv0)
+{
+   do_exec_parent(argv0, 0);
+   do_exec_parent(argv0, 1);
+   return 0;
+}
+
+static void
+do_exec_parent(const char *argv0, int cloexec)
+{
+   char *args[] = {
+   (char *)argv0,
+   "-e",
+   NULL
+   };
+   char fdbuf[12];
+   pid_t pid;
+   int kq, status;
+
+   if (getenv("REGRESS_KQUEUE_FD") != NULL) {
+   do_exec_child();
+   _exit(0);
+   }
+
+   pid = fork();
+   if (pid == -1)
+   err(1, "fork");
+   if (pid == 0) {
+   kq = kqueue1(cloexec ? O_CLOEXEC : 0);
+   if (kq == -1)
+   err(1, "kqueue1");
+   snprintf(fdbuf, sizeof(fdbuf), "%d", kq);
+   if (setenv("REGRESS_KQUEUE_FD", fdbuf, 1) == -1)
+   err(1, "setenv");
+   if (setenv("REGRESS_KQUEUE_CLOEXEC",
+   cloexec ? "1" : "0", 1) == -1)
+   err(1, "setenv 2");
+   execv(argv0, args);
+   err(1, "execve");
+   }
+   if (waitpid(pid, , 0) == -1)
+   err(1, "waitpid");
+   if (status != 0)
+   errx(1, "child failed");
+}
+
+static void
+do_exec_child(void)
+{
+   char *arg;
+   int cloexec, fd;
+
+   arg = getenv("REGRESS_KQUEUE_FD");
+   if (arg == NULL)
+   errx(1, "fd arg is missing");
+   fd = atoi(arg);
+
+   arg = getenv("REGRESS_KQUEUE_CLOEXEC");
+   if (arg != NULL && strcmp(arg, "1") == 0)
+   cloexec = 1;
+   else
+   cloexec = 0;
+
+   if (cloexec) {
+   if (kevent(fd, NULL, 0, NULL, 0, 0) == -1) {
+   if (errno != EBADF)
+   err(1, "child after exec: kevent cloexec");
+   } else {
+   errx(1, "child after exec: "
+   "kqueue cloexec fd is not closed");
+   }
+   } else {
+   if (kevent(fd, NULL, 0, NULL, 0, 0) == -1) {
+   err(1, "child after exec: kevent");
+   }
+   }
+}
Index: regress/sys/kern/kqueue/main.c
===
RCS file: src/regress/sys/kern/kqueue/main.c,v
retrieving revision 1.15
diff -u -p -r1.15 main.c
--- regress/sys/kern/kqueue/main.c  12 Jun 2021 13:30:14 -  1.15
+++ regress/sys/kern/kqueue/main.c  14 Aug 2023 14:52:43 -
@@ -17,8 +17,11 @@ main(int argc, char **argv)
int n, ret, c;
 
ret = 0;
-   while ((c = getopt(argc, argv, "fFiIjlpPrR:stT:")) != -1) {
+   while ((c = getopt(argc, argv, "efFiIjlpPrR:stT:")) != -1) {
switch (c) {
+   case 'e':
+   ret |= do_exec(argv[0]);
+   break;
case 'f':
ret |= check_inheritance();
break;
Index

kqueue(2) and close-on-exec

2023-08-13 Thread Visa Hankala

FreeBSD and NetBSD have variants of the kqueue(2) system call that
allow setting the close-on-exec flag on the returned file descriptor.

In general, I think it is good that the flag can be set atomically
for new descriptors. However, it seems to me that it is almost surely
a mistake if a kqueue descriptor is passed over an exec.

Instead of adding a new system call, maybe close-on-exec should be
enabled automatically by kqueue(2). Today it feels backwards that
close-on-exec is off by default.

Note that kqueue cannot be inherited by accident in fork-then-exec
situations because fork(2) closes kqueue descriptors for the child
process.

Index: sys/kern/kern_event.c
===
RCS file: src/sys/kern/kern_event.c,v
retrieving revision 1.197
diff -u -p -r1.197 kern_event.c
--- sys/kern/kern_event.c   13 Aug 2023 08:29:28 -  1.197
+++ sys/kern/kern_event.c   13 Aug 2023 10:42:45 -
@@ -932,7 +932,7 @@ sys_kqueue(struct proc *p, void *v, regi
*retval = fd;
LIST_INSERT_HEAD(>fd_kqlist, kq, kq_next);
kq = NULL;
-   fdinsert(fdp, fd, 0, fp);
+   fdinsert(fdp, fd, UF_EXCLOSE, fp);
FRELE(fp, p);
 out:
fdpunlock(fdp);
Index: lib/libc/sys/kqueue.2
===
RCS file: src/lib/libc/sys/kqueue.2,v
retrieving revision 1.48
diff -u -p -r1.48 kqueue.2
--- lib/libc/sys/kqueue.2   13 Aug 2023 08:29:28 -  1.48
+++ lib/libc/sys/kqueue.2   13 Aug 2023 10:42:45 -
@@ -74,6 +74,7 @@ on a file descriptor will remove any kev
 .Pp
 .Fn kqueue
 creates a new kernel event queue and returns a descriptor.
+The new descriptor has close-on-exec flag set.
 The queue is not inherited by a child created with
 .Xr fork 2 .
 Similarly, kqueues cannot be passed across UNIX-domain sockets.

Re: smr_grace_wait(): Skip halted CPUs

2023-08-13 Thread Visa Hankala

On Sat, Aug 12, 2023 at 02:40:31PM +0200, Martin Pieuchot wrote:
> So do we want to keep the existing requirement of being able to execute
> a thread on a CPU that has been removed from the scheduler?  That's is
> what smr_flush() currently needs.  I find it surprising but I can add
> that as a requirement for the upcoming scheduler.  I don't know if other
> options are possible or even attractive.

I think it is useful that the kernel can force a thread to run on
a specific CPU even when general scheduling has been stopped. It is
maybe a bit crude, but nonetheless effective and machine independent,
way of synchronization.

The kernel has a few current uses of sched_peg_curproc(). Perhaps
the most prominent ones are interrupt barriers and the SMR grace wait
mechanism.

Re: smr_grace_wait(): Skip halted CPUs

2023-08-12 Thread Visa Hankala

On Sat, Aug 12, 2023 at 01:29:10PM +0200, Martin Pieuchot wrote:
> On 12/08/23(Sat) 10:57, Visa Hankala wrote:
> > On Fri, Aug 11, 2023 at 09:52:15PM +0200, Martin Pieuchot wrote:
> > > When stopping a machine, with "halt -p" for example, secondary CPUs are
> > > removed from the scheduler before smr_flush() is called.  So there's no
> > > need for the SMR thread to peg itself to such CPUs.  This currently
> > > isn't a problem because we use per-CPU runqueues but it doesn't work
> > > with a global one.  So the diff below skip halted CPUs.  It should also
> > > speed up rebooting/halting on machine with a huge number of CPUs.
> > 
> > Because SPCF_HALTED does not (?) imply that the CPU has stopped
> > processing interrupts, this skipping is not safe as is. Interrupt
> > handlers might access SMR-protected data.
> 
> Interesting.  This is worse than I expected.  It seems we completely
> forgot about suspend/resume and rebooting when we started pinning
> interrupts on secondary CPUs, no?  Previously sched_stop_secondary_cpus()
> was enough to ensure no more code would be executed on secondary CPUs,
> no?  Wouldn't it be better to remap interrupts to the primary CPU in
> those cases?  Is it easily doable? 

I think device interrupt stopping already happens through
config_suspend_all().

Re: smr_grace_wait(): Skip halted CPUs

2023-08-12 Thread Visa Hankala

On Fri, Aug 11, 2023 at 09:52:15PM +0200, Martin Pieuchot wrote:
> When stopping a machine, with "halt -p" for example, secondary CPUs are
> removed from the scheduler before smr_flush() is called.  So there's no
> need for the SMR thread to peg itself to such CPUs.  This currently
> isn't a problem because we use per-CPU runqueues but it doesn't work
> with a global one.  So the diff below skip halted CPUs.  It should also
> speed up rebooting/halting on machine with a huge number of CPUs.

Because SPCF_HALTED does not (?) imply that the CPU has stopped
processing interrupts, this skipping is not safe as is. Interrupt
handlers might access SMR-protected data.

One possible solution is to spin. When smr_grace_wait() sees
SPCF_HALTED, it should probably call cpu_unidle(ci) and spin until
condition READ_ONCE(ci->ci_schedstate.spc_smrgp) == smrgp becomes true.
However, for this to work, sched_idle() needs to invoke smr_idle().
Here is a potential problem since the cpu_idle_{enter,cycle,leave}()
logic is not consistent between architectures.

I think the intent in sched_idle() was that cpu_idle_enter() should
block interrupts so that sched_idle() could check without races if
the CPU can sleep. Now, on some architectures cpu_idle_enter() is
a no-op. These architectures have to check the idle state in their
cpu_idle_cycle() function before pausing the CPU.

To avoid touching architecture-specific code, cpu_is_idle() could
be redefined to

((ci)->ci_schedstate.spc_whichqs == 0 &&
 (ci)->ci_schedstate.spc_smrgp == READ_ONCE(smr_grace_period))

Then the loop conditions

while (!cpu_is_idle(curcpu())) {

and

while (spc->spc_whichqs == 0) {

in sched_idle() would have to be changed to

while (spc->spc_whichqs != 0) {

and

while (cpu_is_idle(ci)) {

:(

> Index: kern/kern_smr.c
> ===
> RCS file: /cvs/src/sys/kern/kern_smr.c,v
> retrieving revision 1.16
> diff -u -p -r1.16 kern_smr.c
> --- kern/kern_smr.c   14 Aug 2022 01:58:27 -  1.16
> +++ kern/kern_smr.c   11 Aug 2023 19:43:54 -
> @@ -158,6 +158,8 @@ smr_grace_wait(void)
>   CPU_INFO_FOREACH(cii, ci) {
>   if (!CPU_IS_RUNNING(ci))
>   continue;
> + if (ci->ci_schedstate.spc_schedflags & SPCF_HALTED)
> + continue;
>   if (READ_ONCE(ci->ci_schedstate.spc_smrgp) == smrgp)
>   continue;
>   sched_peg_curproc(ci);
>

Re: glxclk(4/loongson): remove driver

2023-07-05 Thread Visa Hankala

On Wed, Jul 05, 2023 at 09:39:00PM -0500, Scott Cheloha wrote:
> glxclk(4) has been compiled-but-disabled for over six months.  It was
> disabled when loongson made the clockintr switch.  Nobody has asked me
> to make it an intrclock option for loongson so I assume the mips64 CP0
> interrupt clock is sufficient.
> 
> This patch deletes the driver, driver config glue, manpage, and
> manpage cross-references.  Not sure if I got it all.  I have no
> system to test this with.
> 
> One thing I noticed: glxclk(4) is compiled into loongson GENERIC but
> not loongson RAMDISK.  A bit odd for a clock interrupt driver, no?  I
> figure you would want to be sure certain such a basic component was
> working during installation, but maybe I'm missing something.
> 
> Anyway, did I get everything?  If so, ok?

I would like to keep the driver and make it functional again. glxclk(4)
provides an external interrupt to the CPU core. With it, it is possible
to stop the core clock when the system is idle and save a little power.
Sadly, the saving is not enough to quieten the fan noise on a Yeeloong.

Below is the glxclk(4) patch that I was tinkering with a while ago.

Index: arch/loongson/dev/glxclk.c
===
RCS file: src/sys/arch/loongson/dev/glxclk.c,v
retrieving revision 1.8
diff -u -p -r1.8 glxclk.c
--- arch/loongson/dev/glxclk.c  19 Nov 2022 16:23:48 -  1.8
+++ arch/loongson/dev/glxclk.c  1 May 2023 14:54:26 -
@@ -18,8 +18,10 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -47,30 +49,36 @@ struct cfdriver glxclk_cd = {
 intglxclk_match(struct device *, void *, void *);
 void   glxclk_attach(struct device *, struct device *, void *);
 intglxclk_intr(void *);
-intglxclk_stat_intr(void *arg);
 void   glxclk_startclock(struct cpu_info *);
+void   glxclk_rearm(void *, uint64_t);
+void   glxclk_trigger(void *);
 
 const struct cfattach glxclk_ca = {
sizeof(struct glxclk_softc), glxclk_match, glxclk_attach,
 };
 
+const struct intrclock glxclk_intrclock = {
+   .ic_rearm   = glxclk_rearm,
+   .ic_trigger = glxclk_trigger,
+};
+
 #defineMSR_LBAR_ENABLE 0x1ULL
 #defineMSR_LBAR_MFGPT  DIVIL_LBAR_MFGPT
 #defineMSR_MFGPT_SIZE  0x40
 #defineMSR_MFGPT_ADDR_MASK 0xffc0
 
+/*
+ * Experience shows that the clock source goes crazy on scale factors
+ * lower than 8, so keep it at 8.
+ */
+
 #defineAMD5536_MFGPT1_CMP2 0x000a  /* Compare value for 
CMP2 */
 #defineAMD5536_MFGPT1_CNT  0x000c  /* Up counter */
 #defineAMD5536_MFGPT1_SETUP0x000e  /* Setup register */
-#defineAMD5536_MFGPT1_SCALE0x7 /* Set to 128 */
+#defineAMD5536_MFGPT1_SCALE0x3 /* Set divider to 8 */
+#defineAMD5536_MFGPT1_CLOCK(1 << 15)   /* Clock frequency */
 #defineAMD5536_MFGPT1_C2_IRQM  0x0200
 
-#defineAMD5536_MFGPT2_CMP2 0x0012  /* Compare value for 
CMP2 */
-#defineAMD5536_MFGPT2_CNT  0x0014  /* Up counter */
-#defineAMD5536_MFGPT2_SETUP0x0016  /* Setup register */
-#defineAMD5536_MFGPT2_SCALE0x3 /* Divide by 8 */
-#defineAMD5536_MFGPT2_C2_IRQM  0x0400
-
 #defineAMD5536_MFGPT_CNT_EN(1 << 15)   /* Enable counting */
 #defineAMD5536_MFGPT_CMP2  (1 << 14)   /* Compare 2 output */
 #defineAMD5536_MFGPT_CMP1  (1 << 13)   /* Compare 1 output */
@@ -82,18 +90,6 @@ const struct cfattach glxclk_ca = {
 
 struct glxclk_softc *glxclk_sc;
 
-/*
- * Statistics clock interval and variance, in usec.  Variance must be a
- * power of two.  Since this gives us an even number, not an odd number,
- * we discard one case and compensate.  That is, a variance of 1024 would
- * give us offsets in [0..1023].  Instead, we take offsets in [1..1023].
- * This is symmetric about the point 512, or statvar/2, and thus averages
- * to that value (assuming uniform random numbers).
- */
-/* XXX fix comment to match value */
-int statvar = 8192;
-int statmin;   /* statclock interval - 1/2*variance */
-
 int
 glxclk_match(struct device *parent, void *match, void *aux)
 {
@@ -112,10 +108,6 @@ glxclk_attach(struct device *parent, str
glxclk_sc = (struct glxclk_softc *)self;
struct glxpcib_attach_args *gaa = aux;
u_int64_t wa;
-   int statint, minint;
-
-   printf(" not configured\n");
-   return;
 
glxclk_sc->sc_iot = gaa->gaa_iot;
glxclk_sc->sc_ioh = gaa->gaa_ioh;
@@ -137,11 +129,10 @@ glxclk_attach(struct device *parent, str
 * MFGPT runs on powers of two, adjust the hz value accordingly.
 */
stathz = hz = 128;
+   profhz = hz * 10;
tick = 100 / hz;
tick_nsec = 10 / hz;
 
-

Remove unnecessary NOWITNESS kludge

2023-07-04 Thread Visa Hankala

Initialize stack-based mutexed using mtx_init(). This removes the need
of the NOWITNESS kludge and lets the lock checker do its job with these
mutexes.

At the moment, static initialization of locks inside functions does not
work correctly with WITNESS. A lock initializer sets up a struct that
gets permanently referenced by the lock checker. Inside a function,
the static initializers put these structs on the stack, which causes
trouble when the function returns. In principle, this might be solvable
by using a compile-time expression that chooses the correct way of
initialization based on the scope of usage.

Index: dev/ic/mfi.c
===
RCS file: src/sys/dev/ic/mfi.c,v
retrieving revision 1.189
diff -u -p -r1.189 mfi.c
--- dev/ic/mfi.c25 May 2023 19:35:58 -  1.189
+++ dev/ic/mfi.c5 Jul 2023 02:56:57 -
@@ -925,8 +925,9 @@ mfi_poll(struct mfi_softc *sc, struct mf
 void
 mfi_exec(struct mfi_softc *sc, struct mfi_ccb *ccb)
 {
-   struct mutex m = MUTEX_INITIALIZER_FLAGS(IPL_BIO, __MTX_NAME,
-   MTX_NOWITNESS);
+   struct mutex m;
+
+   mtx_init(, IPL_BIO);
 
 #ifdef DIAGNOSTIC
if (ccb->ccb_cookie != NULL || ccb->ccb_done != NULL)
Index: dev/ic/mpi.c
===
RCS file: src/sys/dev/ic/mpi.c,v
retrieving revision 1.225
diff -u -p -r1.225 mpi.c
--- dev/ic/mpi.c25 May 2023 19:35:58 -  1.225
+++ dev/ic/mpi.c5 Jul 2023 02:56:57 -
@@ -1263,10 +1263,11 @@ mpi_poll_done(struct mpi_ccb *ccb)
 void
 mpi_wait(struct mpi_softc *sc, struct mpi_ccb *ccb)
 {
-   struct mutexcookie = MUTEX_INITIALIZER_FLAGS(
-   IPL_BIO, __MTX_NAME, MTX_NOWITNESS);
+   struct mutexcookie;
void(*done)(struct mpi_ccb *);
 
+   mtx_init(, IPL_BIO);
+
done = ccb->ccb_done;
ccb->ccb_done = mpi_wait_done;
ccb->ccb_cookie = 
Index: dev/pci/mfii.c
===
RCS file: src/sys/dev/pci/mfii.c,v
retrieving revision 1.88
diff -u -p -r1.88 mfii.c
--- dev/pci/mfii.c  25 May 2023 19:35:58 -  1.88
+++ dev/pci/mfii.c  5 Jul 2023 02:56:57 -
@@ -1764,8 +1764,9 @@ mfii_poll_done(struct mfii_softc *sc, st
 int
 mfii_exec(struct mfii_softc *sc, struct mfii_ccb *ccb)
 {
-   struct mutex m = MUTEX_INITIALIZER_FLAGS(IPL_BIO, __MTX_NAME,
-   MTX_NOWITNESS);
+   struct mutex m;
+
+   mtx_init(, IPL_BIO);
 
 #ifdef DIAGNOSTIC
if (ccb->ccb_cookie != NULL || ccb->ccb_done != NULL)
Index: dev/pci/mpii.c
===
RCS file: src/sys/dev/pci/mpii.c,v
retrieving revision 1.145
diff -u -p -r1.145 mpii.c
--- dev/pci/mpii.c  25 May 2023 19:35:58 -  1.145
+++ dev/pci/mpii.c  5 Jul 2023 02:56:57 -
@@ -2857,11 +2857,12 @@ mpii_init_queues(struct mpii_softc *sc)
 void
 mpii_wait(struct mpii_softc *sc, struct mpii_ccb *ccb)
 {
-   struct mutexmtx = MUTEX_INITIALIZER_FLAGS(IPL_BIO,
-   __MTX_NAME, MTX_NOWITNESS);
+   struct mutexmtx;
void(*done)(struct mpii_ccb *);
void*cookie;
 
+   mtx_init(, IPL_BIO);
+
done = ccb->ccb_done;
cookie = ccb->ccb_cookie;
 
Index: scsi/scsi_base.c
===
RCS file: src/sys/scsi/scsi_base.c,v
retrieving revision 1.281
diff -u -p -r1.281 scsi_base.c
--- scsi/scsi_base.c25 May 2023 19:35:58 -  1.281
+++ scsi/scsi_base.c5 Jul 2023 02:56:57 -
@@ -1497,10 +1497,11 @@ scsi_done(struct scsi_xfer *xs)
 int
 scsi_xs_sync(struct scsi_xfer *xs)
 {
-   struct mutexcookie = MUTEX_INITIALIZER_FLAGS(IPL_BIO, __MTX_NAME,
-   MTX_NOWITNESS);
+   struct mutexcookie;
int error;
 
+   mtx_init(, IPL_BIO);
+
 #ifdef DIAGNOSTIC
if (xs->cookie != NULL)
panic("xs->cookie != NULL in scsi_xs_sync");

Re: EPIPE returned by kevent(2)

2023-05-06 Thread Visa Hankala

On Thu, May 04, 2023 at 08:07:44PM -0700, Greg Steuck wrote:
> I'm debugging a non-trivial multithreaded unit test in the current
> version of lang/ghc. It runs into some kind of unexpected condition not
> handled well by GHC. I suspect we do something non-standard to cause
> this behavior. These two ktrace items illustrate the issue:
> 
>  12550/209588  T21651   CALL  kevent(217,0x211906e98,1,0,0,0x211906e78)
>  12550/209588  T21651   STRU  struct kevent { ident=13, filter=EVFILT_WRITE, 
> flags=0x11, fflags=0x2, data=0, udata=0x0 }
>  12550/209588  T21651   RET   kevent -1 errno 32 Broken pipe
>  
>  12550/209588  T21651   CALL  kevent(217,0x211906ee8,1,0,0,0x211906ec8)
>  12550/209588  T21651   STRU  struct kevent { ident=13, filter=EVFILT_WRITE, 
> flags=0x2, fflags=0x2, data=0, udata=0x0 }
>  12550/209588  T21651   RET   kevent -1 errno 2 No such file or directory
> 
> errno 2 is the reason GHC goes berserk, but it seems like the earlier
> return of errno 32 (EPIPE) is the first time things go wrong. I don't
> see EPIPE documented as a valid error in kevent(2). It's also nowhere
> to be found in sys/kern/kern_event.c. This errno value pops up from
> some other place that I can't quickly locate.
> 
> So, is EPIPE a valid errno which we should document or a kernel bug?

The EPIPE error relates to the situation where a kevent(2) EVFILT_WRITE
call on a pipe races with the closing of the pipe's other end.
If the close(2) happens before the kevent registration, kevent(2)
returns EPIPE. If the close(2) happens after the kevent(2) call,
the registered event will trigger.

The EPIPE error is a legacy feature of the kqueue implementation.
I think the system should work correctly without it. When the pipe's
write side has already been closed, the EVFILT_WRITE event can still
be registered. It just triggers immediately.

As for the ENOENT error from kevent(2), I think the unit test behaves
incorrectly by trying to delete a non-existent event. The registration
failed, after all.

Below is a patch that removes the EPIPE special case. Could you try it?

Index: kern/sys_generic.c
===
RCS file: src/sys/kern/sys_generic.c,v
retrieving revision 1.155
diff -u -p -r1.155 sys_generic.c
--- kern/sys_generic.c  25 Feb 2023 09:55:46 -  1.155
+++ kern/sys_generic.c  6 May 2023 17:10:27 -
@@ -769,12 +769,6 @@ pselregister(struct proc *p, fd_set *pib
 * __EV_SELECT */
error = 0;
break;
-   case EPIPE: /* Specific to pipes */
-   KASSERT(kev.filter == EVFILT_WRITE);
-   FD_SET(kev.ident, pobits[1]);
-   (*ncollected)++;
-   error = 0;
-   break;
case ENXIO: /* Device has been detached */
default:
goto bad;
@@ -1073,10 +1067,6 @@ again:
goto again;
}
break;
-   case EPIPE: /* Specific to pipes */
-   KASSERT(kevp->filter == EVFILT_WRITE);
-   pl->revents |= POLLHUP;
-   break;
default:
DPRINTFN(0, "poll err %lu fd %d revents %02x serial"
" %lu filt %d ERROR=%d\n",
Index: kern/sys_pipe.c
===
RCS file: src/sys/kern/sys_pipe.c,v
retrieving revision 1.145
diff -u -p -r1.145 sys_pipe.c
--- kern/sys_pipe.c 12 Feb 2023 10:41:00 -  1.145
+++ kern/sys_pipe.c 6 May 2023 17:10:27 -
@@ -857,9 +857,13 @@ pipe_kqfilter(struct file *fp, struct kn
break;
case EVFILT_WRITE:
if (wpipe == NULL) {
-   /* other end of pipe has been closed */
-   error = EPIPE;
-   break;
+   /*
+* The other end of the pipe has been closed.
+* Since the filter now always indicates a pending
+* event, attach the knote to the read side to proceed
+* with the registration.
+*/
+   wpipe = rpipe;
}
kn->kn_fop = _wfiltops;
kn->kn_hook = wpipe;

Re: Make midi(4) event filter MP-safe

2023-02-10 Thread Visa Hankala

On Fri, Feb 10, 2023 at 02:56:21PM +, Visa Hankala wrote:
> This makes midi(4) event filter MP-safe.
> 
> The logic is similar to audio(4). As knote(9) is safe to use
> at IPL_AUDIO, the deferring through soft interrupts is not needed
> any longer.
> 
> In mididetach(), the separate selwakeup() calls are covered by
> klist_invalidate().
> 
> Could someone with actual midi(4) hardware test this?

The initial posting misses sys/dev/midivar.h.

Here is the full patch:

Index: dev/midi.c
===
RCS file: src/sys/dev/midi.c,v
retrieving revision 1.55
diff -u -p -r1.55 midi.c
--- dev/midi.c  2 Jul 2022 08:50:41 -   1.55
+++ dev/midi.c  10 Feb 2023 15:14:21 -
@@ -22,6 +22,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -65,41 +67,40 @@ struct cfdriver midi_cd = {
 
 void filt_midiwdetach(struct knote *);
 int filt_midiwrite(struct knote *, long);
+int filt_midimodify(struct kevent *, struct knote *);
+int filt_midiprocess(struct knote *, struct kevent *);
 
 const struct filterops midiwrite_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_midiwdetach,
.f_event= filt_midiwrite,
+   .f_modify   = filt_midimodify,
+   .f_process  = filt_midiprocess,
 };
 
 void filt_midirdetach(struct knote *);
 int filt_midiread(struct knote *, long);
 
 const struct filterops midiread_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_midirdetach,
.f_event= filt_midiread,
+   .f_modify   = filt_midimodify,
+   .f_process  = filt_midiprocess,
 };
 
 void
-midi_buf_wakeup(void *addr)
+midi_buf_wakeup(struct midi_buffer *buf)
 {
-   struct midi_buffer *buf = addr;
+   MUTEX_ASSERT_LOCKED(_lock);
 
if (buf->blocking) {
wakeup(>blocking);
buf->blocking = 0;
}
-   /*
-* As long as selwakeup() grabs the KERNEL_LOCK() make sure it is
-* already held here to avoid lock ordering problems with `audio_lock'
-*/
-   KERNEL_ASSERT_LOCKED();
-   mtx_enter(_lock);
-   selwakeup(>sel);
-   mtx_leave(_lock);
+   knote_locked(>klist, 0);
 }
 
 void
@@ -117,13 +118,7 @@ midi_iintr(void *addr, int data)
 
MIDIBUF_WRITE(mb, data);
 
-   /*
-* As long as selwakeup() needs to be protected by the
-* KERNEL_LOCK() we have to delay the wakeup to another
-* context to keep the interrupt context KERNEL_LOCK()
-* free.
-*/
-   softintr_schedule(sc->inbuf.softintr);
+   midi_buf_wakeup(>inbuf);
 }
 
 int
@@ -227,13 +222,7 @@ midi_out_stop(struct midi_softc *sc)
 {
sc->isbusy = 0;
 
-   /*
-* As long as selwakeup() needs to be protected by the
-* KERNEL_LOCK() we have to delay the wakeup to another
-* context to keep the interrupt context KERNEL_LOCK()
-* free.
-*/
-   softintr_schedule(sc->outbuf.softintr);
+   midi_buf_wakeup(>outbuf);
 }
 
 void
@@ -342,11 +331,11 @@ midikqfilter(dev_t dev, struct knote *kn
error = 0;
switch (kn->kn_filter) {
case EVFILT_READ:
-   klist = >inbuf.sel.si_note;
+   klist = >inbuf.klist;
kn->kn_fop = _filtops;
break;
case EVFILT_WRITE:
-   klist = >outbuf.sel.si_note;
+   klist = >outbuf.klist;
kn->kn_fop = _filtops;
break;
default:
@@ -355,9 +344,7 @@ midikqfilter(dev_t dev, struct knote *kn
}
kn->kn_hook = (void *)sc;
 
-   mtx_enter(_lock);
-   klist_insert_locked(klist, kn);
-   mtx_leave(_lock);
+   klist_insert(klist, kn);
 done:
device_unref(>dev);
return error;
@@ -366,51 +353,61 @@ done:
 void
 filt_midirdetach(struct knote *kn)
 {
-   struct midi_softc *sc = (struct midi_softc *)kn->kn_hook;
+   struct midi_softc *sc = kn->kn_hook;
 
-   mtx_enter(_lock);
-   klist_remove_locked(>inbuf.sel.si_note, kn);
-   mtx_leave(_lock);
+   klist_remove(>inbuf.klist, kn);
 }
 
 int
 filt_midiread(struct knote *kn, long hint)
 {
-   struct midi_softc *sc = (struct midi_softc *)kn->kn_hook;
-   int retval;
+   struct midi_softc *sc = kn->kn_hook;
 
-   if ((hint & NOTE_SUBMIT) == 0)
-   mtx_enter(_lock);
-   retval = !MIDIBUF_ISEMPTY(>inbuf);
-   if ((hint & NOTE_SUBMIT) == 0)
-   mtx_leave(_lock);
+   MUTEX_ASSERT_LOCKED(_lock);
 
-   return (retval);
+   return !MIDIBUF_ISEMPTY(>inb

Make midi(4) event filter MP-safe

2023-02-10 Thread Visa Hankala

This makes midi(4) event filter MP-safe.

The logic is similar to audio(4). As knote(9) is safe to use
at IPL_AUDIO, the deferring through soft interrupts is not needed
any longer.

In mididetach(), the separate selwakeup() calls are covered by
klist_invalidate().

Could someone with actual midi(4) hardware test this?

Index: dev/midi.c
===
RCS file: src/sys/dev/midi.c,v
retrieving revision 1.55
diff -u -p -r1.55 midi.c
--- dev/midi.c  2 Jul 2022 08:50:41 -   1.55
+++ dev/midi.c  10 Feb 2023 14:44:20 -
@@ -22,6 +22,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -65,41 +67,40 @@ struct cfdriver midi_cd = {
 
 void filt_midiwdetach(struct knote *);
 int filt_midiwrite(struct knote *, long);
+int filt_midimodify(struct kevent *, struct knote *);
+int filt_midiprocess(struct knote *, struct kevent *);
 
 const struct filterops midiwrite_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_midiwdetach,
.f_event= filt_midiwrite,
+   .f_modify   = filt_midimodify,
+   .f_process  = filt_midiprocess,
 };
 
 void filt_midirdetach(struct knote *);
 int filt_midiread(struct knote *, long);
 
 const struct filterops midiread_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_midirdetach,
.f_event= filt_midiread,
+   .f_modify   = filt_midimodify,
+   .f_process  = filt_midiprocess,
 };
 
 void
-midi_buf_wakeup(void *addr)
+midi_buf_wakeup(struct midi_buffer *buf)
 {
-   struct midi_buffer *buf = addr;
+   MUTEX_ASSERT_LOCKED(_lock);
 
if (buf->blocking) {
wakeup(>blocking);
buf->blocking = 0;
}
-   /*
-* As long as selwakeup() grabs the KERNEL_LOCK() make sure it is
-* already held here to avoid lock ordering problems with `audio_lock'
-*/
-   KERNEL_ASSERT_LOCKED();
-   mtx_enter(_lock);
-   selwakeup(>sel);
-   mtx_leave(_lock);
+   knote_locked(>klist, 0);
 }
 
 void
@@ -117,13 +118,7 @@ midi_iintr(void *addr, int data)
 
MIDIBUF_WRITE(mb, data);
 
-   /*
-* As long as selwakeup() needs to be protected by the
-* KERNEL_LOCK() we have to delay the wakeup to another
-* context to keep the interrupt context KERNEL_LOCK()
-* free.
-*/
-   softintr_schedule(sc->inbuf.softintr);
+   midi_buf_wakeup(>inbuf);
 }
 
 int
@@ -227,13 +222,7 @@ midi_out_stop(struct midi_softc *sc)
 {
sc->isbusy = 0;
 
-   /*
-* As long as selwakeup() needs to be protected by the
-* KERNEL_LOCK() we have to delay the wakeup to another
-* context to keep the interrupt context KERNEL_LOCK()
-* free.
-*/
-   softintr_schedule(sc->outbuf.softintr);
+   midi_buf_wakeup(>outbuf);
 }
 
 void
@@ -342,11 +331,11 @@ midikqfilter(dev_t dev, struct knote *kn
error = 0;
switch (kn->kn_filter) {
case EVFILT_READ:
-   klist = >inbuf.sel.si_note;
+   klist = >inbuf.klist;
kn->kn_fop = _filtops;
break;
case EVFILT_WRITE:
-   klist = >outbuf.sel.si_note;
+   klist = >outbuf.klist;
kn->kn_fop = _filtops;
break;
default:
@@ -355,9 +344,7 @@ midikqfilter(dev_t dev, struct knote *kn
}
kn->kn_hook = (void *)sc;
 
-   mtx_enter(_lock);
-   klist_insert_locked(klist, kn);
-   mtx_leave(_lock);
+   klist_insert(klist, kn);
 done:
device_unref(>dev);
return error;
@@ -366,51 +353,61 @@ done:
 void
 filt_midirdetach(struct knote *kn)
 {
-   struct midi_softc *sc = (struct midi_softc *)kn->kn_hook;
+   struct midi_softc *sc = kn->kn_hook;
 
-   mtx_enter(_lock);
-   klist_remove_locked(>inbuf.sel.si_note, kn);
-   mtx_leave(_lock);
+   klist_remove(>inbuf.klist, kn);
 }
 
 int
 filt_midiread(struct knote *kn, long hint)
 {
-   struct midi_softc *sc = (struct midi_softc *)kn->kn_hook;
-   int retval;
+   struct midi_softc *sc = kn->kn_hook;
 
-   if ((hint & NOTE_SUBMIT) == 0)
-   mtx_enter(_lock);
-   retval = !MIDIBUF_ISEMPTY(>inbuf);
-   if ((hint & NOTE_SUBMIT) == 0)
-   mtx_leave(_lock);
+   MUTEX_ASSERT_LOCKED(_lock);
 
-   return (retval);
+   return !MIDIBUF_ISEMPTY(>inbuf);
 }
 
 void
 filt_midiwdetach(struct knote *kn)
 {
-   struct midi_softc *sc = (struct midi_softc *)kn->kn_hook;
+   struct midi_softc *sc = kn->kn_hook;
+
+   klist_remove(>outbuf.klist, kn);
+}
+
+int
+filt_midiwrite(struct knote *kn, long hint)
+{
+

Make log event filter MP-safe

2023-02-10 Thread Visa Hankala

This makes log event filter MP-safe.

OK?

Index: kern/subr_log.c
===
RCS file: src/sys/kern/subr_log.c,v
retrieving revision 1.75
diff -u -p -r1.75 subr_log.c
--- kern/subr_log.c 2 Jul 2022 08:50:42 -   1.75
+++ kern/subr_log.c 10 Feb 2023 14:44:20 -
@@ -50,6 +50,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -75,7 +76,7 @@
  */
 struct logsoftc {
int sc_state;   /* [L] see above for possibilities */
-   struct  selinfo sc_selp;/* process waiting on select call */
+   struct  klist sc_klist; /* [L] list of knotes */
struct  sigio_ref sc_sigio; /* async I/O registration */
int sc_need_wakeup; /* if set, wake up waiters */
struct timeout sc_tick; /* wakeup poll timeout */
@@ -99,17 +100,22 @@ struct mutex log_mtx =
 
 void filt_logrdetach(struct knote *kn);
 int filt_logread(struct knote *kn, long hint);
+int filt_logmodify(struct kevent *kev, struct knote *kn);
+int filt_logprocess(struct knote *kn, struct kevent *kev);
 
 const struct filterops logread_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_logrdetach,
.f_event= filt_logread,
+   .f_modify   = filt_logmodify,
+   .f_process  = filt_logprocess,
 };
 
 int dosendsyslog(struct proc *, const char *, size_t, int, enum uio_seg);
 void logtick(void *);
 size_t msgbuf_getlen(struct msgbuf *);
+size_t msgbuf_getlen_locked(struct msgbuf *);
 void msgbuf_putchar_locked(struct msgbuf *, const char);
 
 void
@@ -189,13 +195,22 @@ msgbuf_putchar_locked(struct msgbuf *mbp
 size_t
 msgbuf_getlen(struct msgbuf *mbp)
 {
-   long len;
+   size_t len;
 
mtx_enter(_mtx);
+   len = msgbuf_getlen_locked(mbp);
+   mtx_leave(_mtx);
+   return (len);
+}
+
+size_t
+msgbuf_getlen_locked(struct msgbuf *mbp)
+{
+   long len;
+
len = mbp->msg_bufx - mbp->msg_bufr;
if (len < 0)
len += mbp->msg_bufs;
-   mtx_leave(_mtx);
return (len);
 }
 
@@ -205,6 +220,7 @@ logopen(dev_t dev, int flags, int mode, 
if (log_open)
return (EBUSY);
log_open = 1;
+   klist_init_mutex(_klist, _mtx);
sigio_init(_sigio);
timeout_set(_tick, logtick, NULL);
timeout_add_msec(_tick, LOG_TICK);
@@ -227,6 +243,7 @@ logclose(dev_t dev, int flag, int mode, 
timeout_del(_tick);
logsoftc.sc_state = 0;
sigio_free(_sigio);
+   klist_free(_klist);
return (0);
 }
 
@@ -302,11 +319,10 @@ int
 logkqfilter(dev_t dev, struct knote *kn)
 {
struct klist *klist;
-   int s;
 
switch (kn->kn_filter) {
case EVFILT_READ:
-   klist = _selp.si_note;
+   klist = _klist;
kn->kn_fop = _filtops;
break;
default:
@@ -315,9 +331,7 @@ logkqfilter(dev_t dev, struct knote *kn)
 
kn->kn_hook = (void *)msgbufp;
 
-   s = splhigh();
-   klist_insert_locked(klist, kn);
-   splx(s);
+   klist_insert(klist, kn);
 
return (0);
 }
@@ -325,11 +339,7 @@ logkqfilter(dev_t dev, struct knote *kn)
 void
 filt_logrdetach(struct knote *kn)
 {
-   int s;
-
-   s = splhigh();
-   klist_remove_locked(_selp.si_note, kn);
-   splx(s);
+   klist_remove(_klist, kn);
 }
 
 int
@@ -337,10 +347,36 @@ filt_logread(struct knote *kn, long hint
 {
struct msgbuf *mbp = kn->kn_hook;
 
-   kn->kn_data = msgbuf_getlen(mbp);
+   MUTEX_ASSERT_LOCKED(_mtx);
+
+   kn->kn_data = msgbuf_getlen_locked(mbp);
return (kn->kn_data != 0);
 }
 
+int
+filt_logmodify(struct kevent *kev, struct knote *kn)
+{
+   int active;
+
+   mtx_enter(_mtx);
+   active = knote_modify(kev, kn);
+   mtx_leave(_mtx);
+
+   return (active);
+}
+
+int
+filt_logprocess(struct knote *kn, struct kevent *kev)
+{
+   int active;
+
+   mtx_enter(_mtx);
+   active = knote_process(kn, kev);
+   mtx_leave(_mtx);
+
+   return (active);
+}
+
 void
 logwakeup(void)
 {
@@ -381,9 +417,9 @@ logtick(void *arg)
state = logsoftc.sc_state;
if (logsoftc.sc_state & LOG_RDWAIT)
logsoftc.sc_state &= ~LOG_RDWAIT;
+   knote_locked(_klist, 0);
mtx_leave(_mtx);
 
-   selwakeup(_selp);
if (state & LOG_ASYNC)
pgsigio(_sigio, SIGIO, 0);
if (state & LOG_RDWAIT)

knote(9) and knote_locked(9)

2023-02-01 Thread Visa Hankala

Make knote(9) lock the knote list internally, and add knote_locked(9)
for the typical situation where the list is already locked.

Simplify the kqueue API a bit (and make room for the new function)
by dropping the KNOTE(9) macro. Its value is dubious, not least because
it is common to use proper non-inline functions even for very minor
tasks in the kernel.

Index: share/man/man9/knote.9
===
RCS file: src/share/man/man9/knote.9,v
retrieving revision 1.9
diff -u -p -r1.9 knote.9
--- share/man/man9/knote.9  21 Jan 2014 03:15:46 -  1.9
+++ share/man/man9/knote.9  2 Feb 2023 04:32:53 -
@@ -33,18 +33,21 @@
 .Os
 .Sh NAME
 .Nm knote ,
-.Nm KNOTE
+.Nm knote_locked
 .Nd raise kernel event
 .Sh SYNOPSIS
 .In sys/param.h
 .In sys/event.h
 .Ft void
 .Fn knote "struct klist *list" "long hint"
-.Fn KNOTE "struct klist *list" "long hint"
+.Ft void
+.Fn knote_locked "struct klist *list" "long hint"
 .Sh DESCRIPTION
 The
 .Fn knote
-function provides a hook into the kqueue kernel event notification
+and
+.Fn knote_locked
+functions provide a hook into the kqueue kernel event notification
 mechanism to allow sections of the kernel to raise a kernel event
 in the form of a
 .Sq knote ,
@@ -60,7 +63,7 @@ of knotes, along with a
 .Fa hint
 (which is passed to the appropriate filter routine).
 .Fn knote
-then walks the
+then locks and walks the
 .Fa list
 making calls to the filter routine for each knote.
 As each knote contains a reference to the data structure that it is
@@ -80,17 +83,19 @@ If the knote is already on the active li
 call to the filter occurs in order to provide an opportunity for the
 filter to record the activity.
 .Pp
+.Fn knote_locked
+is like
+.Fn knote
+but assumes that the
+.Fa list
+is already locked.
+.Pp
 .Fn knote
+and
+.Fn knote_locked
 must not be called from interrupt contexts running at an interrupt
 priority level higher than
 .Fn splsched .
-.Pp
-.Fn KNOTE
-is a macro that calls
-.Fn knote list hint
-if
-.Fa list
-is not empty.
 .\" .Sh ERRORS
 .Sh SEE ALSO
 .Xr kqueue 2
@@ -98,8 +103,6 @@ is not empty.
 .Sh HISTORY
 The
 .Fn knote
-and
-.Fn KNOTE
 functions first appeared in
 .Fx 4.1 ,
 and then in
Index: sys/arch/arm64/dev/apm.c
===
RCS file: src/sys/arch/arm64/dev/apm.c,v
retrieving revision 1.21
diff -u -p -r1.21 apm.c
--- sys/arch/arm64/dev/apm.c22 Jan 2023 13:14:21 -  1.21
+++ sys/arch/arm64/dev/apm.c2 Feb 2023 04:32:53 -
@@ -345,7 +345,7 @@ apm_record_event(u_int event)
return 1;
 
apm_evindex++;
-   KNOTE(>sc_note, APM_EVENT_COMPOSE(event, apm_evindex));
+   knote_locked(>sc_note, APM_EVENT_COMPOSE(event, apm_evindex));
return 0;
 }
 
Index: sys/arch/i386/i386/apm.c
===
RCS file: src/sys/arch/i386/i386/apm.c,v
retrieving revision 1.129
diff -u -p -r1.129 apm.c
--- sys/arch/i386/i386/apm.c30 Jan 2023 10:49:04 -  1.129
+++ sys/arch/i386/i386/apm.c2 Feb 2023 04:32:54 -
@@ -311,7 +311,7 @@ apm_record_event(struct apm_softc *sc, u
}
 
apm_evindex++;
-   KNOTE(>sc_note, APM_EVENT_COMPOSE(type, apm_evindex));
+   knote_locked(>sc_note, APM_EVENT_COMPOSE(type, apm_evindex));
return (0);
 }
 
Index: sys/arch/loongson/dev/apm.c
===
RCS file: src/sys/arch/loongson/dev/apm.c,v
retrieving revision 1.41
diff -u -p -r1.41 apm.c
--- sys/arch/loongson/dev/apm.c 19 Nov 2022 16:23:48 -  1.41
+++ sys/arch/loongson/dev/apm.c 2 Feb 2023 04:32:54 -
@@ -363,7 +363,7 @@ apm_record_event(u_int event, const char
return (1);
 
apm_evindex++;
-   KNOTE(>sc_note, APM_EVENT_COMPOSE(event, apm_evindex));
+   knote_locked(>sc_note, APM_EVENT_COMPOSE(event, apm_evindex));
 
return (0);
 }
Index: sys/dev/audio.c
===
RCS file: src/sys/dev/audio.c,v
retrieving revision 1.205
diff -u -p -r1.205 audio.c
--- sys/dev/audio.c 8 Nov 2022 17:53:01 -   1.205
+++ sys/dev/audio.c 2 Feb 2023 04:32:54 -
@@ -285,7 +285,7 @@ audio_mixer_wakeup(struct audio_softc *s
wakeup(>mix_blocking);
sc->mix_blocking = 0;
}
-   KNOTE(>mix_klist, 0);
+   knote_locked(>mix_klist, 0);
 }
 
 void
@@ -297,7 +297,7 @@ audio_buf_wakeup(struct audio_buf *buf)
wakeup(>blocking);
buf->blocking = 0;
}
-   KNOTE(>klist, 0);
+   knote_locked(>klist, 0);
 }
 
 int
Index: sys/dev/acpi/acpi.c
===
RCS file: src/sys/dev/acpi/acpi.c,v
retrieving revision 1.418
diff -u -p -r1.418 acpi.c
--- sys/dev/acpi/acpi.c 13 Sep 2022 17:14:54 -  1.418
+++ sys/dev/acpi/acpi.c 2 Feb 2023 04:32:54 -

Re: Move duplicating initialization to soalloc()

2023-02-01 Thread Visa Hankala

On Tue, Jan 31, 2023 at 09:50:26PM +0300, Vitaliy Makkoveev wrote:
> On Tue, Jan 31, 2023 at 06:00:45PM +0000, Visa Hankala wrote:
> > On Tue, Jan 31, 2023 at 12:44:47PM +0300, Vitaliy Makkoveev wrote:
> > > Since we have soalloc() to do common socket initialization, move the
> > > rest within. I mostly need to do this because standalone socket's buffer
> > > locking require to introduce another klistops data for buffers and there
> > > is no reason to add more copypaste to sonewconn().
> > > 
> > > Also this makes `socket_klistops' private to kern/uipc_socket.c
> > > 
> > > @@ -226,9 +225,6 @@ sonewconn(struct socket *head, int conns
> > >   so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
> > >   so->so_rcv.sb_timeo_nsecs = head->so_rcv.sb_timeo_nsecs;
> > >  
> > > - klist_init(>so_rcv.sb_klist, _klistops, so);
> > > - klist_init(>so_snd.sb_klist, _klistops, so);
> > > - sigio_init(>so_sigio);
> > >   sigio_copy(>so_sigio, >so_sigio);
> > 
> > With this change, something should call klist_free() and sigio_free()
> > for 'so' if soreserve() fails in sonewconn().
> > 
> 
> klist_init() and sigio_init() alloc nothing, but for consistency reason
> they shold.
> 
> I like to do this in the combined error path for soneconn() and
> pru_attach().
> 
> Index: sys/kern/uipc_socket.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.299
> diff -u -p -r1.299 uipc_socket.c
> --- sys/kern/uipc_socket.c27 Jan 2023 21:01:59 -  1.299
> +++ sys/kern/uipc_socket.c31 Jan 2023 18:44:57 -
> @@ -112,6 +112,16 @@ const struct filterops soexcept_filtops 
>   .f_process  = filt_soprocess,
>  };
>  
> +void klist_soassertlk(void *);
> +int  klist_solock(void *);
> +void klist_sounlock(void *, int);
> +
> +const struct klistops socket_klistops = {
> + .klo_assertlk   = klist_soassertlk,
> + .klo_lock   = klist_solock,
> + .klo_unlock = klist_sounlock,
> +};
> +
>  #ifndef SOMINCONN
>  #define SOMINCONN 80
>  #endif /* SOMINCONN */
> @@ -148,6 +158,11 @@ soalloc(int wait)
>   return (NULL);
>   rw_init_flags(>so_lock, "solock", RWL_DUPOK);
>   refcnt_init(>so_refcnt);
> + klist_init(>so_rcv.sb_klist, _klistops, so);
> + klist_init(>so_snd.sb_klist, _klistops, so);
> + sigio_init(>so_sigio);
> + TAILQ_INIT(>so_q0);
> + TAILQ_INIT(>so_q);
>  
>   return (so);
>  }
> @@ -176,11 +191,6 @@ socreate(int dom, struct socket **aso, i
>   if (prp->pr_type != type)
>   return (EPROTOTYPE);
>   so = soalloc(M_WAIT);
> - klist_init(>so_rcv.sb_klist, _klistops, so);
> - klist_init(>so_snd.sb_klist, _klistops, so);
> - sigio_init(>so_sigio);
> - TAILQ_INIT(>so_q0);
> - TAILQ_INIT(>so_q);
>   so->so_type = type;
>   if (suser(p) == 0)
>   so->so_state = SS_PRIV;
> @@ -2333,12 +2343,6 @@ klist_sounlock(void *arg, int ls)
>  
>   sounlock(so);
>  }
> -
> -const struct klistops socket_klistops = {
> - .klo_assertlk   = klist_soassertlk,
> - .klo_lock   = klist_solock,
> - .klo_unlock = klist_sounlock,
> -};
>  
>  #ifdef DDB
>  void
> Index: sys/kern/uipc_socket2.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
> retrieving revision 1.134
> diff -u -p -r1.134 uipc_socket2.c
> --- sys/kern/uipc_socket2.c   27 Jan 2023 18:46:34 -  1.134
> +++ sys/kern/uipc_socket2.c   31 Jan 2023 18:44:57 -
> @@ -41,7 +41,6 @@
>  #include 
>  #include 
>  #include 
> -#include 
>  #include 
>  
>  /*
> @@ -213,12 +212,8 @@ sonewconn(struct socket *head, int conns
>   /*
>* Inherit watermarks but those may get clamped in low mem situations.
>*/
> - if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
> - if (persocket)
> - sounlock(so);
> - pool_put(_pool, so);
> - return (NULL);
> - }
> + if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat))
> + goto error;

sonewconn() has a variable called 'error'. Maybe use some other label.
'fail' and 'bad' are quite frequent in the kernel.

OK visa@

>   so->so_snd.sb_wat = head->so_snd.sb_wat;
>   so->so_snd.sb_lowat = head->so_snd.sb_lowat;
>   so->so_snd.sb_timeo_nsecs = head->s

Re: Replace selwakeup() with KNOTE() in tun(4) and tap(4)

2023-01-31 Thread Visa Hankala

On Mon, Jan 30, 2023 at 08:34:29PM +0300, Vitaliy Makkoveev wrote:
> > On 30 Jan 2023, at 06:39, Visa Hankala  wrote:
> > 
> > Replace selwakeup() with KNOTE() in tun(4) and tap(4).
> > 
> > This patch makes the tun(4) and tap(4) event filters MP-safe.
> > 
> > This is similar to the change that just got committed to pppac(4)
> > and pppx(4). However, tun(4) and tap(4) can be destroyed abruptly,
> > so klist_invalidate() has to be kept in tun_clone_destroy().
> > 
> > The selwakeup() call in tun_dev_close() can be removed. If the device
> > is closed peacefully, the klists get cleared automatically and waiters
> > notified before the close routine is invoked. On abrupt detach,
> > klist_invalidate() in tun_clone_destroy() should clear any lingering
> > knotes.
> > 
> > OK?
> > 
> 
> Does it make sense to introduce something like KNOTE_UNLOCKED()
> to push lock acquisition within?

I have not been keen to add a new variant of KNOTE() because the common
pattern is, or at least has been, that the klist lock is already held
when KNOTE() is called.

The idea is that the klist is protected by the lock that also covers the
related object, if possible. Examples of this are audio_lock, pipe_lock,
and solock.

klist_insert() and klist_remove() did not touch locks initially.
The locking was added when it turned out that there would be repetition
in very many places otherwise.

If a new flavor of KNOTE() is really wanted, I would rather cook up
a patch that renames KNOTE() to KNOTE_LOCKED() and adds KNOTE() that
acquires the klist lock internally. This way the naming would remain
consistent with the rest of the klist functions.

Re: Move duplicating initialization to soalloc()

2023-01-31 Thread Visa Hankala

On Tue, Jan 31, 2023 at 12:44:47PM +0300, Vitaliy Makkoveev wrote:
> Since we have soalloc() to do common socket initialization, move the
> rest within. I mostly need to do this because standalone socket's buffer
> locking require to introduce another klistops data for buffers and there
> is no reason to add more copypaste to sonewconn().
> 
> Also this makes `socket_klistops' private to kern/uipc_socket.c
> 
> Index: sys/kern/uipc_socket.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.299
> diff -u -p -r1.299 uipc_socket.c
> --- sys/kern/uipc_socket.c27 Jan 2023 21:01:59 -  1.299
> +++ sys/kern/uipc_socket.c31 Jan 2023 09:16:04 -
> @@ -112,6 +112,16 @@ const struct filterops soexcept_filtops 
>   .f_process  = filt_soprocess,
>  };
>  
> +void klist_soassertlk(void *);
> +int  klist_solock(void *);
> +void klist_sounlock(void *, int);
> +
> +const struct klistops socket_klistops = {
> + .klo_assertlk   = klist_soassertlk,
> + .klo_lock   = klist_solock,
> + .klo_unlock = klist_sounlock,
> +};
> +
>  #ifndef SOMINCONN
>  #define SOMINCONN 80
>  #endif /* SOMINCONN */
> @@ -148,6 +158,11 @@ soalloc(int wait)
>   return (NULL);
>   rw_init_flags(>so_lock, "solock", RWL_DUPOK);
>   refcnt_init(>so_refcnt);
> + klist_init(>so_rcv.sb_klist, _klistops, so);
> + klist_init(>so_snd.sb_klist, _klistops, so);
> + sigio_init(>so_sigio);
> + TAILQ_INIT(>so_q0);
> + TAILQ_INIT(>so_q);
>  
>   return (so);
>  }
> @@ -176,11 +191,6 @@ socreate(int dom, struct socket **aso, i
>   if (prp->pr_type != type)
>   return (EPROTOTYPE);
>   so = soalloc(M_WAIT);
> - klist_init(>so_rcv.sb_klist, _klistops, so);
> - klist_init(>so_snd.sb_klist, _klistops, so);
> - sigio_init(>so_sigio);
> - TAILQ_INIT(>so_q0);
> - TAILQ_INIT(>so_q);
>   so->so_type = type;
>   if (suser(p) == 0)
>   so->so_state = SS_PRIV;
> @@ -2333,12 +2343,6 @@ klist_sounlock(void *arg, int ls)
>  
>   sounlock(so);
>  }
> -
> -const struct klistops socket_klistops = {
> - .klo_assertlk   = klist_soassertlk,
> - .klo_lock   = klist_solock,
> - .klo_unlock = klist_sounlock,
> -};
>  
>  #ifdef DDB
>  void
> Index: sys/kern/uipc_socket2.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
> retrieving revision 1.134
> diff -u -p -r1.134 uipc_socket2.c
> --- sys/kern/uipc_socket2.c   27 Jan 2023 18:46:34 -  1.134
> +++ sys/kern/uipc_socket2.c   31 Jan 2023 09:16:04 -
> @@ -41,7 +41,6 @@
>  #include 
>  #include 
>  #include 
> -#include 
>  #include 
>  
>  /*
> @@ -226,9 +225,6 @@ sonewconn(struct socket *head, int conns
>   so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
>   so->so_rcv.sb_timeo_nsecs = head->so_rcv.sb_timeo_nsecs;
>  
> - klist_init(>so_rcv.sb_klist, _klistops, so);
> - klist_init(>so_snd.sb_klist, _klistops, so);
> - sigio_init(>so_sigio);
>   sigio_copy(>so_sigio, >so_sigio);

With this change, something should call klist_free() and sigio_free()
for 'so' if soreserve() fails in sonewconn().

>  
>   soqinsque(head, so, 0);
> Index: sys/sys/event.h
> ===
> RCS file: /cvs/src/sys/sys/event.h,v
> retrieving revision 1.67
> diff -u -p -r1.67 event.h
> --- sys/sys/event.h   31 Mar 2022 01:41:22 -  1.67
> +++ sys/sys/event.h   31 Jan 2023 09:16:04 -
> @@ -286,7 +286,6 @@ struct timespec;
>  
>  extern const struct filterops sig_filtops;
>  extern const struct filterops dead_filtops;
> -extern const struct klistops socket_klistops;
>  
>  extern void  kqpoll_init(unsigned int);
>  extern void  kqpoll_done(unsigned int);
>

Replace selwakeup() with KNOTE() in tun(4) and tap(4)

2023-01-29 Thread Visa Hankala

Replace selwakeup() with KNOTE() in tun(4) and tap(4).

This patch makes the tun(4) and tap(4) event filters MP-safe.

This is similar to the change that just got committed to pppac(4)
and pppx(4). However, tun(4) and tap(4) can be destroyed abruptly,
so klist_invalidate() has to be kept in tun_clone_destroy().

The selwakeup() call in tun_dev_close() can be removed. If the device
is closed peacefully, the klists get cleared automatically and waiters
notified before the close routine is invoked. On abrupt detach,
klist_invalidate() in tun_clone_destroy() should clear any lingering
knotes.

OK?

Index: net/if_tun.c
===
RCS file: src/sys/net/if_tun.c,v
retrieving revision 1.237
diff -u -p -r1.237 if_tun.c
--- net/if_tun.c2 Jul 2022 08:50:42 -   1.237
+++ net/if_tun.c30 Jan 2023 03:32:36 -
@@ -47,13 +47,14 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 
 #include 
@@ -78,8 +79,9 @@
 struct tun_softc {
struct arpcom   sc_ac;  /* ethernet common data */
 #define sc_if  sc_ac.ac_if
-   struct selinfo  sc_rsel;/* read select */
-   struct selinfo  sc_wsel;/* write select (not used) */
+   struct mutexsc_mtx;
+   struct klistsc_rklist;  /* knotes for read */
+   struct klistsc_wklist;  /* knotes for write (unused) */
SMR_LIST_ENTRY(tun_softc)
sc_entry;   /* all tunnel interfaces */
int sc_unit;
@@ -125,22 +127,28 @@ int   tun_init(struct tun_softc *);
 void   tun_start(struct ifnet *);
 intfilt_tunread(struct knote *, long);
 intfilt_tunwrite(struct knote *, long);
+intfilt_tunmodify(struct kevent *, struct knote *);
+intfilt_tunprocess(struct knote *, struct kevent *);
 void   filt_tunrdetach(struct knote *);
 void   filt_tunwdetach(struct knote *);
 void   tun_link_state(struct ifnet *, int);
 
 const struct filterops tunread_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_tunrdetach,
.f_event= filt_tunread,
+   .f_modify   = filt_tunmodify,
+   .f_process  = filt_tunprocess,
 };
 
 const struct filterops tunwrite_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_tunwdetach,
.f_event= filt_tunwrite,
+   .f_modify   = filt_tunmodify,
+   .f_process  = filt_tunprocess,
 };
 
 SMR_LIST_HEAD(tun_list, tun_softc);
@@ -220,6 +228,9 @@ tun_create(struct if_clone *ifc, int uni
ifp = >sc_if;
snprintf(ifp->if_xname, sizeof(ifp->if_xname),
"%s%d", ifc->ifc_name, unit);
+   mtx_init(>sc_mtx, IPL_NET);
+   klist_init_mutex(>sc_rklist, >sc_mtx);
+   klist_init_mutex(>sc_wklist, >sc_mtx);
ifp->if_softc = sc;
 
/* this is enough state for tun_dev_open to work with */
@@ -275,6 +286,8 @@ tun_create(struct if_clone *ifc, int uni
return (0);
 
 exists:
+   klist_free(>sc_rklist);
+   klist_free(>sc_wklist);
free(sc, M_DEVBUF, sizeof(*sc));
return (EEXIST);
 }
@@ -284,7 +297,6 @@ tun_clone_destroy(struct ifnet *ifp)
 {
struct tun_softc*sc = ifp->if_softc;
dev_tdev;
-   int  s;
 
KERNEL_ASSERT_LOCKED();
 
@@ -314,10 +326,11 @@ tun_clone_destroy(struct ifnet *ifp)
/* wait for device entrypoints to finish */
refcnt_finalize(>sc_refs, "tundtor");
 
-   s = splhigh();
-   klist_invalidate(>sc_rsel.si_note);
-   klist_invalidate(>sc_wsel.si_note);
-   splx(s);
+   klist_invalidate(>sc_rklist);
+   klist_invalidate(>sc_wklist);
+
+   klist_free(>sc_rklist);
+   klist_free(>sc_wklist);
 
if (ISSET(sc->sc_flags, TUN_LAYER2))
ether_ifdetach(ifp);
@@ -488,7 +501,6 @@ tun_dev_close(dev_t dev, struct proc *p)
ifq_purge(>if_snd);
 
CLR(sc->sc_flags, TUN_ASYNC);
-   selwakeup(>sc_rsel);
sigio_free(>sc_sigio);
 
if (!ISSET(sc->sc_flags, TUN_DEAD)) {
@@ -654,7 +666,10 @@ tun_wakeup(struct tun_softc *sc)
if (sc->sc_reading)
wakeup(>sc_if.if_snd);
 
-   selwakeup(>sc_rsel);
+   mtx_enter(>sc_mtx);
+   KNOTE(>sc_rklist, 0);
+   mtx_leave(>sc_mtx);
+
if (sc->sc_flags & TUN_ASYNC)
pgsigio(>sc_sigio, SIGIO, 0);
 }
@@ -960,7 +975,6 @@ tun_dev_kqfilter(dev_t dev, struct knote
struct ifnet*ifp;
struct klist*klist;
int  error = 0;
-

Re: Replace selinfo structure by klist in sockbuf

2023-01-27 Thread Visa Hankala

On Thu, Jan 26, 2023 at 09:51:14PM +0300, Vitaliy Makkoveev wrote:
> No reason to keep it, selinfo is just wrapper to klist. netstat(1) and
> libkvm use socket structure, but don't touch
> so_{snd,rcv}.sb_sel.si_note. 

OK visa@

> Index: sys/kern/uipc_socket.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.297
> diff -u -p -r1.297 uipc_socket.c
> --- sys/kern/uipc_socket.c23 Jan 2023 18:34:24 -  1.297
> +++ sys/kern/uipc_socket.c26 Jan 2023 18:39:31 -
> @@ -176,8 +176,8 @@ socreate(int dom, struct socket **aso, i
>   if (prp->pr_type != type)
>   return (EPROTOTYPE);
>   so = soalloc(M_WAIT);
> - klist_init(>so_rcv.sb_sel.si_note, _klistops, so);
> - klist_init(>so_snd.sb_sel.si_note, _klistops, so);
> + klist_init(>so_rcv.sb_klist, _klistops, so);
> + klist_init(>so_snd.sb_klist, _klistops, so);
>   sigio_init(>so_sigio);
>   TAILQ_INIT(>so_q0);
>   TAILQ_INIT(>so_q);
> @@ -303,8 +303,8 @@ sofree(struct socket *so, int keep_lock)
>   }
>  
>   sigio_free(>so_sigio);
> - klist_free(>so_rcv.sb_sel.si_note);
> - klist_free(>so_snd.sb_sel.si_note);
> + klist_free(>so_rcv.sb_klist);
> + klist_free(>so_snd.sb_klist);
>  #ifdef SOCKET_SPLICE
>   if (so->so_sp) {
>   if (issplicedback(so)) {
> @@ -2095,7 +2095,7 @@ void
>  sohasoutofband(struct socket *so)
>  {
>   pgsigio(>so_sigio, SIGURG, 0);
> - KNOTE(>so_rcv.sb_sel.si_note, 0);
> + KNOTE(>so_rcv.sb_klist, 0);
>  }
>  
>  int
> @@ -2126,7 +2126,7 @@ soo_kqfilter(struct file *fp, struct kno
>   return (EINVAL);
>   }
>  
> - klist_insert_locked(>sb_sel.si_note, kn);
> + klist_insert_locked(>sb_klist, kn);
>   sounlock(so);
>  
>   return (0);
> @@ -2137,7 +2137,7 @@ filt_sordetach(struct knote *kn)
>  {
>   struct socket *so = kn->kn_fp->f_data;
>  
> - klist_remove(>so_rcv.sb_sel.si_note, kn);
> + klist_remove(>so_rcv.sb_klist, kn);
>  }
>  
>  int
> @@ -2178,7 +2178,7 @@ filt_sowdetach(struct knote *kn)
>  {
>   struct socket *so = kn->kn_fp->f_data;
>  
> - klist_remove(>so_snd.sb_sel.si_note, kn);
> + klist_remove(>so_snd.sb_klist, kn);
>  }
>  
>  int
> Index: sys/kern/uipc_socket2.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
> retrieving revision 1.133
> diff -u -p -r1.133 uipc_socket2.c
> --- sys/kern/uipc_socket2.c   22 Jan 2023 12:05:44 -  1.133
> +++ sys/kern/uipc_socket2.c   26 Jan 2023 18:39:31 -
> @@ -226,8 +226,8 @@ sonewconn(struct socket *head, int conns
>   so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
>   so->so_rcv.sb_timeo_nsecs = head->so_rcv.sb_timeo_nsecs;
>  
> - klist_init(>so_rcv.sb_sel.si_note, _klistops, so);
> - klist_init(>so_snd.sb_sel.si_note, _klistops, so);
> + klist_init(>so_rcv.sb_klist, _klistops, so);
> + klist_init(>so_snd.sb_klist, _klistops, so);
>   sigio_init(>so_sigio);
>   sigio_copy(>so_sigio, >so_sigio);
>  
> @@ -262,8 +262,8 @@ sonewconn(struct socket *head, int conns
>   if (persocket)
>   sounlock(so);
>   sigio_free(>so_sigio);
> - klist_free(>so_rcv.sb_sel.si_note);
> - klist_free(>so_snd.sb_sel.si_note);
> + klist_free(>so_rcv.sb_klist);
> + klist_free(>so_snd.sb_klist);
>   pool_put(_pool, so);
>   return (NULL);
>   }
> @@ -549,7 +549,7 @@ sowakeup(struct socket *so, struct sockb
>   }
>   if (sb->sb_flags & SB_ASYNC)
>   pgsigio(>so_sigio, SIGIO, 0);
> - KNOTE(>sb_sel.si_note, 0);
> + KNOTE(>sb_klist, 0);
>  }
>  
>  /*
> Index: sys/kern/uipc_syscalls.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_syscalls.c,v
> retrieving revision 1.209
> diff -u -p -r1.209 uipc_syscalls.c
> --- sys/kern/uipc_syscalls.c  22 Jan 2023 12:05:44 -  1.209
> +++ sys/kern/uipc_syscalls.c  26 Jan 2023 18:39:31 -
> @@ -326,7 +326,7 @@ doaccept(struct proc *p, int sock, struc
>   : (flags & SOCK_NONBLOCK ? FNONBLOCK : 0);
>  
>   /* connection has been removed from the listen queue */
> - KNOTE(>so_rcv.sb_sel.si_note, 0);
> + KNOTE(>so_rcv.sb_klist, 0);
>  
>   if (persocket)
>   sounlock(head);
> Index: sys/miscfs/fifofs/fifo_vnops.c
> ===
> RCS file: /cvs/src/sys/miscfs/fifofs/fifo_vnops.c,v
> retrieving revision 1.100
> diff -u -p -r1.100 fifo_vnops.c
> --- sys/miscfs/fifofs/fifo_vnops.c22 Jan 2023 12:05:44 -  1.100
> +++ sys/miscfs/fifofs/fifo_vnops.c26 Jan 2023 18:39:31 -
> @@ -504,7 +504,7 @@ fifo_kqfilter(void *v)
>  
>   ap->a_kn->kn_hook = so;
>  
> - klist_insert(>sb_sel.si_note, ap->a_kn);
> +

Re: Replace selwakeup() with KNOTE(9) in pppx(4) and pppac(4) layers

2023-01-26 Thread Visa Hankala

On Thu, Jan 26, 2023 at 01:57:56AM +0300, Vitaliy Makkoveev wrote:
> On Wed, Jan 25, 2023 at 10:43:50PM +0300, Vitaliy Makkoveev wrote:
> > visa@, mpi@, I'm asking you to review, because you are involved in the
> > kevent(9) development.
> > 
> > Hrvoje, if you want to test this diff, you need to disable pipex(4) with
> > "net.pipex.enable=0".
> > 
> 
> I missed we already have klist_init_mutex(), so the redundant *_klistops
> removed from previous diff.

I have a similar patch lying around, shown below. The main differences:

* Use the same mutex for both read and write side of the klists.
  Having dedicated mutexes feels overkill.

* Use klist_insert() and klist_remove() for shorter code.

* Share f_modify and f_process functions for read and write filters.

As pppx and pppac do not seem to have forced detaching, the calls of
klist_invalidate() are not needed.

Mutex assertions could be added in filt_pppx_write() and
filt_pppac_write() to make the locking assumption more explicit
(the code that calls these functions needs the lock). However, I have
omitted these assertions as they might not add much value.

diff --git a/sys/net/if_pppx.c b/sys/net/if_pppx.c
index a59c932d276..e4675973a06 100644
--- a/sys/net/if_pppx.c
+++ b/sys/net/if_pppx.c
@@ -56,7 +56,8 @@
 #include 
 #include 
 #include 
-#include 
+#include 
+#include 
 #include 
 
 #include 
@@ -118,6 +119,7 @@ struct pppx_if;
  *   I   immutable after creation
  *   K   kernel lock
  *   N   net lock
+ *   m   pxd_mtx
  */
 
 struct pppx_dev {
@@ -125,10 +127,9 @@ struct pppx_dev {
int pxd_unit;   /* [I] */
 
/* kq shizz */
-   struct selinfo  pxd_rsel;
-   struct mutexpxd_rsel_mtx;
-   struct selinfo  pxd_wsel;
-   struct mutexpxd_wsel_mtx;
+   struct mutexpxd_mtx;
+   struct klistpxd_rklist; /* [m] */
+   struct klistpxd_wklist; /* [m] */
 
/* queue of packets for userland to service - protected by splnet */
struct mbuf_queue   pxd_svcq;
@@ -195,22 +196,28 @@ void  pppxattach(int);
 
 void   filt_pppx_rdetach(struct knote *);
 intfilt_pppx_read(struct knote *, long);
+intfilt_pppx_modify(struct kevent *, struct knote *);
+intfilt_pppx_process(struct knote *, struct kevent *);
 
 const struct filterops pppx_rd_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_pppx_rdetach,
.f_event= filt_pppx_read,
+   .f_modify   = filt_pppx_modify,
+   .f_process  = filt_pppx_process,
 };
 
 void   filt_pppx_wdetach(struct knote *);
 intfilt_pppx_write(struct knote *, long);
 
 const struct filterops pppx_wr_filtops = {
-   .f_flags= FILTEROP_ISFD,
+   .f_flags= FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach   = NULL,
.f_detach   = filt_pppx_wdetach,
.f_event= filt_pppx_write,
+   .f_modify   = filt_pppx_modify,
+   .f_process  = filt_pppx_process,
 };
 
 struct pppx_dev *
@@ -257,8 +264,9 @@ pppxopen(dev_t dev, int flags, int mode, struct proc *p)
}
 
pxd->pxd_unit = minor(dev);
-   mtx_init(>pxd_rsel_mtx, IPL_NET);
-   mtx_init(>pxd_wsel_mtx, IPL_NET);
+   mtx_init(>pxd_mtx, IPL_NET);
+   klist_init_mutex(>pxd_rklist, >pxd_mtx);
+   klist_init_mutex(>pxd_wklist, >pxd_mtx);
LIST_INIT(>pxd_pxis);
 
mq_init(>pxd_svcq, 128, IPL_NET);
@@ -453,29 +461,24 @@ int
 pppxkqfilter(dev_t dev, struct knote *kn)
 {
struct pppx_dev *pxd = pppx_dev2pxd(dev);
-   struct mutex *mtx;
struct klist *klist;
 
switch (kn->kn_filter) {
case EVFILT_READ:
-   mtx = >pxd_rsel_mtx;
-   klist = >pxd_rsel.si_note;
+   klist = >pxd_rklist;
kn->kn_fop = _rd_filtops;
break;
case EVFILT_WRITE:
-   mtx = >pxd_wsel_mtx;
-   klist = >pxd_wsel.si_note;
+   klist = >pxd_wklist;
kn->kn_fop = _wr_filtops;
break;
default:
return (EINVAL);
}
 
-   kn->kn_hook = (caddr_t)pxd;
+   kn->kn_hook = pxd;
 
-   mtx_enter(mtx);
-   klist_insert_locked(klist, kn);
-   mtx_leave(mtx);
+   klist_insert(klist, kn);
 
return (0);
 }
@@ -483,18 +486,17 @@ pppxkqfilter(dev_t dev, struct knote *kn)
 void
 filt_pppx_rdetach(struct knote *kn)
 {
-   struct pppx_dev *pxd = (struct pppx_dev *)kn->kn_hook;
-   struct klist *klist = >pxd_rsel.si_note;
+   struct pppx_dev *pxd = kn->kn_hook;
 
-   mtx_enter(>pxd_rsel_mtx);
-   klist_remove_locked(klist, kn);
-   mtx_leave(>pxd_rsel_mtx);
+

Fix evcount_percpu() after evcount_init_percpu() (plus bits for mips64)

2022-12-04 Thread Visa Hankala

Do not re-insert the event counter to evcount_list in evcount_percpu().
Otherwise the list becomes corrupt when evcount_percpu() is called
after evcount_init_percpu().

OK?

As an extra, use percpu counters with mips64 clock and ipi interrupts.

Index: kern/subr_evcount.c
===
RCS file: src/sys/kern/subr_evcount.c,v
retrieving revision 1.14
diff -u -p -r1.14 subr_evcount.c
--- kern/subr_evcount.c 10 Nov 2022 07:05:41 -  1.14
+++ kern/subr_evcount.c 4 Dec 2022 14:17:59 -
@@ -56,7 +56,6 @@ evcount_percpu(struct evcount *ec)
TAILQ_INSERT_TAIL(_percpu_init_list, ec, next);
} else {
ec->ec_percpu = counters_alloc(1);
-   TAILQ_INSERT_TAIL(_list, ec, next);
}
 }
 
Index: arch/mips64/mips64/clock.c
===
RCS file: src/sys/arch/mips64/mips64/clock.c,v
retrieving revision 1.48
diff -u -p -r1.48 clock.c
--- arch/mips64/mips64/clock.c  19 Nov 2022 16:23:48 -  1.48
+++ arch/mips64/mips64/clock.c  4 Dec 2022 14:17:58 -
@@ -37,7 +37,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -100,6 +99,7 @@ clockattach(struct device *parent, struc
 */
set_intr(INTPRI_CLOCK, CR_INT_5, cp0_int5);
evcount_attach(_clock_count, "clock", _clock_irq);
+   evcount_percpu(_clock_count);
 
/* try to avoid getting clock interrupts early */
cp0_set_compare(cp0_get_count() - 1);
@@ -121,7 +121,7 @@ cp0_int5(uint32_t mask, struct trapframe
struct cpu_info *ci = curcpu();
int s;
 
-   atomic_inc_long((unsigned long *)_clock_count.ec_count);
+   evcount_inc(_clock_count);
 
cp0_set_compare(cp0_get_count() - 1);   /* clear INT5 */
 
Index: arch/mips64/mips64/ipifuncs.c
===
RCS file: src/sys/arch/mips64/mips64/ipifuncs.c,v
retrieving revision 1.25
diff -u -p -r1.25 ipifuncs.c
--- arch/mips64/mips64/ipifuncs.c   10 Apr 2022 13:23:14 -  1.25
+++ arch/mips64/mips64/ipifuncs.c   4 Dec 2022 14:17:58 -
@@ -84,6 +84,7 @@ mips64_ipi_init(void)
if (!cpuid) {
mtx_init(_rv_mtx, IPL_HIGH);
evcount_attach(_count, "ipi", _irq);
+   evcount_percpu(_count);
}
 
hw_ipi_intr_clear(cpuid);
@@ -113,8 +114,7 @@ mips64_ipi_intr(void *arg)
for (bit = 0; bit < MIPS64_NIPIS; bit++) {
if (pending_ipis & (1UL << bit)) {
(*ipifuncs[bit])();
-   atomic_inc_long(
-   (unsigned long *)_count.ec_count);
+   evcount_inc(_count);
}
}
}

Re: Microsoft Surface: replace umstc(4) with ucc(4)

2022-11-19 Thread Visa Hankala

On Fri, Nov 18, 2022 at 11:03:06AM -0500, Dave Voutila wrote:
> That fixes booting and the Surface Keyboard is usable, but I'm getting
> spurious faults coming from retpoline out of filt_wseventdetach if I
> detach and reattach the Surface Keyboard multiple times while running
> Xorg.
> 
> It's not consistent, but I've been able to trigger it 3 times now. 2 via
> physical detach/reattach of the Surface Keyboard and another by just
> running `sysctl machdep.forceukdb=1` which apparently detaches/attaches
> behind the scenes.

It seems that the detaching can leave behind dangling wsevent knotes.
They should be ferreted out before the device disappears.

Does the following patch help?

klist_invalidate() can block, so I put it before ev->q is freed and
set NULL. This tries to avoid causing new races with wsevent_init().

The detach logic is tangled and particularly so with ws devices,
so I am uncertain if tweaking just wsevent_fini() is enough.

Index: dev/wscons/wsevent.c
===
RCS file: src/sys/dev/wscons/wsevent.c,v
retrieving revision 1.26
diff -u -p -r1.26 wsevent.c
--- dev/wscons/wsevent.c2 Jul 2022 08:50:42 -   1.26
+++ dev/wscons/wsevent.c19 Nov 2022 15:05:26 -
@@ -131,6 +131,9 @@ wsevent_fini(struct wseventvar *ev)
 #endif
return;
}
+
+   klist_invalidate(>sel.si_note);
+
free(ev->q, M_DEVBUF, WSEVENT_QSIZE * sizeof(struct wscons_event));
ev->q = NULL;

Re: mips64, loongson, octeon: switch to clockintr(9)

2022-11-16 Thread Visa Hankala

On Wed, Nov 16, 2022 at 07:00:23AM -0600, Scott Cheloha wrote:
> On Mon, Nov 14, 2022 at 05:19:17PM +0000, Visa Hankala wrote:
> > I think this clockintr_init() should be in cp0_startclock(). This would
> > let other clock drivers do their own adjusting of the hz variables
> > before clockintr initialization. With this fixed,
> 
> Is the attached the change you envisioned?

Yes, but please remove the #ifdef MULTIPROCESSOR line and its #endif
from cp0_startclock(). The #ifdef'ing is redundant because
CPU_IS_PRIMARY() is short-circuited to value 1 in non-MULTIPROCESSOR
builds and the else branch does not use any MULTIPROCESSOR-only items.

if (CPU_IS_PRIMARY(ci)) {
stathz = hz;
profhz = stathz * 10;
clockintr_init(CL_RNDSTAT);
} else {
s = splhigh();
nanouptime(>ci_schedstate.spc_runtime);
splx(s);

/* try to avoid getting clock interrupts early */
cp0_set_compare(cp0_get_count() - 1);

cp0_calibrate(ci);
}

With the above tweak,

OK visa@

Re: mips64, loongson, octeon: switch to clockintr(9)

2022-11-14 Thread Visa Hankala

On Sun, Nov 06, 2022 at 07:48:09PM +, Scott Cheloha wrote:
> This patch switches loongson and octeon to clockintr(9).
> 
> It has survived several release builds and upgrades from the resulting
> bsd.rd images on my ER-4.  The ER-4 doesn't have enough RAM to crunch a
> parallel release build.  It chokes on some of the larger LLVM modules.
> 
> visa@ reports it survived a partial build on a loongson machine (he
> skipped LLVM).  I believe he is also testing this on a package
> building machine, too.
> 
> Testing on beefier octeon machines would help demonstrate this is
> stable.  My ER-4 only has USB2.0, which slows things down.

So far, this patch has worked fine on the mips64 package build machines.

> Notes:
> 
> - octeon and loongson machines now have a randomized statclock().
> 
> - This patch merely disables the loongson glxclk.  If the device has
>   no other use we can fully remove the driver in a separate patch.

Lets keep the driver for now. The disabling is fine for the time being.

> @@ -324,6 +324,10 @@ cpu_initclocks(void)
>   tc_init(_timecounter);
>   }
>  
> + stathz = hz;
> + profhz = stathz * 10;
> + clockintr_init(CL_RNDSTAT);

I think this clockintr_init() should be in cp0_startclock(). This would
let other clock drivers do their own adjusting of the hz variables
before clockintr initialization. With this fixed,

OK visa@

Re: replace SRP with SMR in the if_idxmap commit

2022-11-10 Thread Visa Hankala

On Thu, Nov 10, 2022 at 11:59:02PM +1000, David Gwynne wrote:
> On Thu, Nov 10, 2022 at 09:04:22PM +1000, David Gwynne wrote:
> > On Thu, Nov 10, 2022 at 08:10:35AM +1000, David Gwynne wrote:
> > > I know what this is. The barrier at the end of if_idxmap_alloc is 
> > > sleeping waiting for cpus to run that aren't running cos we haven't 
> > > finished booting yet.
> > > 
> > > I'll back it out and fix it up when I'm actually awake.
> > 
> > i woke up, so here's a diff.
> > 
> > this uses the usedidx as an smr_entry so we can use smr_call instead of
> > smr_barrier during autoconf.
> > 
> > this works for me on a box with a lot of hardware interfaces, which
> > forces allocation of a new interface map and therefore destruction of
> > the initial one.
> > 
> > there is still an smr_barrier in if_idxmap_remove, but remove only
> > happens when an interface goes away. we could use part of struct ifnet
> > (eg, if_description) as an smr_entry if needed.
> > 
> 
> this one is even better.

Please add #include .

> + SMR_PTR_SET_LOCKED(_idxmap.map, if_map);
> +
> + smr_init(>smr);
> + dtor->map = oif_map;

I think smr_call could be moved here. The call is non-blocking.
Then the scope of the dtor variable could be reduced too.

dtor->map = oif_map;
smr_init(>smr);
smr_call(>smr, if_idxmap_free, dtor);

OK visa@

Re: EVFILT_TIMER add support for different timer precisions NOTE_{,U,N,M}SECONDS

2022-09-10 Thread Visa Hankala

On Wed, Aug 31, 2022 at 04:48:37PM -0400, aisha wrote:
> I've added a patch which adds support for NOTE_{,U,M,N}SECONDS for
> EVFILT_TIMER in the kqueue interface.

It sort of makes sense to add an option to specify timeouts in
sub-millisecond precision. It feels complete overengineering to add
multiple time units on the level of the kernel interface. However,
it looks that FreeBSD and NetBSD have already done this following
macOS' lead...

> I've also added the NOTE_ABSTIME but haven't done any actual implementation
> there as I am not sure how the `data` field should be interpreted (is it
> absolute time in seconds since epoch?).

I think FreeBSD and NetBSD take NOTE_ABSTIME as time since the epoch.

Below is a revised patch that takes into account some corner cases.
It tries to be API-compatible with FreeBSD and NetBSD. I have adjusted
the NOTE_{,M,U,N}SECONDS flags so that they are enum-like.

The manual page bits are from NetBSD.

It is quite late to introduce a feature like this within this release
cycle. Until now, the timer code has ignored the fflags field. There
might be pieces of software that are careless with struct kevent and
that would break as a result of this patch. Programs that are widely
used on different BSDs are probably fine already, though.

Index: lib/libc/sys/kqueue.2
===
RCS file: src/lib/libc/sys/kqueue.2,v
retrieving revision 1.46
diff -u -p -r1.46 kqueue.2
--- lib/libc/sys/kqueue.2   31 Mar 2022 17:27:16 -  1.46
+++ lib/libc/sys/kqueue.2   10 Sep 2022 13:01:36 -
@@ -457,17 +457,71 @@ Establishes an arbitrary timer identifie
 .Fa ident .
 When adding a timer,
 .Fa data
-specifies the timeout period in milliseconds.
-The timer will be periodic unless
+specifies the timeout period in units described below, or, if
+.Dv NOTE_ABSTIME
+is set in
+.Va fflags ,
+specifies the absolute time at which the timer should fire.
+The timer will repeat unless
 .Dv EV_ONESHOT
-is specified.
+is set in
+.Va flags
+or
+.Dv NOTE_ABSTIME
+is set in
+.Va fflags .
 On return,
 .Fa data
 contains the number of times the timeout has expired since the last call to
 .Fn kevent .
-This filter automatically sets the
+This filter automatically sets
 .Dv EV_CLEAR
-flag internally.
+in
+.Va flags
+for periodic timers.
+Timers created with
+.Dv NOTE_ABSTIME
+remain activated on the kqueue once the absolute time has passed unless
+.Dv EV_CLEAR
+or
+.Dv EV_ONESHOT
+are also specified.
+.Pp
+The filter accepts the following flags in the
+.Va fflags
+argument:
+.Bl -tag -width NOTE_MSECONDS
+.It Dv NOTE_SECONDS
+The timer value in
+.Va data
+is expressed in seconds.
+.It Dv NOTE_MSECONDS
+The timer value in
+.Va data
+is expressed in milliseconds.
+.It Dv NOTE_USECONDS
+The timer value in
+.Va data
+is expressed in microseconds.
+.It Dv NOTE_NSECONDS
+The timer value in
+.Va data
+is expressed in nanoseconds.
+.It Dv NOTE_ABSTIME
+The timer value is an absolute time with
+.Dv CLOCK_REALTIME
+as the reference clock.
+.El
+.Pp
+Note that
+.Dv NOTE_SECONDS ,
+.Dv NOTE_MSECONDS ,
+.Dv NOTE_USECONDS ,
+and
+.Dv NOTE_NSECONDS
+are mutually exclusive; behavior is undefined if more than one are specified.
+If a timer value unit is not specified, the default is
+.Dv NOTE_MSECONDS .
 .Pp
 If an existing timer is re-added, the existing timer and related pending events
 will be cancelled.
@@ -557,6 +611,7 @@ No memory was available to register the 
 The specified process to attach to does not exist.
 .El
 .Sh SEE ALSO
+.Xr clock_gettime 2 ,
 .Xr poll 2 ,
 .Xr read 2 ,
 .Xr select 2 ,
Index: regress/sys/kern/kqueue/kqueue-timer.c
===
RCS file: src/regress/sys/kern/kqueue/kqueue-timer.c,v
retrieving revision 1.4
diff -u -p -r1.4 kqueue-timer.c
--- regress/sys/kern/kqueue/kqueue-timer.c  12 Jun 2021 13:30:14 -  
1.4
+++ regress/sys/kern/kqueue/kqueue-timer.c  10 Sep 2022 13:01:37 -
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -31,9 +32,13 @@
 int
 do_timer(void)
 {
-   int kq, n;
+   static const int units[] = {
+   NOTE_SECONDS, NOTE_MSECONDS, NOTE_USECONDS, NOTE_NSECONDS
+   };
struct kevent ev;
-   struct timespec ts;
+   struct timespec ts, start, end, now;
+   int64_t usecs;
+   int i, kq, n;
 
ASS((kq = kqueue()) >= 0,
warn("kqueue"));
@@ -68,6 +73,125 @@ do_timer(void)
n = kevent(kq, NULL, 0, , 1, );
ASSX(n == 1);
 
+   /* Test with different time units */
+
+   for (i = 0; i < sizeof(units) / sizeof(units[0]); i++) {
+   memset(, 0, sizeof(ev));
+   ev.filter = EVFILT_TIMER;
+   ev.flags = EV_ADD | EV_ENABLE;
+   ev.fflags = units[i];
+   ev.data = 1;
+
+   n = kevent(kq, , 1, NULL, 0, NULL);
+   ASSX(n != -1);
+
+

Re: mips64: trigger deferred timer interrupt from splx(9)

2022-08-18 Thread Visa Hankala

On Thu, Aug 18, 2022 at 02:33:55PM +, Miod Vallat wrote:
> > After about 92 hours, one machine showed cp0_raise_calls=622486 and
> > another 695892. cp0_raise_miss was zero on both of them. On two other
> > machines I had forgotten to allow ddb access from console and could
> > not check the values.
> 
> Put kern.allowkmem=1 in /etc/sysctl.conf, and then you can do fetch the
> values with pstat -d.

Not without rebooting first. And I don't want to run the machines
with kern.allowkmem=1.

Re: mips64: trigger deferred timer interrupt from splx(9)

2022-08-18 Thread Visa Hankala

On Wed, Aug 17, 2022 at 11:42:50AM -0500, Scott Cheloha wrote:
> On Wed, Aug 17, 2022 at 01:30:29PM +0000, Visa Hankala wrote:
> > On Tue, Aug 09, 2022 at 09:54:02AM -0500, Scott Cheloha wrote:
> > > On Tue, Aug 09, 2022 at 02:03:31PM +, Visa Hankala wrote:
> > > > On Mon, Aug 08, 2022 at 02:52:37AM -0500, Scott Cheloha wrote:
> > > > > One thing I'm still uncertain about is how glxclk fits into the
> > > > > loongson picture.  It's an interrupt clock that runs hardclock() and
> > > > > statclock(), but the code doesn't do any logical masking, so I don't
> > > > > know whether or not I need to adjust anything in that code or account
> > > > > for it at all.  If there's no logical masking there's no deferral, so
> > > > > it would never call need to call md_triggerclock() from splx(9).
> > > > 
> > > > I think the masking of glxclk interrupts are handled by the ISA
> > > > interrupt code.
> > > 
> > > Do those machines not have Coprocessor 0?  If they do, why would you
> > > prefer glxclk over CP0?
> > > 
> > > > The patch misses md_triggerclock definition in mips64_machdep.c.
> > > 
> > > Whoops, forgot that file.  Fuller patch below.
> > > 
> > > > I have put this to the test on the mips64 ports builder machines.
> > 
> > The machines completed a build with this patch without problems.
> > I tested with the debug counters removed from cp0_trigger_int5().
> > 
> > OK visa@
> 
> Thank you for testing!
> 
> There was a loongson portion to this patch.  Is this OK on loongson or
> just octeon?

OK for both.

> Also, what did the debug counters look like when you yanked them?  If
> cp0_raise_miss was non-zero I will double the initial offset to 32
> cycles.

After about 92 hours, one machine showed cp0_raise_calls=622486 and
another 695892. cp0_raise_miss was zero on both of them. On two other
machines I had forgotten to allow ddb access from console and could
not check the values.

The current offset is good enough.

Re: mips64: trigger deferred timer interrupt from splx(9)

2022-08-17 Thread Visa Hankala

On Tue, Aug 09, 2022 at 09:54:02AM -0500, Scott Cheloha wrote:
> On Tue, Aug 09, 2022 at 02:03:31PM +0000, Visa Hankala wrote:
> > On Mon, Aug 08, 2022 at 02:52:37AM -0500, Scott Cheloha wrote:
> > > One thing I'm still uncertain about is how glxclk fits into the
> > > loongson picture.  It's an interrupt clock that runs hardclock() and
> > > statclock(), but the code doesn't do any logical masking, so I don't
> > > know whether or not I need to adjust anything in that code or account
> > > for it at all.  If there's no logical masking there's no deferral, so
> > > it would never call need to call md_triggerclock() from splx(9).
> > 
> > I think the masking of glxclk interrupts are handled by the ISA
> > interrupt code.
> 
> Do those machines not have Coprocessor 0?  If they do, why would you
> prefer glxclk over CP0?
> 
> > The patch misses md_triggerclock definition in mips64_machdep.c.
> 
> Whoops, forgot that file.  Fuller patch below.
> 
> > I have put this to the test on the mips64 ports builder machines.

The machines completed a build with this patch without problems.
I tested with the debug counters removed from cp0_trigger_int5().

OK visa@

> Index: mips64/mips64/clock.c
> ===
> RCS file: /cvs/src/sys/arch/mips64/mips64/clock.c,v
> retrieving revision 1.45
> diff -u -p -r1.45 clock.c
> --- mips64/mips64/clock.c 6 Apr 2022 18:59:26 -   1.45
> +++ mips64/mips64/clock.c 9 Aug 2022 14:48:47 -
> @@ -60,6 +60,7 @@ const struct cfattach clock_ca = {
>  };
>  
>  void cp0_startclock(struct cpu_info *);
> +void cp0_trigger_int5(void);
>  uint32_t cp0_int5(uint32_t, struct trapframe *);
>  
>  int
> @@ -86,19 +87,20 @@ clockattach(struct device *parent, struc
>   cp0_set_compare(cp0_get_count() - 1);
>  
>   md_startclock = cp0_startclock;
> + md_triggerclock = cp0_trigger_int5;
>  }
>  
>  /*
>   *  Interrupt handler for targets using the internal count register
>   *  as interval clock. Normally the system is run with the clock
>   *  interrupt always enabled. Masking is done here and if the clock
> - *  can not be run the tick is just counted and handled later when
> - *  the clock is logically unmasked again.
> + *  cannot be run the tick is handled later when the clock is logically
> + *  unmasked again.
>   */
>  uint32_t
>  cp0_int5(uint32_t mask, struct trapframe *tf)
>  {
> - u_int32_t clkdiff;
> + u_int32_t clkdiff, pendingticks = 0;
>   struct cpu_info *ci = curcpu();
>  
>   /*
> @@ -113,15 +115,26 @@ cp0_int5(uint32_t mask, struct trapframe
>   }
>  
>   /*
> +  * If the clock interrupt is masked, defer any work until it
> +  * is unmasked from splx(9).
> +  */
> + if (tf->ipl >= IPL_CLOCK) {
> + ci->ci_clock_deferred = 1;
> + cp0_set_compare(cp0_get_count() - 1);
> + return CR_INT_5;
> + }
> + ci->ci_clock_deferred = 0;
> +
> + /*
>* Count how many ticks have passed since the last clock interrupt...
>*/
>   clkdiff = cp0_get_count() - ci->ci_cpu_counter_last;
>   while (clkdiff >= ci->ci_cpu_counter_interval) {
>   ci->ci_cpu_counter_last += ci->ci_cpu_counter_interval;
>   clkdiff = cp0_get_count() - ci->ci_cpu_counter_last;
> - ci->ci_pendingticks++;
> + pendingticks++;
>   }
> - ci->ci_pendingticks++;
> + pendingticks++;
>   ci->ci_cpu_counter_last += ci->ci_cpu_counter_interval;
>  
>   /*
> @@ -132,32 +145,64 @@ cp0_int5(uint32_t mask, struct trapframe
>   clkdiff = cp0_get_count() - ci->ci_cpu_counter_last;
>   if ((int)clkdiff >= 0) {
>   ci->ci_cpu_counter_last += ci->ci_cpu_counter_interval;
> - ci->ci_pendingticks++;
> + pendingticks++;
>   cp0_set_compare(ci->ci_cpu_counter_last);
>   }
>  
>   /*
> -  * Process clock interrupt unless it is currently masked.
> +  * Process clock interrupt.
>*/
> - if (tf->ipl < IPL_CLOCK) {
>  #ifdef MULTIPROCESSOR
> - register_t sr;
> + register_t sr;
>  
> - sr = getsr();
> - ENABLEIPI();
> + sr = getsr();
> + ENABLEIPI();
>  #endif
> - while (ci->ci_pendingticks) {
> - atomic_inc_long(
> - (unsigned long *)_clock_count.ec_count);
> - hardclock(tf);
> - ci->ci_pendingticks--;
> -

Remove kqueue-related trace points from poll(2) and select(2)

2022-08-14 Thread Visa Hankala

kqueue-based poll(2) and select(2) seem to work fairly well. Because of
this, the ktrace points that display the internal translated events do
not appear very valuable any longer. They clog up and make traces
difficult to read. I think it is time to remove the trace points.

OK?

Index: sys/kern/sys_generic.c
===
RCS file: src/sys/kern/sys_generic.c,v
retrieving revision 1.148
diff -u -p -r1.148 sys_generic.c
--- sys/kern/sys_generic.c  5 Jul 2022 15:06:16 -   1.148
+++ sys/kern/sys_generic.c  14 Aug 2022 10:46:50 -
@@ -691,10 +691,7 @@ dopselect(struct proc *p, int nd, fd_set
/* Maximum number of events per iteration */
count = MIN(nitems(kev), nevents);
ready = kqueue_scan(, count, kev, timeout, p, );
-#ifdef KTRACE
-   if (KTRPOINT(p, KTR_STRUCT))
-   ktrevent(p, kev, ready);
-#endif
+
/* Convert back events that are ready. */
for (i = 0; i < ready && error == 0; i++)
error = pselcollect(p, [i], pobits, );
@@ -762,10 +759,6 @@ pselregister(struct proc *p, fd_set *pib
EV_SET(, fd, evf[msk],
EV_ADD|EV_ENABLE|__EV_SELECT,
evff[msk], 0, (void *)(p->p_kq_serial));
-#ifdef KTRACE
-   if (KTRPOINT(p, KTR_STRUCT))
-   ktrevent(p, , 1);
-#endif
error = kqueue_register(p->p_kq, , 0, p);
switch (error) {
case 0:
@@ -1001,10 +994,7 @@ doppoll(struct proc *p, struct pollfd *f
/* Maximum number of events per iteration */
count = MIN(nitems(kev), nevents);
ready = kqueue_scan(, count, kev, timeout, p, );
-#ifdef KTRACE
-   if (KTRPOINT(p, KTR_STRUCT))
-   ktrevent(p, kev, ready);
-#endif
+
/* Convert back events that are ready. */
for (i = 0; i < ready; i++)
ncollected += ppollcollect(p, [i], pl, nfds);
@@ -1057,10 +1047,6 @@ ppollregister_evts(struct proc *p, struc
 
KASSERT(pl->revents == 0);
 
-#ifdef KTRACE
-   if (KTRPOINT(p, KTR_STRUCT))
-   ktrevent(p, kevp, nkev);
-#endif
for (i = 0; i < nkev; i++, kevp++) {
 again:
error = kqueue_register(p->p_kq, kevp, pollid, p);

Remove unneeded kern.nselcoll sysctl

2022-08-14 Thread Visa Hankala

Remove unneeded kern.nselcoll sysctl.

The last use of this sysctl in base was removed over two weeks ago.
Debian Code Search does not show any uses of the KERN_NSELCOLL macro.

OK?

Index: lib/libc/sys/sysctl.2
===
RCS file: src/lib/libc/sys/sysctl.2,v
retrieving revision 1.48
diff -u -p -r1.48 sysctl.2
--- lib/libc/sys/sysctl.2   31 Mar 2022 17:27:16 -  1.48
+++ lib/libc/sys/sysctl.2   14 Aug 2022 10:46:48 -
@@ -460,7 +460,6 @@ information.
 .It Dv KERN_NGROUPS Ta "integer" Ta "no"
 .It Dv KERN_NOSUIDCOREDUMP Ta "integer" Ta "yes"
 .It Dv KERN_NPROCS Ta "integer" Ta "no"
-.It Dv KERN_NSELCOLL Ta "integer" Ta "no"
 .It Dv KERN_NTHREADS Ta "integer" Ta "no"
 .It Dv KERN_NUMVNODES Ta "integer" Ta "no"
 .It Dv KERN_OSRELEASE Ta "string" Ta "no"
@@ -716,10 +715,6 @@ Whether a process may dump core after ch
 .El
 .It Dv KERN_NPROCS Pq Va kern.nprocs
 The number of entries in the kernel process table.
-.It Dv KERN_NSELCOLL Pq Va kern.nselcoll
-Number of
-.Xr select 2
-collisions.
 .It Dv KERN_NTHREADS Pq Va kern.nthreads
 The number of entries in the kernel thread table.
 .It Dv KERN_NUMVNODES Pq Va kern.numvnodes
Index: sys/kern/kern_sysctl.c
===
RCS file: src/sys/kern/kern_sysctl.c,v
retrieving revision 1.404
diff -u -p -r1.404 kern_sysctl.c
--- sys/kern/kern_sysctl.c  26 Jul 2022 14:53:45 -  1.404
+++ sys/kern/kern_sysctl.c  14 Aug 2022 10:46:50 -
@@ -298,7 +298,6 @@ const struct sysctl_bounded_args kern_va
{KERN_NFILES, , SYSCTL_INT_READONLY},
{KERN_TTYCOUNT, _count, SYSCTL_INT_READONLY},
{KERN_ARGMAX, _max, SYSCTL_INT_READONLY},
-   {KERN_NSELCOLL, _zero, SYSCTL_INT_READONLY},
{KERN_POSIX1, _version, SYSCTL_INT_READONLY},
{KERN_NGROUPS, _max, SYSCTL_INT_READONLY},
{KERN_JOB_CONTROL, _one, SYSCTL_INT_READONLY},
Index: sys/sys/sysctl.h
===
RCS file: src/sys/sys/sysctl.h,v
retrieving revision 1.228
diff -u -p -r1.228 sysctl.h
--- sys/sys/sysctl.h13 May 2022 15:32:00 -  1.228
+++ sys/sys/sysctl.h14 Aug 2022 10:46:50 -
@@ -144,7 +144,7 @@ struct ctlname {
 #define KERN_CPTIME40  /* array: cp_time */
 #define KERN_NCHSTATS  41  /* struct: vfs cache statistics */
 #define KERN_FORKSTAT  42  /* struct: fork statistics */
-#define KERN_NSELCOLL  43  /* int: select(2) collisions */
+/* was KERN_NSELCOLL   43  */
 #define KERN_TTY   44  /* node: tty information */
 #defineKERN_CCPU   45  /* int: ccpu */
 #defineKERN_FSCALE 46  /* int: fscale */
@@ -237,7 +237,7 @@ struct ctlname {
{ "cp_time", CTLTYPE_STRUCT }, \
{ "nchstats", CTLTYPE_STRUCT }, \
{ "forkstat", CTLTYPE_STRUCT }, \
-   { "nselcoll", CTLTYPE_INT }, \
+   { "gap", 0 }, \
{ "tty", CTLTYPE_NODE }, \
{ "ccpu", CTLTYPE_INT }, \
{ "fscale", CTLTYPE_INT }, \

Re: Use SMR instead of SRP list in rtsock.c

2022-08-10 Thread Visa Hankala

On Wed, Aug 10, 2022 at 11:08:06AM +0200, Claudio Jeker wrote:
> On Fri, Jul 01, 2022 at 04:03:21PM +0000, Visa Hankala wrote:
> > On Fri, Jul 01, 2022 at 09:59:11AM +0200, Claudio Jeker wrote:
> > > On Thu, Jun 30, 2022 at 03:46:35PM +, Visa Hankala wrote:
> > > > On Thu, Jun 30, 2022 at 11:51:52AM +0200, Claudio Jeker wrote:
> > > > > After discussing this with mpi@ and jmatthew@ we came to the 
> > > > > conclusion
> > > > > that we need to smr_barrier() before refcnt_finalize() to ensure that 
> > > > > no
> > > > > other CPU is between the SMR_TAILQ_FOREACH, refcnt_take() and
> > > > > smr_read_leave().
> > > > 
> > > > [...]
> > > > 
> > > > > @@ -509,7 +487,8 @@ route_input(struct mbuf *m0, struct sock
> > > > >   return;
> > > > >   }
> > > > >  
> > > > > - SRPL_FOREACH(rop, , _list, rop_list) {
> > > > > + smr_read_enter();
> > > > > + SMR_TAILQ_FOREACH(rop, _list, rop_list) {
> > > > >   /*
> > > > >* If route socket is bound to an address family only 
> > > > > send
> > > > >* messages that match the address family. Address 
> > > > > family
> > > > > @@ -519,7 +498,8 @@ route_input(struct mbuf *m0, struct sock
> > > > >   rop->rop_proto != sa_family)
> > > > >   continue;
> > > > >  
> > > > > -
> > > > > + refcnt_take(>rop_refcnt);
> > > > > + smr_read_leave();
> > > > >   so = rop->rop_socket;
> > > > >   solock(so);
> > > > >  
> > > > > @@ -579,8 +559,10 @@ route_input(struct mbuf *m0, struct sock
> > > > >   rtm_sendup(so, m);
> > > > >  next:
> > > > >   sounlock(so);
> > > > > + smr_read_enter();
> > > > > + refcnt_rele_wake(>rop_refcnt);
> > > > 
> > > > This does not look correct.
> > > > 
> > > > smr_barrier() can proceed after smr_read_leave(), so refcnt_rele_wake()
> > > > might drop the final reference and this thread can no longer access
> > > > rop safely (SMR_TAILQ_NEXT() inside SMR_TAILQ_FOREACH()).
> > > > 
> > > > Also, SMR_TAILQ_NEXT() of rop becomes potentially dangling after
> > > > smr_read_leave(). After this thread leaves the read-side critical
> > > > section, another thread might free rop's successor.
> > > 
> > > So we need to either smr_barrier() before and after the refcnt_finalize()
> > > to make sure that the rop pointer remains stable in both cases or we alter
> > > the SMR_TAILQ_FOREACH() loop so that SMR_TAILQ_NEXT can be grabbed before
> > > refcnt_rele_wake().
> > > 
> > > While the double smr_barrier() is trivial it is not ideal and I think it
> > > is better to adjust the loop since SMR loops with sleep points is a
> > > somewhat common issue and so we should have a good clear way on how to
> > > solve it.
> > 
> > Adjusting SMR_TAILQ_FOREACH() will not help.
> > 
> > In general, a reader cannot resume a lockless iteration after it has
> > left the read-side critical section and crossed a sleep point. The
> > guarantee of consistent(-looking) forward linkage is gone. The reader
> > no longer knows if the value of SMR_TAILQ_NEXT() is valid. If the
> > reader wants to continue with the list, it has to re-enter the read-side
> > critical section and restart the iteration.
> 
> This is not a real SMR_TAILQ_FOREACH() use case so trying to use
> SMR_TAILQ_FOREACH() here is not right. The code wants to walk the list of
> route pcbs linked via rop_list. The code just needs to walk all active
> connections and does not care about races with sockets that are concurrently
> closed or opened. In the first case SS_CANTRCVMORE will be set and the
> socket is skipped and in the second case the socket is simply ignored
> because new sockets are inserted at the head of the list.
>  
> It is not a lockless iteration over the full list. It is not required to
> be either. The only thing that matters is that the forward linkage is
> consitent (not pointing to invalid objects).
> 
> There is no need to restart the iteration because element on the list can
> not be reinserted. They can only be removed and a removed element does not
> get any message anyway (either by not visiting the object or by skipping
> it in the loop).
> 
> The refcnt ensures that the currently used pcb is not freed before the
> next element is picked. As long as the refcnt is hold the object can't be
> removed.

Lets assume that another thread begins to detach rop while
route_input() sleeps. The reference prevents early freeing of rop.

Lets assume further that yet another thread detaches and frees the
successor of rop while the first thread is still sleeping. What will
SMR_LIST_NEXT(rop, rop_list) point to?

Re: mips64: trigger deferred timer interrupt from splx(9)

2022-08-09 Thread Visa Hankala

On Mon, Aug 08, 2022 at 02:52:37AM -0500, Scott Cheloha wrote:
> One thing I'm still uncertain about is how glxclk fits into the
> loongson picture.  It's an interrupt clock that runs hardclock() and
> statclock(), but the code doesn't do any logical masking, so I don't
> know whether or not I need to adjust anything in that code or account
> for it at all.  If there's no logical masking there's no deferral, so
> it would never call need to call md_triggerclock() from splx(9).

I think the masking of glxclk interrupts are handled by the ISA
interrupt code.

The patch misses md_triggerclock definition in mips64_machdep.c.

I have put this to the test on the mips64 ports builder machines.

Re: mips64: trigger deferred timer interrupt from splx(9)

2022-08-07 Thread Visa Hankala

On Sun, Jul 31, 2022 at 01:28:18PM -0500, Scott Cheloha wrote:
> Apparently mips64, i.e. octeon and loongson, has the same problem as
> powerpc/macppc and powerpc64.  The timer interrupt is normally only
> logically masked, not physically masked in the hardware, when we're
> running at or above IPL_CLOCK.  If we arrive at cp0_int5() when the
> clock interrupt is logically masked we postpone all work until the
> next tick.  This is a problem for my WIP clock interrupt work.

I think the use of logical masking has been a design choice, not
something dictated by the hardware. Physical masking should be possible,
but some extra care would be needed to implement it, as the mips64
interrupt code is a bit clunky.

> So, this patch is basically the same as what I did for macppc and what
> I have proposed for powerpc64.
> 
> - Add a new member, ci_timer_deferred, to mips64's cpu_info struct.
> 
>   While here, remove ci_pendingticks.  We don't need it anymore.
> 
> - If we get to cp0_int5() and our IPL is too high, set
>   cpu_info.ci_timer_deferred and return.
> 
> - If we get to cp0_int5() and our IPL is low enough, clear
>   cpu_info.ci_timer_deferred and do clock interrupt work.
> 
> - In splx(9), if the new IPL is low enough and cpu_info.ci_timer_deferred
>   is set, trigger the clock interrupt.
> 
> The only big difference is that mips64 uses an equality comparison
> when deciding whether to arm the timer interrupt, so it's really easy
> to "miss" CP0.count when you're setting CP0.compare.
> 
> To address this I've written a function, cp0_raise_int5(), that spins
> until it is sure the timer interrupt will go off.  The function needed
> a bit of new code for reading CP0.cause, which I've added to
> cp0access.S.  I am using an initial offset of 16 cycles based on
> experimentation with the machine I have access to, a 500Mhz CN50xx.
> Values lower than 16 require more than one loop to arm the timer.  If
> that value is insufficient for other machines we can try passing the
> initial offset as an argument to the function.

It should not be necessary to make the initial offset variable. The
offset is primarily a function of the length and content of the
instruction sequence. Some unpredictability comes from cache misses
and maybe branch prediction failures.

> I wasn't sure where to put the prototype for cp0_raise_int5() so I
> stuck it in mips64/cpu.h.  If there's a better place for it, just say
> so.

Currently, mips64 clock.c is formulated as a proper driver. I think
callers should not invoke its functions directly but use a hook instead.
The MI mips64 code starts the clock through the md_startclock function
pointer. Maybe there could be md_triggerclock.

To reduce risk of confusion, I would rename cp0_raise_int5 to
cp0_trigger_int5, as `raise' overlaps with the spl API. Also,
ci_clock_deferred instead of ci_timer_deferred would look more
consistent with the surrounding code.

Re: randomise arc4random() rekey interval

2022-07-30 Thread Visa Hankala

On Sat, Jul 30, 2022 at 06:40:21PM +1000, Damien Miller wrote:
> On Fri, 29 Jul 2022, Theo de Raadt wrote:
> 
> > The question is what _rs_random_u32() will do when it calls
> > _rs_stir_if_needed().
> >
> > There is one potential problem. lib/libcrypto/arc4random/*.h contains
> > portable wrappers for _rs_forkdetect(), which actually do things.
> > memset(rs, 0, sizeof(*rs)) will trash the rs state. Let's imagine a
> > "fork" has happened same time that bytes run out.
> >
> > _rs_stir()
> > ...
> > rs->rs_count = REKEY_BASE;
> > _rs_random_u32 -> _rs_stir_if_needed -> _rs_forkdetect
> >   - all rs fields are zero'd with memset
> >   - _rs_forkdetect returns
> > 
> > back in _rs_stir_if_needed,
> >  - if (!rs || rs->rs_count <= len)
> >_rs_stir();
> > 
> >
> > So it will recurse once (only once, because a 2nd fork cannot happen).
> > But this is fragile.
> >
> > Alternatives are to get the value direct from getentropy -- with
> > KEYSZ + IVSZ + 4 maybe? Or fetch a value for this random bias early
> > and track it in rs, but don't damage it in the memset? Or split
> > _rs_random_u32() so that a sub-function of it may collect these 4
> > keystream bytes without the _rs_stir_if_needed/_rs_rekey checks?
> 
> I don't see how a fork could trash these - do you mean one that
> happened in a thread or a signal handler? AFAIK arc4random() isn't
> safe in these contexts right now, even without fork().
> 
> Anyway, this version invokes the chacha context directly so there's
> not possibility of _rs_stir() reentrance. It is still not safe against
> something clobbering rsx concurrently (but neither is the existing
> code).
> 
> Index: crypt/arc4random.c
> ===
> RCS file: /cvs/src/lib/libc/crypt/arc4random.c,v
> retrieving revision 1.56
> diff -u -p -r1.56 arc4random.c
> --- crypt/arc4random.c28 Feb 2022 21:56:29 -  1.56
> +++ crypt/arc4random.c30 Jul 2022 08:38:44 -
> @@ -49,6 +49,8 @@
>  #define BLOCKSZ  64
>  #define RSBUFSZ  (16*BLOCKSZ)
>  
> +#define REKEY_BASE   (1024*1024) /* NB. should be a power of 2 */
> +
>  /* Marked MAP_INHERIT_ZERO, so zero'd out in fork children. */
>  static struct _rs {
>   size_t  rs_have;/* valid bytes at end of rs_buf */
> @@ -86,6 +88,7 @@ static void
>  _rs_stir(void)
>  {
>   u_char rnd[KEYSZ + IVSZ];
> + uint32_t rekey_fuzz = 0;
>  
>   if (getentropy(rnd, sizeof rnd) == -1)
>   _getentropy_fail();
> @@ -100,7 +103,10 @@ _rs_stir(void)
>   rs->rs_have = 0;
>   memset(rsx->rs_buf, 0, sizeof(rsx->rs_buf));
>  
> - rs->rs_count = 160;
> + /* rekey interval should not be predictable */
> + chacha_encrypt_bytes(>rs_chacha, (uint8_t *)_fuzz,
> +  (uint8_t *)_fuzz, sizeof(rekey_fuzz));
> + rs->rs_count += REKEY_BASE + (rekey_fuzz % REKEY_BASE);

Replace += with =. With that fixed, OK visa@

Re: randomise arc4random() rekey interval

2022-07-29 Thread Visa Hankala

On Fri, Jul 29, 2022 at 06:56:08AM -0600, Theo de Raadt wrote:
> Visa Hankala  wrote:
> 
> > On Thu, Jul 28, 2022 at 11:00:12AM +1000, Damien Miller wrote:
> > > + rs->rs_count = REKEY_BASE;
> > > + /* rekey interval should not be predictable */
> > > + _rs_random_u32(_fuzz);
> > > + rs->rs_count += rekey_fuzz % REKEY_BASE;
> > 
> > The randomization looks good.
> > 
> > However, might it cause a problem (in the future) that _rs_random_u32()
> > calls _rs_stir_if_needed()? rs_count has a largish value so a recursive
> > invocation of _rs_stir() should not happen, but anyway.
> 
> The question is what _rs_random_u32() will do when it calls 
> _rs_stir_if_needed().
> 
> There is one potential problem.  lib/libcrypto/arc4random/*.h contains 
> portable
> wrappers for _rs_forkdetect(), which actually do things.  memset(rs, 0, 
> sizeof(*rs))
> will trash the rs state. Let's imagine a "fork" has happened same time that 
> bytes
> run out.
> 
> _rs_stir()
> ...
> rs->rs_count = REKEY_BASE;
> _rs_random_u32 -> _rs_stir_if_needed -> _rs_forkdetect
>   - all rs fields are zero'd with memset
>   - _rs_forkdetect returns
> 
> back in _rs_stir_if_needed,
>- if (!rs || rs->rs_count <= len)
>_rs_stir();
> 
> 
> So it will recurse once (only once, because a 2nd fork cannot happen).
> But this is fragile.
> 
> Alternatives are to get the value direct from getentropy -- with KEYSZ + IVSZ 
> + 4
> maybe?  Or fetch a value for this random bias early and track it in rs, but 
> don't
> damage it in the memset?  Or split _rs_random_u32() so that a sub-function of 
> it
> may collect these 4 keystream bytes without the _rs_stir_if_needed/_rs_rekey 
> checks?

_rs_stir() clears rs_buf, so a rekey is needed if the fuzz value is
taken from the keystream.

Another option is to move the _rs_stir_if_needed() calls from
_rs_random_u32() and _rs_random_buf() to arc4random() and
arc4random_buf(). The latter two are the subsystem's entry points.

Taking the fuzz value directly from getentropy() would be a clear
approach that does not add odd hoops, though some might feel it
uneconomic use of system entropy. ;)

Re: randomise arc4random() rekey interval

2022-07-29 Thread Visa Hankala

On Thu, Jul 28, 2022 at 11:00:12AM +1000, Damien Miller wrote:
> + rs->rs_count = REKEY_BASE;
> + /* rekey interval should not be predictable */
> + _rs_random_u32(_fuzz);
> + rs->rs_count += rekey_fuzz % REKEY_BASE;

The randomization looks good.

However, might it cause a problem (in the future) that _rs_random_u32()
calls _rs_stir_if_needed()? rs_count has a largish value so a recursive
invocation of _rs_stir() should not happen, but anyway.

More splbio() with struct vnode

2022-07-26 Thread Visa Hankala

This diff puts more fields of struct vnode under splbio(). splbio()
looks necessary with the fields that are modified through the buffer
cache because the access can happen in an interrupt context.

Wrapping LIST_EMPTY() with splbio() is probably overzealous.
However, the added splbio() calls might serve as hints for locking.

OK?

Index: kern/vfs_bio.c
===
RCS file: src/sys/kern/vfs_bio.c,v
retrieving revision 1.208
diff -u -p -r1.208 vfs_bio.c
--- kern/vfs_bio.c  12 Dec 2021 09:14:59 -  1.208
+++ kern/vfs_bio.c  26 Jul 2022 15:36:42 -
@@ -1554,7 +1554,10 @@ bufcache_getcleanbuf(int cachenum, int d
 
 
 void
-discard_buffer(struct buf *bp) {
+discard_buffer(struct buf *bp)
+{
+   splassert(IPL_BIO);
+
bufcache_take(bp);
if (bp->b_vp) {
RBT_REMOVE(buf_rb_bufs,
Index: kern/vfs_subr.c
===
RCS file: src/sys/kern/vfs_subr.c,v
retrieving revision 1.315
diff -u -p -r1.315 vfs_subr.c
--- kern/vfs_subr.c 27 Mar 2022 16:19:39 -  1.315
+++ kern/vfs_subr.c 26 Jul 2022 15:36:42 -
@@ -662,16 +662,16 @@ vget(struct vnode *vp, int flags)
}
mtx_leave(_mtx);
 
+   s = splbio();
onfreelist = vp->v_bioflag & VBIOONFREELIST;
if (vp->v_usecount == 0 && onfreelist) {
-   s = splbio();
if (vp->v_holdcnt > 0)
TAILQ_REMOVE(_hold_list, vp, v_freelist);
else
TAILQ_REMOVE(_free_list, vp, v_freelist);
vp->v_bioflag &= ~VBIOONFREELIST;
-   splx(s);
}
+   splx(s);
 
vp->v_usecount++;
if (flags & LK_TYPE_MASK) {
@@ -749,6 +749,7 @@ void
 vput(struct vnode *vp)
 {
struct proc *p = curproc;
+   int s;
 
 #ifdef DIAGNOSTIC
if (vp == NULL)
@@ -777,8 +778,10 @@ vput(struct vnode *vp)
 
VOP_INACTIVE(vp, p);
 
+   s = splbio();
if (vp->v_usecount == 0 && !(vp->v_bioflag & VBIOONFREELIST))
vputonfreelist(vp);
+   splx(s);
 }
 
 /*
@@ -790,6 +793,7 @@ int
 vrele(struct vnode *vp)
 {
struct proc *p = curproc;
+   int s;
 
 #ifdef DIAGNOSTIC
if (vp == NULL)
@@ -822,8 +826,10 @@ vrele(struct vnode *vp)
 
VOP_INACTIVE(vp, p);
 
+   s = splbio();
if (vp->v_usecount == 0 && !(vp->v_bioflag & VBIOONFREELIST))
vputonfreelist(vp);
+   splx(s);
return (1);
 }
 
@@ -831,6 +837,10 @@ vrele(struct vnode *vp)
 void
 vhold(struct vnode *vp)
 {
+   int s;
+
+   s = splbio();
+
/*
 * If it is on the freelist and the hold count is currently
 * zero, move it to the hold list.
@@ -841,12 +851,18 @@ vhold(struct vnode *vp)
TAILQ_INSERT_TAIL(_hold_list, vp, v_freelist);
}
vp->v_holdcnt++;
+
+   splx(s);
 }
 
 /* Lose interest in a vnode. */
 void
 vdrop(struct vnode *vp)
 {
+   int s;
+
+   s = splbio();
+
 #ifdef DIAGNOSTIC
if (vp->v_holdcnt == 0)
panic("vdrop: zero holdcnt");
@@ -863,6 +879,8 @@ vdrop(struct vnode *vp)
TAILQ_REMOVE(_hold_list, vp, v_freelist);
TAILQ_INSERT_TAIL(_free_list, vp, v_freelist);
}
+
+   splx(s);
 }
 
 /*
@@ -909,6 +927,7 @@ vflush_vnode(struct vnode *vp, void *arg
 {
struct vflush_args *va = arg;
struct proc *p = curproc;
+   int empty, s;
 
if (vp == va->skipvp) {
return (0);
@@ -958,8 +977,11 @@ vflush_vnode(struct vnode *vp, void *arg
 * XXX Might be nice to check per-fs "inode" flags, but
 * generally the filesystem is sync'd already, right?
 */
-   if ((va->flags & IGNORECLEAN) &&
-   LIST_EMPTY(>v_dirtyblkhd))
+   s = splbio();
+   empty = (va->flags & IGNORECLEAN) && LIST_EMPTY(>v_dirtyblkhd);
+   splx(s);
+
+   if (empty)
return (0);
 
 #ifdef DEBUG_SYSCTL
@@ -992,6 +1014,7 @@ void
 vclean(struct vnode *vp, int flags, struct proc *p)
 {
int active, do_wakeup = 0;
+   int s;
 
/*
 * Check to see if the vnode is in use.
@@ -1066,9 +1089,11 @@ vclean(struct vnode *vp, int flags, stru
if (active) {
vp->v_usecount--;
if (vp->v_usecount == 0) {
+   s = splbio();
if (vp->v_holdcnt > 0)
panic("vclean: not clean");
vputonfreelist(vp);
+   splx(s);
}
}
cache_purge(vp);
@@ -1125,6 +1150,7 @@ vgonel(struct vnode *vp, struct proc *p)
 {
struct vnode *vq;
struct vnode *vx;
+   int s;
 
KASSERT(vp->v_uvcount == 0);
 
@@ -1192,12 +1218,9 @@ vgonel(struct vnode *vp, struct proc *p)
 * Move onto the free list, unless we were called from

Remove nselcoll from vmstat(8) output

2022-07-25 Thread Visa Hankala

Remove the obsolete "select collisions" count from vmstat(8) output.

Also, remove the reference of the now-nonexistent kernel variable
"nselcoll", and terminate the name list with a NULL as required by
kvm_nlist(3).

OK?

Index: vmstat.c
===
RCS file: src/usr.bin/vmstat/vmstat.c,v
retrieving revision 1.153
diff -u -p -r1.153 vmstat.c
--- vmstat.c22 Feb 2022 17:35:01 -  1.153
+++ vmstat.c25 Jul 2022 15:22:34 -
@@ -72,13 +72,11 @@ struct nlist namelist[] = {
{ "_bucket" },
 #defineX_FORKSTAT  5   /* sysctl */
{ "_forkstat" },
-#define X_NSELCOLL 6   /* sysctl */
-   { "_nselcoll" },
-#define X_POOLHEAD 7   /* sysctl */
+#define X_POOLHEAD 6   /* sysctl */
{ "_pool_head" },
-#defineX_NAPTIME   8
+#defineX_NAPTIME   7
{ "_naptime" },
-   { "" },
+   { NULL },
 };
 
 /* Objects defined in dkstats.c */
@@ -485,7 +483,7 @@ void
 dosum(void)
 {
struct nchstats nchstats;
-   int mib[2], nselcoll;
+   int mib[2];
long long nchtotal;
size_t size;
 
@@ -571,19 +569,6 @@ dosum(void)
pct(nchstats.ncs_badhits, nchtotal),
pct(nchstats.ncs_falsehits, nchtotal),
pct(nchstats.ncs_long, nchtotal));
-
-   if (nlistf == NULL && memf == NULL) {
-   size = sizeof(nselcoll);
-   mib[0] = CTL_KERN;
-   mib[1] = KERN_NSELCOLL;
-   if (sysctl(mib, 2, , , NULL, 0) == -1) {
-   warn("could not read kern.nselcoll");
-   nselcoll = 0;
-   }
-   } else {
-   kread(X_NSELCOLL, , sizeof(nselcoll));
-   }
-   (void)printf("%11d select collisions\n", nselcoll);
 }
 
 void

Allocate if_index before queue init

2022-07-16 Thread Visa Hankala

The index of a network interface is assigned in if_idxmap_insert().
ifq_init() and ifiq_init() use if_index before it has its final value.
As a consequence, interfaces tend to use net_tq(0) as their first
softnet task queue even though the other softnet task queues could be
used as well. To fix this, allocate if_index before queue setup.

The patch tracks index allocations using a separate bitmap so that
if_get() does not need extra logic.

Because the memory allocation of the map array can sleep, replace the
kernel lock with an rwlock to serialize idxmap updates more robustly.

Also, correct the "too many interfaces" condition because the valid
index range is from 1 to USHRT_MAX.

OK?

Index: net/if.c
===
RCS file: src/sys/net/if.c,v
retrieving revision 1.659
diff -u -p -r1.659 if.c
--- net/if.c14 Jul 2022 11:03:15 -  1.659
+++ net/if.c16 Jul 2022 10:07:14 -
@@ -217,9 +217,12 @@ struct if_idxmap {
unsigned int serial;
unsigned int count;
struct srp   map;
+   struct rwlocklock;
+   unsigned char   *usedidx;   /* bitmap of indices in use */
 };
 
 void   if_idxmap_init(unsigned int);
+void   if_idxmap_alloc(struct ifnet *);
 void   if_idxmap_insert(struct ifnet *);
 void   if_idxmap_remove(struct ifnet *);
 
@@ -273,7 +276,9 @@ ifinit(void)
 static struct if_idxmap if_idxmap = {
0,
0,
-   SRP_INITIALIZER()
+   SRP_INITIALIZER(),
+   RWLOCK_INITIALIZER("idxmaplk"),
+   NULL,
 };
 
 struct srp_gc if_ifp_gc = SRP_GC_INITIALIZER(if_ifp_dtor, NULL);
@@ -298,12 +303,15 @@ if_idxmap_init(unsigned int limit)
for (i = 0; i < limit; i++)
srp_init([i]);
 
+   if_idxmap.usedidx = malloc(howmany(limit, NBBY),
+   M_IFADDR, M_WAITOK | M_ZERO);
+
/* this is called early so there's nothing to race with */
srp_update_locked(_map_gc, _idxmap.map, if_map);
 }
 
 void
-if_idxmap_insert(struct ifnet *ifp)
+if_idxmap_alloc(struct ifnet *ifp)
 {
struct if_map *if_map;
struct srp *map;
@@ -311,10 +319,9 @@ if_idxmap_insert(struct ifnet *ifp)
 
refcnt_init(>if_refcnt);
 
-   /* the kernel lock guarantees serialised modifications to if_idxmap */
-   KERNEL_ASSERT_LOCKED();
+   rw_enter_write(_idxmap.lock);
 
-   if (++if_idxmap.count > USHRT_MAX)
+   if (++if_idxmap.count >= USHRT_MAX)
panic("too many interfaces");
 
if_map = srp_get_locked(_idxmap.map);
@@ -327,6 +334,7 @@ if_idxmap_insert(struct ifnet *ifp)
struct srp *nmap;
unsigned int nlimit;
struct ifnet *nifp;
+   unsigned char *nusedidx;
 
nlimit = if_map->limit * 2;
nif_map = malloc(sizeof(*nif_map) + nlimit * sizeof(*nmap),
@@ -348,6 +356,14 @@ if_idxmap_insert(struct ifnet *ifp)
i++;
}
 
+   nusedidx = malloc(howmany(nlimit, NBBY),
+   M_IFADDR, M_WAITOK | M_ZERO);
+   memcpy(nusedidx, if_idxmap.usedidx,
+   howmany(if_map->limit, NBBY));
+   free(if_idxmap.usedidx, M_IFADDR,
+   howmany(if_map->limit, NBBY));
+   if_idxmap.usedidx = nusedidx;
+
srp_update_locked(_map_gc, _idxmap.map, nif_map);
if_map = nif_map;
map = nmap;
@@ -355,15 +371,39 @@ if_idxmap_insert(struct ifnet *ifp)
 
/* pick the next free index */
for (i = 0; i < USHRT_MAX; i++) {
-   if (index != 0 && srp_get_locked([index]) == NULL)
+   if (index != 0 && isclr(if_idxmap.usedidx, index))
break;
 
index = if_idxmap.serial++ & USHRT_MAX;
}
+   KASSERT(index != 0 && index < if_map->limit);
+   KASSERT(isclr(if_idxmap.usedidx, index));
 
-   /* commit */
+   setbit(if_idxmap.usedidx, index);
ifp->if_index = index;
+
+   rw_exit_write(_idxmap.lock);
+}
+
+void
+if_idxmap_insert(struct ifnet *ifp)
+{
+   struct if_map *if_map;
+   struct srp *map;
+   unsigned int index = ifp->if_index;
+
+   rw_enter_write(_idxmap.lock);
+
+   if_map = srp_get_locked(_idxmap.map);
+   map = (struct srp *)(if_map + 1);
+
+   KASSERT(index != 0 && index < if_map->limit);
+   KASSERT(isset(if_idxmap.usedidx, index));
+
+   /* commit */
srp_update_locked(_ifp_gc, [index], if_ref(ifp));
+
+   rw_exit_write(_idxmap.lock);
 }
 
 void
@@ -375,8 +415,7 @@ if_idxmap_remove(struct ifnet *ifp)
 
index = ifp->if_index;
 
-   /* the kernel lock guarantees serialised modifications to if_idxmap */
-   KERNEL_ASSERT_LOCKED();
+   rw_enter_write(_idxmap.lock);
 
if_map = srp_get_locked(_idxmap.map);
KASSERT(index < if_map->limit);
@@ -386,7 +425,12 @@

Re: unp_solock_peer() and READ_ONCE()

2022-07-13 Thread Visa Hankala

On Thu, Jul 14, 2022 at 04:39:33AM +0300, Vitaliy Makkoveev wrote:
> It looks like sometimes the `unp_conn' doesn't reloaded in
> the "unp->unp_conn != unp2" check, and READ_ONCE() should prevent this.

Are you sure about the non-reloading of unp->unp_conn? I did a quick
look in the compiled output on amd64, mips64 and riscv64, and did not
spot any obvious change (there is no change on mips64 and riscv64).

The loads are surrounded by lock/reference operations that imply
memory clobbering. The compiler should not assume that unp->unp_conn
is unchanged.

> unp_solock_peer(struct socket *so)
> {
> struct unpcb *unp, *unp2;
> struct socket *so2;
> 
> unp = so->so_pcb;
> 
> again:
> if ((unp2 = unp->unp_conn) == NULL)
> return NULL;
> 
> so2 = unp2->unp_socket;
> 
> if (so < so2)
> solock(so2);
> else if (so > so2){
> unp_ref(unp2);
> sounlock(so);
> solock(so2);
> solock(so);
> 
> /* Datagram socket could be reconnected due to re-lock. */
> if (unp->unp_conn != unp2) {
> sounlock(so2);
> unp_rele(unp2);
> goto again;
> }
> 
> unp_rele(unp2);
> }
> 
> return so2;
> }
> 
> Index: sys/kern/uipc_usrreq.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
> retrieving revision 1.167
> diff -u -p -r1.167 uipc_usrreq.c
> --- sys/kern/uipc_usrreq.c2 Jul 2022 11:49:23 -   1.167
> +++ sys/kern/uipc_usrreq.c14 Jul 2022 01:08:22 -
> @@ -154,7 +154,7 @@ unp_solock_peer(struct socket *so)
>   unp = so->so_pcb;
>  
>  again:
> - if ((unp2 = unp->unp_conn) == NULL)
> + if ((unp2 = READ_ONCE(unp->unp_conn)) == NULL)
>   return NULL;
>  
>   so2 = unp2->unp_socket;
> @@ -168,7 +168,7 @@ again:
>   solock(so);
>  
>   /* Datagram socket could be reconnected due to re-lock. */
> - if (unp->unp_conn != unp2) {
> + if (READ_ONCE(unp->unp_conn) != unp2) {
>   sounlock(so2);
>   unp_rele(unp2);
>   goto again;
>

Replace selwakeup() with KNOTE() in audio(4)

2022-07-11 Thread Visa Hankala

Replace selwakeup() with KNOTE() in audio(4).

KNOTE() can be used up to IPL_SCHED. IPL_SCHED is higher than IPL_AUDIO,
so the deferring through soft interrupts is no longer necessary.

In audio_detach(), the selwakeup() calls should not need replacing.
Any remaining kevent/poll/select waiters are woken up by
klist_invalidate().

OK?

Index: dev/audio.c
===
RCS file: src/sys/dev/audio.c,v
retrieving revision 1.199
diff -u -p -r1.199 audio.c
--- dev/audio.c 2 Jul 2022 08:50:41 -   1.199
+++ dev/audio.c 11 Jul 2022 15:14:09 -
@@ -20,6 +20,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -75,8 +77,7 @@ struct audio_buf {
size_t used;/* bytes used in the FIFO */
size_t blksz;   /* DMA block size */
unsigned int nblks; /* number of blocks */
-   struct selinfo sel; /* to record & wakeup poll(2) */
-   void *softintr; /* context to call selwakeup() */
+   struct klist klist; /* list of knotes */
unsigned int pos;   /* bytes transferred */
unsigned int xrun;  /* bytes lost by xruns */
int blocking;   /* read/write blocking */
@@ -136,10 +137,9 @@ struct audio_softc {
int mix_nent;   /* size of mixer state */
int mix_isopen; /* mixer open for reading */
int mix_blocking;   /* read() blocking */
-   struct selinfo mix_sel; /* wakeup poll(2) */
+   struct klist mix_klist; /* list of knotes */
struct mixer_ev *mix_evbuf; /* per mixer-control event */
struct mixer_ev *mix_pending;   /* list of changed controls */
-   void *mix_softintr; /* context to call selwakeup() */
 #if NWSKBD > 0
struct wskbd_vol spkr, mic;
struct task wskbd_task;
@@ -153,6 +153,8 @@ int audio_activate(struct device *, int)
 int audio_detach(struct device *, int);
 void audio_pintr(void *);
 void audio_rintr(void *);
+void audio_buf_wakeup(struct audio_buf *);
+void audio_mixer_wakeup(struct audio_softc *);
 #if NWSKBD > 0
 void wskbd_mixer_init(struct audio_softc *);
 void wskbd_mixer_cb(void *);
@@ -275,53 +277,33 @@ audio_blksz_bytes(int mode,
 }
 
 void
-audio_mixer_wakeup(void *addr)
+audio_mixer_wakeup(struct audio_softc *sc)
 {
-   struct audio_softc *sc = addr;
+   MUTEX_ASSERT_LOCKED(_lock);
 
if (sc->mix_blocking) {
wakeup(>mix_blocking);
sc->mix_blocking = 0;
}
-   /*
-* As long as selwakeup() grabs the KERNEL_LOCK() make sure it is
-* already held here to avoid lock ordering problems with `audio_lock'
-*/
-   KERNEL_ASSERT_LOCKED();
-   mtx_enter(_lock);
-   selwakeup(>mix_sel);
-   mtx_leave(_lock);
+   KNOTE(>mix_klist, 0);
 }
 
 void
-audio_buf_wakeup(void *addr)
+audio_buf_wakeup(struct audio_buf *buf)
 {
-   struct audio_buf *buf = addr;
+   MUTEX_ASSERT_LOCKED(_lock);
 
if (buf->blocking) {
wakeup(>blocking);
buf->blocking = 0;
}
-   /*
-* As long as selwakeup() grabs the KERNEL_LOCK() make sure it is
-* already held here to avoid lock ordering problems with `audio_lock'
-*/
-   KERNEL_ASSERT_LOCKED();
-   mtx_enter(_lock);
-   selwakeup(>sel);
-   mtx_leave(_lock);
+   KNOTE(>klist, 0);
 }
 
 int
 audio_buf_init(struct audio_softc *sc, struct audio_buf *buf, int dir)
 {
-   klist_init_mutex(>sel.si_note, _lock);
-   buf->softintr = softintr_establish(IPL_SOFTAUDIO,
-   audio_buf_wakeup, buf);
-   if (buf->softintr == NULL) {
-   printf("%s: can't establish softintr\n", DEVNAME(sc));
-   goto bad;
-   }
+   klist_init_mutex(>klist, _lock);
if (sc->ops->round_buffersize) {
buf->datalen = sc->ops->round_buffersize(sc->arg,
dir, AUDIO_BUFSZ);
@@ -333,13 +315,10 @@ audio_buf_init(struct audio_softc *sc, s
} else
buf->data = malloc(buf->datalen, M_DEVBUF, M_WAITOK);
if (buf->data == NULL) {
-   softintr_disestablish(buf->softintr);
-   goto bad;
+   klist_free(>klist);
+   return ENOMEM;
}
return 0;
-bad:
-   klist_free(>sel.si_note);
-   return ENOMEM;
 }
 
 void
@@ -349,8 +328,7 @@ audio_buf_done(struct audio_softc *sc, s
sc->ops->freem(sc->arg, buf->data, M_DEVBUF);
else
free(buf->data, M_DEVBUF, buf->datalen);
-   softintr_disestablish(buf->softintr);
-   klist_free(>sel.si_note);
+   klist_free(>klist);
 }
 
 /*
@@ -563,13 +541,7 @@ audio_pintr(void *addr)
if (sc->play.used < sc->play.len) {
DPRINTFN(1,

vldcp(4): Fix event filters

2022-07-09 Thread Visa Hankala

vldcp(4)'s read and write event filters have a discrepancy relative to
the old poll code. They should report non-activeness when the condition
(err == 0 && state == LDC_CHANNEL_UP && head != tail) is false. Now they
can return a stale value, causing a spurious wakeup of
kevent/poll/select. This can be fixed by clearing kn->kn_data.

Could someone test this with ldomd?

Index: arch/sparc64/dev/vldcp.c
===
RCS file: src/sys/arch/sparc64/dev/vldcp.c,v
retrieving revision 1.23
diff -u -p -r1.23 vldcp.c
--- arch/sparc64/dev/vldcp.c2 Jul 2022 08:50:41 -   1.23
+++ arch/sparc64/dev/vldcp.c9 Jul 2022 13:39:10 -
@@ -617,6 +617,7 @@ filt_vldcpread(struct knote *kn, long hi
} else {
cbus_intr_setenabled(sc->sc_bustag, sc->sc_rx_ino,
INTR_ENABLED);
+   kn->kn_data = 0;
}
splx(s);
 
@@ -641,6 +642,7 @@ filt_vldcwrite(struct knote *kn, long hi
} else {
cbus_intr_setenabled(sc->sc_bustag, sc->sc_tx_ino,
INTR_ENABLED);
+   kn->kn_data = 0;
}
splx(s);

vldcp(4): Add missing device_unref() calls

2022-07-09 Thread Visa Hankala

Add missing device_unref() calls to vldcpkqfilter().

The knote that vldcpkqfilter() sets up does not take a device reference.

OK?

Index: arch/sparc64/dev/vldcp.c
===
RCS file: src/sys/arch/sparc64/dev/vldcp.c,v
retrieving revision 1.23
diff -u -p -r1.23 vldcp.c
--- arch/sparc64/dev/vldcp.c2 Jul 2022 08:50:41 -   1.23
+++ arch/sparc64/dev/vldcp.c9 Jul 2022 13:39:10 -
@@ -683,6 +685,7 @@ vldcpkqfilter(dev_t dev, struct knote *k
break;
 
default:
+   device_unref(>sc_dv);
return (EINVAL);
}
 
@@ -692,5 +695,6 @@ vldcpkqfilter(dev_t dev, struct knote *k
klist_insert_locked(klist, kn);
splx(s);
 
+   device_unref(>sc_dv);
return (0);
 }

Replace struct selinfo with klist in bpf, kqueue and pipes

2022-07-08 Thread Visa Hankala

Replace struct selinfo with direct use of struct klist in bpf, kqueue
and pipes. These subsystems no longer utilize selwakeup().

OK?

Index: kern/kern_event.c
===
RCS file: src/sys/kern/kern_event.c,v
retrieving revision 1.191
diff -u -p -r1.191 kern_event.c
--- kern/kern_event.c   27 Jun 2022 13:35:21 -  1.191
+++ kern/kern_event.c   8 Jul 2022 15:32:26 -
@@ -38,7 +38,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -221,7 +220,7 @@ KQRELE(struct kqueue *kq)
free(kq->kq_knlist, M_KEVENT, kq->kq_knlistsize *
sizeof(struct knlist));
hashfree(kq->kq_knhash, KN_HASHSIZE, M_KEVENT);
-   klist_free(>kq_sel.si_note);
+   klist_free(>kq_klist);
pool_put(_pool, kq);
 }
 
@@ -257,7 +256,7 @@ kqueue_kqfilter(struct file *fp, struct 
return (EINVAL);
 
kn->kn_fop = _filtops;
-   klist_insert(>kq_sel.si_note, kn);
+   klist_insert(>kq_klist, kn);
return (0);
 }
 
@@ -266,7 +265,7 @@ filt_kqdetach(struct knote *kn)
 {
struct kqueue *kq = kn->kn_fp->f_data;
 
-   klist_remove(>kq_sel.si_note, kn);
+   klist_remove(>kq_klist, kn);
 }
 
 int
@@ -849,7 +848,7 @@ kqueue_alloc(struct filedesc *fdp)
TAILQ_INIT(>kq_head);
mtx_init(>kq_lock, IPL_HIGH);
task_set(>kq_task, kqueue_task, kq);
-   klist_init_mutex(>kq_sel.si_note, _klist_lock);
+   klist_init_mutex(>kq_klist, _klist_lock);
 
return (kq);
 }
@@ -1580,7 +1579,7 @@ kqueue_terminate(struct proc *p, struct 
 * Any knotes that were attached to this kqueue were deleted
 * by knote_fdclose() when this kqueue's file descriptor was closed.
 */
-   KASSERT(klist_empty(>kq_sel.si_note));
+   KASSERT(klist_empty(>kq_klist));
if (state & KQ_TASK)
taskq_del_barrier(systqmp, >kq_task);
 }
@@ -1606,7 +1605,7 @@ kqueue_task(void *arg)
struct kqueue *kq = arg;
 
mtx_enter(_klist_lock);
-   KNOTE(>kq_sel.si_note, 0);
+   KNOTE(>kq_klist, 0);
mtx_leave(_klist_lock);
 }
 
@@ -1619,7 +1618,7 @@ kqueue_wakeup(struct kqueue *kq)
kq->kq_state &= ~KQ_SLEEP;
wakeup(kq);
}
-   if (!klist_empty(>kq_sel.si_note)) {
+   if (!klist_empty(>kq_klist)) {
/* Defer activation to avoid recursion. */
kq->kq_state |= KQ_TASK;
task_add(systqmp, >kq_task);
Index: kern/sys_pipe.c
===
RCS file: src/sys/kern/sys_pipe.c,v
retrieving revision 1.140
diff -u -p -r1.140 sys_pipe.c
--- kern/sys_pipe.c 20 Jun 2022 01:39:44 -  1.140
+++ kern/sys_pipe.c 8 Jul 2022 15:32:26 -
@@ -371,7 +371,7 @@ pipeselwakeup(struct pipe *cpipe)
 {
rw_assert_wrlock(cpipe->pipe_lock);
 
-   KNOTE(>pipe_sel.si_note, 0);
+   KNOTE(>pipe_klist, 0);
 
if (cpipe->pipe_state & PIPE_ASYNC)
pgsigio(>pipe_sigio, SIGIO, 0);
@@ -854,7 +854,7 @@ pipe_kqfilter(struct file *fp, struct kn
case EVFILT_READ:
kn->kn_fop = _rfiltops;
kn->kn_hook = rpipe;
-   klist_insert_locked(>pipe_sel.si_note, kn);
+   klist_insert_locked(>pipe_klist, kn);
break;
case EVFILT_WRITE:
if (wpipe == NULL) {
@@ -864,7 +864,7 @@ pipe_kqfilter(struct file *fp, struct kn
}
kn->kn_fop = _wfiltops;
kn->kn_hook = wpipe;
-   klist_insert_locked(>pipe_sel.si_note, kn);
+   klist_insert_locked(>pipe_klist, kn);
break;
case EVFILT_EXCEPT:
if (kn->kn_flags & __EV_SELECT) {
@@ -879,7 +879,7 @@ pipe_kqfilter(struct file *fp, struct kn
}
kn->kn_fop = _efiltops;
kn->kn_hook = rpipe;
-   klist_insert_locked(>pipe_sel.si_note, kn);
+   klist_insert_locked(>pipe_klist, kn);
break;
default:
error = EINVAL;
@@ -895,7 +895,7 @@ filt_pipedetach(struct knote *kn)
 {
struct pipe *cpipe = kn->kn_hook;
 
-   klist_remove(>pipe_sel.si_note, kn);
+   klist_remove(>pipe_klist, kn);
 }
 
 int
@@ -1011,8 +1011,8 @@ pipe_pair_create(void)
pp->pp_wpipe.pipe_lock = >pp_lock;
pp->pp_rpipe.pipe_lock = >pp_lock;
 
-   klist_init_rwlock(>pp_wpipe.pipe_sel.si_note, >pp_lock);
-   klist_init_rwlock(>pp_rpipe.pipe_sel.si_note, >pp_lock);
+   klist_init_rwlock(>pp_wpipe.pipe_klist, >pp_lock);
+   klist_init_rwlock(>pp_rpipe.pipe_klist, >pp_lock);
 
if (pipe_create(>pp_wpipe) || pipe_create(>pp_rpipe))
goto err;
@@ -1026,7 +1026,7 @@ err:
 void
 pipe_pair_destroy(struct pipe_pair *pp)
 {
-   klist_free(>pp_wpipe.pipe_sel.si_note);
-

Remove leftovers of old poll/select

2022-07-02 Thread Visa Hankala

Remove the leftovers of the old poll/select mechanism.
This includes the fields si_seltid and si_flags in struct selinfo.
They should now always be zero because nothing calls selrecord().

selwakeup() becomes a wrapper for KNOTE(). I do not want to change
it further in this patch. This code can be subtle, as shown by the
socket selwakeup() experiment in May. ;)

The selwakeup() call in bpf_wakeup_cb() can be removed as the kqueue
event filter ignores it.

The patch makes kern.nselcoll sysctl constant. However, is there any
actual need to keep this sysctl? vmstat(1) is the only user of
KERN_NSELCOLL in base. Debian code search does not reveal uses that
would affect ports.

Index: bin/ps/ps.1
===
RCS file: src/bin/ps/ps.1,v
retrieving revision 1.125
diff -u -p -r1.125 ps.1
--- bin/ps/ps.1 31 Mar 2022 17:27:14 -  1.125
+++ bin/ps/ps.1 2 Jul 2022 13:51:19 -
@@ -222,7 +222,6 @@ P_PROFPEND0x2 this thread needs 
 P_ALRMPEND0x4 this thread needs SIGVTALRM
 P_SIGSUSPEND  0x8 need to restore before-suspend mask
 P_CANTSLEEP  0x10 this thread is not permitted to sleep
-P_SELECT 0x40 selecting; wakeup/waiting danger
 P_SINTR  0x80 sleep is interruptible
 P_SYSTEM0x200 system process: no sigs, stats, or
   swapping
Index: sys/dev/wscons/wsdisplay.c
===
RCS file: src/sys/dev/wscons/wsdisplay.c,v
retrieving revision 1.146
diff -u -p -r1.146 wsdisplay.c
--- sys/dev/wscons/wsdisplay.c  2 Jul 2022 08:50:42 -   1.146
+++ sys/dev/wscons/wsdisplay.c  2 Jul 2022 13:51:45 -
@@ -1441,7 +1441,7 @@ wsdisplaystart(struct tty *tp)
splx(s);
return;
}
-   if (tp->t_outq.c_cc == 0 && tp->t_wsel.si_seltid == 0)
+   if (tp->t_outq.c_cc == 0)
goto low;
 
if ((scr = sc->sc_scr[WSDISPLAYSCREEN(tp->t_dev)]) == NULL) {
Index: sys/kern/kern_sysctl.c
===
RCS file: src/sys/kern/kern_sysctl.c,v
retrieving revision 1.402
diff -u -p -r1.402 kern_sysctl.c
--- sys/kern/kern_sysctl.c  21 Mar 2022 09:12:34 -  1.402
+++ sys/kern/kern_sysctl.c  2 Jul 2022 13:51:45 -
@@ -120,7 +120,7 @@
 
 extern struct forkstat forkstat;
 extern struct nchstats nchstats;
-extern int nselcoll, fscale;
+extern int fscale;
 extern fixpt_t ccpu;
 extern long numvnodes;
 extern int allowdt;
@@ -298,7 +298,7 @@ const struct sysctl_bounded_args kern_va
{KERN_NFILES, , SYSCTL_INT_READONLY},
{KERN_TTYCOUNT, _count, SYSCTL_INT_READONLY},
{KERN_ARGMAX, _max, SYSCTL_INT_READONLY},
-   {KERN_NSELCOLL, , SYSCTL_INT_READONLY},
+   {KERN_NSELCOLL, _zero, SYSCTL_INT_READONLY},
{KERN_POSIX1, _version, SYSCTL_INT_READONLY},
{KERN_NGROUPS, _max, SYSCTL_INT_READONLY},
{KERN_JOB_CONTROL, _one, SYSCTL_INT_READONLY},
Index: sys/kern/sys_generic.c
===
RCS file: src/sys/kern/sys_generic.c,v
retrieving revision 1.147
diff -u -p -r1.147 sys_generic.c
--- sys/kern/sys_generic.c  8 Feb 2022 08:56:41 -   1.147
+++ sys/kern/sys_generic.c  2 Jul 2022 13:51:45 -
@@ -89,7 +89,6 @@ int dopselect(struct proc *, int, fd_set
 struct timespec *, const sigset_t *, register_t *);
 int doppoll(struct proc *, struct pollfd *, u_int, struct timespec *,
 const sigset_t *, register_t *);
-void doselwakeup(struct selinfo *);
 
 int
 iovec_copyin(const struct iovec *uiov, struct iovec **iovp, struct iovec *aiov,
@@ -522,8 +521,6 @@ out:
return (error);
 }
 
-intselwait, nselcoll;
-
 /*
  * Select system call.
  */
@@ -840,41 +837,6 @@ pselcollect(struct proc *p, struct keven
return (0);
 }
 
-int
-seltrue(dev_t dev, int events, struct proc *p)
-{
-
-   return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
-}
-
-int
-selfalse(dev_t dev, int events, struct proc *p)
-{
-
-   return (0);
-}
-
-/*
- * Record a select request.
- */
-void
-selrecord(struct proc *selector, struct selinfo *sip)
-{
-   struct proc *p;
-   pid_t mytid;
-
-   KERNEL_ASSERT_LOCKED();
-
-   mytid = selector->p_tid;
-   if (sip->si_seltid == mytid)
-   return;
-   if (sip->si_seltid && (p = tfind(sip->si_seltid)) &&
-   p->p_wchan == (caddr_t))
-   sip->si_flags |= SI_COLL;
-   else
-   sip->si_seltid = mytid;
-}
-
 /*
  * Do a wakeup when a selectable event occurs.
  */
@@ -883,34 +845,9 @@ selwakeup(struct selinfo *sip)
 {
KERNEL_LOCK();
KNOTE(>si_note, NOTE_SUBMIT);
-   doselwakeup(sip);
KERNEL_UNLOCK();
 }
 
-void
-doselwakeup(struct selinfo *sip)
-{
-   struct proc *p;
-
-   KERNEL_ASSERT_LOCKED();
-
-   if (sip->si_seltid == 0)
-   return;
-   if (sip->si_flags &

Re: Remove device poll functions

2022-07-01 Thread Visa Hankala

On Sat, Jul 02, 2022 at 03:23:30PM +1000, Jonathan Gray wrote:
> a few more could be removed?
> 
> arch/sh/dev/scif.c:scifpoll(dev_t dev, int events, struct proc *p)
> 
> kern/tty_tty.c:cttypoll(dev_t dev, int events, struct proc *p)
> sys/tty.h:int   cttypoll(dev_t, int, struct proc *);

Augmented the patch with these, thanks.

> kern/sys_generic.c:seltrue(dev_t dev, int events, struct proc *p)
> sys/systm.h:int seltrue(dev_t dev, int which, struct proc *);
> 
> kern/sys_generic.c:selfalse(dev_t dev, int events, struct proc *p)
> sys/systm.h:int selfalse(dev_t dev, int which, struct proc *);

These I have in a subsequent cleanup patch.

Re: Use SMR instead of SRP list in rtsock.c

2022-07-01 Thread Visa Hankala

On Fri, Jul 01, 2022 at 09:59:11AM +0200, Claudio Jeker wrote:
> On Thu, Jun 30, 2022 at 03:46:35PM +0000, Visa Hankala wrote:
> > On Thu, Jun 30, 2022 at 11:51:52AM +0200, Claudio Jeker wrote:
> > > After discussing this with mpi@ and jmatthew@ we came to the conclusion
> > > that we need to smr_barrier() before refcnt_finalize() to ensure that no
> > > other CPU is between the SMR_TAILQ_FOREACH, refcnt_take() and
> > > smr_read_leave().
> > 
> > [...]
> > 
> > > @@ -509,7 +487,8 @@ route_input(struct mbuf *m0, struct sock
> > >   return;
> > >   }
> > >  
> > > - SRPL_FOREACH(rop, , _list, rop_list) {
> > > + smr_read_enter();
> > > + SMR_TAILQ_FOREACH(rop, _list, rop_list) {
> > >   /*
> > >* If route socket is bound to an address family only send
> > >* messages that match the address family. Address family
> > > @@ -519,7 +498,8 @@ route_input(struct mbuf *m0, struct sock
> > >   rop->rop_proto != sa_family)
> > >   continue;
> > >  
> > > -
> > > + refcnt_take(>rop_refcnt);
> > > + smr_read_leave();
> > >   so = rop->rop_socket;
> > >   solock(so);
> > >  
> > > @@ -579,8 +559,10 @@ route_input(struct mbuf *m0, struct sock
> > >   rtm_sendup(so, m);
> > >  next:
> > >   sounlock(so);
> > > + smr_read_enter();
> > > + refcnt_rele_wake(>rop_refcnt);
> > 
> > This does not look correct.
> > 
> > smr_barrier() can proceed after smr_read_leave(), so refcnt_rele_wake()
> > might drop the final reference and this thread can no longer access
> > rop safely (SMR_TAILQ_NEXT() inside SMR_TAILQ_FOREACH()).
> > 
> > Also, SMR_TAILQ_NEXT() of rop becomes potentially dangling after
> > smr_read_leave(). After this thread leaves the read-side critical
> > section, another thread might free rop's successor.
> 
> So we need to either smr_barrier() before and after the refcnt_finalize()
> to make sure that the rop pointer remains stable in both cases or we alter
> the SMR_TAILQ_FOREACH() loop so that SMR_TAILQ_NEXT can be grabbed before
> refcnt_rele_wake().
> 
> While the double smr_barrier() is trivial it is not ideal and I think it
> is better to adjust the loop since SMR loops with sleep points is a
> somewhat common issue and so we should have a good clear way on how to
> solve it.

Adjusting SMR_TAILQ_FOREACH() will not help.

In general, a reader cannot resume a lockless iteration after it has
left the read-side critical section and crossed a sleep point. The
guarantee of consistent(-looking) forward linkage is gone. The reader
no longer knows if the value of SMR_TAILQ_NEXT() is valid. If the
reader wants to continue with the list, it has to re-enter the read-side
critical section and restart the iteration.


I guess I should finish the sleepable variant of SMR that I was
tinkering with long ago...

Remove device poll functions

2022-07-01 Thread Visa Hankala

Remove unused device poll functions.

This also removes unneeded includes of  and 
from the kernel. Some includes of  are removed as well,
but another cleanup will come related to that header.

After this, most of the remnants of the old poll machinery
can be removed.

OK?

Index: arch/macppc/dev/adb.c
===
RCS file: src/sys/arch/macppc/dev/adb.c,v
retrieving revision 1.45
diff -u -p -r1.45 adb.c
--- arch/macppc/dev/adb.c   13 Mar 2022 12:33:01 -  1.45
+++ arch/macppc/dev/adb.c   1 Jul 2022 13:49:09 -
@@ -87,8 +87,6 @@
 #include 
 #include 
 #include 
-#include 
-#include 
 #include 
 #include 
 #include 
Index: arch/macppc/dev/akbd_machdep.c
===
RCS file: src/sys/arch/macppc/dev/akbd_machdep.c,v
retrieving revision 1.2
diff -u -p -r1.2 akbd_machdep.c
--- arch/macppc/dev/akbd_machdep.c  3 Sep 2019 17:51:52 -   1.2
+++ arch/macppc/dev/akbd_machdep.c  1 Jul 2022 13:49:09 -
@@ -36,8 +36,6 @@
 #include 
 #include 
 #include 
-#include 
-#include 
 #include 
 #include 
 #include 
Index: arch/sparc64/dev/vldcp.c
===
RCS file: src/sys/arch/sparc64/dev/vldcp.c,v
retrieving revision 1.22
diff -u -p -r1.22 vldcp.c
--- arch/sparc64/dev/vldcp.c24 Oct 2021 17:05:04 -  1.22
+++ arch/sparc64/dev/vldcp.c1 Jul 2022 13:49:09 -
@@ -19,7 +19,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 
@@ -576,49 +575,6 @@ vldcpioctl(dev_t dev, u_long cmd, caddr_
 
device_unref(>sc_dv);
return (0);
-}
-
-int
-vldcppoll(dev_t dev, int events, struct proc *p)
-{
-   struct vldcp_softc *sc;
-   struct ldc_conn *lc;
-   uint64_t head, tail, state;
-   int revents = 0;
-   int s, err;
-
-   sc = vldcp_lookup(dev);
-   if (sc == NULL)
-   return (POLLERR);
-   lc = >sc_lc;
-
-   s = spltty();
-   if (events & (POLLIN | POLLRDNORM)) {
-   err = hv_ldc_rx_get_state(lc->lc_id, , , );
-
-   if (err == 0 && state == LDC_CHANNEL_UP && head != tail)
-   revents |= events & (POLLIN | POLLRDNORM);
-   }
-   if (events & (POLLOUT | POLLWRNORM)) {
-   err = hv_ldc_tx_get_state(lc->lc_id, , , );
-
-   if (err == 0 && state == LDC_CHANNEL_UP && head != tail)
-   revents |= events & (POLLOUT | POLLWRNORM);
-   }
-   if (revents == 0) {
-   if (events & (POLLIN | POLLRDNORM)) {
-   cbus_intr_setenabled(sc->sc_bustag, sc->sc_rx_ino,
-   INTR_ENABLED);
-   selrecord(p, >sc_rsel);
-   }
-   if (events & (POLLOUT | POLLWRNORM)) {
-   cbus_intr_setenabled(sc->sc_bustag, sc->sc_tx_ino,
-   INTR_ENABLED);
-   selrecord(p, >sc_wsel);
-   }
-   }
-   splx(s);
-   return revents;
 }
 
 void
Index: dev/audio.c
===
RCS file: src/sys/dev/audio.c,v
retrieving revision 1.198
diff -u -p -r1.198 audio.c
--- dev/audio.c 21 Mar 2022 19:22:39 -  1.198
+++ dev/audio.c 1 Jul 2022 13:49:09 -
@@ -19,7 +19,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -2051,22 +2050,6 @@ audio_mixer_read(struct audio_softc *sc,
 }
 
 int
-audio_mixer_poll(struct audio_softc *sc, int events, struct proc *p)
-{
-   int revents = 0;
-
-   mtx_enter(_lock);
-   if (sc->mix_isopen && sc->mix_pending)
-   revents |= events & (POLLIN | POLLRDNORM);
-   if (revents == 0) {
-   if (events & (POLLIN | POLLRDNORM))
-   selrecord(p, >mix_sel);
-   }
-   mtx_leave(_lock);
-   return revents;
-}
-
-int
 audio_mixer_open(struct audio_softc *sc, int flags)
 {
DPRINTF("%s: flags = 0x%x\n", __func__, flags);
@@ -2099,26 +2082,6 @@ audio_mixer_close(struct audio_softc *sc
 }
 
 int
-audio_poll(struct audio_softc *sc, int events, struct proc *p)
-{
-   int revents = 0;
-
-   mtx_enter(_lock);
-   if ((sc->mode & AUMODE_RECORD) && sc->rec.used > 0)
-   revents |= events & (POLLIN | POLLRDNORM);
-   if ((sc->mode & AUMODE_PLAY) && sc->play.used < sc->play.len)
-   revents |= events & (POLLOUT | POLLWRNORM);
-   if (revents == 0) {
-   if (events & (POLLIN | POLLRDNORM))
-   selrecord(p, >rec.sel);
-   if (events & (POLLOUT | POLLWRNORM))
-   selrecord(p, >play.sel);
-   }
-   mtx_leave(_lock);
-   return revents;
-}
-
-int
 audioopen(dev_t dev, int flags, int mode, struct proc *p)
 {
struct audio_softc *sc;
@@ -2248,30 +2211,6 @@ audioioctl(dev_t dev, u_long cmd, caddr_
}

Re: Use SMR instead of SRP list in rtsock.c

2022-06-30 Thread Visa Hankala

On Thu, Jun 30, 2022 at 11:51:52AM +0200, Claudio Jeker wrote:
> After discussing this with mpi@ and jmatthew@ we came to the conclusion
> that we need to smr_barrier() before refcnt_finalize() to ensure that no
> other CPU is between the SMR_TAILQ_FOREACH, refcnt_take() and
> smr_read_leave().

[...]

> @@ -509,7 +487,8 @@ route_input(struct mbuf *m0, struct sock
>   return;
>   }
>  
> - SRPL_FOREACH(rop, , _list, rop_list) {
> + smr_read_enter();
> + SMR_TAILQ_FOREACH(rop, _list, rop_list) {
>   /*
>* If route socket is bound to an address family only send
>* messages that match the address family. Address family
> @@ -519,7 +498,8 @@ route_input(struct mbuf *m0, struct sock
>   rop->rop_proto != sa_family)
>   continue;
>  
> -
> + refcnt_take(>rop_refcnt);
> + smr_read_leave();
>   so = rop->rop_socket;
>   solock(so);
>  
> @@ -579,8 +559,10 @@ route_input(struct mbuf *m0, struct sock
>   rtm_sendup(so, m);
>  next:
>   sounlock(so);
> + smr_read_enter();
> + refcnt_rele_wake(>rop_refcnt);

This does not look correct.

smr_barrier() can proceed after smr_read_leave(), so refcnt_rele_wake()
might drop the final reference and this thread can no longer access
rop safely (SMR_TAILQ_NEXT() inside SMR_TAILQ_FOREACH()).

Also, SMR_TAILQ_NEXT() of rop becomes potentially dangling after
smr_read_leave(). After this thread leaves the read-side critical
section, another thread might free rop's successor.

Remove d_poll from struct cdevsw

2022-06-27 Thread Visa Hankala

Remove the now-unused d_poll field from struct cdevsw.

This diff adjusts the various conf.{c,h} bits. To avoid making this
patch too unwieldy, I leave the removal of the device poll functions
for another patch.

(Compile-)tested on amd64, arm64, armv7, i386, loongson, macppc, octeon,
powerpc64, riscv64 and sparc64.

OK?

Index: arch/amd64/amd64/conf.c
===
RCS file: src/sys/arch/amd64/amd64/conf.c,v
retrieving revision 1.74
diff -u -p -r1.74 conf.c
--- arch/amd64/amd64/conf.c 11 Nov 2021 10:03:08 -  1.74
+++ arch/amd64/amd64/conf.c 27 Jun 2022 14:19:10 -
@@ -79,14 +79,14 @@ int nblkdev = nitems(bdevsw);
 #define cdev_ocis_init(c,n) { \
 dev_init(c,n,open), dev_init(c,n,close), (dev_type_read((*))) enodev, \
 (dev_type_write((*))) enodev, dev_init(c,n,ioctl), \
-(dev_type_stop((*))) enodev, 0,  seltrue, \
+(dev_type_stop((*))) enodev, 0, \
 (dev_type_mmap((*))) enodev, 0, 0, seltrue_kqfilter }
 
 /* open, close, read */
 #define cdev_nvram_init(c,n) { \
dev_init(c,n,open), dev_init(c,n,close), dev_init(c,n,read), \
(dev_type_write((*))) enodev, (dev_type_ioctl((*))) enodev, \
-   (dev_type_stop((*))) enodev, 0, seltrue, \
+   (dev_type_stop((*))) enodev, 0, \
(dev_type_mmap((*))) enodev, 0, 0, seltrue_kqfilter }
 
 /* open, close, ioctl */
@@ -95,7 +95,7 @@ int   nblkdev = nitems(bdevsw);
(dev_type_read((*))) enodev, \
(dev_type_write((*))) enodev, \
 dev_init(c,n,ioctl), \
-   (dev_type_stop((*))) enodev, 0, seltrue, \
+   (dev_type_stop((*))) enodev, 0, \
(dev_type_mmap((*))) enodev, 0, 0, seltrue_kqfilter }
 
 #definemmread  mmrw
Index: arch/amd64/include/conf.h
===
RCS file: src/sys/arch/amd64/include/conf.h,v
retrieving revision 1.8
diff -u -p -r1.8 conf.h
--- arch/amd64/include/conf.h   13 May 2020 08:32:43 -  1.8
+++ arch/amd64/include/conf.h   27 Jun 2022 14:19:10 -
@@ -46,7 +46,7 @@ cdev_decl(bios);
 #definecdev_acpi_init(c,n) {\
dev_init(c,n,open), dev_init(c,n,close), (dev_type_read((*))) enodev, \
(dev_type_write((*))) enodev, dev_init(c,n,ioctl), \
-   (dev_type_stop((*))) enodev, 0, selfalse, \
+   (dev_type_stop((*))) enodev, 0, \
(dev_type_mmap((*))) enodev, 0, 0, dev_init(c,n,kqfilter) }
 cdev_decl(acpi);
 
Index: arch/arm/include/conf.h
===
RCS file: src/sys/arch/arm/include/conf.h,v
retrieving revision 1.11
diff -u -p -r1.11 conf.h
--- arch/arm/include/conf.h 21 May 2016 21:24:36 -  1.11
+++ arch/arm/include/conf.h 27 Jun 2022 14:19:10 -
@@ -61,7 +61,7 @@ cdev_decl(fd);
 #define cdev_apm_init(c,n) { \
 dev_init(c,n,open), dev_init(c,n,close), (dev_type_read((*))) enodev, \
 (dev_type_write((*))) enodev, dev_init(c,n,ioctl), \
-   (dev_type_stop((*))) enodev, 0, selfalse, \
+   (dev_type_stop((*))) enodev, 0, \
(dev_type_mmap((*))) enodev, 0, 0, dev_init(c,n,kqfilter) }
 
 cdev_decl(com);
@@ -74,7 +74,7 @@ cdev_decl(spkr);
 #define cdev_openprom_init(c,n) { \
dev_init(c,n,open), dev_init(c,n,close), (dev_type_read((*))) enodev, \
(dev_type_write((*))) enodev, dev_init(c,n,ioctl), \
-   (dev_type_stop((*))) nullop, 0, selfalse, \
+   (dev_type_stop((*))) nullop, 0, \
(dev_type_mmap((*))) enodev }
 
 cdev_decl(openprom);
Index: arch/arm64/include/conf.h
===
RCS file: src/sys/arch/arm64/include/conf.h,v
retrieving revision 1.3
diff -u -p -r1.3 conf.h
--- arch/arm64/include/conf.h   23 Jan 2019 09:57:36 -  1.3
+++ arch/arm64/include/conf.h   27 Jun 2022 14:19:10 -
@@ -43,7 +43,7 @@ cdev_decl(mm);
 #define cdev_openprom_init(c,n) { \
dev_init(c,n,open), dev_init(c,n,close), (dev_type_read((*))) enodev, \
(dev_type_write((*))) enodev, dev_init(c,n,ioctl), \
-   (dev_type_stop((*))) nullop, 0, selfalse, \
+   (dev_type_stop((*))) nullop, 0, \
(dev_type_mmap((*))) enodev }
 
 cdev_decl(openprom);
@@ -52,7 +52,7 @@ cdev_decl(openprom);
 #define cdev_acpiapm_init(c,n) { \
dev_init(c,n,open), dev_init(c,n,close), (dev_type_read((*))) enodev, \
(dev_type_write((*))) enodev, dev_init(c,n,ioctl), \
-   (dev_type_stop((*))) enodev, 0, selfalse, \
+   (dev_type_stop((*))) enodev, 0, \
(dev_type_mmap((*))) enodev, 0, 0, dev_init(c,n,kqfilter) }
 
 cdev_decl(apm);
Index: arch/i386/i386/conf.c
===
RCS file: src/sys/arch/i386/i386/conf.c,v
retrieving revision 1.172
diff -u -p -r1.172 conf.c
--- arch/i386/i386/conf.c   11 Nov 2021 10:03:09 -  1.172
+++ arch/i386/i386/conf.c   27 Jun 2022 14:19:10 -
@@ -81,21 +81,21 @@ int nblkdev

Remove switch(4) leftovers

2022-06-27 Thread Visa Hankala

Remove some switch(4) leftovers.

OK?

Index: etc/etc.hppa/MAKEDEV.md
===
RCS file: src/etc/etc.hppa/MAKEDEV.md,v
retrieving revision 1.68
diff -u -p -r1.68 MAKEDEV.md
--- etc/etc.hppa/MAKEDEV.md 11 Nov 2021 09:47:33 -  1.68
+++ etc/etc.hppa/MAKEDEV.md 27 Jun 2022 13:41:53 -
@@ -108,4 +108,3 @@ target(all, rd, 0)dnl
 target(all, cd, 0, 1)dnl
 target(all, sd, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)dnl
 target(all, vnd, 0, 1, 2, 3)dnl
-target(all, switch, 0, 1, 2, 3)dnl
Index: sys/sys/conf.h
===
RCS file: src/sys/sys/conf.h,v
retrieving revision 1.156
diff -u -p -r1.156 conf.h
--- sys/sys/conf.h  23 Jan 2021 05:08:36 -  1.156
+++ sys/sys/conf.h  27 Jun 2022 13:41:53 -
@@ -257,13 +257,6 @@ extern struct cdevsw cdevsw[];
0, dev_init(c,n,poll), (dev_type_mmap((*))) enodev, \
0, 0, dev_init(c,n,kqfilter) }
 
-/* open, close, read, write, ioctl, poll, kqfilter -- XXX should be generic 
device */
-#define cdev_switch_init(c,n) {
\
-   dev_init(c,n,open), dev_init(c,n,close), dev_init(c,n,read),\
-   dev_init(c,n,write), dev_init(c,n,ioctl), (dev_type_stop((*))) enodev, \
-   0, dev_init(c,n,poll), (dev_type_mmap((*))) enodev, \
-   0, 0, dev_init(c,n,kqfilter) }
-
 /* open, close, ioctl, poll, kqfilter -- XXX should be generic device */
 #define cdev_vscsi_init(c,n) { \
dev_init(c,n,open), dev_init(c,n,close), \
@@ -607,7 +600,6 @@ cdev_decl(pf);
 
 cdev_decl(tun);
 cdev_decl(tap);
-cdev_decl(switch);
 cdev_decl(pppx);
 cdev_decl(pppac);

Remove VOP_POLL()

2022-06-25 Thread Visa Hankala

Remove unused VOP_POLL().

The patch cuts cttypoll()'s dependency on VOP_POLL(). cttypoll()
has been called through spec_poll() and is unused. The function
cannot be removed yet as the build expects the symbol.

(Next, remove d_poll from struct cdevsw.)

OK?

Index: share/man/man9/VOP_LOOKUP.9
===
RCS file: src/share/man/man9/VOP_LOOKUP.9,v
retrieving revision 1.45
diff -u -p -r1.45 VOP_LOOKUP.9
--- share/man/man9/VOP_LOOKUP.9 12 Dec 2021 09:14:58 -  1.45
+++ share/man/man9/VOP_LOOKUP.9 25 Jun 2022 12:34:58 -
@@ -48,7 +48,6 @@
 .Nm VOP_MKNOD ,
 .Nm VOP_OPEN ,
 .Nm VOP_PATHCONF ,
-.Nm VOP_POLL ,
 .Nm VOP_PRINT ,
 .Nm VOP_READ ,
 .Nm VOP_READDIR ,
@@ -194,13 +193,6 @@
 .Fa "register_t *retval"
 .Fc
 .Ft int
-.Fo VOP_POLL
-.Fa "struct vnode *vp"
-.Fa "int fflag"
-.Fa "int events"
-.Fa "struct proc *p"
-.Fc
-.Ft int
 .Fo VOP_PRINT
 .Fa "struct vnode *vp"
 .Fc
@@ -724,28 +716,6 @@ The result is placed in
 .Fa *retval .
 Upon success, zero is returned; otherwise, an appropriate error code is
 returned.
-.Pp
-.It Fn VOP_POLL vp fflag events p
-Determine whether the vnode
-.Fa vp
-is ready to perform the operations specified by
-.Fa events
-(see
-.Xr poll 2 )
-with file flags
-.Fa fflag
-for the calling process
-.Fa p .
-The
-.Fn selrecord
-routine may be used to detect selection collisions for multiple
-processes sleeping on the same file, waiting for I/O
-to become possible, although all file systems currently assume that
-I/O is always possible.
-The return value specifies which operations from
-.Fa events
-were found to be ready, which may be performed without the need for
-blocking.
 .Pp
 .It Fn VOP_PRINT vp
 Print information about the vnode to the kernel message buffer.
Index: sys/isofs/cd9660/cd9660_node.h
===
RCS file: src/sys/isofs/cd9660/cd9660_node.h,v
retrieving revision 1.21
diff -u -p -r1.21 cd9660_node.h
--- sys/isofs/cd9660/cd9660_node.h  20 Jan 2019 16:09:41 -  1.21
+++ sys/isofs/cd9660/cd9660_node.h  25 Jun 2022 12:35:54 -
@@ -99,7 +99,6 @@ int   cd9660_getattr(void *);
 intcd9660_setattr(void *);
 intcd9660_read(void *);
 intcd9660_ioctl(void *);
-intcd9660_poll(void *);
 intcd9660_mmap(void *);
 intcd9660_seek(void *);
 intcd9660_readdir(void *);
Index: sys/isofs/cd9660/cd9660_vnops.c
===
RCS file: src/sys/isofs/cd9660/cd9660_vnops.c,v
retrieving revision 1.92
diff -u -p -r1.92 cd9660_vnops.c
--- sys/isofs/cd9660/cd9660_vnops.c 12 Dec 2021 09:14:59 -  1.92
+++ sys/isofs/cd9660/cd9660_vnops.c 25 Jun 2022 12:35:54 -
@@ -54,7 +54,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 
@@ -290,18 +289,6 @@ cd9660_ioctl(void *v)
return (ENOTTY);
 }
 
-/* ARGSUSED */
-int
-cd9660_poll(void *v)
-{
-   struct vop_poll_args *ap = v;
-
-   /*
-* We should really check to see if I/O is possible.
-*/
-   return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
-}
-
 /*
  * Mmap a file
  *
@@ -825,7 +812,6 @@ const struct vops cd9660_vops = {
.vop_read   = cd9660_read,
.vop_write  = eopnotsupp,
.vop_ioctl  = cd9660_ioctl,
-   .vop_poll   = cd9660_poll,
.vop_kqfilter   = cd9660_kqfilter,
.vop_revoke = vop_generic_revoke,
.vop_fsync  = nullop,
@@ -872,7 +858,6 @@ const struct vops cd9660_specvops = {
.vop_read   = spec_read,
.vop_write  = spec_write,
.vop_ioctl  = spec_ioctl,
-   .vop_poll   = spec_poll,
.vop_kqfilter   = spec_kqfilter,
.vop_revoke = vop_generic_revoke,
.vop_fsync  = spec_fsync,
@@ -914,7 +899,6 @@ const struct vops cd9660_fifovops = {
.vop_read   = fifo_read,
.vop_write  = fifo_write,
.vop_ioctl  = fifo_ioctl,
-   .vop_poll   = fifo_poll,
.vop_kqfilter   = fifo_kqfilter,
.vop_revoke = vop_generic_revoke,
.vop_fsync  = nullop,
Index: sys/kern/spec_vnops.c
===
RCS file: src/sys/kern/spec_vnops.c,v
retrieving revision 1.107
diff -u -p -r1.107 spec_vnops.c
--- sys/kern/spec_vnops.c   11 Dec 2021 09:28:26 -  1.107
+++ sys/kern/spec_vnops.c   25 Jun 2022 12:35:54 -
@@ -48,7 +48,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -74,7 +73,6 @@ const struct vops spec_vops = {
.vop_read   = spec_read,
.vop_write  = spec_write,
.vop_ioctl  = spec_ioctl,
-   .vop_poll   = spec_poll,
.vop_kqfilter   = spec_kqfilter,
.vop_revoke = vop_generic_revoke,
.vop_fsync  = spec_fsync,
@@ -379,21 +377,6 @@ spec_ioctl(void *v)
}
 }
 
-int
-spec_poll(void *v)

Fix deadlock with FFS quotas

2022-06-25 Thread Visa Hankala

The kernel can deadlock if it tries to allocate blocks for a quota data
file that is stored in the same filesystem that has the quotas enabled.
This happens because of recursion within the quota subsystem:

sleep_finish(8000225d16c0,1) at sleep_finish+0xfe
tsleep(807f0880,9,81f8b8d8,0) at tsleep+0xb2
# waits for the struct dquot lock that was acquired earlier
ufs_quota_alloc_blocks2(fd82631dbbc0,4,fd843f7e3c60,0) at 
ufs_quota_alloc_blocks2+0x149
ffs_alloc(fd82631dbbc0,0,ce5,800,fd843f7e3c60,8000225d1940) at 
ffs_alloc+0x114
ffs2_balloc(fd82631dbbc0,0,20,fd843f7e3c60,1,8000225d1a28) at 
ffs2_balloc+0x50d
ffs_write(8000225d1aa8) at ffs_write+0x229
VOP_WRITE(fd83357662f8,8000225d1b18,0,fd843f7e3c60) at 
VOP_WRITE+0x6a
# has acquired struct dquot lock
dqsync(fd83357662f8,807f0880) at dqsync+0x133
qsync_vnode(fd83357662f8,0) at qsync_vnode+0x68
vfs_mount_foreach_vnode(806ae000,81a92120,0) at 
vfs_mount_foreach_vnode+0x4a
qsync(806ae000) at qsync+0x37
ffs_sync(806ae000,3,0,fd843f7e3ea0,80002259b508) at 
ffs_sync+0x114
sync_fsync(8000225d1d60) at sync_fsync+0x144
VOP_FSYNC(fd8406af5bc8,fd843f7e3ea0,3,80002259b508) at 
VOP_FSYNC+0x83
syncer_thread(80002259b508) at syncer_thread+0x1b2

One way to fix this is to exclude quota data files from the accounting,
as shown in the following patch. The quota files are root's property
and special to the system, so the omission should not be a problem
in general.

A slight variation of the patch is to skip only the files that are used
to store quotas in the same filesystem. However, I think the categorical
skipping is clearer.

The patch makes quotacheck(8) ignore the quota data files in the
same-filesystem case. Note that there is no adjustment for externally
stored quota files. External storage probably does not use quotas.

Also, the VSYSTEM flag should be cleared when quotas are disabled.

(One might still invoke curious patterns by setting up a filesystem
with quotas on a vnd device that is backed by a sparse file on
a quota-enabled filesystem. However, I think this is a slightly
different thing that relates to the subsystem's shared data structures.)

OK?

Index: sbin/quotacheck/quotacheck.c
===
RCS file: src/sbin/quotacheck/quotacheck.c,v
retrieving revision 1.41
diff -u -p -r1.41 quotacheck.c
--- sbin/quotacheck/quotacheck.c28 Jun 2019 13:32:45 -  1.41
+++ sbin/quotacheck/quotacheck.c25 Jun 2022 10:46:22 -
@@ -263,10 +263,13 @@ int
 chkquota(const char *vfstype, const char *fsname, const char *mntpt,
 void *auxarg, pid_t *pidp)
 {
+   struct stat sb;
struct quotaname *qnp = auxarg;
struct fileusage *fup;
union dinode *dp;
int cg, i, mode, errs = 0, status;
+   dev_t groupdev = 0, userdev = 0, dev = 0;
+   ino_t groupino = 0, userino = 0;
ino_t ino, inosused;
pid_t pid;
char *cp;
@@ -276,8 +279,20 @@ chkquota(const char *vfstype, const char
warn("fork");
return 1;
case 0: /* child */
+   if ((qnp->flags & HASGRP) != 0 &&
+   stat(qnp->grpqfname, ) == 0) {
+   groupdev = sb.st_dev;
+   groupino = sb.st_ino;
+   }
+   if ((qnp->flags & HASUSR) != 0 &&
+   stat(qnp->usrqfname, ) == 0) {
+   userdev = sb.st_dev;
+   userino = sb.st_ino;
+   }
if ((fi = opendev(fsname, O_RDONLY, 0, NULL)) == -1)
err(1, "%s", fsname);
+   if (stat(mntpt, ) == 0)
+   dev = sb.st_dev;
sync();
for (i = 0; sblock_try[i] != -1; i++) {
bread(sblock_try[i] / DEV_BSIZE, (char *),
@@ -332,6 +347,13 @@ chkquota(const char *vfstype, const char
ino < ROOTINO ||
(mode = DIP(dp, di_mode) & IFMT) == 0)
continue;
+   /*
+* Skip the quota data files because
+* the kernel excludes them from accounting.
+*/
+   if ((ino == groupino && dev == groupdev) ||
+   (ino == userino && dev == userdev))
+   continue;
if (qnp->flags & HASGRP) {
fup = addid(DIP(dp, di_gid),
GRPQUOTA, NULL);
Index: sys/ufs/ufs/ufs_quota.c
===
RCS file: src/sys/ufs/ufs/ufs_quota.c,v
retrieving revision

Re: pluart(4): change baud rate

2022-06-21 Thread Visa Hankala

On Mon, Jun 20, 2022 at 07:07:14PM +0200, Anton Lindqvist wrote:
> On Mon, Jun 20, 2022 at 02:42:52PM +0000, Visa Hankala wrote:
> > On Sun, Jun 19, 2022 at 03:06:47PM +0200, Anton Lindqvist wrote:
> > > This allows the pluart baud rate to be changed. There's one potential
> > > pitfall with this change as users will have the wrong baud rate in their
> > > /etc/ttys if not installed after revision 1.11 of dev/ic/pluart.c which
> > > landed today. This will make the serial console unusable until the
> > > expected baud rate in /etc/ttys is changed to 115200.
> > 
> > An upgrade note would be good.
> 
> I can prepare something for current.html.
> 
> > > Comments? OK?
> > > 
> > > diff --git sys/dev/fdt/pluart_fdt.c sys/dev/fdt/pluart_fdt.c
> > > index 969018eccdc..ac2467bdf47 100644
> > > --- sys/dev/fdt/pluart_fdt.c
> > > +++ sys/dev/fdt/pluart_fdt.c
> > > @@ -27,6 +27,7 @@
> > >  
> > >  #include 
> > >  #include 
> > > +#include 
> > >  #include 
> > >  
> > >  int  pluart_fdt_match(struct device *, void *, void *);
> > > @@ -70,8 +71,12 @@ pluart_fdt_attach(struct device *parent, struct device 
> > > *self, void *aux)
> > >   return;
> > >   }
> > >  
> > > - if (OF_is_compatible(faa->fa_node, "arm,sbsa-uart"))
> > > + if (OF_is_compatible(faa->fa_node, "arm,sbsa-uart")) {
> > >   sc->sc_hwflags |= COM_HW_SBSA;
> > > + } else {
> > > + clock_enable_all(faa->fa_node);
> > > + sc->sc_clkfreq = clock_get_frequency(faa->fa_node, "uartclk");
> > > + }
> > >  
> > >   periphid = OF_getpropint(faa->fa_node, "arm,primecell-periphid", 0);
> > >   if (periphid != 0)
> > > diff --git sys/dev/ic/pluart.c sys/dev/ic/pluart.c
> > > index 40e2b1976fb..aa4301e8fb0 100644
> > > --- sys/dev/ic/pluart.c
> > > +++ sys/dev/ic/pluart.c
> > > @@ -71,9 +71,9 @@
> > >  #define UART_ILPR0x20/* IrDA low-power 
> > > counter register */
> > >  #define UART_ILPR_ILPDVSR((x) & 0xf) /* IrDA low-power 
> > > divisor */
> > >  #define UART_IBRD0x24/* Integer baud rate 
> > > register */
> > > -#define UART_IBRD_DIVINT ((x) & 0xff)/* Integer baud rate divisor */
> > > +#define UART_IBRD_DIVINT(x)  ((x) & 0xff)/* Integer baud rate 
> > > divisor */
> > 
> > This mask should be 0x.
> 
> Thanks, fixed.
> 
> > >  #define UART_FBRD0x28/* Fractional baud rate 
> > > register */
> > > -#define UART_FBRD_DIVFRAC((x) & 0x3f)/* Fractional baud rate 
> > > divisor */
> > > +#define UART_FBRD_DIVFRAC(x) ((x) & 0x3f)/* Fractional baud rate 
> > > divisor */
> > >  #define UART_LCR_H   0x2c/* Line control 
> > > register */
> > >  #define UART_LCR_H_BRK   (1 << 0)/* Send break */
> > >  #define UART_LCR_H_PEN   (1 << 1)/* Parity enable */
> > > @@ -338,7 +338,9 @@ pluart_param(struct tty *tp, struct termios *t)
> > >   /* lower dtr */
> > >   }
> > >  
> > > - if (ospeed != 0) {
> > > + if (ospeed != 0 && sc->sc_clkfreq != 0 && tp->t_ospeed != ospeed) {
> > > + int div, lcr;
> > > +
> > >   while (ISSET(tp->t_state, TS_BUSY)) {
> > >   ++sc->sc_halt;
> > >   error = ttysleep(tp, >t_outq,
> > > @@ -349,7 +351,40 @@ pluart_param(struct tty *tp, struct termios *t)
> > >   return (error);
> > >   }
> > >   }
> > > - /* set speed */
> > > +
> > > + /*
> > > +  * Writes to IBRD and FBRD are made effective first when LCR_H
> > > +  * is written.
> > > +  */
> > > + lcr = bus_space_read_4(sc->sc_iot, sc->sc_ioh, UART_LCR_H);
> > > +
> > > + /* The UART must be disabled while changing the baud rate. */
> > > + bus_space_write_4(sc->sc_iot, sc->sc_ioh, UART_CR, 0);
> > 
> > I think this should save original CR for restoring later, and set CR
> > with UARTEN masked out.
> > 
> > cr = bus_spa

Add note about locks with taskq, timeout and SMR barriers

2022-06-20 Thread Visa Hankala

The taskq, timeout and SMR barrier routines require that the thing
that is being synchronized with is able to make progress. Consequently,
the callers of these routines must not hold locks that can prevent
that progress. The patch below documents this.

OK?

Index: smr_call.9
===
RCS file: src/share/man/man9/smr_call.9,v
retrieving revision 1.3
diff -u -p -r1.3 smr_call.9
--- smr_call.9  25 Feb 2020 16:53:25 -  1.3
+++ smr_call.9  20 Jun 2022 16:11:53 -
@@ -96,6 +96,12 @@ but the system is forced to process the 
 The use of this function is discouraged because of the heavy impact
 on system performance.
 .Pp
+To avoid deadlocks, the caller of
+.Fn smr_barrier
+or
+.Fn smr_flush
+must not hold locks that can block the processing of SMR callbacks.
+.Pp
 The SMR implementation does not limit the number of deferred calls.
 It is important to prevent arbitrary call rate of
 .Fn smr_call .
Index: task_add.9
===
RCS file: src/share/man/man9/task_add.9,v
retrieving revision 1.22
diff -u -p -r1.22 task_add.9
--- task_add.9  8 Jun 2020 00:29:51 -   1.22
+++ task_add.9  20 Jun 2022 16:11:53 -
@@ -108,6 +108,13 @@ from the list of pending tasks on the
 .Fa tq
 taskq, or waits until any running task has completed.
 .Pp
+The caller of
+.Fn taskq_barrier
+or
+.Fn taskq_del_barrier
+must not hold locks that can block the taskq.
+Otherwise, the system can deadlock.
+.Pp
 It is the responsibility of the caller to provide the
 .Fn task_set ,
 .Fn task_add ,
Index: timeout.9
===
RCS file: src/share/man/man9/timeout.9,v
retrieving revision 1.54
diff -u -p -r1.54 timeout.9
--- timeout.9   31 Mar 2022 17:27:23 -  1.54
+++ timeout.9   20 Jun 2022 16:11:53 -
@@ -195,6 +195,13 @@ ensures that any current execution of th
 .Fa to
 has completed before returning.
 .Pp
+The caller of
+.Fn timeout_barrier
+or
+.Fn timeout_del_barrier
+must not hold locks that can block processing in the timeout's context.
+Otherwise, the system can deadlock.
+.Pp
 The
 .Fn timeout_pending
 macro can be used to check if a timeout is scheduled to run.

Re: pluart(4): change baud rate

2022-06-20 Thread Visa Hankala

On Sun, Jun 19, 2022 at 03:06:47PM +0200, Anton Lindqvist wrote:
> This allows the pluart baud rate to be changed. There's one potential
> pitfall with this change as users will have the wrong baud rate in their
> /etc/ttys if not installed after revision 1.11 of dev/ic/pluart.c which
> landed today. This will make the serial console unusable until the
> expected baud rate in /etc/ttys is changed to 115200.

An upgrade note would be good.

> Comments? OK?
> 
> diff --git sys/dev/fdt/pluart_fdt.c sys/dev/fdt/pluart_fdt.c
> index 969018eccdc..ac2467bdf47 100644
> --- sys/dev/fdt/pluart_fdt.c
> +++ sys/dev/fdt/pluart_fdt.c
> @@ -27,6 +27,7 @@
>  
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  int  pluart_fdt_match(struct device *, void *, void *);
> @@ -70,8 +71,12 @@ pluart_fdt_attach(struct device *parent, struct device 
> *self, void *aux)
>   return;
>   }
>  
> - if (OF_is_compatible(faa->fa_node, "arm,sbsa-uart"))
> + if (OF_is_compatible(faa->fa_node, "arm,sbsa-uart")) {
>   sc->sc_hwflags |= COM_HW_SBSA;
> + } else {
> + clock_enable_all(faa->fa_node);
> + sc->sc_clkfreq = clock_get_frequency(faa->fa_node, "uartclk");
> + }
>  
>   periphid = OF_getpropint(faa->fa_node, "arm,primecell-periphid", 0);
>   if (periphid != 0)
> diff --git sys/dev/ic/pluart.c sys/dev/ic/pluart.c
> index 40e2b1976fb..aa4301e8fb0 100644
> --- sys/dev/ic/pluart.c
> +++ sys/dev/ic/pluart.c
> @@ -71,9 +71,9 @@
>  #define UART_ILPR0x20/* IrDA low-power counter 
> register */
>  #define UART_ILPR_ILPDVSR((x) & 0xf) /* IrDA low-power divisor */
>  #define UART_IBRD0x24/* Integer baud rate register */
> -#define UART_IBRD_DIVINT ((x) & 0xff)/* Integer baud rate divisor */
> +#define UART_IBRD_DIVINT(x)  ((x) & 0xff)/* Integer baud rate divisor */

This mask should be 0x.

>  #define UART_FBRD0x28/* Fractional baud rate 
> register */
> -#define UART_FBRD_DIVFRAC((x) & 0x3f)/* Fractional baud rate divisor 
> */
> +#define UART_FBRD_DIVFRAC(x) ((x) & 0x3f)/* Fractional baud rate divisor 
> */
>  #define UART_LCR_H   0x2c/* Line control register */
>  #define UART_LCR_H_BRK   (1 << 0)/* Send break */
>  #define UART_LCR_H_PEN   (1 << 1)/* Parity enable */
> @@ -338,7 +338,9 @@ pluart_param(struct tty *tp, struct termios *t)
>   /* lower dtr */
>   }
>  
> - if (ospeed != 0) {
> + if (ospeed != 0 && sc->sc_clkfreq != 0 && tp->t_ospeed != ospeed) {
> + int div, lcr;
> +
>   while (ISSET(tp->t_state, TS_BUSY)) {
>   ++sc->sc_halt;
>   error = ttysleep(tp, >t_outq,
> @@ -349,7 +351,40 @@ pluart_param(struct tty *tp, struct termios *t)
>   return (error);
>   }
>   }
> - /* set speed */
> +
> + /*
> +  * Writes to IBRD and FBRD are made effective first when LCR_H
> +  * is written.
> +  */
> + lcr = bus_space_read_4(sc->sc_iot, sc->sc_ioh, UART_LCR_H);
> +
> + /* The UART must be disabled while changing the baud rate. */
> + bus_space_write_4(sc->sc_iot, sc->sc_ioh, UART_CR, 0);

I think this should save original CR for restoring later, and set CR
with UARTEN masked out.

cr = bus_space_read_4(sc->sc_iot, sc->sc_ioh, UART_CR);
bus_space_write_4(sc->sc_iot, sc->sc_ioh, UART_CR,
cr & ~UART_CR_UARTEN);

The PL011 manual says that reserved bits in CR should not be modified.

> +
> + /*
> +  * The baud rate divisor is expressed relative to the UART clock
> +  * frequency where IBRD represents the quotient using 16 bits
> +  * and FBRD the remainder using 6 bits. The PL011 specification
> +  * provides the following formula:
> +  *
> +  *  uartclk/(16 * baudrate)
> +  *
> +  * The formula can be estimated by scaling it with the
> +  * precision 64 (2^6) and letting the resulting upper 16 bits
> +  * represents the quotient and the lower 6 bits the remainder:
> +  *
> +  *  64 * uartclk/(16 * baudrate) = 4 * uartclk/baudrate
> +  */
> + div = 4 * sc->sc_clkfreq / ospeed;
> + bus_space_write_4(sc->sc_iot, sc->sc_ioh, UART_IBRD,
> + UART_IBRD_DIVINT(div >> 6));
> + bus_space_write_4(sc->sc_iot, sc->sc_ioh, UART_FBRD,
> + UART_FBRD_DIVFRAC(div));
> + /* Commit baud rate change. */
> + bus_space_write_4(sc->sc_iot, sc->sc_ioh, UART_LCR_H, lcr);
> + /* Enable UART. */
> + bus_space_write_4(sc->sc_iot, sc->sc_ioh,
> +

Re: Fix lock order reversal in nfs_inactive()

2022-06-19 Thread Visa Hankala

On Sun, Jun 19, 2022 at 11:05:38AM +0200, Jeremie Courreges-Anglas wrote:
> On Fri, Jun 17 2022, Jeremie Courreges-Anglas  wrote:
> > On Thu, Jun 16 2022, Visa Hankala  wrote:
> >> nfs_inactive() has a lock order reversal. When it removes the silly
> >> file, it locks the directory vnode while it already holds the lock
> >> of the argument file vnode. This clashes for example with name lookups
> >> where directory vnodes are locked before file vnodes.
> >>
> >> The reversal can cause a deadlock when an NFS client has multiple
> >> processes that create, modify and remove files in the same
> >> NFS directory.
> >>
> >> The following patch makes the silly file removal happen after
> >> nfs_inactive() has released the file vnode lock. This should be safe
> >> because the silly file removal is independent of nfs_inactive()'s
> >> argument vnode.
> 
> The diff makes sense to me.  Did you spot it reviewing the code, or
> using WITNESS?

I noticed it by code review.

WITNESS is somewhat helpless with vnode locks because they can involve
multiple levels of lock nesting. In fact, the order checking between
vnodes has been disabled by initializing the locks with RWL_IS_VNODE.
To fix this, the kernel would have to pass nesting information around
the filesystem code.

This particular deadlock can be triggered for example by quickly
writing and removing temporary files in an NFS directory using one
process while another process lists the directory contents repeatedly.

> >> Could some NFS users test this?
> >
> > I'm running this diff on the riscv64 build cluster, since 1h25mn with no
> > hang.  Let's see how it goes.
> 
> This run did finish properly yesterday.
> 
> > This cluster doesn't use NFS as much as it could (build logs stored
> > locally) but I can try that in the next build.
> 
> So I have restarted a build with this diff and dpb(1) logging on an
> NFS-mounted /usr/ports/logs.  I get a dpb(1) hang after 1400+ packages
> built.  Any other attempt to access the NFS-mounted filesystem results
> in a hang.  Let me know if I can extract more data from the system.

No need this time. Those wait messages give some pointers.

> shannon ~$ grep nfs riscv64/nfs-hang.txt
>  97293   72036  49335  0  30x91  nfs_fsync perl
>  69045   83700  64026 55  30x82  nfs_fsync c++
>  80365   37354  15104 55  30x100082  nfs_fsync make
>  28876  139812  59322 55  30x100082  nfs_fsync make
>   6160  193238  61541   1000  30x13  nfsnode   ksh
>   7535  421732  0  0  3 0x14280  nfsrcvlk  nfsio
>  70437  237308  0  0  3 0x14280  nfsrcvlk  nfsio
>  97073  406345  0  0  3 0x14200  nfsrcvlk  nfsio
>  88487  390804  0  0  3 0x14200  nfsrcvlk  nfsio
>  58945   91139  92962  0  30x80  nfsd  nfsd
>  75619  357314  92962  0  30x80  nfsd  nfsd
>  39027  137228  92962  0  30x80  nfsd  nfsd
>  22028  406380  92962  0  30x80  nfsd  nfsd
>  92962   11420  1  0  30x80  netconnfsd
>  90467  310188  0  0  3 0x14280  nfsrcvlk  update

Re: pluart(4): hardware console baudrate

2022-06-19 Thread Visa Hankala

On Wed, Jun 15, 2022 at 07:30:09AM +0200, Anton Lindqvist wrote:
> pluart(4) does not report the correct baudrate for the hardware console
> but instead defaults to 38400. This in turn causes the same baudrate to
> end up in /etc/ttys during installation. Note that this is not a problem
> as of now since pluart does not support changing the baudrate just yet,
> I have another subsequent diff for that.
> 
> Instead, honor and propagate the baudrate given to pluartcnattach()
> while attaching the hardware console. Similar to what com(4) already
> does.
> 
> Comments? OK?

OK visa@

Fix lock order reversal in nfs_inactive()

2022-06-16 Thread Visa Hankala

nfs_inactive() has a lock order reversal. When it removes the silly
file, it locks the directory vnode while it already holds the lock
of the argument file vnode. This clashes for example with name lookups
where directory vnodes are locked before file vnodes.

The reversal can cause a deadlock when an NFS client has multiple
processes that create, modify and remove files in the same
NFS directory.

The following patch makes the silly file removal happen after
nfs_inactive() has released the file vnode lock. This should be safe
because the silly file removal is independent of nfs_inactive()'s
argument vnode.

Could some NFS users test this?

Index: nfs/nfs_node.c
===
RCS file: src/sys/nfs/nfs_node.c,v
retrieving revision 1.74
diff -u -p -r1.74 nfs_node.c
--- nfs/nfs_node.c  20 Oct 2021 06:35:39 -  1.74
+++ nfs/nfs_node.c  16 Jun 2022 14:46:36 -
@@ -183,20 +183,23 @@ nfs_inactive(void *v)
np->n_sillyrename = NULL;
} else
sp = NULL;
-   if (sp) {
+   if (sp != NULL)
+   nfs_vinvalbuf(ap->a_vp, 0, sp->s_cred, curproc);
+   np->n_flag &= (NMODIFIED | NFLUSHINPROG | NFLUSHWANT);
+
+   VOP_UNLOCK(ap->a_vp);
+
+   if (sp != NULL) {
/*
 * Remove the silly file that was rename'd earlier
 */
-   nfs_vinvalbuf(ap->a_vp, 0, sp->s_cred, curproc);
vn_lock(sp->s_dvp, LK_EXCLUSIVE | LK_RETRY);
nfs_removeit(sp);
crfree(sp->s_cred);
vput(sp->s_dvp);
free(sp, M_NFSREQ, sizeof(*sp));
}
-   np->n_flag &= (NMODIFIED | NFLUSHINPROG | NFLUSHWANT);
 
-   VOP_UNLOCK(ap->a_vp);
return (0);
 }

Re: pluart(4): fifo support

2022-06-10 Thread Visa Hankala

On Wed, Jun 08, 2022 at 06:50:18AM +0200, Anton Lindqvist wrote:
> On Sun, May 01, 2022 at 04:17:34PM +0000, Visa Hankala wrote:
> > On Sat, Apr 30, 2022 at 09:40:24AM +0200, Anton Lindqvist wrote:
> > > On Sun, Mar 13, 2022 at 04:17:07PM +0100, Mark Kettenis wrote:
> > > > > Date: Fri, 11 Mar 2022 07:53:13 +0100
> > > > > From: Anton Lindqvist 
> > > > > 
> > > > > On Tue, Mar 08, 2022 at 01:44:47PM +, Visa Hankala wrote:
> > > > > > On Tue, Mar 08, 2022 at 08:04:36AM +0100, Anton Lindqvist wrote:
> > > > > > > On Mon, Mar 07, 2022 at 07:36:35AM +, Visa Hankala wrote:
> > > > > > > > I still think that checking TXFF and using the same code for 
> > > > > > > > both
> > > > > > > > SBSA and true PL011 UARTs would be the best choice. This would 
> > > > > > > > avoid
> > > > > > > > fragmenting the code and improve robustness by relying on 
> > > > > > > > functionality
> > > > > > > > that is common to the different controller variants.
> > > > > > > 
> > > > > > > Fair enough, new diff.
> > > > > > 
> > > > > > Maybe the comments should omit the FIFO space description and just
> > > > > > mention the lack of the level control register in the SBSA UART
> > > > > > register interface.
> > > > > 
> > > > > I ended up tweaking the comments before committing. Thanks for all the
> > > > > feedback.
> > > > > 
> > > > 
> > > > Hi Anton,
> > > > 
> > > > This diff seems to break things.  When I boot my rpi4 it now prints:
> > > > 
> > > >   pluart0 at simplebus0: rev 0, 16 byte fifo
> > > >   pluart0: console
> > > >   bcmbsc0 at simplebus0
> > > >   iic0 at bcmbsc0
> > > > 
> > > > so it appears that a carriage return character is lost here.
> > > > 
> > > > Later on output stops at:
> > > > 
> > > >   reordering libraries: done.
> > > > 
> > > > and only when I reboot the machine the login prompt appears, but with
> > > > some wierd respawning:
> > > > 
> > > >   OpenBSD/arm64 (rutter.sibelius.xs4all.nl) (console)
> > > > 
> > > >   login: init: getty repeating too quickly on port /dev/console, 
> > > > sleeping
> > > >   init: getty repeating too quickly on port /dev/console, sleeping
> > > > 
> > > >   OpenBSD/arm64 (rutter.sibelius.xs4all.nl) (console)
> > > > 
> > > >   login:
> > > >   OpenBSD/arm64 (rutter.sibelius.xs4all.nl) (console)
> > > > 
> > > >   login:
> > > > 
> > > > If you don't have a quick fix for this, may I suggest reverting the
> > > > commit?  We're heading towards release and we don't want the serial
> > > > console on the rpi4 to be broken.
> > > 
> > > Circling back to this: what happens is that no tx interrupt is raised
> > > when sending less data than the configured interrupt fifo level, causing
> > > the tty to end up in a forever busy state. Clearing the busy flag after
> > > a successful transmission of all queued data solves the problem.
> > 
> > Are you sure about the behaviour of the interrupt?
> > 
> > One possible problem is that pluart_intr() uses the raw, unmasked,
> > interrupt status to clear interrupts. Your previous patch always
> > disabled the Tx interrupt whenever the raw status indicated a Tx FIFO
> > level event.
> > 
> > This new patch might very well be correct. However, it feels strange
> > if the hardware raises the Tx interrupt only at one specific level of
> > FIFO state change.
> > 
> > It would be nice to know if a comstart()-style arrangement of interrupt
> > masking worked in pluart_start().
> 
> What did work was to not clear the tx interrupt in pluart_intr().
> Updated diff with some additional changes:
> 
> * Flush any pending transmission before configuring the device during
>   attachment. Prevents the next dmesg line from being mangled.
> 
> * Make pluart_start() mimic comstart() as proposed by visa@.
> 
> * While entering ddb (i.e. poll mode), disable interrupts.



> @@ -792,4 +870,13 @@ pluartcnputc(dev_t dev, int c)
>  void
>  pluartcnpollc(dev_t dev, int on)
>  {
> + int s;
> +
> + s = splhigh();
> + if (on)
> + bus_space_write_4(pluartconsiot, pluartconsioh, UART_IMSC, 0);
> + else
> + bus_space_write_4(pluartconsiot, pluartconsioh, UART_IMSC,
> + UART_IMSC_RXIM | UART_IMSC_RTIM);
> + splx(s);
>  }

Does this fix an actual issue? If not, I would leave it out. ddb entry
can happen in unexpected places. There is a risk that the mask gets
messed up in particular when leaving the debugger.

Otherwise, OK visa@

Re: Fix clearing of sleep timeouts

2022-06-06 Thread Visa Hankala

On Mon, Jun 06, 2022 at 06:47:32AM +1000, David Gwynne wrote:
> On Sun, Jun 05, 2022 at 03:57:39PM +0000, Visa Hankala wrote:
> > On Sun, Jun 05, 2022 at 12:27:32PM +0200, Martin Pieuchot wrote:
> > > On 05/06/22(Sun) 05:20, Visa Hankala wrote:
> > > > Encountered the following panic:
> > > > 
> > > > panic: kernel diagnostic assertion "(p->p_flag & P_TIMEOUT) == 0" 
> > > > failed: file "/usr/src/sys/kern/kern_synch.c", line 373
> > > > Stopped at  db_enter+0x10:  popq%rbp
> > > > TIDPIDUID PRFLAGS PFLAGS  CPU  COMMAND
> > > >  423109  57118 55 0x3  02  link
> > > >  330695  30276 55 0x3  03  link
> > > > * 46366  85501 55  0x1003  0x40804001  link
> > > >  188803  85501 55  0x1003  0x40820000K link
> > > > db_enter() at db_enter+0x10
> > > > panic(81f25d2b) at panic+0xbf
> > > > __assert(81f9a186,81f372c8,175,81f87c6c) at 
> > > > __assert+0x25
> > > > sleep_setup(800022d64bf8,800022d64c98,20,81f66ac6,0) at 
> > > > sleep_setup+0x1d8
> > > > cond_wait(800022d64c98,81f66ac6) at cond_wait+0x46
> > > > timeout_barrier(8000228a28b0) at timeout_barrier+0x109
> > > > timeout_del_barrier(8000228a28b0) at timeout_del_barrier+0xa2
> > > > sleep_finish(800022d64d90,1) at sleep_finish+0x16d
> > > > tsleep(823a5130,120,81f0b730,2) at tsleep+0xb2
> > > > sys_nanosleep(8000228a27f0,800022d64ea0,800022d64ef0) at 
> > > > sys_nanosleep+0x12d
> > > > syscall(800022d64f60) at syscall+0x374
> > > > 
> > > > The panic is a regression of sys/kern/kern_timeout.c r1.84. Previously,
> > > > soft-interrupt-driven timeouts could be deleted synchronously without
> > > > blocking. Now, timeout_del_barrier() can sleep regardless of the type
> > > > of the timeout.
> > > > 
> > > > It looks that with small adjustments timeout_del_barrier() can sleep
> > > > in sleep_finish(). The management of run queues is not affected because
> > > > the timeout clearing happens after it. As timeout_del_barrier() does not
> > > > rely on a timeout or signal catching, there should be no risk of
> > > > unbounded recursion or unwanted signal side effects within the sleep
> > > > machinery. In a way, a sleep with a timeout is higher-level than
> > > > one without.
> > > 
> > > I trust you on the analysis.  However this looks very fragile to me.
> > > 
> > > The use of timeout_del_barrier() which can sleep using the global sleep
> > > queue is worrying me.  
> > 
> > I think the queue handling ends in sleep_finish() when SCHED_LOCK()
> > is released. The timeout clearing is done outside of it.
> 
> That's ok.
> 
> > The extra sleeping point inside sleep_finish() is subtle. It should not
> > matter in typical use. But is it permissible with the API? Also, if
> > timeout_del_barrier() sleeps, the thread's priority can change.
> 
> What other options do we have at this point? Spin? Allocate the timeout
> dynamically so sleep_finish doesn't have to wait for it and let the
> handler clean up? How would you stop the timeout handler waking up the
> thread if it's gone back to sleep again for some other reason?

In principle, each thread could have a sleep serial number. If the
serial number was somehow associated with the timeout, the handler
could bail out if the thread has moved on. However, implementing that
association looks tricky. Dynamic allocation could provide a suitable
context struct, but memory allocation in this code would be awful.
Maybe there should be an extended version of timeout_add() that allows
controlled updating of the timeout argument.. Does not sound appealing.

Spinning might be an option in the future. The kernel lock complicates
things, however. The spinning wait would have to release the kernel lock
to avoid possible deadlocking.

> Sleeping here is the least worst option.

So it seems.

> As for timeout_del_barrier, if prio is a worry we can provide an
> advanced version of it that lets you pass the prio in. I'd also
> like to change timeout_barrier so it queues the barrier task at the
> head of the pending lists rather than at the tail.

I think it is not important to preserve the priority here. Later
scheduling events will override it anyway.

> 
> > Note that sleep_finish() already can take an additional nap when

Re: Fix clearing of sleep timeouts

2022-06-05 Thread Visa Hankala

On Sun, Jun 05, 2022 at 12:27:32PM +0200, Martin Pieuchot wrote:
> On 05/06/22(Sun) 05:20, Visa Hankala wrote:
> > Encountered the following panic:
> > 
> > panic: kernel diagnostic assertion "(p->p_flag & P_TIMEOUT) == 0" failed: 
> > file "/usr/src/sys/kern/kern_synch.c", line 373
> > Stopped at  db_enter+0x10:  popq%rbp
> > TIDPIDUID PRFLAGS PFLAGS  CPU  COMMAND
> >  423109  57118 55 0x3  02  link
> >  330695  30276 55 0x3  03  link
> > * 46366  85501 55  0x1003  0x40804001  link
> >  188803  85501 55  0x1003  0x40820000K link
> > db_enter() at db_enter+0x10
> > panic(81f25d2b) at panic+0xbf
> > __assert(81f9a186,81f372c8,175,81f87c6c) at 
> > __assert+0x25
> > sleep_setup(800022d64bf8,800022d64c98,20,81f66ac6,0) at 
> > sleep_setup+0x1d8
> > cond_wait(800022d64c98,81f66ac6) at cond_wait+0x46
> > timeout_barrier(8000228a28b0) at timeout_barrier+0x109
> > timeout_del_barrier(8000228a28b0) at timeout_del_barrier+0xa2
> > sleep_finish(800022d64d90,1) at sleep_finish+0x16d
> > tsleep(823a5130,120,81f0b730,2) at tsleep+0xb2
> > sys_nanosleep(8000228a27f0,800022d64ea0,800022d64ef0) at 
> > sys_nanosleep+0x12d
> > syscall(800022d64f60) at syscall+0x374
> > 
> > The panic is a regression of sys/kern/kern_timeout.c r1.84. Previously,
> > soft-interrupt-driven timeouts could be deleted synchronously without
> > blocking. Now, timeout_del_barrier() can sleep regardless of the type
> > of the timeout.
> > 
> > It looks that with small adjustments timeout_del_barrier() can sleep
> > in sleep_finish(). The management of run queues is not affected because
> > the timeout clearing happens after it. As timeout_del_barrier() does not
> > rely on a timeout or signal catching, there should be no risk of
> > unbounded recursion or unwanted signal side effects within the sleep
> > machinery. In a way, a sleep with a timeout is higher-level than
> > one without.
> 
> I trust you on the analysis.  However this looks very fragile to me.
> 
> The use of timeout_del_barrier() which can sleep using the global sleep
> queue is worrying me.  

I think the queue handling ends in sleep_finish() when SCHED_LOCK()
is released. The timeout clearing is done outside of it.

The extra sleeping point inside sleep_finish() is subtle. It should not
matter in typical use. But is it permissible with the API? Also, if
timeout_del_barrier() sleeps, the thread's priority can change.

Note that sleep_finish() already can take an additional nap when
signal catching is enabled.

> > Note that endtsleep() can run and set P_TIMEOUT during
> > timeout_del_barrier() when the thread is blocked in cond_wait().
> > To avoid unnecessary atomic read-modify-write operations, the clearing
> > of P_TIMEOUT could be conditional, but maybe that is an unnecessary
> > optimization at this point.
> 
> I agree this optimization seems unnecessary at the moment.
> 
> > While it should be possible to make the code use timeout_del() instead
> > of timeout_del_barrier(), the outcome might not be outright better. For
> > example, sleep_setup() and endtsleep() would have to coordinate so that
> > a late-running timeout from previous sleep cycle would not disturb the
> > new cycle.
> 
> So that's the price for not having to sleep in sleep_finish(), right?

That is correct. Some synchronization is needed in any case.

> > To test the barrier path reliably, I made the code call
> > timeout_del_barrier() twice in a row. The second call is guaranteed
> > to sleep. Of course, this is not part of the patch.
> 
> ok mpi@
> 
> > Index: kern/kern_synch.c
> > ===
> > RCS file: src/sys/kern/kern_synch.c,v
> > retrieving revision 1.187
> > diff -u -p -r1.187 kern_synch.c
> > --- kern/kern_synch.c   13 May 2022 15:32:00 -  1.187
> > +++ kern/kern_synch.c   5 Jun 2022 05:04:45 -
> > @@ -370,8 +370,8 @@ sleep_setup(struct sleep_state *sls, con
> > p->p_slppri = prio & PRIMASK;
> > TAILQ_INSERT_TAIL([LOOKUP(ident)], p, p_runq);
> >  
> > -   KASSERT((p->p_flag & P_TIMEOUT) == 0);
> > if (timo) {
> > +   KASSERT((p->p_flag & P_TIMEOUT) == 0);
> > sls->sls_timeout = 1;
> > timeout_add(>p_sleep_to, timo);
> > }
> > @@ -432,13 +432,12 @

Fix clearing of sleep timeouts

2022-06-04 Thread Visa Hankala

Encountered the following panic:

panic: kernel diagnostic assertion "(p->p_flag & P_TIMEOUT) == 0" failed: file 
"/usr/src/sys/kern/kern_synch.c", line 373
Stopped at  db_enter+0x10:  popq%rbp
TIDPIDUID PRFLAGS PFLAGS  CPU  COMMAND
 423109  57118 55 0x3  02  link
 330695  30276 55 0x3  03  link
* 46366  85501 55  0x1003  0x40804001  link
 188803  85501 55  0x1003  0x40820000K link
db_enter() at db_enter+0x10
panic(81f25d2b) at panic+0xbf
__assert(81f9a186,81f372c8,175,81f87c6c) at 
__assert+0x25
sleep_setup(800022d64bf8,800022d64c98,20,81f66ac6,0) at 
sleep_setup+0x1d8
cond_wait(800022d64c98,81f66ac6) at cond_wait+0x46
timeout_barrier(8000228a28b0) at timeout_barrier+0x109
timeout_del_barrier(8000228a28b0) at timeout_del_barrier+0xa2
sleep_finish(800022d64d90,1) at sleep_finish+0x16d
tsleep(823a5130,120,81f0b730,2) at tsleep+0xb2
sys_nanosleep(8000228a27f0,800022d64ea0,800022d64ef0) at 
sys_nanosleep+0x12d
syscall(800022d64f60) at syscall+0x374

The panic is a regression of sys/kern/kern_timeout.c r1.84. Previously,
soft-interrupt-driven timeouts could be deleted synchronously without
blocking. Now, timeout_del_barrier() can sleep regardless of the type
of the timeout.

It looks that with small adjustments timeout_del_barrier() can sleep
in sleep_finish(). The management of run queues is not affected because
the timeout clearing happens after it. As timeout_del_barrier() does not
rely on a timeout or signal catching, there should be no risk of
unbounded recursion or unwanted signal side effects within the sleep
machinery. In a way, a sleep with a timeout is higher-level than
one without.

Note that endtsleep() can run and set P_TIMEOUT during
timeout_del_barrier() when the thread is blocked in cond_wait().
To avoid unnecessary atomic read-modify-write operations, the clearing
of P_TIMEOUT could be conditional, but maybe that is an unnecessary
optimization at this point.

While it should be possible to make the code use timeout_del() instead
of timeout_del_barrier(), the outcome might not be outright better. For
example, sleep_setup() and endtsleep() would have to coordinate so that
a late-running timeout from previous sleep cycle would not disturb the
new cycle.

To test the barrier path reliably, I made the code call
timeout_del_barrier() twice in a row. The second call is guaranteed
to sleep. Of course, this is not part of the patch.

OK?

Index: kern/kern_synch.c
===
RCS file: src/sys/kern/kern_synch.c,v
retrieving revision 1.187
diff -u -p -r1.187 kern_synch.c
--- kern/kern_synch.c   13 May 2022 15:32:00 -  1.187
+++ kern/kern_synch.c   5 Jun 2022 05:04:45 -
@@ -370,8 +370,8 @@ sleep_setup(struct sleep_state *sls, con
p->p_slppri = prio & PRIMASK;
TAILQ_INSERT_TAIL([LOOKUP(ident)], p, p_runq);
 
-   KASSERT((p->p_flag & P_TIMEOUT) == 0);
if (timo) {
+   KASSERT((p->p_flag & P_TIMEOUT) == 0);
sls->sls_timeout = 1;
timeout_add(>p_sleep_to, timo);
}
@@ -432,13 +432,12 @@ sleep_finish(struct sleep_state *sls, in
 
if (sls->sls_timeout) {
if (p->p_flag & P_TIMEOUT) {
-   atomic_clearbits_int(>p_flag, P_TIMEOUT);
error1 = EWOULDBLOCK;
} else {
-   /* This must not sleep. */
+   /* This can sleep. It must not use timeouts. */
timeout_del_barrier(>p_sleep_to);
-   KASSERT((p->p_flag & P_TIMEOUT) == 0);
}
+   atomic_clearbits_int(>p_flag, P_TIMEOUT);
}
 
/* Check if thread was woken up because of a unwind or signal */

Re: Make pipes and sockets use KNOTE() instead of selwakeup()

2022-05-23 Thread Visa Hankala

On Tue, Feb 08, 2022 at 09:12:11AM +, Visa Hankala wrote:
> Now that poll(2) is based on kqueue, the old, non-MP-safe poll backend
> is not used any longer. Event sources can call KNOTE() directly instead
> of selwakeup().
> 
> This diff does the KNOTE() conversion for pipes and sockets, removing
> a kernel-locked section from a frequently used code path. The related
> event filters do not use the hint value, hence passing 0 rather than
> NOTE_SUBMIT.
 
This patch had an unexpected effect of triggering NFS server hangs.
These hangs were possibly caused by a synchronization issue that was
exposed by the patch. The NFS subsystem is not MP-safe, but the NFS
socket upcall was run without the kernel lock. Now that this has been
fixed, the selwakeup-to-KNOTE conversion might work fine.

Could someone who saw the NFS problem test this patch?

Index: kern/sys_pipe.c
===
RCS file: src/sys/kern/sys_pipe.c,v
retrieving revision 1.133
diff -u -p -r1.133 sys_pipe.c
--- kern/sys_pipe.c 13 Dec 2021 14:56:55 -  1.133
+++ kern/sys_pipe.c 8 Feb 2022 08:59:05 -
@@ -381,12 +381,7 @@ pipeselwakeup(struct pipe *cpipe)
 {
rw_assert_wrlock(cpipe->pipe_lock);
 
-   if (cpipe->pipe_state & PIPE_SEL) {
-   cpipe->pipe_state &= ~PIPE_SEL;
-   selwakeup(>pipe_sel);
-   } else {
-   KNOTE(>pipe_sel.si_note, 0);
-   }
+   KNOTE(>pipe_sel.si_note, 0);
 
if (cpipe->pipe_state & PIPE_ASYNC)
pgsigio(>pipe_sigio, SIGIO, 0);
Index: kern/uipc_socket.c
===
RCS file: src/sys/kern/uipc_socket.c,v
retrieving revision 1.271
diff -u -p -r1.271 uipc_socket.c
--- kern/uipc_socket.c  24 Dec 2021 06:50:16 -  1.271
+++ kern/uipc_socket.c  8 Feb 2022 08:59:05 -
@@ -2049,7 +2049,7 @@ void
 sohasoutofband(struct socket *so)
 {
pgsigio(>so_sigio, SIGURG, 0);
-   selwakeup(>so_rcv.sb_sel);
+   KNOTE(>so_rcv.sb_sel.si_note, 0);
 }
 
 int
Index: kern/uipc_socket2.c
===
RCS file: src/sys/kern/uipc_socket2.c,v
retrieving revision 1.116
diff -u -p -r1.116 uipc_socket2.c
--- kern/uipc_socket2.c 6 Nov 2021 05:26:33 -   1.116
+++ kern/uipc_socket2.c 8 Feb 2022 08:59:06 -
@@ -423,7 +423,7 @@ sowakeup(struct socket *so, struct sockb
}
if (sb->sb_flags & SB_ASYNC)
pgsigio(>so_sigio, SIGIO, 0);
-   selwakeup(>sb_sel);
+   KNOTE(>sb_sel.si_note, 0);
 }
 
 /*

Lock kernel in nfsrv_rcv()

2022-05-13 Thread Visa Hankala

The NFS subsystem is not MP-safe yet. Take this into account
in the NFS server socket upcall by locking the kernel.

This might help with the NFS server hanging that was seen recently
as a result of the now-reverted selwakeup()-to-KNOTE() conversion.
Unfortunately, I have not been able to confirm this myself.

OK?

Index: nfs/nfs_socket.c
===
RCS file: src/sys/nfs/nfs_socket.c,v
retrieving revision 1.140
diff -u -p -r1.140 nfs_socket.c
--- nfs/nfs_socket.c17 Mar 2022 14:23:34 -  1.140
+++ nfs/nfs_socket.c13 May 2022 15:38:48 -
@@ -1561,8 +1561,10 @@ nfsrv_rcv(struct socket *so, caddr_t arg
struct uio auio;
int flags, error;
 
+   KERNEL_LOCK();
+
if ((slp->ns_flag & SLP_VALID) == 0)
-   return;
+   goto out;
 
/* Defer soreceive() to an nfsd. */
if (waitflag == M_DONTWAIT) {
@@ -1644,6 +1646,9 @@ dorecs:
if (waitflag == M_DONTWAIT &&
(slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN
nfsrv_wakenfsd(slp);
+
+out:
+   KERNEL_UNLOCK();
 }
 
 /*

Remove fo_poll from struct fileops

2022-05-07 Thread Visa Hankala

Remove unused struct fileops field fo_poll and callbacks.

(After this, VOP_POLL() is next in line for removal.)

OK?

Index: sys/dev/pci/drm/drm_linux.c
===
RCS file: src/sys/dev/pci/drm/drm_linux.c,v
retrieving revision 1.92
diff -u -p -r1.92 drm_linux.c
--- sys/dev/pci/drm/drm_linux.c 1 Mar 2022 11:50:37 -   1.92
+++ sys/dev/pci/drm/drm_linux.c 7 May 2022 10:57:52 -
@@ -2261,12 +2261,6 @@ dmabuf_ioctl(struct file *fp, u_long com
 }
 
 int
-dmabuf_poll(struct file *fp, int events, struct proc *p)
-{
-   return (0);
-}
-
-int
 dmabuf_kqfilter(struct file *fp, struct knote *kn)
 {
return (EINVAL);
@@ -2326,7 +2320,6 @@ const struct fileops dmabufops = {
.fo_read= dmabuf_read,
.fo_write   = dmabuf_write,
.fo_ioctl   = dmabuf_ioctl,
-   .fo_poll= dmabuf_poll,
.fo_kqfilter= dmabuf_kqfilter,
.fo_stat= dmabuf_stat,
.fo_close   = dmabuf_close,
@@ -2849,12 +2842,6 @@ syncfile_ioctl(struct file *fp, u_long c
 }
 
 int
-syncfile_poll(struct file *fp, int events, struct proc *p)
-{
-   return 0;
-}
-
-int
 syncfile_kqfilter(struct file *fp, struct knote *kn)
 {
return EINVAL;
@@ -2908,7 +2895,6 @@ const struct fileops syncfileops = {
.fo_read= syncfile_read,
.fo_write   = syncfile_write,
.fo_ioctl   = syncfile_ioctl,
-   .fo_poll= syncfile_poll,
.fo_kqfilter= syncfile_kqfilter,
.fo_stat= syncfile_stat,
.fo_close   = syncfile_close,
Index: sys/kern/kern_event.c
===
RCS file: src/sys/kern/kern_event.c,v
retrieving revision 1.187
diff -u -p -r1.187 kern_event.c
--- sys/kern/kern_event.c   6 May 2022 13:12:16 -   1.187
+++ sys/kern/kern_event.c   7 May 2022 10:57:53 -
@@ -49,7 +50,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -79,7 +79,6 @@ int   kqueue_read(struct file *, struct ui
 intkqueue_write(struct file *, struct uio *, int);
 intkqueue_ioctl(struct file *fp, u_long com, caddr_t data,
struct proc *p);
-intkqueue_poll(struct file *fp, int events, struct proc *p);
 intkqueue_kqfilter(struct file *fp, struct knote *kn);
 intkqueue_stat(struct file *fp, struct stat *st, struct proc *p);
 intkqueue_close(struct file *fp, struct proc *p);
@@ -107,7 +106,6 @@ const struct fileops kqueueops = {
.fo_read= kqueue_read,
.fo_write   = kqueue_write,
.fo_ioctl   = kqueue_ioctl,
-   .fo_poll= kqueue_poll,
.fo_kqfilter= kqueue_kqfilter,
.fo_stat= kqueue_stat,
.fo_close   = kqueue_close
@@ -1526,25 +1623,6 @@ kqueue_ioctl(struct file *fp, u_long com
 }
 
 int
-kqueue_poll(struct file *fp, int events, struct proc *p)
-{
-   struct kqueue *kq = (struct kqueue *)fp->f_data;
-   int revents = 0;
-
-   if (events & (POLLIN | POLLRDNORM)) {
-   mtx_enter(>kq_lock);
-   if (kq->kq_count) {
-   revents |= events & (POLLIN | POLLRDNORM);
-   } else {
-   selrecord(p, >kq_sel);
-   kq->kq_state |= KQ_SEL;
-   }
-   mtx_leave(>kq_lock);
-   }
-   return (revents);
-}
-
-int
 kqueue_stat(struct file *fp, struct stat *st, struct proc *p)
 {
struct kqueue *kq = fp->f_data;
Index: sys/kern/sys_pipe.c
===
RCS file: src/sys/kern/sys_pipe.c,v
retrieving revision 1.137
diff -u -p -r1.137 sys_pipe.c
--- sys/kern/sys_pipe.c 6 May 2022 13:09:41 -   1.137
+++ sys/kern/sys_pipe.c 7 May 2022 10:57:53 -
@@ -40,7 +40,6 @@
 #include 
 #include 
 #include 
-#include 
 #ifdef KTRACE
 #include 
 #endif
@@ -61,7 +60,6 @@ struct pipe_pair {
 intpipe_read(struct file *, struct uio *, int);
 intpipe_write(struct file *, struct uio *, int);
 intpipe_close(struct file *, struct proc *);
-intpipe_poll(struct file *, int events, struct proc *);
 intpipe_kqfilter(struct file *fp, struct knote *kn);
 intpipe_ioctl(struct file *, u_long, caddr_t, struct proc *);
 intpipe_stat(struct file *fp, struct stat *ub, struct proc *p);
@@ -70,7 +68,6 @@ static const struct fileops pipeops = {
.fo_read= pipe_read,
.fo_write   = pipe_write,
.fo_ioctl   = pipe_ioctl,
-   .fo_poll= pipe_poll,
.fo_kqfilter= pipe_kqfilter,
.fo_stat= pipe_stat,
.fo_close   = pipe_close
@@ -719,46 +716,6 @@ pipe_ioctl(struct file *fp, u_long cmd, 
 }
 
 int
-pipe_poll(struct file *fp, int events, struct proc *p)
-{
-   struct pipe *rpipe = fp->f_data, *wpipe;
-   struct rwlock *lock = rpipe->pipe_lock;
-   int revents =

Re: pluart(4): fifo support

2022-05-01 Thread Visa Hankala

On Sat, Apr 30, 2022 at 09:40:24AM +0200, Anton Lindqvist wrote:
> On Sun, Mar 13, 2022 at 04:17:07PM +0100, Mark Kettenis wrote:
> > > Date: Fri, 11 Mar 2022 07:53:13 +0100
> > > From: Anton Lindqvist 
> > > 
> > > On Tue, Mar 08, 2022 at 01:44:47PM +, Visa Hankala wrote:
> > > > On Tue, Mar 08, 2022 at 08:04:36AM +0100, Anton Lindqvist wrote:
> > > > > On Mon, Mar 07, 2022 at 07:36:35AM +, Visa Hankala wrote:
> > > > > > I still think that checking TXFF and using the same code for both
> > > > > > SBSA and true PL011 UARTs would be the best choice. This would avoid
> > > > > > fragmenting the code and improve robustness by relying on 
> > > > > > functionality
> > > > > > that is common to the different controller variants.
> > > > > 
> > > > > Fair enough, new diff.
> > > > 
> > > > Maybe the comments should omit the FIFO space description and just
> > > > mention the lack of the level control register in the SBSA UART
> > > > register interface.
> > > 
> > > I ended up tweaking the comments before committing. Thanks for all the
> > > feedback.
> > > 
> > 
> > Hi Anton,
> > 
> > This diff seems to break things.  When I boot my rpi4 it now prints:
> > 
> >   pluart0 at simplebus0: rev 0, 16 byte fifo
> >   pluart0: console
> >   bcmbsc0 at simplebus0
> >   iic0 at bcmbsc0
> > 
> > so it appears that a carriage return character is lost here.
> > 
> > Later on output stops at:
> > 
> >   reordering libraries: done.
> > 
> > and only when I reboot the machine the login prompt appears, but with
> > some wierd respawning:
> > 
> >   OpenBSD/arm64 (rutter.sibelius.xs4all.nl) (console)
> > 
> >   login: init: getty repeating too quickly on port /dev/console, sleeping
> >   init: getty repeating too quickly on port /dev/console, sleeping
> > 
> >   OpenBSD/arm64 (rutter.sibelius.xs4all.nl) (console)
> > 
> >   login:
> >   OpenBSD/arm64 (rutter.sibelius.xs4all.nl) (console)
> > 
> >   login:
> > 
> > If you don't have a quick fix for this, may I suggest reverting the
> > commit?  We're heading towards release and we don't want the serial
> > console on the rpi4 to be broken.
> 
> Circling back to this: what happens is that no tx interrupt is raised
> when sending less data than the configured interrupt fifo level, causing
> the tty to end up in a forever busy state. Clearing the busy flag after
> a successful transmission of all queued data solves the problem.

Are you sure about the behaviour of the interrupt?

One possible problem is that pluart_intr() uses the raw, unmasked,
interrupt status to clear interrupts. Your previous patch always
disabled the Tx interrupt whenever the raw status indicated a Tx FIFO
level event.

This new patch might very well be correct. However, it feels strange
if the hardware raises the Tx interrupt only at one specific level of
FIFO state change.

It would be nice to know if a comstart()-style arrangement of interrupt
masking worked in pluart_start().

EVFILT_USER and kevent(2)

2022-04-30 Thread Visa Hankala

It has been asked in the past if OpenBSD's kevent(2) should implement
user event filters, also known as EVFILT_USER. This filter type
originates from FreeBSD but is now available also on DragonFly BSD,
NetBSD, and macOS.

Below is an implementation of EVFILT_USER. The logic should be fairly
straightforward. However, the filter type needs a special case in
kqueue_register() to allow triggering a previously registered user
event without using EV_ADD.

The code limits the number of user events. Otherwise the user could
allocate copious amounts of kernel memory. The limit is per process
so that programs will not interfere with each other. The current limit
is arbitrary and might need adjusting later. Hopefully a sysctl knob
will not be necessary.

I am in two minds about EVFILT_USER. On the one hand, having it on
OpenBSD might help with ports. On the other hand, it makes the kernel
perform a task that userspace can already handle using existing
interfaces.

Index: lib/libc/sys/kqueue.2
===
RCS file: src/lib/libc/sys/kqueue.2,v
retrieving revision 1.46
diff -u -p -r1.46 kqueue.2
--- lib/libc/sys/kqueue.2   31 Mar 2022 17:27:16 -  1.46
+++ lib/libc/sys/kqueue.2   30 Apr 2022 13:33:10 -
@@ -487,6 +487,44 @@ A device change event has occurred, e.g.
 On return,
 .Fa fflags
 contains the events which triggered the filter.
+.It Dv EVFILT_USER
+Establishes a user event identified by
+.Va ident
+which is not associated with any kernel mechanism but is triggered by
+user level code.
+The lower 24 bits of the
+.Va fflags
+may be used for user defined flags and manipulated using the following:
+.Bl -tag -width XXNOTE_FFLAGSMASK
+.It Dv NOTE_FFNOP
+Ignore the input
+.Va fflags .
+.It Dv NOTE_FFAND
+Bitwise AND
+.Va fflags .
+.It Dv NOTE_FFOR
+Bitwise OR
+.Va fflags .
+.It Dv NOTE_FFCOPY
+Copy
+.Va fflags .
+.It Dv NOTE_FFCTRLMASK
+Control mask for
+.Va fflags .
+.It Dv NOTE_FFLAGSMASK
+User defined flag mask for
+.Va fflags .
+.El
+.Pp
+A user event is triggered for output with the following:
+.Bl -tag -width XXNOTE_FFLAGSMASK
+.It Dv NOTE_TRIGGER
+Cause the event to be triggered.
+.El
+.Pp
+On return,
+.Va fflags
+contains the users defined flags in the lower 24 bits.
 .El
 .Sh RETURN VALUES
 .Fn kqueue
Index: regress/sys/kern/kqueue/Makefile
===
RCS file: src/regress/sys/kern/kqueue/Makefile,v
retrieving revision 1.31
diff -u -p -r1.31 Makefile
--- regress/sys/kern/kqueue/Makefile30 Mar 2022 05:11:52 -  1.31
+++ regress/sys/kern/kqueue/Makefile30 Apr 2022 13:33:12 -
@@ -4,7 +4,7 @@ PROG=   kqueue-test
 CFLAGS+=-Wall
 SRCS=  kqueue-pipe.c kqueue-fork.c main.c kqueue-process.c kqueue-random.c \
kqueue-pty.c kqueue-tun.c kqueue-signal.c kqueue-fdpass.c \
-   kqueue-flock.c kqueue-timer.c kqueue-regress.c
+   kqueue-flock.c kqueue-timer.c kqueue-regress.c kqueue-user.c
 LDADD= -levent -lutil
 DPADD= ${LIBEVENT} ${LIBUTIL}
 
@@ -50,6 +50,8 @@ kq-regress-5: ${PROG}
./${PROG} -R5
 kq-regress-6: ${PROG}
./${PROG} -R6
+kq-user: ${PROG}
+   ./${PROG} -u
 
 TESTS+=kq-fdpass
 TESTS+=kq-flock
@@ -70,6 +72,7 @@ TESTS+=   kq-reset-timer
 TESTS+=kq-signal
 TESTS+=kq-timer
 TESTS+=kq-tun
+TESTS+=kq-user
 
 REGRESS_TARGETS=${TESTS}
 REGRESS_ROOT_TARGETS=kq-pty-1
Index: regress/sys/kern/kqueue/kqueue-user.c
===
RCS file: regress/sys/kern/kqueue/kqueue-user.c
diff -N regress/sys/kern/kqueue/kqueue-user.c
--- /dev/null   1 Jan 1970 00:00:00 -
+++ regress/sys/kern/kqueue/kqueue-user.c   30 Apr 2022 13:33:12 -
@@ -0,0 +1,189 @@
+/* $OpenBSD$   */
+
+/*
+ * Copyright (c) 2022 Visa Hankala
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+
+#include "main.h"
+
+int
+do_user(void)
+{
+   const struct timespec ts = { 0, 1 };
+   struct kevent kev[2];
+   int dummy, dummy2, i, kq, n;
+
+   ASS((kq = kqueue()) >= 0,
+   warn("kqueue"));
+
+   /* Set up an event. */
+   EV_SET([0], 1, EVFILT_USE

Re: Make pipes and sockets use KNOTE() instead of selwakeup()

2022-04-25 Thread Visa Hankala

On Tue, Feb 08, 2022 at 09:12:11AM +, Visa Hankala wrote:
> Now that poll(2) is based on kqueue, the old, non-MP-safe poll backend
> is not used any longer. Event sources can call KNOTE() directly instead
> of selwakeup().
> 
> This diff does the KNOTE() conversion for pipes and sockets, removing
> a kernel-locked section from a frequently used code path. The related
> event filters do not use the hint value, hence passing 0 rather than
> NOTE_SUBMIT.

This patch is still valid. It also paves the way for the removal
of fo_poll member from struct file.

Testers are welcome.

> Index: kern/sys_pipe.c
> ===
> RCS file: src/sys/kern/sys_pipe.c,v
> retrieving revision 1.133
> diff -u -p -r1.133 sys_pipe.c
> --- kern/sys_pipe.c   13 Dec 2021 14:56:55 -  1.133
> +++ kern/sys_pipe.c   8 Feb 2022 08:59:05 -
> @@ -381,12 +381,7 @@ pipeselwakeup(struct pipe *cpipe)
>  {
>   rw_assert_wrlock(cpipe->pipe_lock);
>  
> - if (cpipe->pipe_state & PIPE_SEL) {
> - cpipe->pipe_state &= ~PIPE_SEL;
> - selwakeup(>pipe_sel);
> - } else {
> - KNOTE(>pipe_sel.si_note, 0);
> - }
> + KNOTE(>pipe_sel.si_note, 0);
>  
>   if (cpipe->pipe_state & PIPE_ASYNC)
>   pgsigio(>pipe_sigio, SIGIO, 0);
> Index: kern/uipc_socket.c
> ===
> RCS file: src/sys/kern/uipc_socket.c,v
> retrieving revision 1.271
> diff -u -p -r1.271 uipc_socket.c
> --- kern/uipc_socket.c24 Dec 2021 06:50:16 -  1.271
> +++ kern/uipc_socket.c8 Feb 2022 08:59:05 -
> @@ -2049,7 +2049,7 @@ void
>  sohasoutofband(struct socket *so)
>  {
>   pgsigio(>so_sigio, SIGURG, 0);
> - selwakeup(>so_rcv.sb_sel);
> + KNOTE(>so_rcv.sb_sel.si_note, 0);
>  }
>  
>  int
> Index: kern/uipc_socket2.c
> ===
> RCS file: src/sys/kern/uipc_socket2.c,v
> retrieving revision 1.116
> diff -u -p -r1.116 uipc_socket2.c
> --- kern/uipc_socket2.c   6 Nov 2021 05:26:33 -   1.116
> +++ kern/uipc_socket2.c   8 Feb 2022 08:59:06 -
> @@ -423,7 +423,7 @@ sowakeup(struct socket *so, struct sockb
>   }
>   if (sb->sb_flags & SB_ASYNC)
>   pgsigio(>so_sigio, SIGIO, 0);
> - selwakeup(>sb_sel);
> + KNOTE(>sb_sel.si_note, 0);
>  }
>  
>  /*
>

Re: Provide memory barriers in refcnt_rele() and refcnt_finalize()

2022-04-25 Thread Visa Hankala

On Thu, Apr 21, 2022 at 11:10:40PM +0200, Mark Kettenis wrote:
> > Date: Thu, 21 Apr 2022 22:17:31 +0200
> > From: Alexander Bluhm 
> > 
> > On Mon, Apr 18, 2022 at 08:33:06AM +, Visa Hankala wrote:
> > > I think the sanest solution is to add the release and acquire barriers
> > > in refcnt_rele().
> > 
> > Getting memory barriers right is too complicated for developers
> > doing MP stuff.  The existing locking and refcount primitives have
> > to implement that functionality.  I am on visa@'s side and would
> > prefer a memory barrier in refcount API instead of searching for
> > races in MP code.
> > 
> > Better waste some CPU cycles in some cases than having strange
> > behavior due to missing barries in other cases.
> 
> I don't disagree with that.  The refcount API is at the same level as
> the mutex API and explicit memory barriers should not be needed to use
> it safely.  It's just that it isn't obvious what the right memory
> ordering semantics are for a refcount API.  For some reason this
> doesn't seem to be something that's widely discussed.
> 
> The Linux include/linux/refcount.h file has the following comment at
> the start:
> 
>  * Memory ordering
>  * ===
>  *
>  * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
>  * and provide only what is strictly required for refcounts.
>  *
>  * The increments are fully relaxed; these will not provide ordering. The
>  * rationale is that whatever is used to obtain the object we're increasing 
> the
>  * reference count on will provide the ordering. For locked data structures,
>  * its the lock acquire, for RCU/lockless data structures its the dependent
>  * load.
>  *
>  * Do note that inc_not_zero() provides a control dependency which will order
>  * future stores against the inc, this ensures we'll never modify the object
>  * if we did not in fact acquire a reference.
>  *
>  * The decrements will provide release order, such that all the prior loads 
> and
>  * stores will be issued before, it also provides a control dependency, which
>  * will order us against the subsequent free().
>  *
>  * The control dependency is against the load of the cmpxchg (ll/sc) that
>  * succeeded. This means the stores aren't fully ordered, but this is fine
>  * because the 1->0 transition indicates no concurrency.
>  *
>  * Note that the allocator is responsible for ordering things between free()
>  * and alloc().
>  *
>  * The decrements dec_and_test() and sub_and_test() also provide acquire
>  * ordering on success.
> 
> That is still a bit murky, but I think it matches what visa@'s diff
> does for refcnt_rele(9).  And doing what Linux does in its refcount
> API is probably a good thing.  But I don't think it covers the
> membar_sync() added to the end of refcnt_finalize(9), so I'm not
> confident about that bit, and would like to understand that better.

The patch uses membar_sync(), and not membar_enter(), after the loop
in refcnt_finalize() because subsequent memory operations should hinge
on the load of r_refs.

membar_enter() is usable when the reference point is a store.

> The other issue I have with the diff is that it documentations the
> memory ordering in terms of acquire and release which is not what we
> do in other places such as the membar_enter(9) man page.  Maybe this
> should explicitly call out the memory ordering like what the Linux
> comment does.

I have updated the documentation, though I am not sure if the outcome
is an improvement.

Index: share/man/man9/refcnt_init.9
===
RCS file: src/share/man/man9/refcnt_init.9,v
retrieving revision 1.2
diff -u -p -r1.2 refcnt_init.9
--- share/man/man9/refcnt_init.916 Mar 2022 14:13:01 -  1.2
+++ share/man/man9/refcnt_init.925 Apr 2022 14:34:05 -
@@ -74,6 +74,17 @@ There may only be one caller to
 per refcnt
 .Fa r .
 .Pp
+.Fn refcnt_rele ,
+.Fn refcnt_rele_wake
+and
+.Fn refcnt_finalize
+order prior memory loads and stores before the release of the reference.
+The functions enforce control dependency so that after the final reference
+has been released, subsequent loads and stores happen after the release.
+These ensure that concurrent accesses cease before the object's destructor
+runs and that the destructor sees all updates done during the lifetime
+of the object.
+.Pp
 .Fn refcnt_shared
 tests if the object has multiple references.
 .Pp
Index: sys/kern/kern_synch.c
===
RCS file: src/sys/kern/kern_synch.c,v
retrieving revision 1.185
diff -u -p -r1.185 kern_synch.c
--- sys/kern/kern_synch.c   18 Mar 2022 15:32:06 -  1.185
+++ sys/kern/k

Re: Provide memory barriers in refcnt_rele() and refcnt_finalize()

2022-04-18 Thread Visa Hankala

On Mon, Mar 14, 2022 at 04:14:47PM +, Visa Hankala wrote:
> On Mon, Mar 14, 2022 at 02:01:07AM -0700, Philip Guenther wrote:
> > On Mon, Mar 14, 2022 at 12:47 AM Visa Hankala  wrote:
> > 
> > > On Sun, Mar 13, 2022 at 06:26:19PM -0700, Philip Guenther wrote:
> > > > On Sun, Mar 13, 2022 at 10:27 AM Visa Hankala  wrote:
> > > >
> > > > > On Sun, Mar 13, 2022 at 04:29:44PM +0100, Mark Kettenis wrote:
> > > > >
> > > > ...
> > > >
> > > > > > Under what circumstances does memory ordering matter for these
> > > > > > interfaces?
> > > > >
> > > > > Consider the following scenario:
> > > > >
> > > > > struct widget {
> > > > > struct refcnt   w_refcnt;
> > > > > /* more fields spanning many cache lines */
> > > > > ...
> > > > > int w_var;
> > > > > };
> > > > >
> > > > > First, CPU 1 executes:
> > > > >
> > > > > w->w_var = 1;
> > > > > refcnt_rele(>w_refcnt);  /* remains above zero */
> > > > >
> > > >
> > > > Having incremented the refcnt previous does not give this thread
> > > exclusive
> > > > access to 'w', so if it's writing to w->w_var then it must either
> > > > a) have some sort of write lock taken, which it will release after this
> > > and
> > > > which will contain the necessary member, OR
> > > > b) have the only access patch to this structure (i.e, it's not yet
> > > > 'published' into structures which can be seen by other threads), in 
> > > > which
> > > > case the operations which do that 'publishing' of the access to 'w'
> > > (adding
> > > > it to a global list, etc) must include the necessary membar.
> > >
> > > Lets change the sequence to this:
> > >
> > > local_var = atomic_load_int(>w_var);
> > > refcnt_rele(>w_refcnt);
> > >
> > > Without the release barrier, is the load guaranteed to happen before
> > > the reference count is decremented?
> > >
> > 
> > That's completely uncomparable to what you described before for "CPU 1"
> > because there's no write to anything but the refcnt.
> > 
> > If no one writes to the object that is ref counted, then there are no
> > visibility problems, no?
> > 
> > 
> > > Next, CPU 2 executes:
> > > > >
> > > > > if (refcnt_rele(>w_refcnt))  /* refcnt drops to zero */
> > > > > free(w);
> > > > >
> > > >
> > > > How did CPU 2 get what is now exclusive access to 'w' without any
> > > membars?
> > > > If that's possible then it was just accessing 'w' and possibly not 
> > > > seeing
> > > > the update to w->w_var even _before_ the refcnt_rele(), so putting a
> > > membar
> > > > in refcnt_rele() hides the incorrect code by suppressing the later 
> > > > crash!
> > > >
> > > > If these membars appear to help then the code is and remains broken.
> > > This
> > > > change should not be done.
> > >
> > > It is not uncommon to see something like below:
> > >
> > > Access object with read intent:
> > >
> > > mtx_enter(_lock);
> > > w = lookup_from_list();
> > > if (w != NULL)
> > > refcnt_take(>w_refcnt);
> > > mtx_leave(_lock);
> > > if (w == NULL)
> > > return;
> > > ...
> > >
> > 
> > No writes to *w described *OR LEGAL*.
> > 
> > 
> > 
> > > if (refcnt_rele(>w_refcnt))
> > > free(w);
> > 
> > Delete object:
> > >
> > > mtx_enter(_lock);
> > > w = lookup_from_list();
> > > if (w != NULL)
> > > remove_from_list(w);
> > > mtx_leave(_lock);
> > >
> > 
> > This does the 'release' on the change to w's list link(s).
> > 
> > 
> > 
> > > /* Release list's reference. */
> > > if (w != NULL && refcnt_rele(>w_refcnt))
> > > free(w);
> > >
> > > Above, any refcnt_rele()

Re: vfs: document (and correct) the protection required for manipulating v_numoutput

2022-04-12 Thread Visa Hankala

On Sun, Mar 27, 2022 at 03:36:20PM +0200, Sebastien Marie wrote:
> v_numoutput is a struct member of vnode which is used to keep track the 
> number 
> of writes in progress.
> 
> in several function comments, it is marked as "Manipulates v_numoutput. Must 
> be 
> called at splbio()".
> 
> So I added a "[B]" mark in the comment to properly document the need of 
> IPL_BIO 
> protection.
> 
> Next, I audited the tree for usage. I found 2 occurrences of v_numoutput 
> (modification) without the required protection, inside dev/softraid.c. I 
> added 
> them.
> 
> Comments or OK ?

Please move the declarations of `s' next to the other variable
declarations at the starts of the functions. With that, OK visa@

> Index: dev/softraid.c
> ===
> RCS file: /cvs/src/sys/dev/softraid.c,v
> retrieving revision 1.422
> diff -u -p -r1.422 softraid.c
> --- dev/softraid.c20 Mar 2022 13:14:02 -  1.422
> +++ dev/softraid.c27 Mar 2022 13:28:55 -
> @@ -437,8 +437,12 @@ sr_rw(struct sr_softc *sc, dev_t dev, ch
>   b.b_resid = bufsize;
>   b.b_vp = vp;
>  
> - if ((b.b_flags & B_READ) == 0)
> + if ((b.b_flags & B_READ) == 0) {
> + int s;
> + s = splbio();
>   vp->v_numoutput++;
> + splx(s);
> + }
>  
>   LIST_INIT(_dep);
>   VOP_STRATEGY(vp, );
> @@ -2006,8 +2010,12 @@ sr_ccb_rw(struct sr_discipline *sd, int 
>   ccb->ccb_buf.b_vp = sc->src_vn;
>   ccb->ccb_buf.b_bq = NULL;
>  
> - if (!ISSET(ccb->ccb_buf.b_flags, B_READ))
> + if (!ISSET(ccb->ccb_buf.b_flags, B_READ)) {
> + int s;
> + s = splbio();
>   ccb->ccb_buf.b_vp->v_numoutput++;
> + splx(s);
> + }
>  
>   LIST_INIT(>ccb_buf.b_dep);
>  
> Index: sys/vnode.h
> ===
> RCS file: /cvs/src/sys/sys/vnode.h,v
> retrieving revision 1.163
> diff -u -p -r1.163 vnode.h
> --- sys/vnode.h   12 Dec 2021 09:14:59 -  1.163
> +++ sys/vnode.h   27 Mar 2022 13:28:56 -
> @@ -89,6 +89,7 @@ RBT_HEAD(namecache_rb_cache, namecache);
>   * Locks used to protect struct members in struct vnode:
>   *   a   atomic
>   *   V   vnode_mtx
> + *   B   IPL_BIO
>   */
>  struct uvm_vnode;
>  struct vnode {
> @@ -113,7 +114,7 @@ struct vnode {
>   struct  buf_rb_bufs v_bufs_tree;/* lookup of all bufs */
>   struct  buflists v_cleanblkhd;  /* clean blocklist head */
>   struct  buflists v_dirtyblkhd;  /* dirty blocklist head */
> - u_int   v_numoutput;/* num of writes in progress */
> + u_int   v_numoutput;/* [B] num of writes in progress */
>   LIST_ENTRY(vnode) v_synclist;   /* vnode with dirty buffers */
>   union {
>   struct mount*vu_mountedhere;/* ptr to mounted vfs (VDIR) */
>

Re: Kill selrecord()

2022-04-12 Thread Visa Hankala

On Mon, Apr 11, 2022 at 07:14:46PM +0200, Martin Pieuchot wrote:
> Now that poll(2) & select(2) use the kqueue backend under the hood we
> can start retiring the old machinery. 
> 
> The diff below does not touch driver definitions, however it :
> 
> - kills selrecord() & doselwakeup()
> 
> - make it obvious that `kern.nselcoll' is now always 0 
> 
> - Change all poll/select hooks to return 0
> 
> - Kill a seltid check in wsdisplaystart() which is now always true
> 
> In a later step we could remove the *_poll() requirement from device
> drivers and kill seltrue() & selfalse().
> 
> ok?

I was planning to wait for two weeks after the general availability
of the 7.1 release, and then begin the dismantling in logical order
from fo_poll.

The final tier of the new code's testing will begin when release users
upgrade their production machines. Because of this, I would refrain
from burning bridges just yet.

gpio: Add missing device_unref()

2022-04-10 Thread Visa Hankala

Make gpio(4) release the device reference that device_lookup() takes.

OK?

Index: dev/gpio/gpio.c
===
RCS file: src/sys/dev/gpio/gpio.c,v
retrieving revision 1.16
diff -u -p -r1.16 gpio.c
--- dev/gpio/gpio.c 6 Apr 2022 18:59:28 -   1.16
+++ dev/gpio/gpio.c 10 Apr 2022 14:45:40 -
@@ -53,6 +53,7 @@ int   gpio_detach(struct device *, int);
 intgpio_search(struct device *, void *, void *);
 intgpio_print(void *, const char *);
 intgpio_pinbyname(struct gpio_softc *, char *gp_name);
+intgpio_ioctl(struct gpio_softc *, u_long, caddr_t, int);
 
 const struct cfattach gpio_ca = {
sizeof (struct gpio_softc),
@@ -249,16 +250,20 @@ int
 gpioopen(dev_t dev, int flag, int mode, struct proc *p)
 {
struct gpio_softc *sc;
+   int error = 0;
 
sc = (struct gpio_softc *)device_lookup(_cd, minor(dev));
if (sc == NULL)
return (ENXIO);
 
if (sc->sc_opened)
-   return (EBUSY);
-   sc->sc_opened = 1;
+   error = EBUSY;
+   else
+   sc->sc_opened = 1;
 
-   return (0);
+   device_unref(>sc_dev);
+
+   return (error);
 }
 
 int
@@ -272,6 +277,8 @@ gpioclose(dev_t dev, int flag, int mode,
 
sc->sc_opened = 0;
 
+   device_unref(>sc_dev);
+
return (0);
 }
 
@@ -287,9 +294,8 @@ gpio_pinbyname(struct gpio_softc *sc, ch
 }
 
 int
-gpioioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p)
+gpio_ioctl(struct gpio_softc *sc, u_long cmd, caddr_t data, int flag)
 {
-   struct gpio_softc *sc;
gpio_chipset_tag_t gc;
struct gpio_info *info;
struct gpio_pin_op *op;
@@ -301,10 +307,6 @@ gpioioctl(dev_t dev, u_long cmd, caddr_t
struct device *dv;
int pin, value, flags, npins, found;
 
-   sc = (struct gpio_softc *)device_lookup(_cd, minor(dev));
-   if (sc == NULL)
-   return (ENXIO);
-
gc = sc->sc_gc;
 
switch (cmd) {
@@ -520,4 +522,21 @@ gpioioctl(dev_t dev, u_long cmd, caddr_t
}
 
return (0);
+}
+
+int
+gpioioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p)
+{
+   struct gpio_softc *sc;
+   int error;
+
+   sc = (struct gpio_softc *)device_lookup(_cd, minor(dev));
+   if (sc == NULL)
+   return (ENXIO);
+
+   error = gpio_ioctl(sc, cmd, data, flag);
+
+   device_unref(>sc_dev);
+
+   return (error);
 }

Re: refcount btrace

2022-04-07 Thread Visa Hankala

On Thu, Apr 07, 2022 at 07:55:11PM +0200, Alexander Bluhm wrote:
> On Wed, Mar 23, 2022 at 06:13:27PM +0100, Alexander Bluhm wrote:
> > In my opinion tracepoints give insight at minimal cost.  It is worth
> > it to have it in GENERIC to make it easy to use.
> 
> After release I want to revive the btrace of refcounts discussion.
> 
> As mpi@ mentioned the idea of dt(4) is to have these trace points
> in GENERIC kernel.  If you want to hunt a bug, just turn it on.
> Refounting is a common place for bugs, leaks can be detected easily.
> 
> The alternative are some defines that you can compile in and access
> from ddb.  This is more work and you would have to implement it for
> every recount.
> https://marc.info/?l=openbsd-tech=163786435916039=2
> 
> There is no measuarable performance difference.  dt(4) is written
> in a way that is is only one additional branch.  At least my goal
> is to add trace points to useful places when we identify them.

DT_INDEX_ENTER() still checks the index first, so it has two branches
in practice.

I think dt_tracing should be checked first so that it serves as
a gateway to the trace code. Under normal operation, the check's
outcome is always the same, which is easy even for simple branch
predictors.

I have a slight suspicion that dt(4) is now becoming a way to add code
that would be otherwise unacceptable. Also, how "durable" are trace
points perceived? Is an added trace point an achieved advantage that
is difficult to remove even when its utility has diminished? There is
a similarity to (ad hoc) debug printfs too.

__read_mostly

2022-03-20 Thread Visa Hankala

Linux uses pseudo-attribute __read_mostly to tag variables that are read
frequently but written seldom. These variables are grouped together
at link time. This improves cache usage by reducing false sharing.

__read_mostly has been adopted by FreeBSD, DragonFly BSD and NetBSD.
The former two also have __read_frequently. It aims to improve
the placement of hot variables even further.

The following patch shows an implementation of __read_mostly on amd64
and a few samples of the tag's usage. It does not add __read_frequently
as this attribute seems somewhat redundant.

A downside of this is that it becomes easier to guess the addresses
of the tagged variables.

Index: arch/amd64/conf/ld.script
===
RCS file: src/sys/arch/amd64/conf/ld.script,v
retrieving revision 1.17
diff -u -p -r1.17 ld.script
--- arch/amd64/conf/ld.script   7 Mar 2021 23:10:54 -   1.17
+++ arch/amd64/conf/ld.script   20 Mar 2022 05:49:46 -
@@ -117,6 +117,8 @@ SECTIONS
.data : AT (__kernel_data_phys)
{
__data_start = ABSOLUTE(.);
+   *(.data.read_mostly)
+   . = ALIGN(128);
*(.data .data.*)
} :data =0x
 
Index: dev/dt/dt_dev.c
===
RCS file: src/sys/dev/dt/dt_dev.c,v
retrieving revision 1.22
diff -u -p -r1.22 dt_dev.c
--- dev/dt/dt_dev.c 27 Feb 2022 10:14:01 -  1.22
+++ dev/dt/dt_dev.c 20 Mar 2022 05:49:46 -
@@ -110,7 +110,7 @@ unsigned intdt_nprobes; /* [I] 
# of p
 SIMPLEQ_HEAD(, dt_probe)   dt_probe_list;  /* [I] list of probes */
 
 struct rwlock  dt_lock = RWLOCK_INITIALIZER("dtlk");
-volatile uint32_t  dt_tracing = 0; /* [K] # of processes tracing */
+volatile __read_mostly uint32_tdt_tracing = 0; /* [K] # of processes 
tracing */
 
 int allowdt;
 
Index: dev/pci/drm/include/linux/compiler.h
===
RCS file: src/sys/dev/pci/drm/include/linux/compiler.h,v
retrieving revision 1.8
diff -u -p -r1.8 compiler.h
--- dev/pci/drm/include/linux/compiler.h19 Jan 2022 02:49:05 -  
1.8
+++ dev/pci/drm/include/linux/compiler.h20 Mar 2022 05:49:46 -
@@ -12,7 +12,6 @@
 #define __force
 #define __acquires(x)
 #define __releases(x)
-#define __read_mostly
 #define __iomem
 #define __must_check
 #define __init
Index: kern/init_main.c
===
RCS file: src/sys/kern/init_main.c,v
retrieving revision 1.315
diff -u -p -r1.315 init_main.c
--- kern/init_main.c22 Feb 2022 01:15:01 -  1.315
+++ kern/init_main.c20 Mar 2022 05:49:47 -
@@ -131,7 +131,7 @@ extern  struct user *proc0paddr;
 
 struct vnode *rootvp, *swapdev_vp;
 intboothowto;
-intdb_active = 0;
+__read_mostly int db_active = 0;
 intncpus =  1;
 intncpusfound = 1; /* number of cpus we find */
 volatile int start_init_exec;  /* semaphore for start_init() */
Index: kern/subr_prf.c
===
RCS file: src/sys/kern/subr_prf.c,v
retrieving revision 1.105
diff -u -p -r1.105 subr_prf.c
--- kern/subr_prf.c 20 Jan 2022 17:11:30 -  1.105
+++ kern/subr_prf.c 20 Mar 2022 05:49:47 -
@@ -97,8 +97,13 @@ struct mutex kprintf_mutex =
  */
 
 extern int log_open;   /* subr_log: is /dev/klog open? */
-const  char *panicstr; /* arg to first call to panic (used as a flag
-  to indicate that panic has already been called). */
+
+/*
+ * arg to first call to panic (used as a flag
+ * to indicate that panic has already been called).
+ */
+__read_mostly const char *panicstr;
+
 #ifdef DDB
 /*
  * Enter ddb on panic.
Index: sys/systm.h
===
RCS file: src/sys/sys/systm.h,v
retrieving revision 1.155
diff -u -p -r1.155 systm.h
--- sys/systm.h 9 Dec 2021 00:26:10 -   1.155
+++ sys/systm.h 20 Mar 2022 05:49:47 -
@@ -315,6 +315,8 @@ int uiomove(void *, size_t, struct uio *
 
 #include 
 
+#define __read_mostly  __attribute__((__section__(".data.read_mostly")))
+
 extern struct rwlock netlock;
 
 /*

Re: refcount btrace

2022-03-19 Thread Visa Hankala

On Sat, Mar 19, 2022 at 12:10:11AM +0100, Alexander Bluhm wrote:
> On Thu, Mar 17, 2022 at 07:25:27AM +0000, Visa Hankala wrote:
> > On Thu, Mar 17, 2022 at 12:42:13AM +0100, Alexander Bluhm wrote:
> > > I would like to use btrace to debug refernce counting.  The idea
> > > is to a a tracepoint for every type of refcnt we have.  When it
> > > changes, print the actual object, the current counter and the change
> > > value.
> > 
> > > Do we want that feature?
> > 
> > I am against this in its current form. The code would become more
> > complex, and the trace points can affect timing. There is a risk that
> > the kernel behaves slightly differently when dt has been compiled in.
> 
> On our main architectures dt(4) is in GENERIC.  I see your timing
> point for uvm structures.

In my opinion, having dt(4) enabled by default is another reason why
there should be no carte blanche for adding trace points. Each trace
point adds a tiny amount of bloat. Few users will use the tracing
facility.

Maybe high-rate trace points could be behind a build option...

> What do you think about this?  The check starts with a
> __predict_false(index > 0) in #define DT_INDEX_ENTER.  The r_traceidx
> is very likely in the same cache line as r_refs.  So the additional
> overhead of the branch should be small compared to the atomic
> operation.  The __predict_false(dt_tracing) might take longer as
> it is a global variable.

I have no hard data to back up my claim, but I think dt_tracing should
be checked first. This would make the situation easier for branch
prediction. It is likely that dt_tracing is already in cache.

Re: refcount btrace

2022-03-18 Thread Visa Hankala

On Thu, Mar 17, 2022 at 06:16:51PM +0100, Alexander Bluhm wrote:
> On Thu, Mar 17, 2022 at 07:25:27AM +0000, Visa Hankala wrote:
> > On Thu, Mar 17, 2022 at 12:42:13AM +0100, Alexander Bluhm wrote:
> > > I would like to use btrace to debug refernce counting.  The idea
> > > is to a a tracepoint for every type of refcnt we have.  When it
> > > changes, print the actual object, the current counter and the change
> > > value.
> > 
> > > Do we want that feature?
> > 
> > I am against this in its current form. The code would become more
> > complex, and the trace points can affect timing. There is a risk that
> > the kernel behaves slightly differently when dt has been compiled in.
> 
> Can we get in this part then?
> 
> - Remove DIAGNOSTIC to keep similar in non DIAGNOSTIC case.
> - Rename refcnt to refs.  refcnt is the struct, refs contains the
>   r_refs value.
> - Add KASSERT(refs != ~0) in refcnt_finalize().
> - Always use u_int refs so I can insert my btrace diff easily.

I think this is fine.

OK visa@

> Index: kern/kern_synch.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_synch.c,v
> retrieving revision 1.184
> diff -u -p -r1.184 kern_synch.c
> --- kern/kern_synch.c 16 Mar 2022 14:13:01 -  1.184
> +++ kern/kern_synch.c 17 Mar 2022 16:12:50 -
> @@ -810,25 +810,21 @@ refcnt_init(struct refcnt *r)
>  void
>  refcnt_take(struct refcnt *r)
>  {
> -#ifdef DIAGNOSTIC
> - u_int refcnt;
> + u_int refs;
>  
> - refcnt = atomic_inc_int_nv(>r_refs);
> - KASSERT(refcnt != 0);
> -#else
> - atomic_inc_int(>r_refs);
> -#endif
> + refs = atomic_inc_int_nv(>r_refs);
> + KASSERT(refs != 0);
> + (void)refs;
>  }
>  
>  int
>  refcnt_rele(struct refcnt *r)
>  {
> - u_int refcnt;
> + u_int refs;
>  
> - refcnt = atomic_dec_int_nv(>r_refs);
> - KASSERT(refcnt != ~0);
> -
> - return (refcnt == 0);
> + refs = atomic_dec_int_nv(>r_refs);
> + KASSERT(refs != ~0);
> + return (refs == 0);
>  }
>  
>  void
> @@ -842,26 +838,33 @@ void
>  refcnt_finalize(struct refcnt *r, const char *wmesg)
>  {
>   struct sleep_state sls;
> - u_int refcnt;
> + u_int refs;
>  
> - refcnt = atomic_dec_int_nv(>r_refs);
> - while (refcnt) {
> + refs = atomic_dec_int_nv(>r_refs);
> + KASSERT(refs != ~0);
> + while (refs) {
>   sleep_setup(, r, PWAIT, wmesg, 0);
> - refcnt = atomic_load_int(>r_refs);
> - sleep_finish(, refcnt);
> + refs = atomic_load_int(>r_refs);
> + sleep_finish(, refs);
>   }
>  }
>  
>  int
>  refcnt_shared(struct refcnt *r)
>  {
> - return (atomic_load_int(>r_refs) > 1);
> + u_int refs;
> +
> + refs = atomic_load_int(>r_refs);
> + return (refs > 1);
>  }
>  
>  unsigned int
>  refcnt_read(struct refcnt *r)
>  {
> - return (atomic_load_int(>r_refs));
> + u_int refs;
> +
> + refs = atomic_load_int(>r_refs);
> + return (refs);
>  }
>  
>  void
>

Use refcnt API with struct plimit

2022-03-17 Thread Visa Hankala

Use the refcnt API with struct plimit.

OK?

Index: kern/kern_resource.c
===
RCS file: src/sys/kern/kern_resource.c,v
retrieving revision 1.71
diff -u -p -r1.71 kern_resource.c
--- kern/kern_resource.c8 Feb 2021 10:51:01 -   1.71
+++ kern/kern_resource.c17 Mar 2022 15:59:52 -
@@ -582,7 +582,7 @@ lim_startup(struct plimit *limit0)
limit0->pl_rlimit[RLIMIT_RSS].rlim_max = lim;
limit0->pl_rlimit[RLIMIT_MEMLOCK].rlim_max = lim;
limit0->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = lim / 3;
-   limit0->pl_refcnt = 1;
+   refcnt_init(>pl_refcnt);
 }
 
 /*
@@ -598,14 +598,14 @@ lim_copy(struct plimit *lim)
newlim = pool_get(_pool, PR_WAITOK);
memcpy(newlim->pl_rlimit, lim->pl_rlimit,
sizeof(struct rlimit) * RLIM_NLIMITS);
-   newlim->pl_refcnt = 1;
+   refcnt_init(>pl_refcnt);
return (newlim);
 }
 
 void
 lim_free(struct plimit *lim)
 {
-   if (atomic_dec_int_nv(>pl_refcnt) > 0)
+   if (refcnt_rele(>pl_refcnt) == 0)
return;
pool_put(_pool, lim);
 }
@@ -617,7 +617,7 @@ lim_fork(struct process *parent, struct 
 
mtx_enter(>ps_mtx);
limit = parent->ps_limit;
-   atomic_inc_int(>pl_refcnt);
+   refcnt_take(>pl_refcnt);
mtx_leave(>ps_mtx);
 
child->ps_limit = limit;
@@ -650,7 +650,7 @@ lim_write_begin(void)
 */
 
limit = p->p_p->ps_limit;
-   if (P_HASSIBLING(p) || limit->pl_refcnt > 1)
+   if (P_HASSIBLING(p) || refcnt_shared(>pl_refcnt))
limit = lim_copy(limit);
 
return (limit);
@@ -703,7 +703,7 @@ lim_read_enter(void)
if (limit != pr->ps_limit) {
mtx_enter(>ps_mtx);
limit = pr->ps_limit;
-   atomic_inc_int(>pl_refcnt);
+   refcnt_take(>pl_refcnt);
mtx_leave(>ps_mtx);
if (p->p_limit != NULL)
lim_free(p->p_limit);
Index: sys/resourcevar.h
===
RCS file: src/sys/sys/resourcevar.h,v
retrieving revision 1.24
diff -u -p -r1.24 resourcevar.h
--- sys/resourcevar.h   21 Jun 2019 09:39:48 -  1.24
+++ sys/resourcevar.h   17 Mar 2022 15:59:52 -
@@ -35,6 +35,7 @@
 #ifndef_SYS_RESOURCEVAR_H_
 #define_SYS_RESOURCEVAR_H_
 
+#include 
 #include 
 
 /*
@@ -44,7 +45,7 @@
  */
 struct plimit {
struct  rlimit pl_rlimit[RLIM_NLIMITS];
-   u_int   pl_refcnt;  /* number of references */
+   struct  refcnt pl_refcnt;
 };
 
 /* add user profiling from AST */

Re: refcount btrace

2022-03-17 Thread Visa Hankala

On Thu, Mar 17, 2022 at 12:42:13AM +0100, Alexander Bluhm wrote:
> I would like to use btrace to debug refernce counting.  The idea
> is to a a tracepoint for every type of refcnt we have.  When it
> changes, print the actual object, the current counter and the change
> value.

> Do we want that feature?

I am against this in its current form. The code would become more
complex, and the trace points can affect timing. There is a risk that
the kernel behaves slightly differently when dt has been compiled in.

Re: Remove data dependency barrier from atomic_load_*

2022-03-17 Thread Visa Hankala

On Wed, Mar 16, 2022 at 11:09:12PM +0100, Alexander Bluhm wrote:
> On Tue, Mar 15, 2022 at 09:15:34AM +0000, Visa Hankala wrote:
> > However, some DEC Alpha CPUs have their data caches divided into cache
> > banks to improve bandwidth. These cache banks are relatively
> > independent. The system maintains coherency, but bus contention can
> > delay propagation of cache updates. If the loads spanned different cache
> > banks, the second load could deliver data which is older than the
> > initial load's value. The data dependency barrier causes an interlock
> > with cache updating, ensuring causal ordering.)
> 
> The code with the membar is copied from READ_ONCE() which is copied
> from Linux.  The membar_datadep_consumer() has an #ifdef __alpha__
> in it.  It is only used for that case.  I don't know whether we
> want to support such CPU.  But if that is the case, we need the
> membar.

Whether the membar is necessary or not depends on the use case.
READ_ONCE(), and SMR_PTR_GET(), have it built in so that loaded
pointers would work in the expected way in lockless contexts. This
is intentional, the membar has not been just copied there.

If you want to keep the memory barrier, then I suggest that the current
atomic_load_* and atomic_store_* functions are replaced with NetBSD's
atomic_load_relaxed(), atomic_load_consume(), atomic_load_acquire(),
atomic_store_relaxed() and atomic_store_release(). With these, it is
clear what ordering the operations provide and the programmer is able
to make the appropriate choice.

> What do you need refcnt_read() for?  Is it only for assert?  Then
> a refcnt_assert() without membar or atomic_load might be better.

I want to keep the API small. Even though refcnt_read() is possibly
dubious in general, it allows a degree of freedom that might be useful
for example in assertions.

The membar question will arise in many places, not just with refcnt.

Use refcnt API with struct ucred

2022-03-16 Thread Visa Hankala

Use the refcnt API with struct ucred.

OK?

Index: nfs/nfs_socket.c
===
RCS file: src/sys/nfs/nfs_socket.c,v
retrieving revision 1.139
diff -u -p -r1.139 nfs_socket.c
--- nfs/nfs_socket.c22 Feb 2022 01:15:02 -  1.139
+++ nfs/nfs_socket.c16 Mar 2022 15:42:05 -
@@ -1493,7 +1493,7 @@ nfs_getreq(struct nfsrv_descript *nd, st
nfsm_adv(nfsm_rndup(len));
nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
memset(>nd_cr, 0, sizeof (struct ucred));
-   nd->nd_cr.cr_ref = 1;
+   refcnt_init(>nd_cr.cr_refcnt);
nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++);
nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++);
len = fxdr_unsigned(int, *tl);
Index: kern/kern_fork.c
===
RCS file: src/sys/kern/kern_fork.c,v
retrieving revision 1.238
diff -u -p -r1.238 kern_fork.c
--- kern/kern_fork.c10 Dec 2021 05:34:42 -  1.238
+++ kern/kern_fork.c16 Mar 2022 15:42:05 -
@@ -190,7 +190,8 @@ process_initialize(struct process *pr, s
/* give the process the same creds as the initial thread */
pr->ps_ucred = p->p_ucred;
crhold(pr->ps_ucred);
-   KASSERT(p->p_ucred->cr_ref >= 2);   /* new thread and new process */
+   /* new thread and new process */
+   KASSERT(p->p_ucred->cr_refcnt.r_refs >= 2);
 
LIST_INIT(>ps_children);
LIST_INIT(>ps_orphans);
Index: kern/kern_prot.c
===
RCS file: src/sys/kern/kern_prot.c,v
retrieving revision 1.78
diff -u -p -r1.78 kern_prot.c
--- kern/kern_prot.c24 Oct 2021 00:02:25 -  1.78
+++ kern/kern_prot.c16 Mar 2022 15:42:05 -
@@ -57,7 +57,7 @@
 inline void
 crset(struct ucred *newcr, const struct ucred *cr)
 {
-   KASSERT(cr->cr_ref > 0);
+   KASSERT(cr->cr_refcnt.r_refs > 0);
memcpy(
(char *)newcr+ offsetof(struct ucred, cr_startcopy),
(const char *)cr + offsetof(struct ucred, cr_startcopy),
@@ -945,7 +945,7 @@ crget(void)
struct ucred *cr;
 
cr = pool_get(_pool, PR_WAITOK|PR_ZERO);
-   cr->cr_ref = 1;
+   refcnt_init(>cr_refcnt);
return (cr);
 }
 
@@ -956,7 +956,7 @@ crget(void)
 struct ucred *
 crhold(struct ucred *cr)
 {
-   atomic_inc_int(>cr_ref);
+   refcnt_take(>cr_refcnt);
return (cr);
 }
 
@@ -967,8 +967,7 @@ crhold(struct ucred *cr)
 void
 crfree(struct ucred *cr)
 {
-
-   if (atomic_dec_int_nv(>cr_ref) == 0)
+   if (refcnt_rele(>cr_refcnt))
pool_put(_pool, cr);
 }
 
@@ -980,12 +979,12 @@ crcopy(struct ucred *cr)
 {
struct ucred *newcr;
 
-   if (cr->cr_ref == 1)
+   if (!refcnt_shared(>cr_refcnt))
return (cr);
newcr = crget();
*newcr = *cr;
crfree(cr);
-   newcr->cr_ref = 1;
+   refcnt_init(>cr_refcnt);
return (newcr);
 }
 
@@ -999,7 +998,7 @@ crdup(struct ucred *cr)
 
newcr = crget();
*newcr = *cr;
-   newcr->cr_ref = 1;
+   refcnt_init(>cr_refcnt);
return (newcr);
 }
 
@@ -1011,7 +1010,7 @@ crfromxucred(struct ucred *cr, const str
 {
if (xcr->cr_ngroups < 0 || xcr->cr_ngroups > NGROUPS_MAX)
return (EINVAL);
-   cr->cr_ref = 1;
+   refcnt_init(>cr_refcnt);
cr->cr_uid = xcr->cr_uid;
cr->cr_gid = xcr->cr_gid;
cr->cr_ngroups = xcr->cr_ngroups;
Index: sys/ucred.h
===
RCS file: src/sys/sys/ucred.h,v
retrieving revision 1.13
diff -u -p -r1.13 ucred.h
--- sys/ucred.h 21 Jun 2018 13:58:21 -  1.13
+++ sys/ucred.h 16 Mar 2022 15:42:05 -
@@ -35,13 +35,14 @@
 #ifndef _SYS_UCRED_H_
 #define_SYS_UCRED_H_
 
+#include 
 #include 
 
 /*
  * Credentials.
  */
 struct ucred {
-   u_int   cr_ref; /* reference count */
+   struct refcnt   cr_refcnt;  /* reference count */
 
 /* The following fields are all copied by crset() */
 #definecr_startcopycr_uid

Use refcnt API in bpf

2022-03-16 Thread Visa Hankala

Use the refcnt API in bpf.

OK?

Index: net/bpf.c
===
RCS file: src/sys/net/bpf.c,v
retrieving revision 1.215
diff -u -p -r1.215 bpf.c
--- net/bpf.c   15 Feb 2022 08:43:50 -  1.215
+++ net/bpf.c   16 Mar 2022 15:42:05 -
@@ -55,6 +55,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -398,7 +399,7 @@ bpfopen(dev_t dev, int flag, int mode, s
 
bd->bd_rtout = 0;   /* no timeout by default */
 
-   bpf_get(bd);
+   refcnt_init(>bd_refcnt);
LIST_INSERT_HEAD(_d_list, bd, bd_list);
 
return (0);
@@ -1645,7 +1646,7 @@ bpf_d_smr(void *smr)
 void
 bpf_get(struct bpf_d *bd)
 {
-   atomic_inc_int(>bd_ref);
+   refcnt_take(>bd_refcnt);
 }
 
 /*
@@ -1655,7 +1656,7 @@ bpf_get(struct bpf_d *bd)
 void
 bpf_put(struct bpf_d *bd)
 {
-   if (atomic_dec_int_nv(>bd_ref) > 0)
+   if (refcnt_rele(>bd_refcnt) == 0)
return;
 
smr_call(>bd_smr, bpf_d_smr, bd);
Index: net/bpfdesc.h
===
RCS file: src/sys/net/bpfdesc.h,v
retrieving revision 1.45
diff -u -p -r1.45 bpfdesc.h
--- net/bpfdesc.h   21 Jan 2021 12:33:14 -  1.45
+++ net/bpfdesc.h   16 Mar 2022 15:42:05 -
@@ -98,7 +98,7 @@ struct bpf_d {
int bd_sig; /* signal to send upon packet reception 
*/
struct sigio_ref
bd_sigio;   /* async I/O registration */
-   u_int   bd_ref; /* reference count */
+   struct refcnt   bd_refcnt;  /* reference count */
struct selinfo  bd_sel; /* bsd select info */
int bd_unit;/* logical unit number */
LIST_ENTRY(bpf_d) bd_list;  /* descriptor list */

Use refcnt API in kqueue

2022-03-15 Thread Visa Hankala

Make kqueue use the refcnt API.

OK?

Index: sys/kern/kern_event.c
===
RCS file: src/sys/kern/kern_event.c,v
retrieving revision 1.183
diff -u -p -r1.183 kern_event.c
--- sys/kern/kern_event.c   22 Feb 2022 01:15:01 -  1.183
+++ sys/kern/kern_event.c   15 Mar 2022 13:43:20 -
@@ -199,7 +199,7 @@ const struct filterops *const sysfilt_op
 void
 KQREF(struct kqueue *kq)
 {
-   atomic_inc_int(>kq_refs);
+   refcnt_take(>kq_refcnt);
 }
 
 void
@@ -207,7 +207,7 @@ KQRELE(struct kqueue *kq)
 {
struct filedesc *fdp;
 
-   if (atomic_dec_int_nv(>kq_refs) > 0)
+   if (refcnt_rele(>kq_refcnt) == 0)
return;
 
fdp = kq->kq_fdp;
@@ -837,7 +837,7 @@ kqpoll_exit(void)
 
kqueue_purge(p, p->p_kq);
kqueue_terminate(p, p->p_kq);
-   KASSERT(p->p_kq->kq_refs == 1);
+   KASSERT(p->p_kq->kq_refcnt.r_refs == 1);
KQRELE(p->p_kq);
p->p_kq = NULL;
 }
@@ -848,7 +848,7 @@ kqueue_alloc(struct filedesc *fdp)
struct kqueue *kq;
 
kq = pool_get(_pool, PR_WAITOK | PR_ZERO);
-   kq->kq_refs = 1;
+   refcnt_init(>kq_refcnt);
kq->kq_fdp = fdp;
TAILQ_INIT(>kq_head);
mtx_init(>kq_lock, IPL_HIGH);
Index: sys/sys/eventvar.h
===
RCS file: src/sys/sys/eventvar.h,v
retrieving revision 1.13
diff -u -p -r1.13 eventvar.h
--- sys/sys/eventvar.h  8 Feb 2022 08:56:41 -   1.13
+++ sys/sys/eventvar.h  15 Mar 2022 13:43:20 -
@@ -32,6 +32,7 @@
 #define _SYS_EVENTVAR_H_
 
 #include 
+#include 
 #include 
 
 #define KQ_NEVENTS 8   /* minimize copy{in,out} calls */
@@ -47,7 +48,7 @@ struct kqueue {
struct  mutex kq_lock;  /* lock for queue access */
TAILQ_HEAD(, knote) kq_head;/* [q] list of pending event */
int kq_count;   /* [q] # of pending events */
-   u_int   kq_refs;/* [a] # of references */
+   struct  refcnt kq_refcnt;   /* [a] # of references */
struct  selinfo kq_sel;
struct  filedesc *kq_fdp;   /* [I] fd table of this kq */

Re: Add refcnt_read()

2022-03-15 Thread Visa Hankala

On Tue, Mar 15, 2022 at 09:11:30AM +, Visa Hankala wrote:
> This patch adds a function for getting a snapshot of a reference
> counter. This will let code like crcopy() access the value in an
> API-observing way.

Here is a revised version.

Based on input from dlg@, the patch now adds refcnt_shared() for
testing if the object has multiple references. This interface is
possibly more robust than refcnt_read().

The patch still provides refcnt_read() for special (mis)uses.

OK?

Index: share/man/man9/refcnt_init.9
===
RCS file: src/share/man/man9/refcnt_init.9,v
retrieving revision 1.1
diff -u -p -r1.1 refcnt_init.9
--- share/man/man9/refcnt_init.911 Sep 2015 19:13:22 -  1.1
+++ share/man/man9/refcnt_init.915 Mar 2022 13:43:20 -
@@ -23,6 +23,8 @@
 .Nm refcnt_rele ,
 .Nm refcnt_rele_wake ,
 .Nm refcnt_finalize ,
+.Nm refcnt_shared ,
+.Nm refcnt_read ,
 .Nm REFCNT_INITIALIZER
 .Nd reference count API
 .Sh SYNOPSIS
@@ -37,6 +39,10 @@
 .Fn "refcnt_rele_wake" "struct refcnt *r"
 .Ft void
 .Fn "refcnt_finalize" "struct refcnt *r" "const char *wmesg"
+.Ft int
+.Fn "refcnt_shared" "struct refcnt *r"
+.Ft unsigned int
+.Fn "refcnt_read" "struct refcnt *r"
 .Fn "REFCNT_INITIALIZER"
 .Sh DESCRIPTION
 The refcnt API provides simple reference counters that can be used
@@ -68,14 +74,26 @@ There may only be one caller to
 per refcnt
 .Fa r .
 .Pp
+.Fn refcnt_shared
+tests if the object has multiple references.
+.Pp
+.Fn refcnt_read
+returns a snapshot of the counter value.
+Its use is discouraged,
+code should use
+.Fn refcnt_shared
+whenever possible.
+.Pp
 .Fn REFCNT_INITIALIZER
 initialises a declaration of a refcnt to 1.
 .Sh CONTEXT
 .Fn refcnt_init ,
 .Fn refcnt_take ,
 .Fn refcnt_rele ,
+.Fn refcnt_rele_wake ,
+.Fn refcnt_shared
 and
-.Fn refcnt_rele_wake
+.Fn refcnt_read
 can be called during autoconf, from process context, or from interrupt
 context.
 .Pp
@@ -85,3 +103,10 @@ can be called from process context.
 .Fn refcnt_rele
 returns a non-zero value if the last reference has been released,
 otherwise 0.
+.Pp
+.Fn refcnt_shared
+returns a non-zero value if the object has multiple references,
+otherwise 0.
+.Pp
+.Fn refcnt_read
+returns a snapshot of the counter value.
Index: sys/kern/kern_synch.c
===
RCS file: src/sys/kern/kern_synch.c,v
retrieving revision 1.183
diff -u -p -r1.183 kern_synch.c
--- sys/kern/kern_synch.c   10 Mar 2022 15:21:08 -  1.183
+++ sys/kern/kern_synch.c   15 Mar 2022 13:43:20 -
@@ -852,6 +852,18 @@ refcnt_finalize(struct refcnt *r, const 
}
 }
 
+int
+refcnt_shared(struct refcnt *r)
+{
+   return (atomic_load_int(>r_refs) > 1);
+}
+
+unsigned int
+refcnt_read(struct refcnt *r)
+{
+   return (atomic_load_int(>r_refs));
+}
+
 void
 cond_init(struct cond *c)
 {
Index: sys/sys/refcnt.h
===
RCS file: src/sys/sys/refcnt.h,v
retrieving revision 1.5
diff -u -p -r1.5 refcnt.h
--- sys/sys/refcnt.h10 Mar 2022 15:21:08 -  1.5
+++ sys/sys/refcnt.h15 Mar 2022 13:43:20 -
@@ -37,6 +37,8 @@ void  refcnt_take(struct refcnt *);
 intrefcnt_rele(struct refcnt *);
 void   refcnt_rele_wake(struct refcnt *);
 void   refcnt_finalize(struct refcnt *, const char *);
+intrefcnt_shared(struct refcnt *);
+unsigned int   refcnt_read(struct refcnt *);
 
 #endif /* _KERNEL */

Remove data dependency barrier from atomic_load_*

2022-03-15 Thread Visa Hankala

This removes the data dependency consumer barrier from the atomic_load_*
functions. I think the intent was to keep these functions relaxed in
terms of CPU memory order.

This makes these functions more agreeable in code that assertions may
use, such as the suggested refcnt_read().

Removing the barrier should be safe at this point. The current callers
of atomic_load_*, that is cond_wait() and refcnt_finalize(), use the
loaded value for a control decision. These functions need to provide
a stronger ordering guarantee (memory acquire) for their callers than
what membar_datadep_consumer() gives.

(The data dependency barrier would be necessary in a setting like the
following where a memory load of non-constant data is dependent on
another load.

idx = atomic_load_int();
membar_datadep_consumer();
val = atomic_load_int([idx]);

Typically, even if the processor did reorder loads, the second load's
dependency on the value of the first load would prevent the load-load
reordering.

However, some DEC Alpha CPUs have their data caches divided into cache
banks to improve bandwidth. These cache banks are relatively
independent. The system maintains coherency, but bus contention can
delay propagation of cache updates. If the loads spanned different cache
banks, the second load could deliver data which is older than the
initial load's value. The data dependency barrier causes an interlock
with cache updating, ensuring causal ordering.)

OK?

Index: sys/sys/atomic.h
===
RCS file: src/sys/sys/atomic.h,v
retrieving revision 1.8
diff -u -p -r1.8 atomic.h
--- sys/sys/atomic.h11 Mar 2022 19:02:15 -  1.8
+++ sys/sys/atomic.h15 Mar 2022 07:52:39 -
@@ -201,26 +201,16 @@ atomic_sub_long_nv(volatile unsigned lon
  * atomic_load_* - read from memory
  */
 
-static inline void membar_datadep_consumer(void);
-
 static inline unsigned int
 atomic_load_int(volatile unsigned int *p)
 {
-   unsigned int v;
-
-   v = *p;
-   membar_datadep_consumer();
-   return v;
+   return *p;
 }
 
 static inline unsigned long
 atomic_load_long(volatile unsigned long *p)
 {
-   unsigned long v;
-
-   v = *p;
-   membar_datadep_consumer();
-   return v;
+   return *p;
 }
 
 /*

Add refcnt_read()

2022-03-15 Thread Visa Hankala

This patch adds a function for getting a snapshot of a reference
counter. This will let code like crcopy() access the value in an
API-observing way.

OK?

Index: share/man/man9/refcnt_init.9
===
RCS file: src/share/man/man9/refcnt_init.9,v
retrieving revision 1.1
diff -u -p -r1.1 refcnt_init.9
--- share/man/man9/refcnt_init.911 Sep 2015 19:13:22 -  1.1
+++ share/man/man9/refcnt_init.915 Mar 2022 07:52:39 -
@@ -23,6 +23,7 @@
 .Nm refcnt_rele ,
 .Nm refcnt_rele_wake ,
 .Nm refcnt_finalize ,
+.Nm refcnt_read ,
 .Nm REFCNT_INITIALIZER
 .Nd reference count API
 .Sh SYNOPSIS
@@ -37,6 +38,8 @@
 .Fn "refcnt_rele_wake" "struct refcnt *r"
 .Ft void
 .Fn "refcnt_finalize" "struct refcnt *r" "const char *wmesg"
+.Ft unsigned int
+.Fn "refcnt_read" "struct refcnt *r"
 .Fn "REFCNT_INITIALIZER"
 .Sh DESCRIPTION
 The refcnt API provides simple reference counters that can be used
@@ -68,14 +71,21 @@ There may only be one caller to
 per refcnt
 .Fa r .
 .Pp
+.Fn refcnt_read
+returns a snapshot of the counter value.
+The value can become stale immediately if other CPUs are able to change
+the counter in parallel.
+The function does not enforce any memory access order.
+.Pp
 .Fn REFCNT_INITIALIZER
 initialises a declaration of a refcnt to 1.
 .Sh CONTEXT
 .Fn refcnt_init ,
 .Fn refcnt_take ,
 .Fn refcnt_rele ,
+.Fn refcnt_rele_wake ,
 and
-.Fn refcnt_rele_wake
+.Fn refcnt_read
 can be called during autoconf, from process context, or from interrupt
 context.
 .Pp
@@ -85,3 +95,6 @@ can be called from process context.
 .Fn refcnt_rele
 returns a non-zero value if the last reference has been released,
 otherwise 0.
+.Pp
+.Fn refcnt_read
+returns a snapshot of the counter value.
Index: sys/kern/kern_synch.c
===
RCS file: src/sys/kern/kern_synch.c,v
retrieving revision 1.183
diff -u -p -r1.183 kern_synch.c
--- sys/kern/kern_synch.c   10 Mar 2022 15:21:08 -  1.183
+++ sys/kern/kern_synch.c   15 Mar 2022 07:52:39 -
@@ -852,6 +852,12 @@ refcnt_finalize(struct refcnt *r, const 
}
 }
 
+unsigned int
+refcnt_read(struct refcnt *r)
+{
+   return (atomic_load_int(>r_refs));
+}
+
 void
 cond_init(struct cond *c)
 {
Index: sys/sys/refcnt.h
===
RCS file: src/sys/sys/refcnt.h,v
retrieving revision 1.5
diff -u -p -r1.5 refcnt.h
--- sys/sys/refcnt.h10 Mar 2022 15:21:08 -  1.5
+++ sys/sys/refcnt.h15 Mar 2022 07:52:39 -
@@ -37,6 +37,7 @@ void  refcnt_take(struct refcnt *);
 intrefcnt_rele(struct refcnt *);
 void   refcnt_rele_wake(struct refcnt *);
 void   refcnt_finalize(struct refcnt *, const char *);
+unsigned int   refcnt_read(struct refcnt *);
 
 #endif /* _KERNEL */

Re: Provide memory barriers in refcnt_rele() and refcnt_finalize()

2022-03-14 Thread Visa Hankala

On Mon, Mar 14, 2022 at 02:01:07AM -0700, Philip Guenther wrote:
> On Mon, Mar 14, 2022 at 12:47 AM Visa Hankala  wrote:
> 
> > On Sun, Mar 13, 2022 at 06:26:19PM -0700, Philip Guenther wrote:
> > > On Sun, Mar 13, 2022 at 10:27 AM Visa Hankala  wrote:
> > >
> > > > On Sun, Mar 13, 2022 at 04:29:44PM +0100, Mark Kettenis wrote:
> > > >
> > > ...
> > >
> > > > > Under what circumstances does memory ordering matter for these
> > > > > interfaces?
> > > >
> > > > Consider the following scenario:
> > > >
> > > > struct widget {
> > > > struct refcnt   w_refcnt;
> > > > /* more fields spanning many cache lines */
> > > > ...
> > > > int w_var;
> > > > };
> > > >
> > > > First, CPU 1 executes:
> > > >
> > > > w->w_var = 1;
> > > > refcnt_rele(>w_refcnt);  /* remains above zero */
> > > >
> > >
> > > Having incremented the refcnt previous does not give this thread
> > exclusive
> > > access to 'w', so if it's writing to w->w_var then it must either
> > > a) have some sort of write lock taken, which it will release after this
> > and
> > > which will contain the necessary member, OR
> > > b) have the only access patch to this structure (i.e, it's not yet
> > > 'published' into structures which can be seen by other threads), in which
> > > case the operations which do that 'publishing' of the access to 'w'
> > (adding
> > > it to a global list, etc) must include the necessary membar.
> >
> > Lets change the sequence to this:
> >
> > local_var = atomic_load_int(>w_var);
> > refcnt_rele(>w_refcnt);
> >
> > Without the release barrier, is the load guaranteed to happen before
> > the reference count is decremented?
> >
> 
> That's completely uncomparable to what you described before for "CPU 1"
> because there's no write to anything but the refcnt.
> 
> If no one writes to the object that is ref counted, then there are no
> visibility problems, no?
> 
> 
> > Next, CPU 2 executes:
> > > >
> > > > if (refcnt_rele(>w_refcnt))  /* refcnt drops to zero */
> > > > free(w);
> > > >
> > >
> > > How did CPU 2 get what is now exclusive access to 'w' without any
> > membars?
> > > If that's possible then it was just accessing 'w' and possibly not seeing
> > > the update to w->w_var even _before_ the refcnt_rele(), so putting a
> > membar
> > > in refcnt_rele() hides the incorrect code by suppressing the later crash!
> > >
> > > If these membars appear to help then the code is and remains broken.
> > This
> > > change should not be done.
> >
> > It is not uncommon to see something like below:
> >
> > Access object with read intent:
> >
> > mtx_enter(_lock);
> > w = lookup_from_list();
> > if (w != NULL)
> > refcnt_take(>w_refcnt);
> > mtx_leave(_lock);
> > if (w == NULL)
> > return;
> > ...
> >
> 
> No writes to *w described *OR LEGAL*.
> 
> 
> 
> > if (refcnt_rele(>w_refcnt))
> > free(w);
> 
> Delete object:
> >
> > mtx_enter(_lock);
> > w = lookup_from_list();
> > if (w != NULL)
> > remove_from_list(w);
> > mtx_leave(_lock);
> >
> 
> This does the 'release' on the change to w's list link(s).
> 
> 
> 
> > /* Release list's reference. */
> > if (w != NULL && refcnt_rele(>w_refcnt))
> > free(w);
> >
> > Above, any refcnt_rele() can release the final reference.
> >
> > If there is no acquire barrier after the refcnt 1->0 transition, what
> > is known about the CPU's local view of the object after refcnt_rele()?
> >
> 
> Okay, if refcnt is used with objects that have embedded list links, or
> could be, then refcnt_rele needs acquire semantics on the 1->0 transition.
> I can agree with your justification for that.
> 
> Do you have a similar example for giving it release semantics as your diff
> proposed?  What's the otherwise-unprotected-by-synchronization-primitive
> that has to be protected?

The release barrier prevents memory accesses from gettin

Re: Provide memory barriers in refcnt_rele() and refcnt_finalize()

2022-03-14 Thread Visa Hankala

On Sun, Mar 13, 2022 at 06:26:19PM -0700, Philip Guenther wrote:
> On Sun, Mar 13, 2022 at 10:27 AM Visa Hankala  wrote:
> 
> > On Sun, Mar 13, 2022 at 04:29:44PM +0100, Mark Kettenis wrote:
> >
> ...
> 
> > > Under what circumstances does memory ordering matter for these
> > > interfaces?
> >
> > Consider the following scenario:
> >
> > struct widget {
> > struct refcnt   w_refcnt;
> > /* more fields spanning many cache lines */
> > ...
> > int w_var;
> > };
> >
> > First, CPU 1 executes:
> >
> > w->w_var = 1;
> > refcnt_rele(>w_refcnt);  /* remains above zero */
> >
> 
> Having incremented the refcnt previous does not give this thread exclusive
> access to 'w', so if it's writing to w->w_var then it must either
> a) have some sort of write lock taken, which it will release after this and
> which will contain the necessary member, OR
> b) have the only access patch to this structure (i.e, it's not yet
> 'published' into structures which can be seen by other threads), in which
> case the operations which do that 'publishing' of the access to 'w' (adding
> it to a global list, etc) must include the necessary membar.

Lets change the sequence to this:

local_var = atomic_load_int(>w_var);
refcnt_rele(>w_refcnt);

Without the release barrier, is the load guaranteed to happen before
the reference count is decremented?

> Next, CPU 2 executes:
> >
> > if (refcnt_rele(>w_refcnt))  /* refcnt drops to zero */
> > free(w);
> >
> 
> How did CPU 2 get what is now exclusive access to 'w' without any membars?
> If that's possible then it was just accessing 'w' and possibly not seeing
> the update to w->w_var even _before_ the refcnt_rele(), so putting a membar
> in refcnt_rele() hides the incorrect code by suppressing the later crash!
> 
> If these membars appear to help then the code is and remains broken.  This
> change should not be done.

It is not uncommon to see something like below:

Access object with read intent:

mtx_enter(_lock);
w = lookup_from_list();
if (w != NULL)
refcnt_take(>w_refcnt);
mtx_leave(_lock);
if (w == NULL)
return;
...
if (refcnt_rele(>w_refcnt))
free(w);

Delete object:

mtx_enter(_lock);
w = lookup_from_list();
if (w != NULL)
remove_from_list(w);
mtx_leave(_lock);
/* Release list's reference. */
if (w != NULL && refcnt_rele(>w_refcnt))
free(w);

Above, any refcnt_rele() can release the final reference.

If there is no acquire barrier after the refcnt 1->0 transition, what
is known about the CPU's local view of the object after refcnt_rele()?

The decrement operation in the  API of Linux and
the  API of FreeBSD provide release and acquire
barriers. Are they wrong?

Re: Provide memory barriers in refcnt_rele() and refcnt_finalize()

2022-03-13 Thread Visa Hankala

On Sun, Mar 13, 2022 at 04:29:44PM +0100, Mark Kettenis wrote:
> > Date: Sun, 13 Mar 2022 12:47:13 +
> > From: Visa Hankala 
> > 
> > This makes the refcnt implementation issue memory barriers when
> > releasing references, splitting memory activity cleanly into preceding
> > and succeeding stages around refcnt 1->0 transition.
> > 
> > I think the while loop could be optimized a little by re-reading
> > r->r_refs just after sleep_finish(). That should avoid re-entering
> > the SCHED_LOCK()'ed section. However, that is not part of this patch.
> > 
> > OK?
> 
> Under what circumstances does memory ordering matter for these
> interfaces?

Consider the following scenario:

struct widget {
struct refcnt   w_refcnt;
/* more fields spanning many cache lines */
...
int w_var;
};

First, CPU 1 executes:

w->w_var = 1;
refcnt_rele(>w_refcnt);  /* remains above zero */

Next, CPU 2 executes:

if (refcnt_rele(>w_refcnt))  /* refcnt drops to zero */
free(w);

CPU 1 has to complete any stores to or loads from the object before
it decrements the reference count. Otherwise there is a risk of
a use-after-free-like situation should CPU 2 manage to release the
object quickly.

This can arise even with an in-order processor if the memory subsystem
buffers and reorders writes.

The release barrier prevents the above problems on CPU 1.

On CPU 2, the object release activity should take place only after the
zero reference count has been observed. If the processor used
speculation, it might begin executing free() before the final result of
the reference count decrement was known. Once the decrement finished
the processor could commit the speculative part.

An adjusted code sequence might clarify an aspect:

CPU 2:

refcnt_rele(>w_refcnt);
memset(w, 0xff, sizeof(*w));
free(w);

Now CPU 2 "poisons" the memory. The acquire barrier ensures the
poisoning is applied to the object as seen after the decrement.

If bits of the memset() were run speculatively without proper
barriers, CPU 1's store to w->w_var could overwrite the poisoning.

My talk about these speculative stores might go too far in practice.
However, at least it is not obvious if it is safe to leave the control
dependency without a barrier.

> 
> > Index: share/man/man9/refcnt_init.9
> > ===
> > RCS file: src/share/man/man9/refcnt_init.9,v
> > retrieving revision 1.1
> > diff -u -p -r1.1 refcnt_init.9
> > --- share/man/man9/refcnt_init.911 Sep 2015 19:13:22 -  1.1
> > +++ share/man/man9/refcnt_init.913 Mar 2022 11:40:12 -
> > @@ -68,6 +68,18 @@ There may only be one caller to
> >  per refcnt
> >  .Fa r .
> >  .Pp
> > +.Fn refcnt_rele ,
> > +.Fn refcnt_rele_wake
> > +and
> > +.Fn refcnt_finalize
> > +provide release memory ordering.
> > +The caller's prior memory loads and stores are completed
> > +before the reference is released.
> > +The functions provide acquire memory ordering after all the references
> > +have been released.
> > +This ensures the object's destructor sees all updates
> > +done during the lifetime of the object.
> > +.Pp
> >  .Fn REFCNT_INITIALIZER
> >  initialises a declaration of a refcnt to 1.
> >  .Sh CONTEXT
> > Index: sys/kern/kern_synch.c
> > ===
> > RCS file: src/sys/kern/kern_synch.c,v
> > retrieving revision 1.183
> > diff -u -p -r1.183 kern_synch.c
> > --- sys/kern/kern_synch.c   10 Mar 2022 15:21:08 -  1.183
> > +++ sys/kern/kern_synch.c   13 Mar 2022 11:40:13 -
> > @@ -825,10 +825,16 @@ refcnt_rele(struct refcnt *r)
> >  {
> > u_int refcnt;
> >  
> > +   membar_exit_before_atomic();
> > refcnt = atomic_dec_int_nv(>r_refs);
> > KASSERT(refcnt != ~0);
> >  
> > -   return (refcnt == 0);
> > +   if (refcnt == 0) {
> > +   membar_enter_after_atomic();
> > +   return (1);
> > +   }
> > +
> > +   return (0);
> >  }
> >  
> >  void
> > @@ -844,12 +850,20 @@ refcnt_finalize(struct refcnt *r, const 
> > struct sleep_state sls;
> > u_int refcnt;
> >  
> > +   membar_exit_before_atomic();
> > refcnt = atomic_dec_int_nv(>r_refs);
> > +   if (refcnt == 0) {
> > +   membar_enter_after_atomic();
> > +   return;
> > +   }
> > +
> > while (refcnt) {
> > sleep_setup(, r, PWAIT, wmesg, 0);
> > refcnt = atomic_load_int(>r_refs);
> > sleep_finish(, refcnt);
> > }
> > +   /* Provide acquire ordering after seeing refcnt == 0. */
> > +   membar_sync();
> >  }
> >  
> >  void
> > 
> > 
>

Provide memory barriers in refcnt_rele() and refcnt_finalize()

2022-03-13 Thread Visa Hankala

This makes the refcnt implementation issue memory barriers when
releasing references, splitting memory activity cleanly into preceding
and succeeding stages around refcnt 1->0 transition.

I think the while loop could be optimized a little by re-reading
r->r_refs just after sleep_finish(). That should avoid re-entering
the SCHED_LOCK()'ed section. However, that is not part of this patch.

OK?

Index: share/man/man9/refcnt_init.9
===
RCS file: src/share/man/man9/refcnt_init.9,v
retrieving revision 1.1
diff -u -p -r1.1 refcnt_init.9
--- share/man/man9/refcnt_init.911 Sep 2015 19:13:22 -  1.1
+++ share/man/man9/refcnt_init.913 Mar 2022 11:40:12 -
@@ -68,6 +68,18 @@ There may only be one caller to
 per refcnt
 .Fa r .
 .Pp
+.Fn refcnt_rele ,
+.Fn refcnt_rele_wake
+and
+.Fn refcnt_finalize
+provide release memory ordering.
+The caller's prior memory loads and stores are completed
+before the reference is released.
+The functions provide acquire memory ordering after all the references
+have been released.
+This ensures the object's destructor sees all updates
+done during the lifetime of the object.
+.Pp
 .Fn REFCNT_INITIALIZER
 initialises a declaration of a refcnt to 1.
 .Sh CONTEXT
Index: sys/kern/kern_synch.c
===
RCS file: src/sys/kern/kern_synch.c,v
retrieving revision 1.183
diff -u -p -r1.183 kern_synch.c
--- sys/kern/kern_synch.c   10 Mar 2022 15:21:08 -  1.183
+++ sys/kern/kern_synch.c   13 Mar 2022 11:40:13 -
@@ -825,10 +825,16 @@ refcnt_rele(struct refcnt *r)
 {
u_int refcnt;
 
+   membar_exit_before_atomic();
refcnt = atomic_dec_int_nv(>r_refs);
KASSERT(refcnt != ~0);
 
-   return (refcnt == 0);
+   if (refcnt == 0) {
+   membar_enter_after_atomic();
+   return (1);
+   }
+
+   return (0);
 }
 
 void
@@ -844,12 +850,20 @@ refcnt_finalize(struct refcnt *r, const 
struct sleep_state sls;
u_int refcnt;
 
+   membar_exit_before_atomic();
refcnt = atomic_dec_int_nv(>r_refs);
+   if (refcnt == 0) {
+   membar_enter_after_atomic();
+   return;
+   }
+
while (refcnt) {
sleep_setup(, r, PWAIT, wmesg, 0);
refcnt = atomic_load_int(>r_refs);
sleep_finish(, refcnt);
}
+   /* Provide acquire ordering after seeing refcnt == 0. */
+   membar_sync();
 }
 
 void

Provide memory barriers in FRELE()

2022-03-13 Thread Visa Hankala

This is a slightly tweaked version of the FRELE() memory barrier patch.
The barriers aim to provide clearer memory access behaviour around the
f_count 1->0 transition. With the barriers, the transition splits the
memory activity into preceding and succeeding stages that do not
overlap.

OK?

Index: share/man/man9/file.9
===
RCS file: src/share/man/man9/file.9,v
retrieving revision 1.22
diff -u -p -r1.22 file.9
--- share/man/man9/file.9   3 Jan 2020 05:37:00 -   1.22
+++ share/man/man9/file.9   13 Mar 2022 11:40:12 -
@@ -148,6 +148,14 @@ The function
 .Fn FRELE
 decreases the use count, and releases the file descriptor if the use count
 becomes zero.
+.Pp
+.Fn FRELE
+provides release memory ordering.
+Prior memory loads and stores are completed before the use count is decreased.
+After the use count drops to zero,
+.Fn FRELE
+enforces acquire memory ordering so that the file release code sees all updates
+done during the lifetime of the file.
 .Sh CODE REFERENCES
 The majority of those functions are implemented in
 .Pa sys/kern/kern_descrip.c .
Index: sys/kern/kern_descrip.c
===
RCS file: src/sys/kern/kern_descrip.c,v
retrieving revision 1.205
diff -u -p -r1.205 kern_descrip.c
--- sys/kern/kern_descrip.c 20 Jan 2022 11:06:57 -  1.205
+++ sys/kern/kern_descrip.c 13 Mar 2022 11:40:13 -
@@ -1268,7 +1268,12 @@ fdrop(struct file *fp, struct proc *p)
 {
int error;
 
-   KASSERTMSG(fp->f_count == 0, "count (%u) != 0", fp->f_count);
+   membar_exit_before_atomic();
+   if (atomic_dec_int_nv(>f_count) > 0)
+   return 0;
+
+   /* Provide acquire ordering after f_count 1->0 transition. */
+   membar_enter_after_atomic();
 
mtx_enter();
if (fp->f_iflags & FIF_INSERTED)
Index: sys/sys/file.h
===
RCS file: src/sys/sys/file.h,v
retrieving revision 1.65
diff -u -p -r1.65 file.h
--- sys/sys/file.h  20 Jan 2022 03:43:31 -  1.65
+++ sys/sys/file.h  13 Mar 2022 11:40:13 -
@@ -113,8 +113,7 @@ struct file {
atomic_inc_int(&(fp)->f_count); \
} while (0)
 
-#define FRELE(fp,p) \
-   (atomic_dec_int_nv(>f_count) == 0 ? fdrop(fp, p) : 0)
+#define FRELE(fp, p)   fdrop(fp, p)
 
 #define FDUP_MAX_COUNT (UINT_MAX - 2 * MAXCPUS)

Re: atomic read write

2022-03-11 Thread Visa Hankala

On Fri, Mar 11, 2022 at 11:51:31AM +0100, Alexander Bluhm wrote:
> On Fri, Mar 11, 2022 at 05:32:11AM +0000, Visa Hankala wrote:
> > On Thu, Mar 10, 2022 at 07:17:43PM +0100, Alexander Bluhm wrote:
> > > On Thu, Mar 10, 2022 at 04:39:49PM +0100, Alexander Bluhm wrote:
> > > > > Below is a patch that shows how to accomplish release semantics with
> > > > > the file and refcnt APIs. (The added memory barriers ensure that the
> > > > > CPU completes its loads and stores to the object before dropping the
> > > > > reference. Another CPU might delete the object immediately after.
> > > > > The barrier might not be strictly necessary with refcnt_finalize(),
> > > > > though.)
> > > 
> > > The enter and exit membars that protect the critical section should
> > > be symmetric.  Maybe this diff is better.
> > 
> > No, symmetry is not always present. See below.
> 
> In general if you want to move data from one CPU to another, you
> have to push them out and pull them in.  That is where the symetry
> comes from and it should be reflected in code.  At least that is
> my understanding, but understanding the whole topic is hard.

A skeptical take on memory barriers is that they do not push anything;
they only make certain things happen in a specific order but the exact
time frame remains open. To ensure immediacy, the code needs to use
a read-modify-write atomic operation or something similar that the
processor cannot delay.

For example, on octeon, membar_producer() prevents subsequent writes
from getting reordered with earlier writes in the write buffer.
However, the operation does not block the pipeline, it takes effect
in the background.

> I came to the conclusion that refcnt does not need membars at all.
> It does not protect other data, it is only looking at one counter
> variable.  When using refcnt, it protects the livetime of an object.
> For the data you need another lock which brings its own barriers.

To make a reasonable API, one should consider the intended usage.
Putting the barriers inside refcnt code keeps the API sane. Otherwise
callers would have to remember issue barriers, which would also be
wasteful on systems where RMW atomics do enforce memory order.

> Not sure about the purpose of cond.  Maybe membar_enter/exit is the
> way to go.  Does it guatantee anything about memomy access?

The caller of cond_signal() has state that the caller of cond_wait()
wants to see.

cond_signal() should make the (local) state visible to other CPUs
before the clearing of c_wait becomes visible. membar_exit() does
that.

cond_wait() should prevent premature peeking into the data from
happening before the clearing of c_wait has been seen. I think
membar_sync(), and not membar_enter(), is the right choice.

My earlier suggestion about membar_enter() is wrong. This barrier
orders subsequent reads and writes relative to earlier writes. The
pivot in cond_wait() is a read!

> > > And to avoid memory barriers the nobody understands we should convert
> > > FRELE to refcnt_rele.
> > 
> > I am not sure about that. I think as long as file reference counting
> > does unusual things that refcnt does not implement, f_count handling
> > should be separate.
> 
> Let's keep FRELE out of the discussion.  Either it works as it is
> or it should use suitable primitives.  But please no membars in the
> file system code.

I believe the membars are necessary if f_count is updated using atomic
operations. An alternative is to wrap the reference count updates with
a mutex which invokes membars internally but with increased total cost.

Below is an updated patch that acknowledges the acquire semantics
aspect of FRELE() when reference count has dropped to zero. The
destructor wants to see the (aggregate) state that follows the
f_count 1->0 transition.

Index: kern/kern_descrip.c
===
RCS file: src/sys/kern/kern_descrip.c,v
retrieving revision 1.205
diff -u -p -r1.205 kern_descrip.c
--- kern/kern_descrip.c 20 Jan 2022 11:06:57 -  1.205
+++ kern/kern_descrip.c 11 Mar 2022 14:17:22 -
@@ -1268,7 +1268,16 @@ fdrop(struct file *fp, struct proc *p)
 {
int error;

-   KASSERTMSG(fp->f_count == 0, "count (%u) != 0", fp->f_count);
+   membar_exit_before_atomic();
+   if (atomic_dec_int_nv(>f_count) > 0)
+   return 0;
+
+   /*
+* Make this CPU see the latest state relative to f_count updates.
+* Note this memory barrier is redundant because the following
+* critical section provides an acquire-release barrier.
+*/
+   /* membar_enter_after_atomic(); */

mtx_enter();
if (fp->f_iflags & FIF_INSERTED)
Index: sys/file.h
==

Re: atomic read write

2022-03-10 Thread Visa Hankala

On Thu, Mar 10, 2022 at 07:17:43PM +0100, Alexander Bluhm wrote:
> On Thu, Mar 10, 2022 at 04:39:49PM +0100, Alexander Bluhm wrote:
> > > Below is a patch that shows how to accomplish release semantics with
> > > the file and refcnt APIs. (The added memory barriers ensure that the
> > > CPU completes its loads and stores to the object before dropping the
> > > reference. Another CPU might delete the object immediately after.
> > > The barrier might not be strictly necessary with refcnt_finalize(),
> > > though.)
> 
> The enter and exit membars that protect the critical section should
> be symmetric.  Maybe this diff is better.

No, symmetry is not always present. See below.

> And to avoid memory barriers the nobody understands we should convert
> FRELE to refcnt_rele.

I am not sure about that. I think as long as file reference counting
does unusual things that refcnt does not implement, f_count handling
should be separate.

> Index: kern/kern_synch.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_synch.c,v
> retrieving revision 1.183
> diff -u -p -r1.183 kern_synch.c
> --- kern/kern_synch.c 10 Mar 2022 15:21:08 -  1.183
> +++ kern/kern_synch.c 10 Mar 2022 18:11:39 -
> @@ -805,6 +805,7 @@ void
>  refcnt_init(struct refcnt *r)
>  {
>   atomic_store_int(>r_refs, 1);
> + membar_enter_after_atomic();
>  }

I think membar is unnecessary here. Concurrent access can only happen
after the object has been "published", and the publishing should have
the appropriate memory barriers.

Without proper publishing, the code is prone to race conditions.

Also note that membar_enter_after_atomic() is not valid with
atomic_store_* or atomic_load_*. See my comment about cond_signal().

>  
>  void
> @@ -818,6 +819,7 @@ refcnt_take(struct refcnt *r)
>  #else
>   atomic_inc_int(>r_refs);
>  #endif
> + membar_enter_after_atomic();
>  }

This is unnecessary. The caller already has a reference to the object.
refcnt_take() only has the intent of increasing the reference count.

>  
>  int
> @@ -825,6 +827,7 @@ refcnt_rele(struct refcnt *r)
>  {
>   u_int refcnt;
>  
> + membar_exit_before_atomic();
>   refcnt = atomic_dec_int_nv(>r_refs);
>   KASSERT(refcnt != ~0);
>  
> @@ -844,6 +847,7 @@ refcnt_finalize(struct refcnt *r, const 
>   struct sleep_state sls;
>   u_int refcnt;
>  
> + membar_exit_before_atomic();
>   refcnt = atomic_dec_int_nv(>r_refs);
>   while (refcnt) {
>   sleep_setup(, r, PWAIT, wmesg, 0);
> @@ -856,11 +860,13 @@ void
>  cond_init(struct cond *c)
>  {
>   atomic_store_int(>c_wait, 1);
> + membar_enter_after_atomic();
>  }

Same point here as with refcnt_init().

>  
>  void
>  cond_signal(struct cond *c)
>  {
> + membar_exit_before_atomic();
>   atomic_store_int(>c_wait, 0);

This should use membar_exit(). membar_exit_before_atomic() is valid
only when accompanied with a true read-modify-write atomic operation.

The atomic_ prefix with the store and load instructions confuses this
somewhat.

The wakeup call that follows provides a membar function, but it comes
too late as c_wait has already been cleared.

>  
>   wakeup_one(c);
> @@ -872,9 +878,11 @@ cond_wait(struct cond *c, const char *wm
>   struct sleep_state sls;
>   unsigned int wait;
>  
> + membar_exit_before_atomic();
>   wait = atomic_load_int(>c_wait);
>   while (wait) {
>   sleep_setup(, c, PWAIT, wmesg, 0);
> + membar_exit_before_atomic();
>   wait = atomic_load_int(>c_wait);
>   sleep_finish(, wait);
>   }
> 

I think this should use membar_enter() after the loop.

cond_wait() is supposed to provide acquire semantics; once cond_wait()
returns, the CPU sees a state that is at least as recent as the one
that the caller of cond_signal() saw.

In a way, cond_wait() is similar to lock acquisition, and cond_signal()
similar to lock release.

Re: atomic read write

2022-03-10 Thread Visa Hankala

On Thu, Mar 10, 2022 at 10:45:47AM +, Laurence Tratt wrote:
> On Thu, Mar 10, 2022 at 09:05:54AM +0000, Visa Hankala wrote:
> 
> Hello Visa,
> 
> > In general, atomic_* functions have not provided implicit memory
> > barriers on OpenBSD.
> 
> I've used atomics fairly extensively in other settings. Forgive me if I'm
> explaining the obvious, but I had a devil of a job making sense of this
> stuff a few years back, and so perhaps others might find it useful to expand
> on this point.
> 
> Quick background: modern CPUs come in two main flavours, weakly ordered
> (e.g. most Arm systems) and strongly ordered (e.g. x86), which determine the
> rules of when multiple cores can see the reads/writes of other cores. Weakly
> ordered systems can move/optimise loads/stores around more than strongly
> ordered systems (so code that seems to work fine on x86 can then fail on
> Arm).
> 
> There are in a sense two "safe" ways to use atomics: to assume that each
> atomic is isolated and that reading/writing to it tells you nothing about any
> other location in memory; or that every atomic is fully ordered with respect
> to every other atomic (i.e. no reorderings of atomic operations are allowed).
> The former is fast but (without additional operations) can't even express
> a mutex safely. The latter doesn't have very good performance.
> 
> C11 thus allows you to do various atomic operations with different memory
> orderings [1] so that you can choose on a case-by-case basis what you're
> prepared to tolerate. "relaxed" is the most efficient but has the least
> guarantees; "seq_cst" is the least efficient but has the most guarantees.
> 
> I would be very nervous about adding further atomic functions (as in the
> original diff) to OpenBSD that don't allow the user to specify what ordering
> they want: it's impossible to pick a memory ordering that suits all use
> cases. For example, neither READ_ONCE nor the existing atomic_* instructions
> define an ordering: I suppose I'd have to to assume they're relaxed. I worry
> that people might assume these atomic operations provide greater guarantees
> than they actually do.

My understanding is that OpenBSD's atomic_* instructions are relaxed
in terms of memory order. To accomplish a specific form of semantics,
the user adds the appropriate memory barrier instruction. Well, this
is the plan at least, I think.

Below is a patch that shows how to accomplish release semantics with
the file and refcnt APIs. (The added memory barriers ensure that the
CPU completes its loads and stores to the object before dropping the
reference. Another CPU might delete the object immediately after.
The barrier might not be strictly necessary with refcnt_finalize(),
though.)

> Fortunately since, AFAICT, we already use C11 (or C17?!) for base, and LLVM
> includes all of the relevant functions (e.g. the compare_exchange family
> [2]) I don't think we need to add any functions of our own? It might not
> even be a bad idea to deprecate the current atomic_* functions in base
> and migrate to the C11 alternatives?

Some of the architectures are still using GCC 4.2.1.

Base does not use C11 at the moment.


Index: kern/kern_synch.c
===
RCS file: src/sys/kern/kern_synch.c,v
retrieving revision 1.182
diff -u -p -r1.182 kern_synch.c
--- kern/kern_synch.c   19 Feb 2022 23:56:18 -  1.182
+++ kern/kern_synch.c   10 Mar 2022 13:37:50 -
@@ -825,6 +825,7 @@ refcnt_rele(struct refcnt *r)
 {
u_int refcnt;
 
+   membar_exit_before_atomic();
refcnt = atomic_dec_int_nv(>refs);
KASSERT(refcnt != ~0);
 
@@ -844,6 +845,7 @@ refcnt_finalize(struct refcnt *r, const 
struct sleep_state sls;
u_int refcnt;
 
+   membar_exit_before_atomic();
refcnt = atomic_dec_int_nv(>refs);
while (refcnt) {
sleep_setup(, r, PWAIT, wmesg, 0);
Index: sys/file.h
===
RCS file: src/sys/sys/file.h,v
retrieving revision 1.65
diff -u -p -r1.65 file.h
--- sys/file.h  20 Jan 2022 03:43:31 -  1.65
+++ sys/file.h  10 Mar 2022 13:37:50 -
@@ -36,6 +36,7 @@
 #include 
 
 #else /* _KERNEL */
+#include 
 #include 
 #include 
 #endif /* _KERNEL */
@@ -113,13 +114,21 @@ struct file {
atomic_inc_int(&(fp)->f_count); \
} while (0)
 
-#define FRELE(fp,p) \
-   (atomic_dec_int_nv(>f_count) == 0 ? fdrop(fp, p) : 0)
-
 #define FDUP_MAX_COUNT (UINT_MAX - 2 * MAXCPUS)
 
 intfdrop(struct file *, struct proc *);
 
+static inline int
+FRELE(struct file *fp, struct proc *p)
+{
+   int error = 0;
+
+   membar_exit_before_atomic();
+   if (atomic_dec_int_nv(>f_count) == 0)
+   error = fdrop(fp, p);
+   return (error);
+}
+
 static inline off_t
 foffset(struct file *fp)
 {

Re: atomic read write

2022-03-10 Thread Visa Hankala

On Wed, Mar 09, 2022 at 08:45:35PM +0100, Alexander Bluhm wrote:
> On Tue, Mar 08, 2022 at 04:55:56PM +0100, Alexander Bluhm wrote:
> > Once we had the discussion where we need the READ_ONCE() macro.  As
> > modern C compiler has much freedom how to access memory, I came to
> > the conclusion that it would be wise to use READ_ONCE() and
> > WRITE_ONCE() everywhere when we use atomic operations variables.
> > Using atomic operations on one side and do whatever the compiler
> > thinks at the other side of the variable feels wrong.
> > 
> > The rule use READ_ONCE, WRITE_ONCE, atomic_inc, atomic_dec consistently
> > would be easy to follow.  Thinking about where the compiler might
> > reorder things and break MP code is much more complicated.
> >
> > Do we want to go this direction?
> 
> mvs@ mentioned that FreeBSD has atomic load and store instructions
> for that.  I decided to implement them as static inline functions
> as they provide stronger type checks.  Also I add them for int and
> long only, everything else is not atomic.
> 
> > If yes, here is the change for struct refcnt and cond.  While there
> > rename the field to r_refs which is easier to grep.
> 
> Note that the _init functions do not need atomic operations.  But
> the whole idea is to make it consistent and have a simple rule.  If
> an MP variable locking is marked as atomic, use the atomic_ functions.
> 
> As a bonus alpha gets the membar it needs.

In general, atomic_* functions have not provided implicit memory
barriers on OpenBSD.

I am not sure if the data dependency barrier is needed where
atomic_load_int() and atomic_load_long() are used. The memory ordering
guarantee is very weak and does not seem useful in any of the use cases
in the patch. However, the barrier does not appear to make things worse
in terms of correctness. Except maybe in assertions where they cause
subtle side effects.

However, the patch looks good.

OK visa@

> Index: sys/dev/pci/if_iwm.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pci/if_iwm.c,v
> retrieving revision 1.391
> diff -u -p -r1.391 if_iwm.c
> --- sys/dev/pci/if_iwm.c  8 Feb 2022 14:24:36 -   1.391
> +++ sys/dev/pci/if_iwm.c  9 Mar 2022 18:52:52 -
> @@ -9975,7 +9975,7 @@ iwm_init(struct ifnet *ifp)
>  
>   generation = ++sc->sc_generation;
>  
> - KASSERT(sc->task_refs.refs == 0);
> + KASSERT(atomic_load_int(>task_refs.r_refs) == 0);
>   refcnt_init(>task_refs);
>  
>   err = iwm_preinit(sc);
> @@ -10116,7 +10116,7 @@ iwm_stop(struct ifnet *ifp)
>   iwm_del_task(sc, systq, >mac_ctxt_task);
>   iwm_del_task(sc, systq, >phy_ctxt_task);
>   iwm_del_task(sc, systq, >bgscan_done_task);
> - KASSERT(sc->task_refs.refs >= 1);
> + KASSERT(atomic_load_int(>task_refs.r_refs) >= 1);
>   refcnt_finalize(>task_refs, "iwmstop");
>  
>   iwm_stop_device(sc);
> Index: sys/dev/pci/if_iwx.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/dev/pci/if_iwx.c,v
> retrieving revision 1.134
> diff -u -p -r1.134 if_iwx.c
> --- sys/dev/pci/if_iwx.c  21 Jan 2022 15:51:02 -  1.134
> +++ sys/dev/pci/if_iwx.c  9 Mar 2022 18:53:50 -
> @@ -8017,7 +8017,7 @@ iwx_init(struct ifnet *ifp)
>   if (sc->sc_nvm.sku_cap_11n_enable)
>   iwx_setup_ht_rates(sc);
>  
> - KASSERT(sc->task_refs.refs == 0);
> + KASSERT(atomic_load_int(>task_refs.r_refs) == 0);
>   refcnt_init(>task_refs);
>   ifq_clr_oactive(>if_snd);
>   ifp->if_flags |= IFF_RUNNING;
> @@ -8139,7 +8139,7 @@ iwx_stop(struct ifnet *ifp)
>   iwx_del_task(sc, systq, >mac_ctxt_task);
>   iwx_del_task(sc, systq, >phy_ctxt_task);
>   iwx_del_task(sc, systq, >bgscan_done_task);
> - KASSERT(sc->task_refs.refs >= 1);
> + KASSERT(atomic_load_int(>task_refs.r_refs) >= 1);
>   refcnt_finalize(>task_refs, "iwxstop");
>  
>   iwx_stop_device(sc);
> Index: sys/kern/kern_synch.c
> ===
> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_synch.c,v
> retrieving revision 1.182
> diff -u -p -r1.182 kern_synch.c
> --- sys/kern/kern_synch.c 19 Feb 2022 23:56:18 -  1.182
> +++ sys/kern/kern_synch.c 9 Mar 2022 18:57:53 -
> @@ -804,7 +804,7 @@ sys___thrwakeup(struct proc *p, void *v,
>  void
>  refcnt_init(struct refcnt *r)
>  {
> - r->refs = 1;
> + atomic_store_int(>r_refs, 1);
>  }
>  
>  void
> @@ -813,10 +813,10 @@ refcnt_take(struct refcnt *r)
>  #ifdef DIAGNOSTIC
>   u_int refcnt;
>  
> - refcnt = atomic_inc_int_nv(>refs);
> + refcnt = atomic_inc_int_nv(>r_refs);
>   KASSERT(refcnt != 0);
>  #else
> - atomic_inc_int(>refs);
> + atomic_inc_int(>r_refs);
>  #endif
>  }
>  
> @@ -825,7 +825,7 @@ refcnt_rele(struct refcnt *r)
>  {
>   u_int refcnt;
>  
> - refcnt = atomic_dec_int_nv(>refs);
> +

Re: Add kernel stack trace saving for riscv64

2022-03-09 Thread Visa Hankala

On Wed, Mar 09, 2022 at 08:40:50AM +0100, Jeremie Courreges-Anglas wrote:
> On Tue, Mar 08 2022, Visa Hankala  wrote:
> > This patch adds kernel stack trace saving for riscv64, for the benefit
> > of dt(4) and witness(4).
> 
> Nice!
> 
> > The unwinder is slow because of the symbol
> > lookup, but this can be tweaked later.
> 
> A dumb approach that appears to work: add
> cpu_exception_handler_supervisor_end and cpu_exception_handler_user_end
> symbols, and perform a range check.

Yes, using extra symbols and checking ranges has been an option. It
does not look nice but seems less intrusive than making the exception
entry points preserve, or rather improvise, the chain of callframes.

I have to withdraw the patch for now as it does not seem reliable.

Add kernel stack trace saving for riscv64

2022-03-08 Thread Visa Hankala

This patch adds kernel stack trace saving for riscv64, for the benefit
of dt(4) and witness(4). The unwinder is slow because of the symbol
lookup, but this can be tweaked later.

The limit variable prevents the unwinder from using user-controllable
register values. The limit has to reflect the kernel stack setup in
cpu_fork(). To ensure consistency, the stack start address is stored
in a variable in struct pcb.

OK?

Index: arch/riscv64/include/pcb.h
===
RCS file: src/sys/arch/riscv64/include/pcb.h,v
retrieving revision 1.3
diff -u -p -r1.3 pcb.h
--- arch/riscv64/include/pcb.h  30 Jun 2021 22:20:56 -  1.3
+++ arch/riscv64/include/pcb.h  8 Mar 2022 16:54:58 -
@@ -39,5 +39,6 @@ struct pcb {
 
caddr_t pcb_onfault;// On fault handler
struct fpregpcb_fpstate;// Floating Point state */
+   register_t  pcb_kstack; /* kernel stack address */
 };
 #endif /* _MACHINE_PCB_H_ */
Index: arch/riscv64/riscv64/db_trace.c
===
RCS file: src/sys/arch/riscv64/riscv64/db_trace.c,v
retrieving revision 1.5
diff -u -p -r1.5 db_trace.c
--- arch/riscv64/riscv64/db_trace.c 22 Feb 2022 07:46:04 -  1.5
+++ arch/riscv64/riscv64/db_trace.c 8 Mar 2022 16:54:58 -
@@ -141,3 +141,56 @@ db_stack_trace_print(db_expr_t addr, int
}
(*pr)("end trace frame: 0x%lx, count: %d\n", frame, count);
 }
+
+void
+stacktrace_save_at(struct stacktrace *st, unsigned int skip)
+{
+   struct callframe *frame, *lastframe, *limit;
+   struct pcb *pcb = curpcb;
+   Elf_Sym *sym;
+   db_expr_t diff;
+   vaddr_t ra, subr;
+
+   st->st_count = 0;
+
+   if (pcb == NULL)
+   return;
+
+   ra = (vaddr_t)__builtin_return_address(0);
+   frame = (struct callframe *)__builtin_frame_address(0);
+   KASSERT(INKERNEL(frame));
+   limit = (struct callframe *)((struct trapframe *)pcb->pcb_kstack - 1);
+
+   while (st->st_count < STACKTRACE_MAX) {
+   if (skip == 0)
+   st->st_pc[st->st_count++] = ra;
+   else
+   skip--;
+
+   sym = db_search_symbol(ra, DB_STGY_PROC, );
+   if (sym == NULL)
+   break;
+   subr = ra - (vaddr_t)diff;
+
+   lastframe = frame;
+   if (subr == (vaddr_t)cpu_exception_handler_supervisor ||
+   subr == (vaddr_t)cpu_exception_handler_user) {
+   struct trapframe *tf = (struct trapframe *)frame;
+
+   frame = (struct callframe *)tf->tf_s[0];
+   ra = tf->tf_ra;
+   } else {
+   frame = frame[-1].f_frame;
+   if (frame == NULL)
+   break;
+   ra = frame[-1].f_ra;
+   }
+
+   if (frame <= lastframe)
+   break;
+   if (frame >= limit)
+   break;
+   if (!INKERNEL(ra))
+   break;
+   }
+}
Index: arch/riscv64/riscv64/vm_machdep.c
===
RCS file: src/sys/arch/riscv64/riscv64/vm_machdep.c,v
retrieving revision 1.10
diff -u -p -r1.10 vm_machdep.c
--- arch/riscv64/riscv64/vm_machdep.c   24 Feb 2022 14:19:10 -  1.10
+++ arch/riscv64/riscv64/vm_machdep.c   8 Mar 2022 16:54:58 -
@@ -75,13 +75,12 @@ cpu_fork(struct proc *p1, struct proc *p
 
pmap_activate(p2);
 
-   tf = (struct trapframe *)((u_long)p2->p_addr
+   pcb->pcb_kstack = STACKALIGN((u_long)p2->p_addr
+ USPACE
-   - sizeof(struct trapframe)
- sizeof(register_t)/* for holding curcpu */
- 0x10);
 
-   tf = (struct trapframe *)STACKALIGN(tf);
+   tf = (struct trapframe *)pcb->pcb_kstack - 1;
pcb->pcb_tf = tf;
*tf = *p1->p_addr->u_pcb.pcb_tf;

Re: pluart(4): fifo support

2022-03-08 Thread Visa Hankala

On Tue, Mar 08, 2022 at 08:04:36AM +0100, Anton Lindqvist wrote:
> On Mon, Mar 07, 2022 at 07:36:35AM +0000, Visa Hankala wrote:
> > I still think that checking TXFF and using the same code for both
> > SBSA and true PL011 UARTs would be the best choice. This would avoid
> > fragmenting the code and improve robustness by relying on functionality
> > that is common to the different controller variants.
> 
> Fair enough, new diff.

Maybe the comments should omit the FIFO space description and just
mention the lack of the level control register in the SBSA UART
register interface.

OK visa@

> diff --git sys/dev/acpi/pluart_acpi.c sys/dev/acpi/pluart_acpi.c
> index dc8ea5e9922..08ebe13ffbc 100644
> --- sys/dev/acpi/pluart_acpi.c
> +++ sys/dev/acpi/pluart_acpi.c
> @@ -91,6 +91,8 @@ pluart_acpi_attach(struct device *parent, struct device 
> *self, void *aux)
>   return;
>   }
>  
> + sc->sc.sc_hwflags |= COM_HW_SBSA;
> +
>   pluart_attach_common(>sc, pluart_acpi_is_console(sc));
>  }
>  
> diff --git sys/dev/fdt/pluart_fdt.c sys/dev/fdt/pluart_fdt.c
> index 7f17365f1d6..798250593bf 100644
> --- sys/dev/fdt/pluart_fdt.c
> +++ sys/dev/fdt/pluart_fdt.c
> @@ -69,6 +69,9 @@ pluart_fdt_attach(struct device *parent, struct device 
> *self, void *aux)
>   return;
>   }
>  
> + if (OF_is_compatible(faa->fa_node, "arm,sbsa-uart"))
> + sc->sc_hwflags |= COM_HW_SBSA;
> +
>   sc->sc_irq = fdt_intr_establish(faa->fa_node, IPL_TTY, pluart_intr,
>   sc, sc->sc_dev.dv_xname);
>  
> diff --git sys/dev/ic/pluart.c sys/dev/ic/pluart.c
> index eaa11b6c44b..457c88d8cad 100644
> --- sys/dev/ic/pluart.c
> +++ sys/dev/ic/pluart.c
> @@ -99,6 +99,13 @@
>  #define UART_CR_CTSE (1 << 14)   /* CTS hardware flow control 
> enable */
>  #define UART_CR_RTSE (1 << 15)   /* RTS hardware flow control 
> enable */
>  #define UART_IFLS0x34/* Interrupt FIFO level select 
> register */
> +#define UART_IFLS_RX_SHIFT   3   /* RX level in bits [5:3] */
> +#define UART_IFLS_TX_SHIFT   0   /* TX level in bits [2:0] */
> +#define UART_IFLS_1_80   /* FIFO 1/8 full */
> +#define UART_IFLS_1_41   /* FIFO 1/4 full */
> +#define UART_IFLS_1_22   /* FIFO 1/2 full */
> +#define UART_IFLS_3_43   /* FIFO 3/4 full */
> +#define UART_IFLS_7_84   /* FIFO 7/8 full */
>  #define UART_IMSC0x38/* Interrupt mask set/clear 
> register */
>  #define UART_IMSC_RIMIM  (1 << 0)
>  #define UART_IMSC_CTSMIM (1 << 1)
> @@ -115,8 +122,16 @@
>  #define UART_MIS 0x40/* Masked interrupt status 
> register */
>  #define UART_ICR 0x44/* Interrupt clear register */
>  #define UART_DMACR   0x48/* DMA control register */
> +#define UART_PID00xfe0   /* Peripheral identification 
> register 0 */
> +#define UART_PID10xfe4   /* Peripheral identification 
> register 1 */
> +#define UART_PID20xfe8   /* Peripheral identification 
> register 2 */
> +#define UART_PID2_REV(x) (((x) & 0xf0) >> 4)
> +#define UART_PID30xfec   /* Peripheral identification 
> register 3 */
>  #define UART_SPACE   0x100
>  
> +#define UART_FIFO_SIZE   16
> +#define UART_FIFO_SIZE_R332
> +
>  void pluartcnprobe(struct consdev *cp);
>  void pluartcninit(struct consdev *cp);
>  int pluartcngetc(dev_t dev);
> @@ -150,7 +165,31 @@ struct cdevsw pluartdev =
>  void
>  pluart_attach_common(struct pluart_softc *sc, int console)
>  {
> - int maj;
> + int fifolen, maj;
> + int lcr;
> +
> + if ((sc->sc_hwflags & COM_HW_SBSA) == 0) {
> + int rev;
> +
> + rev = UART_PID2_REV(bus_space_read_4(sc->sc_iot, sc->sc_ioh,
> + UART_PID2));
> + if (rev < 3)
> + fifolen = UART_FIFO_SIZE;
> + else
> + fifolen = UART_FIFO_SIZE_R3;
> + printf(": rev %d, %d byte fifo\n", rev, fifolen);
> + } else {
> + /*
> +  * The SBSA UART is PL011 r1p5 compliant which implies revision
> +  * 3 with a 32 byte FIFO. However, we cannot expect to configure
> +  * RX/TX interrupt levels using the UARTIFLS register making it
> +  * impossible to make assumptions ab

1 2 3 4 >

1 - 100 of 396 matches

Mail list logo