On Wed, Oct 27, 2021 at 09:02:08PM -0400, Dave Voutila wrote:
> 
> Dave Voutila <[email protected]> writes:
> 
> > Was tinkering on a bt(5) script for trying to debug an issue in vmm(4)
> > when I managed to start hitting a panic "wakeup: p_stat is 2" being
> > triggered by kqueue coming from the softnet kernel task.
> >
> > I'm running an amd64 kernel built from the tree today (latest CVS commit
> > id UynQo1r7kLKA0Q2p) with VMM_DEBUG option set and the defaults from
> > GENERIC.MP. Userland is from the latest amd snap.
> >
> > To reproduce, I'm running a single OpenBSD-current guest under vmd(8)
> > which I'm targeting with the following trivial btrace script I was
> > working on to use for debugging something in vmm(4):
> >
> > tracepoint:sched:sleep / pid == $1 && tid == $2 /{
> >   printf("pid %d, tid %d slept %d!\n", pid, tid, nsecs);
> > }
> >
> > tracepoint:sched:wakeup / pid == $1 && tid == $2 /{
> >   printf("pid %d, tid %d awoke %d!\n", pid, tid, nsecs);
> > }
> 
> Even easier reproduction: if you have 2 machines and can use tcpbench(1)
> between them, then while tcpbench is running target it with the above
> btrace script. I've found running the script, killing it with ctrl-c,
> and re-running it 2-3 times triggers the panic on my laptop.
> 
> >
> > Both times this happened I was trying to sysupgrade the vmd(8) guest
> > while running the above btrace script. When I don't run the script,
> > there is no panic.
> >
> > Image of the full backtrace is here: https://imgur.com/a/swW1qoj
> >
> > Simple transcript of the call stack after the panic() call looks like:
> >
> > wakeup_n
> > kqueue_wakeup
> > knote
> > selwakekup
> > tun_enqueue
> > ether_output
> > ip_output
> > ip_forward
> > ip_input_if
> > ipv4_input
> > ether_input
> > if_input_process
> >
> > The other 3 cpu cores appeared to be in ipi handlers. (Image in that
> > imgur link)

I think the problem is recursion within the scheduler. Trace points
invoke wakeup() directly, which is unsafe when done from within the
run queue routines.

One approach to avoid the recursion is to defer the wakeup() with
a soft interrupt. The scheduler runs at an elevated system priority
level that blocks soft interrupts. The deferred wakeup() is issued once
the system priority level turns low enough.

Unfortunately, the patch will get broken when someone adds trace points
to soft interrupt scheduling...

Index: dev/dt/dt_dev.c
===================================================================
RCS file: src/sys/dev/dt/dt_dev.c,v
retrieving revision 1.16
diff -u -p -r1.16 dt_dev.c
--- dev/dt/dt_dev.c     25 Oct 2021 19:51:12 -0000      1.16
+++ dev/dt/dt_dev.c     28 Oct 2021 05:39:32 -0000
@@ -25,6 +25,8 @@
 
 #include <dev/dt/dtvar.h>
 
+#include <machine/intr.h>
+
 /*
  * Number of frames to skip in stack traces.
  *
@@ -85,6 +87,7 @@ struct dt_softc {
        pid_t                    ds_pid;        /* [I] PID of tracing program */
 
        struct mutex             ds_mtx;
+       void                    *ds_si;
 
        struct dt_pcb_list       ds_pcbs;       /* [K] list of enabled PCBs */
        struct dt_evt           *ds_bufqueue;   /* [K] copy evts to userland */
@@ -118,6 +121,7 @@ int dtread(dev_t, struct uio *, int);
 int    dtioctl(dev_t, u_long, caddr_t, int, struct proc *);
 
 struct dt_softc *dtlookup(int);
+void   dtwakeup(void *);
 
 int    dt_ioctl_list_probes(struct dt_softc *, struct dtioc_probe *);
 int    dt_ioctl_get_stats(struct dt_softc *, struct dtioc_stat *);
@@ -177,6 +181,10 @@ dtopen(dev_t dev, int flags, int mode, s
        sc->ds_readevt = 0;
        sc->ds_dropevt = 0;
 
+       sc->ds_si = softintr_establish(IPL_SOFTCLOCK, dtwakeup, sc);
+       if (sc->ds_si == NULL)
+               goto bad;
+
        SLIST_INSERT_HEAD(&dtdev_list, sc, ds_next);
 
        DPRINTF("dt%d: pid %d open\n", sc->ds_unit, sc->ds_pid);
@@ -184,6 +192,8 @@ dtopen(dev_t dev, int flags, int mode, s
        return 0;
 
 bad:
+       free(sc->ds_bufqueue, M_DEVBUF,
+           sc->ds_bufqlen * sizeof(*sc->ds_bufqueue));
        free(sc, M_DEVBUF, sizeof(*sc));
        return ENOMEM;
 }
@@ -203,6 +213,8 @@ dtclose(dev_t dev, int flags, int mode, 
        dt_ioctl_record_stop(sc);
        dt_pcb_purge(&sc->ds_pcbs);
 
+       softintr_disestablish(sc->ds_si);
+
        free(sc->ds_bufqueue, M_DEVBUF,
            sc->ds_bufqlen * sizeof(*sc->ds_bufqueue));
        free(sc, M_DEVBUF, sizeof(*sc));
@@ -324,6 +336,14 @@ dtlookup(int unit)
        return sc;
 }
 
+void
+dtwakeup(void *arg)
+{
+       struct dt_softc *sc = arg;
+
+       wakeup(sc);
+}
+
 int
 dtioc_req_isvalid(struct dtioc_req *dtrq)
 {
@@ -686,7 +706,9 @@ dt_pcb_ring_consume(struct dt_pcb *dp, s
        mtx_enter(&dp->dp_sc->ds_mtx);
        dp->dp_sc->ds_evtcnt++;
        mtx_leave(&dp->dp_sc->ds_mtx);
-       wakeup(dp->dp_sc);
+
+       /* Defer wakeup() to avoid recursion within the scheduler. */
+       softintr_schedule(dp->dp_sc->ds_si);
 }
 
 /*

Reply via email to