On Sun, May 23, 2021 at 09:05:24AM +0000, Visa Hankala wrote:
>
> [...]
>
> Here is a revised patch with a slightly altered approach.
>
> I have replaced the per-CPU queues with system-wide queues to make
> the code simpler.
I think that's fine to start with. We don't have many softintrs. We
can go per-CPU later if there is an issue with performance.
> When a CPU starts processing a soft interrupt, it reserves the handler
> to prevent concurrent execution. If the soft interrupt gets rescheduled
> during processing, the handler is run again by the same CPU. This breaks
> FIFO ordering, though.
If you want to preserve FIFO you can reinsert the handler at the queue
tail. That would be more fair.
If FIFO is the current behavior I think we ought to keep it.
> softintr_disestablish() now calls sched_barrier() to drain processing.
> This should also prevent the deadlock with the kernel lock.
>
> It appeared tricky to add witness(4) integration. witness(4) treats the
> kernel lock as a sleepable lock to avoid ordering issues with rwlocks.
> This creates a dilemma in softintr_dispatch() because of nesting.
> The natural lock class for the required soft interrupt pseudo-lock is
> mutex, but that does not work if an MP-safe soft interrupt got preempted
> by a higher IPL non-MP-safe soft interrupt.
>
> The dilemma could be worked around by preventing nesting when witness(4)
> is enabled, altering the system's behaviour. Another approach is to skip
> the pseudo-lock handling when processing is nested.
Altering the system's behavior sounds like a bad idea.
> However, as soft interrupt handlers cannot use rwlocks, for deadlock
> prevention it should be sufficient to ensure that the caller of
> softintr_disestablish() does not hold spinning locks (the kernel lock
> excluded). Hence softintr_disestablish() calls assertwaitok() at the
> start. The assertion is actually implied by sched_barrier().
I like your implementation a lot better than mine.
Misc. thoughts below, though I don't have any real complaints.
> Index: arch/amd64/amd64/softintr.c
> ===================================================================
> RCS file: src/sys/arch/amd64/amd64/softintr.c,v
> retrieving revision 1.10
> diff -u -p -r1.10 softintr.c
> --- arch/amd64/amd64/softintr.c 11 Sep 2020 09:27:09 -0000 1.10
> +++ arch/amd64/amd64/softintr.c 23 May 2021 08:56:28 -0000
> @@ -37,12 +37,33 @@
> #include <sys/param.h>
> #include <sys/systm.h>
> #include <sys/malloc.h>
> +#include <sys/mutex.h>
> +#include <sys/queue.h>
>
> #include <machine/intr.h>
>
> #include <uvm/uvm_extern.h>
>
> -struct x86_soft_intr x86_soft_intrs[X86_NSOFTINTR];
> +struct x86_soft_intrhand {
> + STAILQ_ENTRY(x86_soft_intrhand) sih_q;
> + void (*sih_fn)(void *);
> + void *sih_arg;
> + struct cpu_info *sih_runner;
> + int sih_which;
> + unsigned short sih_flags;
> + unsigned short sih_state;
> +};
> +
> +#define SIF_MPSAFE 0x0001
> +
> +#define SIS_DYING 0x0001
> +#define SIS_PENDING 0x0002
> +#define SIS_RESTART 0x0004
> +
> +STAILQ_HEAD(x86_soft_intr_queue, x86_soft_intrhand);
> +
> +struct x86_soft_intr_queue softintr_queue[X86_NSOFTINTR];
Why did we switch to STAILQ? I know we don't have very many
softintr_disestablish() calls but isn't O(1) removal worth the extra
pointer?
> +struct mutex softintr_lock = MUTEX_INITIALIZER(IPL_HIGH);
>
> const int x86_soft_intr_to_ssir[X86_NSOFTINTR] = {
> SIR_CLOCK,
> @@ -58,15 +79,10 @@ const int x86_soft_intr_to_ssir[X86_NSOF
> void
> softintr_init(void)
> {
> - struct x86_soft_intr *si;
> int i;
>
> - for (i = 0; i < X86_NSOFTINTR; i++) {
> - si = &x86_soft_intrs[i];
> - TAILQ_INIT(&si->softintr_q);
> - mtx_init(&si->softintr_lock, IPL_HIGH);
> - si->softintr_ssir = x86_soft_intr_to_ssir[i];
> - }
> + for (i = 0; i < X86_NSOFTINTR; i++)
> + STAILQ_INIT(&softintr_queue[i]);
Nit: nitems(), if you want.
> }
>
> /*
> @@ -78,31 +94,45 @@ void
> softintr_dispatch(int which)
> {
> struct cpu_info *ci = curcpu();
> - struct x86_soft_intr *si = &x86_soft_intrs[which];
> + struct x86_soft_intr_queue *queue = &softintr_queue[which];
> struct x86_soft_intrhand *sih;
> int floor;
>
> floor = ci->ci_handled_intr_level;
> ci->ci_handled_intr_level = ci->ci_ilevel;
>
> - KERNEL_LOCK();
> - for (;;) {
> - mtx_enter(&si->softintr_lock);
> - sih = TAILQ_FIRST(&si->softintr_q);
> - if (sih == NULL) {
> - mtx_leave(&si->softintr_lock);
> - break;
> + mtx_enter(&softintr_lock);
> + while ((sih = STAILQ_FIRST(queue)) != NULL) {
> + KASSERT((sih->sih_state & (SIS_PENDING | SIS_RESTART)) ==
> + SIS_PENDING);
> + KASSERT(sih->sih_runner == NULL);
> +
> + sih->sih_state &= ~SIS_PENDING;
Nit: you could use CLR(), ISSET(), etc., if you want. I won't mention
it again though.
> + STAILQ_REMOVE_HEAD(queue, sih_q);
> + sih->sih_runner = ci;
> +restart:
> + mtx_leave(&softintr_lock);
> +
> + if (sih->sih_flags & SIF_MPSAFE) {
> + (*sih->sih_fn)(sih->sih_arg);
> + } else {
> + KERNEL_LOCK();
> + (*sih->sih_fn)(sih->sih_arg);
> + KERNEL_UNLOCK();
> }
> - TAILQ_REMOVE(&si->softintr_q, sih, sih_q);
> - sih->sih_pending = 0;
>
> + mtx_enter(&softintr_lock);
> uvmexp.softs++;
>
> - mtx_leave(&si->softintr_lock);
> + KASSERT((sih->sih_state & SIS_PENDING) == 0);
> + if (sih->sih_state & SIS_RESTART) {
> + sih->sih_state &= ~SIS_RESTART;
> + goto restart;
> + }
So, here is where you would reinsert if you wanted to keep FIFO.
Something like:
if (sih->sih_state & SIS_RESTART) {
sih->sih_state &= ~SIS_RESTART;
sih->sih_state |= SIS_PENDING;
STAILQ_INSERT_TAIL(queue, sih, sih_q);
}
>
> - (*sih->sih_fn)(sih->sih_arg);
> + sih->sih_runner = NULL;
.. of course you would need to clear sih_runner before reinsertion.
Actually, I think you ought to clear sih_runner immediately after you
reenter the mutex. The symmetry will make an audit easier.
> }
> - KERNEL_UNLOCK();
> + mtx_leave(&softintr_lock);
>
> ci->ci_handled_intr_level = floor;
> }
> @@ -115,9 +145,13 @@ softintr_dispatch(int which)
> void *
> softintr_establish(int ipl, void (*func)(void *), void *arg)
> {
> - struct x86_soft_intr *si;
> struct x86_soft_intrhand *sih;
> int which;
> + unsigned short flags = 0;
> +
> + if (ipl & IPL_MPSAFE)
> + flags |= SIF_MPSAFE;
> + ipl &= ~IPL_MPSAFE;
So IPL_MPSAFE is a flag that we OR'd with an integer IPL level? Seems
kinda messy. I guess if we already do it elsewhere it's fine.
>
> switch (ipl) {
> case IPL_SOFTCLOCK:
> @@ -137,14 +171,14 @@ softintr_establish(int ipl, void (*func)
> panic("softintr_establish");
> }
>
> - si = &x86_soft_intrs[which];
> -
> sih = malloc(sizeof(*sih), M_DEVBUF, M_NOWAIT);
> if (__predict_true(sih != NULL)) {
> - sih->sih_intrhead = si;
> sih->sih_fn = func;
> sih->sih_arg = arg;
> - sih->sih_pending = 0;
> + sih->sih_runner = NULL;
> + sih->sih_which = which;
> + sih->sih_flags = flags;
> + sih->sih_state = 0;
> }
> return (sih);
> }
> @@ -157,15 +191,52 @@ softintr_establish(int ipl, void (*func)
> void
> softintr_disestablish(void *arg)
> {
> + struct cpu_info *runner = NULL;
> struct x86_soft_intrhand *sih = arg;
> - struct x86_soft_intr *si = sih->sih_intrhead;
>
> - mtx_enter(&si->softintr_lock);
> - if (sih->sih_pending) {
> - TAILQ_REMOVE(&si->softintr_q, sih, sih_q);
> - sih->sih_pending = 0;
> + assertwaitok();
> +
> + mtx_enter(&softintr_lock);
> + sih->sih_state |= SIS_DYING;
> + if (sih->sih_state & SIS_PENDING) {
> + sih->sih_state &= ~SIS_PENDING;
> + STAILQ_REMOVE(&softintr_queue[sih->sih_which], sih,
> + x86_soft_intrhand, sih_q);
> }
> - mtx_leave(&si->softintr_lock);
> + runner = sih->sih_runner;
> + mtx_leave(&softintr_lock);
> +
> + if (runner != NULL)
> + sched_barrier(runner);
> +
> + KASSERT((sih->sih_state & (SIS_PENDING | SIS_RESTART)) == 0);
> + KASSERT(sih->sih_runner == NULL);
>
> free(sih, M_DEVBUF, sizeof(*sih));
> }
> +
> +void
> +softintr_schedule(void *arg)
> +{
> + struct x86_soft_intrhand *sih = arg;
> + struct x86_soft_intr_queue *queue = &softintr_queue[sih->sih_which];
> + int trigger = 0;
> +
> + mtx_enter(&softintr_lock);
> + KASSERT((sih->sih_state & SIS_DYING) == 0);
The SIS_DYING sanity-check is a great idea.
> + if (sih->sih_runner == NULL) {
> + KASSERT((sih->sih_state & SIS_RESTART) == 0);
> + if ((sih->sih_state & SIS_PENDING) == 0) {
> + STAILQ_INSERT_TAIL(queue, sih, sih_q);
> + sih->sih_state |= SIS_PENDING;
> + trigger = 1;
> + }
> + } else {
> + KASSERT((sih->sih_state & SIS_PENDING) == 0);
> + sih->sih_state |= SIS_RESTART;
> + }
> + mtx_leave(&softintr_lock);
> +
> + if (trigger)
> + softintr(x86_soft_intr_to_ssir[sih->sih_which]);
> +}
This aligns with your description from earlier and looks correct.
> Index: arch/amd64/include/intr.h
> ===================================================================
> RCS file: src/sys/arch/amd64/include/intr.h,v
> retrieving revision 1.32
> diff -u -p -r1.32 intr.h
> --- arch/amd64/include/intr.h 17 Jun 2020 06:14:52 -0000 1.32
> +++ arch/amd64/include/intr.h 23 May 2021 08:56:28 -0000
> @@ -231,42 +231,13 @@ extern void (*ipifunc[X86_NIPI])(struct
> #define X86_NSOFTINTR 3
>
> #ifndef _LOCORE
> -#include <sys/queue.h>
> -
> -struct x86_soft_intrhand {
> - TAILQ_ENTRY(x86_soft_intrhand)
> - sih_q;
> - struct x86_soft_intr *sih_intrhead;
> - void (*sih_fn)(void *);
> - void *sih_arg;
> - int sih_pending;
> -};
> -
> -struct x86_soft_intr {
> - TAILQ_HEAD(, x86_soft_intrhand)
> - softintr_q;
> - int softintr_ssir;
> - struct mutex softintr_lock;
> -};
>
> void *softintr_establish(int, void (*)(void *), void *);
> void softintr_disestablish(void *);
> void softintr_init(void);
> void softintr_dispatch(int);
> +void softintr_schedule(void *);
>
> -#define softintr_schedule(arg)
> \
> -do { \
> - struct x86_soft_intrhand *__sih = (arg); \
> - struct x86_soft_intr *__si = __sih->sih_intrhead; \
> - \
> - mtx_enter(&__si->softintr_lock); \
> - if (__sih->sih_pending == 0) { \
> - TAILQ_INSERT_TAIL(&__si->softintr_q, __sih, sih_q); \
> - __sih->sih_pending = 1; \
> - softintr(__si->softintr_ssir); \
> - } \
> - mtx_leave(&__si->softintr_lock); \
> -} while (/*CONSTCOND*/ 0)
Thank you for moving this out of the header.
> #endif /* _LOCORE */
>
> #endif /* !_MACHINE_INTR_H_ */
>