This patch adds a mutex that serializes access to a kqueue. As a result,
most of kqueue's internals should become safe to run without the kernel
lock. In principle, the patch should allow unlocking kevent(2).

Some notes:

* The existing uses of splhigh() outline where the mutex should be held.

* The code is a true entanglement of lock operations. There are many
  spots where lock usage is far from optimal. The patch does not attempt
  to fix them, so as to keep the changeset relatively small.

* As msleep() with PCATCH requires the kernel lock, kqueue_scan() locks
  the kernel for the section that might sleep. The lock is released
  before the actual scan of events. An opportunistic implementation
  could do a precheck to determine if the scan could be started right
  away, but this is not part of the diff.

* knote_acquire() has a gap where it might miss a wakeup. This is an
  unlikely situation that may arise with klist_invalidate(). It should
  not happen during normal operation, and the code should recover thanks
  to the one-second timeout. The loss of wakeup could be avoided with
  serial numbering for example.

* The timeout in knote_acquire() makes the function try-lock-like, which
  is essential in klist_invalidate(). The normal sequence of action is
  that knote_acquire() comes before klist_lock(). klist_invalidate() has
  to violate this, and the timeout, and retrying, prevent the system
  from deadlocking.

* At the moment, all event sources still require the kernel lock.
  kqueue will lock the kernel when it invokes the filterops callbacks
  if FILTEROP_MPSAFE is not set.


Please test!


Index: kern/kern_event.c
===================================================================
RCS file: src/sys/kern/kern_event.c,v
retrieving revision 1.163
diff -u -p -r1.163 kern_event.c
--- kern/kern_event.c   22 Apr 2021 15:30:12 -0000      1.163
+++ kern/kern_event.c   20 May 2021 13:45:32 -0000
@@ -124,7 +124,8 @@ void        knote_dequeue(struct knote *kn);
 int    knote_acquire(struct knote *kn, struct klist *, int);
 void   knote_release(struct knote *kn);
 void   knote_activate(struct knote *kn);
-void   knote_remove(struct proc *p, struct knlist *list, int purge);
+void   knote_remove(struct proc *p, struct kqueue *kq, struct knlist *list,
+           int purge);
 
 void   filt_kqdetach(struct knote *kn);
 int    filt_kqueue(struct knote *kn, long hint);
@@ -265,7 +266,7 @@ filt_kqueue(struct knote *kn, long hint)
 {
        struct kqueue *kq = kn->kn_fp->f_data;
 
-       kn->kn_data = kq->kq_count;
+       kn->kn_data = kq->kq_count;     /* unlocked read */
        return (kn->kn_data > 0);
 }
 
@@ -739,28 +740,31 @@ kqpoll_dequeue(struct proc *p)
 {
        struct knote *kn;
        struct kqueue *kq = p->p_kq;
-       int s;
 
-       s = splhigh();
+       mtx_enter(&kq->kq_lock);
        while ((kn = TAILQ_FIRST(&kq->kq_head)) != NULL) {
                /* This kqueue should not be scanned by other threads. */
                KASSERT(kn->kn_filter != EVFILT_MARKER);
 
-               if (!knote_acquire(kn, NULL, 0))
+               if (!knote_acquire(kn, NULL, 0)) {
+                       /* knote_acquire() has released kq_lock. */
+                       mtx_enter(&kq->kq_lock);
                        continue;
+               }
 
                kqueue_check(kq);
                TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
                kn->kn_status &= ~KN_QUEUED;
                kq->kq_count--;
+               mtx_leave(&kq->kq_lock);
 
-               splx(s);
-               kn->kn_fop->f_detach(kn);
+               filter_detach(kn);
                knote_drop(kn, p);
-               s = splhigh();
+
+               mtx_enter(&kq->kq_lock);
                kqueue_check(kq);
        }
-       splx(s);
+       mtx_leave(&kq->kq_lock);
 }
 
 struct kqueue *
@@ -772,6 +776,7 @@ kqueue_alloc(struct filedesc *fdp)
        kq->kq_refs = 1;
        kq->kq_fdp = fdp;
        TAILQ_INIT(&kq->kq_head);
+       mtx_init(&kq->kq_lock, IPL_HIGH);
        task_set(&kq->kq_task, kqueue_task, kq);
 
        return (kq);
@@ -933,8 +938,7 @@ kqueue_do_check(struct kqueue *kq, const
        struct knote *kn;
        int count = 0, nmarker = 0;
 
-       KERNEL_ASSERT_LOCKED();
-       splassert(IPL_HIGH);
+       MUTEX_ASSERT_LOCKED(&kq->kq_lock);
 
        TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
                if (kn->kn_filter == EVFILT_MARKER) {
@@ -973,7 +977,7 @@ kqueue_register(struct kqueue *kq, struc
        struct file *fp = NULL;
        struct knote *kn = NULL, *newkn = NULL;
        struct knlist *list = NULL;
-       int s, error = 0;
+       int error = 0;
 
        if (kev->filter < 0) {
                if (kev->filter + EVFILT_SYSCOUNT < 0)
@@ -1005,11 +1009,13 @@ again:
                        error = EBADF;
                        goto done;
                }
+               mtx_enter(&kq->kq_lock);
                if (kev->flags & EV_ADD)
                        kqueue_expand_list(kq, kev->ident);
                if (kev->ident < kq->kq_knlistsize)
                        list = &kq->kq_knlist[kev->ident];
        } else {
+               mtx_enter(&kq->kq_lock);
                if (kev->flags & EV_ADD)
                        kqueue_expand_hash(kq);
                if (kq->kq_knhashmask != 0) {
@@ -1021,16 +1027,15 @@ again:
                SLIST_FOREACH(kn, list, kn_link) {
                        if (kev->filter == kn->kn_filter &&
                            kev->ident == kn->kn_id) {
-                               s = splhigh();
                                if (!knote_acquire(kn, NULL, 0)) {
-                                       splx(s);
+                                       /* knote_acquire() has released
+                                        * kq_lock. */
                                        if (fp != NULL) {
                                                FRELE(fp, p);
                                                fp = NULL;
                                        }
                                        goto again;
                                }
-                               splx(s);
                                break;
                        }
                }
@@ -1038,14 +1043,13 @@ again:
        KASSERT(kn == NULL || (kn->kn_status & KN_PROCESSING) != 0);
 
        if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
+               mtx_leave(&kq->kq_lock);
                error = ENOENT;
                goto done;
        }
 
        /*
         * kn now contains the matching knote, or NULL if no match.
-        * If adding a new knote, sleeping is not allowed until the knote
-        * has been inserted.
         */
        if (kev->flags & EV_ADD) {
                if (kn == NULL) {
@@ -1069,6 +1073,8 @@ again:
                        kn->kn_kevent = *kev;
 
                        knote_attach(kn);
+                       mtx_leave(&kq->kq_lock);
+
                        error = filter_attach(kn);
                        if (error != 0) {
                                knote_drop(kn, p);
@@ -1097,45 +1103,47 @@ again:
                        /* Check if there is a pending event. */
                        if (filter_process(kn, NULL))
                                knote_activate(kn);
+
+                       mtx_enter(&kq->kq_lock);
                } else {
                        /*
                         * The user may change some filter values after the
                         * initial EV_ADD, but doing so will not reset any
                         * filters which have already been triggered.
                         */
+                       mtx_leave(&kq->kq_lock);
                        if (filter_modify(kev, kn))
                                knote_activate(kn);
+                       mtx_enter(&kq->kq_lock);
                        if (kev->flags & EV_ERROR) {
                                error = kev->data;
                                goto release;
                        }
                }
        } else if (kev->flags & EV_DELETE) {
+               mtx_leave(&kq->kq_lock);
                filter_detach(kn);
                knote_drop(kn, p);
                goto done;
        }
 
-       if ((kev->flags & EV_DISABLE) &&
-           ((kn->kn_status & KN_DISABLED) == 0)) {
-               s = splhigh();
+       if ((kev->flags & EV_DISABLE) && ((kn->kn_status & KN_DISABLED) == 0))
                kn->kn_status |= KN_DISABLED;
-               splx(s);
-       }
 
        if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
-               s = splhigh();
                kn->kn_status &= ~KN_DISABLED;
-               splx(s);
+               mtx_leave(&kq->kq_lock);
+
                /* Check if there is a pending event. */
                if (filter_process(kn, NULL))
                        knote_activate(kn);
+
+               mtx_enter(&kq->kq_lock);
        }
 
 release:
-       s = splhigh();
        knote_release(kn);
-       splx(s);
+       mtx_leave(&kq->kq_lock);
 done:
        if (fp != NULL)
                FRELE(fp, p);
@@ -1151,14 +1159,15 @@ kqueue_sleep(struct kqueue *kq, struct t
        uint64_t nsecs;
        int error;
 
-       splassert(IPL_HIGH);
+       MUTEX_ASSERT_LOCKED(&kq->kq_lock);
 
        if (tsp != NULL) {
                getnanouptime(&start);
                nsecs = MIN(TIMESPEC_TO_NSEC(tsp), MAXTSLP);
        } else
                nsecs = INFSLP;
-       error = tsleep_nsec(kq, PSOCK | PCATCH, "kqread", nsecs);
+       error = msleep_nsec(kq, &kq->kq_lock, PSOCK | PCATCH | PNORELOCK,
+           "kqread", nsecs);
        if (tsp != NULL) {
                getnanouptime(&stop);
                timespecsub(&stop, &start, &elapsed);
@@ -1181,7 +1190,7 @@ kqueue_scan(struct kqueue_scan_state *sc
 {
        struct kqueue *kq = scan->kqs_kq;
        struct knote *kn;
-       int s, error = 0, nkev = 0;
+       int error = 0, nkev = 0;
 
        if (maxevents == 0)
                goto done;
@@ -1190,12 +1199,18 @@ retry:
 
        error = 0;
 
+       /* msleep() with PCATCH requires kernel lock. */
+       KERNEL_LOCK();
+
+       mtx_enter(&kq->kq_lock);
+
        if (kq->kq_state & KQ_DYING) {
+               mtx_leave(&kq->kq_lock);
+               KERNEL_UNLOCK();
                error = EBADF;
                goto done;
        }
 
-       s = splhigh();
        if (kq->kq_count == 0) {
                /*
                 * Successive loops are only necessary if there are more
@@ -1203,13 +1218,15 @@ retry:
                 */
                if ((tsp != NULL && !timespecisset(tsp)) ||
                    scan->kqs_nevent != 0) {
-                       splx(s);
+                       mtx_leave(&kq->kq_lock);
+                       KERNEL_UNLOCK();
                        error = 0;
                        goto done;
                }
                kq->kq_state |= KQ_SLEEP;
                error = kqueue_sleep(kq, tsp);
-               splx(s);
+               /* kqueue_sleep() has released kq_lock. */
+               KERNEL_UNLOCK();
                if (error == 0 || error == EWOULDBLOCK)
                        goto retry;
                /* don't restart after signals... */
@@ -1218,6 +1235,9 @@ retry:
                goto done;
        }
 
+       /* The actual scan does not sleep on kq, so unlock the kernel. */
+       KERNEL_UNLOCK();
+
        /*
         * Put the end marker in the queue to limit the scan to the events
         * that are currently active.  This prevents events from being
@@ -1249,8 +1269,11 @@ retry:
                        continue;
                }
 
-               if (!knote_acquire(kn, NULL, 0))
+               if (!knote_acquire(kn, NULL, 0)) {
+                       /* knote_acquire() has released kq_lock. */
+                       mtx_enter(&kq->kq_lock);
                        continue;
+               }
 
                kqueue_check(kq);
                TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
@@ -1263,11 +1286,11 @@ retry:
                        continue;
                }
 
-               splx(s);
+               mtx_leave(&kq->kq_lock);
 
                memset(kevp, 0, sizeof(*kevp));
                if (filter_process(kn, kevp) == 0) {
-                       s = splhigh();
+                       mtx_enter(&kq->kq_lock);
                        if ((kn->kn_status & KN_QUEUED) == 0)
                                kn->kn_status &= ~KN_ACTIVE;
                        knote_release(kn);
@@ -1281,9 +1304,9 @@ retry:
                if (kevp->flags & EV_ONESHOT) {
                        filter_detach(kn);
                        knote_drop(kn, p);
-                       s = splhigh();
+                       mtx_enter(&kq->kq_lock);
                } else if (kevp->flags & (EV_CLEAR | EV_DISPATCH)) {
-                       s = splhigh();
+                       mtx_enter(&kq->kq_lock);
                        if (kevp->flags & EV_DISPATCH)
                                kn->kn_status |= KN_DISABLED;
                        if ((kn->kn_status & KN_QUEUED) == 0)
@@ -1291,7 +1314,7 @@ retry:
                        KASSERT(kn->kn_status & KN_ATTACHED);
                        knote_release(kn);
                } else {
-                       s = splhigh();
+                       mtx_enter(&kq->kq_lock);
                        if ((kn->kn_status & KN_QUEUED) == 0) {
                                kqueue_check(kq);
                                kq->kq_count++;
@@ -1308,7 +1331,7 @@ retry:
                scan->kqs_nevent++;
        }
        TAILQ_REMOVE(&kq->kq_head, &scan->kqs_start, kn_tqe);
-       splx(s);
+       mtx_leave(&kq->kq_lock);
        if (scan->kqs_nevent == 0)
                goto retry;
 done:
@@ -1333,7 +1356,6 @@ void
 kqueue_scan_finish(struct kqueue_scan_state *scan)
 {
        struct kqueue *kq = scan->kqs_kq;
-       int s;
 
        KASSERT(scan->kqs_start.kn_filter == EVFILT_MARKER);
        KASSERT(scan->kqs_start.kn_status == KN_PROCESSING);
@@ -1342,9 +1364,9 @@ kqueue_scan_finish(struct kqueue_scan_st
 
        if (scan->kqs_queued) {
                scan->kqs_queued = 0;
-               s = splhigh();
+               mtx_enter(&kq->kq_lock);
                TAILQ_REMOVE(&kq->kq_head, &scan->kqs_end, kn_tqe);
-               splx(s);
+               mtx_leave(&kq->kq_lock);
        }
        KQRELE(kq);
 }
@@ -1376,17 +1398,17 @@ kqueue_poll(struct file *fp, int events,
 {
        struct kqueue *kq = (struct kqueue *)fp->f_data;
        int revents = 0;
-       int s = splhigh();
 
        if (events & (POLLIN | POLLRDNORM)) {
+               mtx_enter(&kq->kq_lock);
                if (kq->kq_count) {
                        revents |= events & (POLLIN | POLLRDNORM);
                } else {
                        selrecord(p, &kq->kq_sel);
                        kq->kq_state |= KQ_SEL;
                }
+               mtx_leave(&kq->kq_lock);
        }
-       splx(s);
        return (revents);
 }
 
@@ -1396,7 +1418,7 @@ kqueue_stat(struct file *fp, struct stat
        struct kqueue *kq = fp->f_data;
 
        memset(st, 0, sizeof(*st));
-       st->st_size = kq->kq_count;
+       st->st_size = kq->kq_count;     /* unlocked read */
        st->st_blksize = sizeof(struct kevent);
        st->st_mode = S_IFIFO;
        return (0);
@@ -1407,14 +1429,14 @@ kqueue_purge(struct proc *p, struct kque
 {
        int i;
 
-       KERNEL_ASSERT_LOCKED();
-
+       mtx_enter(&kq->kq_lock);
        for (i = 0; i < kq->kq_knlistsize; i++)
-               knote_remove(p, &kq->kq_knlist[i], 1);
+               knote_remove(p, kq, &kq->kq_knlist[i], 1);
        if (kq->kq_knhashmask != 0) {
                for (i = 0; i < kq->kq_knhashmask + 1; i++)
-                       knote_remove(p, &kq->kq_knhash[i], 1);
+                       knote_remove(p, kq, &kq->kq_knhash[i], 1);
        }
+       mtx_leave(&kq->kq_lock);
 }
 
 void
@@ -1422,6 +1444,8 @@ kqueue_terminate(struct proc *p, struct 
 {
        struct knote *kn;
 
+       mtx_enter(&kq->kq_lock);
+
        /*
         * Any remaining entries should be scan markers.
         * They are removed when the ongoing scans finish.
@@ -1432,6 +1456,7 @@ kqueue_terminate(struct proc *p, struct 
 
        kq->kq_state |= KQ_DYING;
        kqueue_wakeup(kq);
+       mtx_leave(&kq->kq_lock);
 
        KASSERT(klist_empty(&kq->kq_sel.si_note));
        task_del(systq, &kq->kq_task);
@@ -1443,15 +1468,13 @@ kqueue_close(struct file *fp, struct pro
 {
        struct kqueue *kq = fp->f_data;
 
-       KERNEL_LOCK();
+       fp->f_data = NULL;
+
        kqueue_purge(p, kq);
        kqueue_terminate(p, kq);
-       fp->f_data = NULL;
 
        KQRELE(kq);
 
-       KERNEL_UNLOCK();
-
        return (0);
 }
 
@@ -1460,10 +1483,16 @@ kqueue_task(void *arg)
 {
        struct kqueue *kq = arg;
 
+       /* Kernel lock is needed inside selwakeup(). */
+       KERNEL_ASSERT_LOCKED();
+
+       mtx_enter(&kq->kq_lock);
        if (kq->kq_state & KQ_SEL) {
                kq->kq_state &= ~KQ_SEL;
+               mtx_leave(&kq->kq_lock);
                selwakeup(&kq->kq_sel);
        } else {
+               mtx_leave(&kq->kq_lock);
                KNOTE(&kq->kq_sel.si_note, 0);
        }
        KQRELE(kq);
@@ -1472,6 +1501,7 @@ kqueue_task(void *arg)
 void
 kqueue_wakeup(struct kqueue *kq)
 {
+       MUTEX_ASSERT_LOCKED(&kq->kq_lock);
 
        if (kq->kq_state & KQ_SLEEP) {
                kq->kq_state &= ~KQ_SLEEP;
@@ -1491,14 +1521,20 @@ kqueue_expand_hash(struct kqueue *kq)
        struct knlist *hash;
        u_long hashmask;
 
+       MUTEX_ASSERT_LOCKED(&kq->kq_lock);
+
        if (kq->kq_knhashmask == 0) {
+               mtx_leave(&kq->kq_lock);
                hash = hashinit(KN_HASHSIZE, M_KEVENT, M_WAITOK, &hashmask);
+               mtx_enter(&kq->kq_lock);
                if (kq->kq_knhashmask == 0) {
                        kq->kq_knhash = hash;
                        kq->kq_knhashmask = hashmask;
                } else {
                        /* Another thread has allocated the hash. */
+                       mtx_leave(&kq->kq_lock);
                        hashfree(hash, KN_HASHSIZE, M_KEVENT);
+                       mtx_enter(&kq->kq_lock);
                }
        }
 }
@@ -1506,26 +1542,35 @@ kqueue_expand_hash(struct kqueue *kq)
 static void
 kqueue_expand_list(struct kqueue *kq, int fd)
 {
-       struct knlist *list;
-       int size;
+       struct knlist *list, *olist;
+       int size, osize;
+
+       MUTEX_ASSERT_LOCKED(&kq->kq_lock);
 
        if (kq->kq_knlistsize <= fd) {
                size = kq->kq_knlistsize;
+               mtx_leave(&kq->kq_lock);
                while (size <= fd)
                        size += KQEXTENT;
                list = mallocarray(size, sizeof(*list), M_KEVENT, M_WAITOK);
+               mtx_enter(&kq->kq_lock);
                if (kq->kq_knlistsize <= fd) {
                        memcpy(list, kq->kq_knlist,
                            kq->kq_knlistsize * sizeof(*list));
                        memset(&list[kq->kq_knlistsize], 0,
                            (size - kq->kq_knlistsize) * sizeof(*list));
-                       free(kq->kq_knlist, M_KEVENT,
-                           kq->kq_knlistsize * sizeof(*list));
+                       olist = kq->kq_knlist;
+                       osize = kq->kq_knlistsize;
                        kq->kq_knlist = list;
                        kq->kq_knlistsize = size;
+                       mtx_leave(&kq->kq_lock);
+                       free(olist, M_KEVENT, osize * sizeof(*list));
+                       mtx_enter(&kq->kq_lock);
                } else {
                        /* Another thread has expanded the list. */
+                       mtx_leave(&kq->kq_lock);
                        free(list, M_KEVENT, size * sizeof(*list));
+                       mtx_enter(&kq->kq_lock);
                }
        }
 }
@@ -1543,14 +1588,22 @@ kqueue_expand_list(struct kqueue *kq, in
 int
 knote_acquire(struct knote *kn, struct klist *klist, int ls)
 {
-       splassert(IPL_HIGH);
+       struct kqueue *kq = kn->kn_kq;
+
+       MUTEX_ASSERT_LOCKED(&kq->kq_lock);
        KASSERT(kn->kn_filter != EVFILT_MARKER);
 
        if (kn->kn_status & KN_PROCESSING) {
                kn->kn_status |= KN_WAITING;
-               if (klist != NULL)
+               if (klist != NULL) {
+                       mtx_leave(&kq->kq_lock);
                        klist_unlock(klist, ls);
-               tsleep_nsec(kn, 0, "kqepts", SEC_TO_NSEC(1));
+                       /* XXX Timeout resolves potential loss of wakeup. */
+                       tsleep_nsec(kn, 0, "kqepts", SEC_TO_NSEC(1));
+               } else {
+                       msleep_nsec(kn, &kq->kq_lock, PNORELOCK, "kqepts",
+                           SEC_TO_NSEC(1));
+               }
                /* knote may be stale now */
                return (0);
        }
@@ -1564,7 +1617,9 @@ knote_acquire(struct knote *kn, struct k
 void
 knote_release(struct knote *kn)
 {
-       splassert(IPL_HIGH);
+       struct kqueue *kq = kn->kn_kq;
+
+       MUTEX_ASSERT_LOCKED(&kq->kq_lock);
        KASSERT(kn->kn_filter != EVFILT_MARKER);
        KASSERT(kn->kn_status & KN_PROCESSING);
 
@@ -1582,13 +1637,13 @@ knote_release(struct knote *kn)
 void
 knote_activate(struct knote *kn)
 {
-       int s;
+       struct kqueue *kq = kn->kn_kq;
 
-       s = splhigh();
+       mtx_enter(&kq->kq_lock);
        kn->kn_status |= KN_ACTIVE;
        if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)
                knote_enqueue(kn);
-       splx(s);
+       mtx_leave(&kq->kq_lock);
 }
 
 /*
@@ -1610,18 +1665,20 @@ knote(struct klist *list, long hint)
  * remove all knotes from a specified knlist
  */
 void
-knote_remove(struct proc *p, struct knlist *list, int purge)
+knote_remove(struct proc *p, struct kqueue *kq, struct knlist *list, int purge)
 {
        struct knote *kn;
-       int s;
+
+       MUTEX_ASSERT_LOCKED(&kq->kq_lock);
 
        while ((kn = SLIST_FIRST(list)) != NULL) {
-               s = splhigh();
+               KASSERT(kn->kn_kq == kq);
                if (!knote_acquire(kn, NULL, 0)) {
-                       splx(s);
+                       /* knote_acquire() has released kq_lock. */
+                       mtx_enter(&kq->kq_lock);
                        continue;
                }
-               splx(s);
+               mtx_leave(&kq->kq_lock);
                filter_detach(kn);
 
                /*
@@ -1636,20 +1693,22 @@ knote_remove(struct proc *p, struct knli
                 */
                if (!purge && (kn->kn_flags & __EV_POLL) != 0) {
                        KASSERT(kn->kn_fop->f_flags & FILTEROP_ISFD);
+                       mtx_enter(&kq->kq_lock);
                        knote_detach(kn);
+                       mtx_leave(&kq->kq_lock);
                        FRELE(kn->kn_fp, p);
                        kn->kn_fp = NULL;
 
                        kn->kn_fop = &badfd_filtops;
                        filter_event(kn, 0);
                        knote_activate(kn);
-                       s = splhigh();
+                       mtx_enter(&kq->kq_lock);
                        knote_release(kn);
-                       splx(s);
                        continue;
                }
 
                knote_drop(kn, p);
+               mtx_enter(&kq->kq_lock);
        }
 }
 
@@ -1661,7 +1720,6 @@ knote_fdclose(struct proc *p, int fd)
 {
        struct filedesc *fdp = p->p_p->ps_fd;
        struct kqueue *kq;
-       struct knlist *list;
 
        /*
         * fdplock can be ignored if the file descriptor table is being freed
@@ -1670,18 +1728,12 @@ knote_fdclose(struct proc *p, int fd)
        if (fdp->fd_refcnt != 0)
                fdpassertlocked(fdp);
 
-       if (LIST_EMPTY(&fdp->fd_kqlist))
-               return;
-
-       KERNEL_LOCK();
        LIST_FOREACH(kq, &fdp->fd_kqlist, kq_next) {
-               if (fd >= kq->kq_knlistsize)
-                       continue;
-
-               list = &kq->kq_knlist[fd];
-               knote_remove(p, list, 0);
+               mtx_enter(&kq->kq_lock);
+               if (fd < kq->kq_knlistsize)
+                       knote_remove(p, kq, &kq->kq_knlist[fd], 0);
+               mtx_leave(&kq->kq_lock);
        }
-       KERNEL_UNLOCK();
 }
 
 /*
@@ -1693,6 +1745,7 @@ knote_processexit(struct proc *p)
 {
        struct process *pr = p->p_p;
 
+       KERNEL_ASSERT_LOCKED();
        KASSERT(p == curproc);
 
        KNOTE(&pr->ps_klist, NOTE_EXIT);
@@ -1706,15 +1759,12 @@ knote_attach(struct knote *kn)
 {
        struct kqueue *kq = kn->kn_kq;
        struct knlist *list;
-       int s;
 
+       MUTEX_ASSERT_LOCKED(&kq->kq_lock);
        KASSERT(kn->kn_status & KN_PROCESSING);
        KASSERT((kn->kn_status & KN_ATTACHED) == 0);
 
-       s = splhigh();
        kn->kn_status |= KN_ATTACHED;
-       splx(s);
-
        if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
                KASSERT(kq->kq_knlistsize > kn->kn_id);
                list = &kq->kq_knlist[kn->kn_id];
@@ -1730,8 +1780,8 @@ knote_detach(struct knote *kn)
 {
        struct kqueue *kq = kn->kn_kq;
        struct knlist *list;
-       int s;
 
+       MUTEX_ASSERT_LOCKED(&kq->kq_lock);
        KASSERT(kn->kn_status & KN_PROCESSING);
 
        if ((kn->kn_status & KN_ATTACHED) == 0)
@@ -1742,10 +1792,7 @@ knote_detach(struct knote *kn)
        else
                list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
        SLIST_REMOVE(list, kn, knote, kn_link);
-
-       s = splhigh();
        kn->kn_status &= ~KN_ATTACHED;
-       splx(s);
 }
 
 /*
@@ -1755,20 +1802,20 @@ knote_detach(struct knote *kn)
 void
 knote_drop(struct knote *kn, struct proc *p)
 {
-       int s;
+       struct kqueue *kq = kn->kn_kq;
 
        KASSERT(kn->kn_filter != EVFILT_MARKER);
 
+       mtx_enter(&kq->kq_lock);
        knote_detach(kn);
-
-       s = splhigh();
        if (kn->kn_status & KN_QUEUED)
                knote_dequeue(kn);
        if (kn->kn_status & KN_WAITING) {
                kn->kn_status &= ~KN_WAITING;
                wakeup(kn);
        }
-       splx(s);
+       mtx_leave(&kq->kq_lock);
+
        if ((kn->kn_fop->f_flags & FILTEROP_ISFD) && kn->kn_fp != NULL)
                FRELE(kn->kn_fp, p);
        pool_put(&knote_pool, kn);
@@ -1780,7 +1827,7 @@ knote_enqueue(struct knote *kn)
 {
        struct kqueue *kq = kn->kn_kq;
 
-       splassert(IPL_HIGH);
+       MUTEX_ASSERT_LOCKED(&kq->kq_lock);
        KASSERT(kn->kn_filter != EVFILT_MARKER);
        KASSERT((kn->kn_status & KN_QUEUED) == 0);
 
@@ -1797,7 +1844,7 @@ knote_dequeue(struct knote *kn)
 {
        struct kqueue *kq = kn->kn_kq;
 
-       splassert(IPL_HIGH);
+       MUTEX_ASSERT_LOCKED(&kq->kq_lock);
        KASSERT(kn->kn_filter != EVFILT_MARKER);
        KASSERT(kn->kn_status & KN_QUEUED);
 
@@ -1905,36 +1952,38 @@ void
 klist_invalidate(struct klist *list)
 {
        struct knote *kn;
+       struct kqueue *kq;
        struct proc *p = curproc;
-       int ls, s;
+       int ls;
 
        NET_ASSERT_UNLOCKED();
 
-       s = splhigh();
        ls = klist_lock(list);
        while ((kn = SLIST_FIRST(&list->kl_list)) != NULL) {
+               kq = kn->kn_kq;
+               mtx_enter(&kq->kq_lock);
                if (!knote_acquire(kn, list, ls)) {
-                       /* knote_acquire() has unlocked list. */
+                       /* knote_acquire() has released kq_lock
+                        * and klist lock. */
                        ls = klist_lock(list);
                        continue;
                }
+               mtx_leave(&kq->kq_lock);
                klist_unlock(list, ls);
-               splx(s);
                filter_detach(kn);
                if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
                        kn->kn_fop = &dead_filtops;
                        filter_event(kn, 0);
                        knote_activate(kn);
-                       s = splhigh();
+                       mtx_enter(&kq->kq_lock);
                        knote_release(kn);
+                       mtx_leave(&kq->kq_lock);
                } else {
                        knote_drop(kn, p);
-                       s = splhigh();
                }
                ls = klist_lock(list);
        }
        klist_unlock(list, ls);
-       splx(s);
 }
 
 static int
Index: sys/eventvar.h
===================================================================
RCS file: src/sys/sys/eventvar.h,v
retrieving revision 1.11
diff -u -p -r1.11 eventvar.h
--- sys/eventvar.h      17 Jan 2021 05:56:32 -0000      1.11
+++ sys/eventvar.h      20 May 2021 13:45:32 -0000
@@ -31,6 +31,7 @@
 #ifndef _SYS_EVENTVAR_H_
 #define _SYS_EVENTVAR_H_
 
+#include <sys/mutex.h>
 #include <sys/task.h>
 
 #define KQ_NEVENTS     8               /* minimize copy{in,out} calls */
@@ -38,24 +39,29 @@
 
 /*
  * Locking:
+ *     I       immutable after creation
  *     a       atomic operations
+ *     q       kq_lock
  */
 struct kqueue {
-       TAILQ_HEAD(, knote) kq_head;            /* list of pending event */
-       int             kq_count;               /* number of pending events */
-       u_int           kq_refs;                /* [a] number of references */
+       struct          mutex kq_lock;          /* lock for queue access */
+       TAILQ_HEAD(, knote) kq_head;            /* [q] list of pending event */
+       int             kq_count;               /* [q] # of pending events */
+       u_int           kq_refs;                /* [a] # of references */
        struct          selinfo kq_sel;
-       struct          filedesc *kq_fdp;
+       struct          filedesc *kq_fdp;       /* [I] fd table of this kq */
 
        LIST_ENTRY(kqueue) kq_next;
 
-       int             kq_knlistsize;          /* size of kq_knlist */
-       struct          knlist *kq_knlist;      /* list of attached knotes */
-       u_long          kq_knhashmask;          /* size of kq_knhash */
-       struct          knlist *kq_knhash;      /* hash table for attached 
knotes */
+       int             kq_knlistsize;          /* [q] size of kq_knlist */
+       struct          knlist *kq_knlist;      /* [q] list of
+                                                *     attached knotes */
+       u_long          kq_knhashmask;          /* [q] size of kq_knhash */
+       struct          knlist *kq_knhash;      /* [q] hash table for
+                                                *     attached knotes */
        struct          task kq_task;           /* deferring of activation */
 
-       int             kq_state;
+       int             kq_state;               /* [q] */
 #define KQ_SEL         0x01
 #define KQ_SLEEP       0x02
 #define KQ_DYING       0x04

Reply via email to