This patch makes the kqread_filtops event filter MP-safe. This event filter is used when a kqueue monitors another kqueue, and now also when poll(2) or select(2) is used with a kqueue file descriptor.
The code uses the knote "object lock" pattern with a twist. kq_lock of the object is not locked when calling KNOTE(&kq->kq_sel.si_note) so as to avoid lock order issues; knote_activate() needs the kq_lock of the monitoring kqueue. Consequently, filt_kqueue() has to acquire the object's kq_lock. As the object lock does not serialize kqueue's klist, another lock is required. The patch adds a system-wide mutex for this. Struct kqueue could have a separate klist mutex. However, that seems somewhat excessive at the moment because the heaviest part that needs the klist lock, deferred activation, is delivered through a single thread. OK? Index: kern/kern_event.c =================================================================== RCS file: src/sys/kern/kern_event.c,v retrieving revision 1.169 diff -u -p -r1.169 kern_event.c --- kern/kern_event.c 24 Jul 2021 09:16:51 -0000 1.169 +++ kern/kern_event.c 31 Oct 2021 05:55:12 -0000 @@ -128,6 +128,9 @@ void knote_remove(struct proc *p, struct void filt_kqdetach(struct knote *kn); int filt_kqueue(struct knote *kn, long hint); +int filt_kqueuemodify(struct kevent *kev, struct knote *kn); +int filt_kqueueprocess(struct knote *kn, struct kevent *kev); +int filt_kqueue_common(struct knote *kn, struct kqueue *kq); int filt_procattach(struct knote *kn); void filt_procdetach(struct knote *kn); int filt_proc(struct knote *kn, long hint); @@ -140,10 +143,12 @@ int filt_timerprocess(struct knote *kn, void filt_seltruedetach(struct knote *kn); const struct filterops kqread_filtops = { - .f_flags = FILTEROP_ISFD, + .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_kqdetach, .f_event = filt_kqueue, + .f_modify = filt_kqueuemodify, + .f_process = filt_kqueueprocess, }; const struct filterops proc_filtops = { @@ -171,6 +176,7 @@ const struct filterops timer_filtops = { struct pool knote_pool; struct pool kqueue_pool; +struct mutex kqueue_klist_lock = MUTEX_INITIALIZER(IPL_MPFLOOR); int kq_ntimeouts = 0; int kq_timeoutmax = (4 * 1024); @@ -219,6 +225,7 @@ KQRELE(struct kqueue *kq) free(kq->kq_knlist, M_KEVENT, kq->kq_knlistsize * sizeof(struct knlist)); hashfree(kq->kq_knhash, KN_HASHSIZE, M_KEVENT); + klist_free(&kq->kq_sel.si_note); pool_put(&kqueue_pool, kq); } @@ -254,7 +261,7 @@ kqueue_kqfilter(struct file *fp, struct return (EINVAL); kn->kn_fop = &kqread_filtops; - klist_insert_locked(&kq->kq_sel.si_note, kn); + klist_insert(&kq->kq_sel.si_note, kn); return (0); } @@ -263,18 +270,62 @@ filt_kqdetach(struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; - klist_remove_locked(&kq->kq_sel.si_note, kn); + klist_remove(&kq->kq_sel.si_note, kn); +} + +int +filt_kqueue_common(struct knote *kn, struct kqueue *kq) +{ + MUTEX_ASSERT_LOCKED(&kq->kq_lock); + + kn->kn_data = kq->kq_count; + + return (kn->kn_data > 0); } int filt_kqueue(struct knote *kn, long hint) { struct kqueue *kq = kn->kn_fp->f_data; + int active; mtx_enter(&kq->kq_lock); - kn->kn_data = kq->kq_count; + active = filt_kqueue_common(kn, kq); mtx_leave(&kq->kq_lock); - return (kn->kn_data > 0); + + return (active); +} + +int +filt_kqueuemodify(struct kevent *kev, struct knote *kn) +{ + struct kqueue *kq = kn->kn_fp->f_data; + int active; + + mtx_enter(&kq->kq_lock); + knote_modify(kev, kn); + active = filt_kqueue_common(kn, kq); + mtx_leave(&kq->kq_lock); + + return (active); +} + +int +filt_kqueueprocess(struct knote *kn, struct kevent *kev) +{ + struct kqueue *kq = kn->kn_fp->f_data; + int active; + + mtx_enter(&kq->kq_lock); + if (kev != NULL && (kn->kn_flags & EV_ONESHOT)) + active = 1; + else + active = filt_kqueue_common(kn, kq); + if (active) + knote_submit(kn, kev); + mtx_leave(&kq->kq_lock); + + return (active); } int @@ -821,6 +872,7 @@ kqueue_alloc(struct filedesc *fdp) TAILQ_INIT(&kq->kq_head); mtx_init(&kq->kq_lock, IPL_HIGH); task_set(&kq->kq_task, kqueue_task, kq); + klist_init_mutex(&kq->kq_sel.si_note, &kqueue_klist_lock); return (kq); } @@ -1529,6 +1590,7 @@ kqueue_task(void *arg) /* Kernel lock is needed inside selwakeup(). */ KERNEL_ASSERT_LOCKED(); + mtx_enter(&kqueue_klist_lock); mtx_enter(&kq->kq_lock); if (kq->kq_state & KQ_SEL) { kq->kq_state &= ~KQ_SEL; @@ -1538,6 +1600,7 @@ kqueue_task(void *arg) mtx_leave(&kq->kq_lock); KNOTE(&kq->kq_sel.si_note, 0); } + mtx_leave(&kqueue_klist_lock); KQRELE(kq); }