This patch makes the kqread_filtops event filter MP-safe. This event
filter is used when a kqueue monitors another kqueue, and now also when
poll(2) or select(2) is used with a kqueue file descriptor.

The code uses the knote "object lock" pattern with a twist. kq_lock of
the object is not locked when calling KNOTE(&kq->kq_sel.si_note) so as
to avoid lock order issues; knote_activate() needs the kq_lock of the
monitoring kqueue. Consequently, filt_kqueue() has to acquire the
object's kq_lock.

As the object lock does not serialize kqueue's klist, another lock
is required. The patch adds a system-wide mutex for this.

Struct kqueue could have a separate klist mutex. However, that seems
somewhat excessive at the moment because the heaviest part that needs
the klist lock, deferred activation, is delivered through a single
thread.

OK?

Index: kern/kern_event.c
===================================================================
RCS file: src/sys/kern/kern_event.c,v
retrieving revision 1.169
diff -u -p -r1.169 kern_event.c
--- kern/kern_event.c   24 Jul 2021 09:16:51 -0000      1.169
+++ kern/kern_event.c   31 Oct 2021 05:55:12 -0000
@@ -128,6 +128,9 @@ void        knote_remove(struct proc *p, struct
 
 void   filt_kqdetach(struct knote *kn);
 int    filt_kqueue(struct knote *kn, long hint);
+int    filt_kqueuemodify(struct kevent *kev, struct knote *kn);
+int    filt_kqueueprocess(struct knote *kn, struct kevent *kev);
+int    filt_kqueue_common(struct knote *kn, struct kqueue *kq);
 int    filt_procattach(struct knote *kn);
 void   filt_procdetach(struct knote *kn);
 int    filt_proc(struct knote *kn, long hint);
@@ -140,10 +143,12 @@ int       filt_timerprocess(struct knote *kn, 
 void   filt_seltruedetach(struct knote *kn);
 
 const struct filterops kqread_filtops = {
-       .f_flags        = FILTEROP_ISFD,
+       .f_flags        = FILTEROP_ISFD | FILTEROP_MPSAFE,
        .f_attach       = NULL,
        .f_detach       = filt_kqdetach,
        .f_event        = filt_kqueue,
+       .f_modify       = filt_kqueuemodify,
+       .f_process      = filt_kqueueprocess,
 };
 
 const struct filterops proc_filtops = {
@@ -171,6 +176,7 @@ const struct filterops timer_filtops = {
 
 struct pool knote_pool;
 struct pool kqueue_pool;
+struct mutex kqueue_klist_lock = MUTEX_INITIALIZER(IPL_MPFLOOR);
 int kq_ntimeouts = 0;
 int kq_timeoutmax = (4 * 1024);
 
@@ -219,6 +225,7 @@ KQRELE(struct kqueue *kq)
        free(kq->kq_knlist, M_KEVENT, kq->kq_knlistsize *
            sizeof(struct knlist));
        hashfree(kq->kq_knhash, KN_HASHSIZE, M_KEVENT);
+       klist_free(&kq->kq_sel.si_note);
        pool_put(&kqueue_pool, kq);
 }
 
@@ -254,7 +261,7 @@ kqueue_kqfilter(struct file *fp, struct 
                return (EINVAL);
 
        kn->kn_fop = &kqread_filtops;
-       klist_insert_locked(&kq->kq_sel.si_note, kn);
+       klist_insert(&kq->kq_sel.si_note, kn);
        return (0);
 }
 
@@ -263,18 +270,62 @@ filt_kqdetach(struct knote *kn)
 {
        struct kqueue *kq = kn->kn_fp->f_data;
 
-       klist_remove_locked(&kq->kq_sel.si_note, kn);
+       klist_remove(&kq->kq_sel.si_note, kn);
+}
+
+int
+filt_kqueue_common(struct knote *kn, struct kqueue *kq)
+{
+       MUTEX_ASSERT_LOCKED(&kq->kq_lock);
+
+       kn->kn_data = kq->kq_count;
+
+       return (kn->kn_data > 0);
 }
 
 int
 filt_kqueue(struct knote *kn, long hint)
 {
        struct kqueue *kq = kn->kn_fp->f_data;
+       int active;
 
        mtx_enter(&kq->kq_lock);
-       kn->kn_data = kq->kq_count;
+       active = filt_kqueue_common(kn, kq);
        mtx_leave(&kq->kq_lock);
-       return (kn->kn_data > 0);
+
+       return (active);
+}
+
+int
+filt_kqueuemodify(struct kevent *kev, struct knote *kn)
+{
+       struct kqueue *kq = kn->kn_fp->f_data;
+       int active;
+
+       mtx_enter(&kq->kq_lock);
+       knote_modify(kev, kn);
+       active = filt_kqueue_common(kn, kq);
+       mtx_leave(&kq->kq_lock);
+
+       return (active);
+}
+
+int
+filt_kqueueprocess(struct knote *kn, struct kevent *kev)
+{
+       struct kqueue *kq = kn->kn_fp->f_data;
+       int active;
+
+       mtx_enter(&kq->kq_lock);
+       if (kev != NULL && (kn->kn_flags & EV_ONESHOT))
+               active = 1;
+       else
+               active = filt_kqueue_common(kn, kq);
+       if (active)
+               knote_submit(kn, kev);
+       mtx_leave(&kq->kq_lock);
+
+       return (active);
 }
 
 int
@@ -821,6 +872,7 @@ kqueue_alloc(struct filedesc *fdp)
        TAILQ_INIT(&kq->kq_head);
        mtx_init(&kq->kq_lock, IPL_HIGH);
        task_set(&kq->kq_task, kqueue_task, kq);
+       klist_init_mutex(&kq->kq_sel.si_note, &kqueue_klist_lock);
 
        return (kq);
 }
@@ -1529,6 +1590,7 @@ kqueue_task(void *arg)
        /* Kernel lock is needed inside selwakeup(). */
        KERNEL_ASSERT_LOCKED();
 
+       mtx_enter(&kqueue_klist_lock);
        mtx_enter(&kq->kq_lock);
        if (kq->kq_state & KQ_SEL) {
                kq->kq_state &= ~KQ_SEL;
@@ -1538,6 +1600,7 @@ kqueue_task(void *arg)
                mtx_leave(&kq->kq_lock);
                KNOTE(&kq->kq_sel.si_note, 0);
        }
+       mtx_leave(&kqueue_klist_lock);
        KQRELE(kq);
 }
 

Reply via email to