This patch makes the kqread_filtops event filter MP-safe. This event
filter is used when a kqueue monitors another kqueue, and now also when
poll(2) or select(2) is used with a kqueue file descriptor.
The code uses the knote "object lock" pattern with a twist. kq_lock of
the object is not locked when calling KNOTE(&kq->kq_sel.si_note) so as
to avoid lock order issues; knote_activate() needs the kq_lock of the
monitoring kqueue. Consequently, filt_kqueue() has to acquire the
object's kq_lock.
As the object lock does not serialize kqueue's klist, another lock
is required. The patch adds a system-wide mutex for this.
Struct kqueue could have a separate klist mutex. However, that seems
somewhat excessive at the moment because the heaviest part that needs
the klist lock, deferred activation, is delivered through a single
thread.
OK?
Index: kern/kern_event.c
===================================================================
RCS file: src/sys/kern/kern_event.c,v
retrieving revision 1.169
diff -u -p -r1.169 kern_event.c
--- kern/kern_event.c 24 Jul 2021 09:16:51 -0000 1.169
+++ kern/kern_event.c 31 Oct 2021 05:55:12 -0000
@@ -128,6 +128,9 @@ void knote_remove(struct proc *p, struct
void filt_kqdetach(struct knote *kn);
int filt_kqueue(struct knote *kn, long hint);
+int filt_kqueuemodify(struct kevent *kev, struct knote *kn);
+int filt_kqueueprocess(struct knote *kn, struct kevent *kev);
+int filt_kqueue_common(struct knote *kn, struct kqueue *kq);
int filt_procattach(struct knote *kn);
void filt_procdetach(struct knote *kn);
int filt_proc(struct knote *kn, long hint);
@@ -140,10 +143,12 @@ int filt_timerprocess(struct knote *kn,
void filt_seltruedetach(struct knote *kn);
const struct filterops kqread_filtops = {
- .f_flags = FILTEROP_ISFD,
+ .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_kqdetach,
.f_event = filt_kqueue,
+ .f_modify = filt_kqueuemodify,
+ .f_process = filt_kqueueprocess,
};
const struct filterops proc_filtops = {
@@ -171,6 +176,7 @@ const struct filterops timer_filtops = {
struct pool knote_pool;
struct pool kqueue_pool;
+struct mutex kqueue_klist_lock = MUTEX_INITIALIZER(IPL_MPFLOOR);
int kq_ntimeouts = 0;
int kq_timeoutmax = (4 * 1024);
@@ -219,6 +225,7 @@ KQRELE(struct kqueue *kq)
free(kq->kq_knlist, M_KEVENT, kq->kq_knlistsize *
sizeof(struct knlist));
hashfree(kq->kq_knhash, KN_HASHSIZE, M_KEVENT);
+ klist_free(&kq->kq_sel.si_note);
pool_put(&kqueue_pool, kq);
}
@@ -254,7 +261,7 @@ kqueue_kqfilter(struct file *fp, struct
return (EINVAL);
kn->kn_fop = &kqread_filtops;
- klist_insert_locked(&kq->kq_sel.si_note, kn);
+ klist_insert(&kq->kq_sel.si_note, kn);
return (0);
}
@@ -263,18 +270,62 @@ filt_kqdetach(struct knote *kn)
{
struct kqueue *kq = kn->kn_fp->f_data;
- klist_remove_locked(&kq->kq_sel.si_note, kn);
+ klist_remove(&kq->kq_sel.si_note, kn);
+}
+
+int
+filt_kqueue_common(struct knote *kn, struct kqueue *kq)
+{
+ MUTEX_ASSERT_LOCKED(&kq->kq_lock);
+
+ kn->kn_data = kq->kq_count;
+
+ return (kn->kn_data > 0);
}
int
filt_kqueue(struct knote *kn, long hint)
{
struct kqueue *kq = kn->kn_fp->f_data;
+ int active;
mtx_enter(&kq->kq_lock);
- kn->kn_data = kq->kq_count;
+ active = filt_kqueue_common(kn, kq);
mtx_leave(&kq->kq_lock);
- return (kn->kn_data > 0);
+
+ return (active);
+}
+
+int
+filt_kqueuemodify(struct kevent *kev, struct knote *kn)
+{
+ struct kqueue *kq = kn->kn_fp->f_data;
+ int active;
+
+ mtx_enter(&kq->kq_lock);
+ knote_modify(kev, kn);
+ active = filt_kqueue_common(kn, kq);
+ mtx_leave(&kq->kq_lock);
+
+ return (active);
+}
+
+int
+filt_kqueueprocess(struct knote *kn, struct kevent *kev)
+{
+ struct kqueue *kq = kn->kn_fp->f_data;
+ int active;
+
+ mtx_enter(&kq->kq_lock);
+ if (kev != NULL && (kn->kn_flags & EV_ONESHOT))
+ active = 1;
+ else
+ active = filt_kqueue_common(kn, kq);
+ if (active)
+ knote_submit(kn, kev);
+ mtx_leave(&kq->kq_lock);
+
+ return (active);
}
int
@@ -821,6 +872,7 @@ kqueue_alloc(struct filedesc *fdp)
TAILQ_INIT(&kq->kq_head);
mtx_init(&kq->kq_lock, IPL_HIGH);
task_set(&kq->kq_task, kqueue_task, kq);
+ klist_init_mutex(&kq->kq_sel.si_note, &kqueue_klist_lock);
return (kq);
}
@@ -1529,6 +1590,7 @@ kqueue_task(void *arg)
/* Kernel lock is needed inside selwakeup(). */
KERNEL_ASSERT_LOCKED();
+ mtx_enter(&kqueue_klist_lock);
mtx_enter(&kq->kq_lock);
if (kq->kq_state & KQ_SEL) {
kq->kq_state &= ~KQ_SEL;
@@ -1538,6 +1600,7 @@ kqueue_task(void *arg)
mtx_leave(&kq->kq_lock);
KNOTE(&kq->kq_sel.si_note, 0);
}
+ mtx_leave(&kqueue_klist_lock);
KQRELE(kq);
}