Here is a revised fix for the panic. It prevents the error by keeping
knotes attached to the fd-indexed knote list during and after the badfd
conversion. This should ensure that no knotes get enqueued after
kqpoll_exit() has finished kqueue_purge().
A badfd knote needs to be delivered only within the interval from the
registration of the original knote to completing the poll/select scan.
Once the polling thread has left the scan, the remaining knotes can
be dropped right away when the fds are closed.
To allow lazy removal of poll/select knotes, kqueue_register() drops
badfd knotes that it encounters. This together with keeping badfd
knotes attached should prevent badfd knotes from accumulating by
accident, which obsoletes kqpoll_dequeue().
knote_remove() uses knote serial numbers to determine when it is safe
skip the badfd conversion. Currently, this limits the skipping to the
current thread; the conversion is always done when a thread closes
a file descriptor that another thread has polled.
The skipping can be extended to cover the multi-threaded case. However,
that code change is better done separately.
The KN_ATTACHED flag was added to allow "detached" badfd knotes that
are only reachable through the queue of active knotes. Because this
misfeature is no longer necessary, the patch removes the flag.
Index: kern/kern_event.c
===================================================================
RCS file: src/sys/kern/kern_event.c,v
retrieving revision 1.173
diff -u -p -r1.173 kern_event.c
--- kern/kern_event.c 15 Nov 2021 15:48:54 -0000 1.173
+++ kern/kern_event.c 24 Nov 2021 13:53:28 -0000
@@ -93,8 +93,6 @@ void kqueue_do_check(struct kqueue *kq,
#define kqueue_check(kq) do {} while (0)
#endif
-void kqpoll_dequeue(struct proc *p, int all);
-
static int filter_attach(struct knote *kn);
static void filter_detach(struct knote *kn);
static int filter_event(struct knote *kn, long hint);
@@ -790,14 +788,6 @@ kqpoll_init(unsigned int num)
kqueue_purge(p, p->p_kq);
p->p_kq_serial = 0;
}
-
- /*
- * Discard any detached knotes that have been enqueued after
- * previous scan.
- * This prevents them from accumulating in case
- * scan does not make progress for some reason.
- */
- kqpoll_dequeue(p, 0);
}
/*
@@ -813,9 +803,6 @@ kqpoll_done(unsigned int num)
KASSERT(p->p_kq_serial + num >= p->p_kq_serial);
p->p_kq_serial += num;
-
- /* XXX Work around a race condition. */
- kqueue_purge(p, p->p_kq);
}
void
@@ -827,70 +814,12 @@ kqpoll_exit(void)
return;
kqueue_purge(p, p->p_kq);
- /* Clear any detached knotes that remain in the queue. */
- kqpoll_dequeue(p, 1);
kqueue_terminate(p, p->p_kq);
KASSERT(p->p_kq->kq_refs == 1);
KQRELE(p->p_kq);
p->p_kq = NULL;
}
-void
-kqpoll_dequeue(struct proc *p, int all)
-{
- struct knote marker;
- struct knote *kn;
- struct kqueue *kq = p->p_kq;
-
- /*
- * Bail out early without locking if the queue appears empty.
- *
- * This thread might not see the latest value of kq_count yet.
- * However, if there is any sustained increase in the queue size,
- * this thread will eventually observe that kq_count has become
- * non-zero.
- */
- if (all == 0 && kq->kq_count == 0)
- return;
-
- memset(&marker, 0, sizeof(marker));
- marker.kn_filter = EVFILT_MARKER;
- marker.kn_status = KN_PROCESSING;
-
- mtx_enter(&kq->kq_lock);
- kn = TAILQ_FIRST(&kq->kq_head);
- while (kn != NULL) {
- /* This kqueue should not be scanned by other threads. */
- KASSERT(kn->kn_filter != EVFILT_MARKER);
-
- if (all == 0 && (kn->kn_status & KN_ATTACHED)) {
- kn = TAILQ_NEXT(kn, kn_tqe);
- continue;
- }
-
- TAILQ_INSERT_BEFORE(kn, &marker, kn_tqe);
-
- if (!knote_acquire(kn, NULL, 0)) {
- /* knote_acquire() has released kq_lock. */
- } else {
- kqueue_check(kq);
- TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
- kn->kn_status &= ~KN_QUEUED;
- kq->kq_count--;
- mtx_leave(&kq->kq_lock);
-
- filter_detach(kn);
- knote_drop(kn, p);
- }
-
- mtx_enter(&kq->kq_lock);
- kqueue_check(kq);
- kn = TAILQ_NEXT(&marker, kn_tqe);
- TAILQ_REMOVE(&kq->kq_head, &marker, kn_tqe);
- }
- mtx_leave(&kq->kq_lock);
-}
-
struct kqueue *
kqueue_alloc(struct filedesc *fdp)
{
@@ -1230,6 +1159,21 @@ again:
mtx_enter(&kq->kq_lock);
if (active)
knote_activate(kn);
+ } else if (kn->kn_fop == &badfd_filtops) {
+ /*
+ * Nothing expects this badfd knote any longer.
+ * Drop it to make room for the new knote and retry.
+ */
+ KASSERT(kq == p->p_kq);
+ mtx_leave(&kq->kq_lock);
+ filter_detach(kn);
+ knote_drop(kn, p);
+
+ KASSERT(fp != NULL);
+ FRELE(fp, p);
+ fp = NULL;
+
+ goto again;
} else {
/*
* The user may change some filter values after the
@@ -1445,7 +1389,6 @@ retry:
kn->kn_status |= KN_DISABLED;
if ((kn->kn_status & KN_QUEUED) == 0)
kn->kn_status &= ~KN_ACTIVE;
- KASSERT(kn->kn_status & KN_ATTACHED);
knote_release(kn);
} else {
mtx_enter(&kq->kq_lock);
@@ -1455,7 +1398,6 @@ retry:
kn->kn_status |= KN_QUEUED;
TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
}
- KASSERT(kn->kn_status & KN_ATTACHED);
knote_release(kn);
}
kqueue_check(kq);
@@ -1811,6 +1753,17 @@ knote_remove(struct proc *p, struct kque
while ((kn = SLIST_FIRST(list)) != NULL) {
KASSERT(kn->kn_kq == kq);
+
+ if (!purge) {
+ /* Skip pending badfd knotes. */
+ while (kn->kn_fop == &badfd_filtops) {
+ kn = SLIST_NEXT(kn, kn_link);
+ if (kn == NULL)
+ return;
+ KASSERT(kn->kn_kq == kq);
+ }
+ }
+
if (!knote_acquire(kn, NULL, 0)) {
/* knote_acquire() has released kq_lock. */
mtx_enter(&kq->kq_lock);
@@ -1825,15 +1778,12 @@ knote_remove(struct proc *p, struct kque
*
* This reuses the original knote for delivering the
* notification so as to avoid allocating memory.
- * The knote will be reachable only through the queue
- * of active knotes and is freed either by kqueue_scan()
- * or kqpoll_dequeue().
*/
- if (!purge && (kn->kn_flags & __EV_POLL) != 0) {
+ if (!purge && (kn->kn_flags & __EV_POLL) &&
+ !(p->p_kq == kq &&
+ p->p_kq_serial > (unsigned long)kn->kn_udata) &&
+ kn->kn_fop != &badfd_filtops) {
KASSERT(kn->kn_fop->f_flags & FILTEROP_ISFD);
- mtx_enter(&kq->kq_lock);
- knote_detach(kn);
- mtx_leave(&kq->kq_lock);
FRELE(kn->kn_fp, p);
kn->kn_fp = NULL;
@@ -1900,9 +1850,7 @@ knote_attach(struct knote *kn)
MUTEX_ASSERT_LOCKED(&kq->kq_lock);
KASSERT(kn->kn_status & KN_PROCESSING);
- KASSERT((kn->kn_status & KN_ATTACHED) == 0);
- kn->kn_status |= KN_ATTACHED;
if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
KASSERT(kq->kq_knlistsize > kn->kn_id);
list = &kq->kq_knlist[kn->kn_id];
@@ -1922,15 +1870,11 @@ knote_detach(struct knote *kn)
MUTEX_ASSERT_LOCKED(&kq->kq_lock);
KASSERT(kn->kn_status & KN_PROCESSING);
- if ((kn->kn_status & KN_ATTACHED) == 0)
- return;
-
if (kn->kn_fop->f_flags & FILTEROP_ISFD)
list = &kq->kq_knlist[kn->kn_id];
else
list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
SLIST_REMOVE(list, kn, knote, kn_link);
- kn->kn_status &= ~KN_ATTACHED;
}
/*
Index: sys/event.h
===================================================================
RCS file: src/sys/sys/event.h,v
retrieving revision 1.58
diff -u -p -r1.58 event.h
--- sys/event.h 12 Nov 2021 04:34:23 -0000 1.58
+++ sys/event.h 24 Nov 2021 13:53:28 -0000
@@ -251,8 +251,6 @@ struct knote {
#define KN_DETACHED 0x0008 /* knote is detached */
#define KN_PROCESSING 0x0010 /* knote is being processed */
#define KN_WAITING 0x0020 /* waiting on processing */
-#define KN_ATTACHED 0x0040 /* knote is attached to
- * a knlist of the kqueue */
#define kn_id kn_kevent.ident /* [I] */
#define kn_filter kn_kevent.filter /* [I] */