This patch adds a mutex that serializes access to a kqueue. As a result,
most of kqueue's internals should become safe to run without the kernel
lock. In principle, the patch should allow unlocking kevent(2).
Some notes:
* The existing uses of splhigh() outline where the mutex should be held.
* The code is a true entanglement of lock operations. There are many
spots where lock usage is far from optimal. The patch does not attempt
to fix them, so as to keep the changeset relatively small.
* As msleep() with PCATCH requires the kernel lock, kqueue_scan() locks
the kernel for the section that might sleep. The lock is released
before the actual scan of events. An opportunistic implementation
could do a precheck to determine if the scan could be started right
away, but this is not part of the diff.
* knote_acquire() has a gap where it might miss a wakeup. This is an
unlikely situation that may arise with klist_invalidate(). It should
not happen during normal operation, and the code should recover thanks
to the one-second timeout. The loss of wakeup could be avoided with
serial numbering for example.
* The timeout in knote_acquire() makes the function try-lock-like, which
is essential in klist_invalidate(). The normal sequence of action is
that knote_acquire() comes before klist_lock(). klist_invalidate() has
to violate this, and the timeout, and retrying, prevent the system
from deadlocking.
* At the moment, all event sources still require the kernel lock.
kqueue will lock the kernel when it invokes the filterops callbacks
if FILTEROP_MPSAFE is not set.
Please test!
Index: kern/kern_event.c
===================================================================
RCS file: src/sys/kern/kern_event.c,v
retrieving revision 1.163
diff -u -p -r1.163 kern_event.c
--- kern/kern_event.c 22 Apr 2021 15:30:12 -0000 1.163
+++ kern/kern_event.c 20 May 2021 13:45:32 -0000
@@ -124,7 +124,8 @@ void knote_dequeue(struct knote *kn);
int knote_acquire(struct knote *kn, struct klist *, int);
void knote_release(struct knote *kn);
void knote_activate(struct knote *kn);
-void knote_remove(struct proc *p, struct knlist *list, int purge);
+void knote_remove(struct proc *p, struct kqueue *kq, struct knlist *list,
+ int purge);
void filt_kqdetach(struct knote *kn);
int filt_kqueue(struct knote *kn, long hint);
@@ -265,7 +266,7 @@ filt_kqueue(struct knote *kn, long hint)
{
struct kqueue *kq = kn->kn_fp->f_data;
- kn->kn_data = kq->kq_count;
+ kn->kn_data = kq->kq_count; /* unlocked read */
return (kn->kn_data > 0);
}
@@ -739,28 +740,31 @@ kqpoll_dequeue(struct proc *p)
{
struct knote *kn;
struct kqueue *kq = p->p_kq;
- int s;
- s = splhigh();
+ mtx_enter(&kq->kq_lock);
while ((kn = TAILQ_FIRST(&kq->kq_head)) != NULL) {
/* This kqueue should not be scanned by other threads. */
KASSERT(kn->kn_filter != EVFILT_MARKER);
- if (!knote_acquire(kn, NULL, 0))
+ if (!knote_acquire(kn, NULL, 0)) {
+ /* knote_acquire() has released kq_lock. */
+ mtx_enter(&kq->kq_lock);
continue;
+ }
kqueue_check(kq);
TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
kn->kn_status &= ~KN_QUEUED;
kq->kq_count--;
+ mtx_leave(&kq->kq_lock);
- splx(s);
- kn->kn_fop->f_detach(kn);
+ filter_detach(kn);
knote_drop(kn, p);
- s = splhigh();
+
+ mtx_enter(&kq->kq_lock);
kqueue_check(kq);
}
- splx(s);
+ mtx_leave(&kq->kq_lock);
}
struct kqueue *
@@ -772,6 +776,7 @@ kqueue_alloc(struct filedesc *fdp)
kq->kq_refs = 1;
kq->kq_fdp = fdp;
TAILQ_INIT(&kq->kq_head);
+ mtx_init(&kq->kq_lock, IPL_HIGH);
task_set(&kq->kq_task, kqueue_task, kq);
return (kq);
@@ -933,8 +938,7 @@ kqueue_do_check(struct kqueue *kq, const
struct knote *kn;
int count = 0, nmarker = 0;
- KERNEL_ASSERT_LOCKED();
- splassert(IPL_HIGH);
+ MUTEX_ASSERT_LOCKED(&kq->kq_lock);
TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
if (kn->kn_filter == EVFILT_MARKER) {
@@ -973,7 +977,7 @@ kqueue_register(struct kqueue *kq, struc
struct file *fp = NULL;
struct knote *kn = NULL, *newkn = NULL;
struct knlist *list = NULL;
- int s, error = 0;
+ int error = 0;
if (kev->filter < 0) {
if (kev->filter + EVFILT_SYSCOUNT < 0)
@@ -1005,11 +1009,13 @@ again:
error = EBADF;
goto done;
}
+ mtx_enter(&kq->kq_lock);
if (kev->flags & EV_ADD)
kqueue_expand_list(kq, kev->ident);
if (kev->ident < kq->kq_knlistsize)
list = &kq->kq_knlist[kev->ident];
} else {
+ mtx_enter(&kq->kq_lock);
if (kev->flags & EV_ADD)
kqueue_expand_hash(kq);
if (kq->kq_knhashmask != 0) {
@@ -1021,16 +1027,15 @@ again:
SLIST_FOREACH(kn, list, kn_link) {
if (kev->filter == kn->kn_filter &&
kev->ident == kn->kn_id) {
- s = splhigh();
if (!knote_acquire(kn, NULL, 0)) {
- splx(s);
+ /* knote_acquire() has released
+ * kq_lock. */
if (fp != NULL) {
FRELE(fp, p);
fp = NULL;
}
goto again;
}
- splx(s);
break;
}
}
@@ -1038,14 +1043,13 @@ again:
KASSERT(kn == NULL || (kn->kn_status & KN_PROCESSING) != 0);
if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
+ mtx_leave(&kq->kq_lock);
error = ENOENT;
goto done;
}
/*
* kn now contains the matching knote, or NULL if no match.
- * If adding a new knote, sleeping is not allowed until the knote
- * has been inserted.
*/
if (kev->flags & EV_ADD) {
if (kn == NULL) {
@@ -1069,6 +1073,8 @@ again:
kn->kn_kevent = *kev;
knote_attach(kn);
+ mtx_leave(&kq->kq_lock);
+
error = filter_attach(kn);
if (error != 0) {
knote_drop(kn, p);
@@ -1097,45 +1103,47 @@ again:
/* Check if there is a pending event. */
if (filter_process(kn, NULL))
knote_activate(kn);
+
+ mtx_enter(&kq->kq_lock);
} else {
/*
* The user may change some filter values after the
* initial EV_ADD, but doing so will not reset any
* filters which have already been triggered.
*/
+ mtx_leave(&kq->kq_lock);
if (filter_modify(kev, kn))
knote_activate(kn);
+ mtx_enter(&kq->kq_lock);
if (kev->flags & EV_ERROR) {
error = kev->data;
goto release;
}
}
} else if (kev->flags & EV_DELETE) {
+ mtx_leave(&kq->kq_lock);
filter_detach(kn);
knote_drop(kn, p);
goto done;
}
- if ((kev->flags & EV_DISABLE) &&
- ((kn->kn_status & KN_DISABLED) == 0)) {
- s = splhigh();
+ if ((kev->flags & EV_DISABLE) && ((kn->kn_status & KN_DISABLED) == 0))
kn->kn_status |= KN_DISABLED;
- splx(s);
- }
if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
- s = splhigh();
kn->kn_status &= ~KN_DISABLED;
- splx(s);
+ mtx_leave(&kq->kq_lock);
+
/* Check if there is a pending event. */
if (filter_process(kn, NULL))
knote_activate(kn);
+
+ mtx_enter(&kq->kq_lock);
}
release:
- s = splhigh();
knote_release(kn);
- splx(s);
+ mtx_leave(&kq->kq_lock);
done:
if (fp != NULL)
FRELE(fp, p);
@@ -1151,14 +1159,15 @@ kqueue_sleep(struct kqueue *kq, struct t
uint64_t nsecs;
int error;
- splassert(IPL_HIGH);
+ MUTEX_ASSERT_LOCKED(&kq->kq_lock);
if (tsp != NULL) {
getnanouptime(&start);
nsecs = MIN(TIMESPEC_TO_NSEC(tsp), MAXTSLP);
} else
nsecs = INFSLP;
- error = tsleep_nsec(kq, PSOCK | PCATCH, "kqread", nsecs);
+ error = msleep_nsec(kq, &kq->kq_lock, PSOCK | PCATCH | PNORELOCK,
+ "kqread", nsecs);
if (tsp != NULL) {
getnanouptime(&stop);
timespecsub(&stop, &start, &elapsed);
@@ -1181,7 +1190,7 @@ kqueue_scan(struct kqueue_scan_state *sc
{
struct kqueue *kq = scan->kqs_kq;
struct knote *kn;
- int s, error = 0, nkev = 0;
+ int error = 0, nkev = 0;
if (maxevents == 0)
goto done;
@@ -1190,12 +1199,18 @@ retry:
error = 0;
+ /* msleep() with PCATCH requires kernel lock. */
+ KERNEL_LOCK();
+
+ mtx_enter(&kq->kq_lock);
+
if (kq->kq_state & KQ_DYING) {
+ mtx_leave(&kq->kq_lock);
+ KERNEL_UNLOCK();
error = EBADF;
goto done;
}
- s = splhigh();
if (kq->kq_count == 0) {
/*
* Successive loops are only necessary if there are more
@@ -1203,13 +1218,15 @@ retry:
*/
if ((tsp != NULL && !timespecisset(tsp)) ||
scan->kqs_nevent != 0) {
- splx(s);
+ mtx_leave(&kq->kq_lock);
+ KERNEL_UNLOCK();
error = 0;
goto done;
}
kq->kq_state |= KQ_SLEEP;
error = kqueue_sleep(kq, tsp);
- splx(s);
+ /* kqueue_sleep() has released kq_lock. */
+ KERNEL_UNLOCK();
if (error == 0 || error == EWOULDBLOCK)
goto retry;
/* don't restart after signals... */
@@ -1218,6 +1235,9 @@ retry:
goto done;
}
+ /* The actual scan does not sleep on kq, so unlock the kernel. */
+ KERNEL_UNLOCK();
+
/*
* Put the end marker in the queue to limit the scan to the events
* that are currently active. This prevents events from being
@@ -1249,8 +1269,11 @@ retry:
continue;
}
- if (!knote_acquire(kn, NULL, 0))
+ if (!knote_acquire(kn, NULL, 0)) {
+ /* knote_acquire() has released kq_lock. */
+ mtx_enter(&kq->kq_lock);
continue;
+ }
kqueue_check(kq);
TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
@@ -1263,11 +1286,11 @@ retry:
continue;
}
- splx(s);
+ mtx_leave(&kq->kq_lock);
memset(kevp, 0, sizeof(*kevp));
if (filter_process(kn, kevp) == 0) {
- s = splhigh();
+ mtx_enter(&kq->kq_lock);
if ((kn->kn_status & KN_QUEUED) == 0)
kn->kn_status &= ~KN_ACTIVE;
knote_release(kn);
@@ -1281,9 +1304,9 @@ retry:
if (kevp->flags & EV_ONESHOT) {
filter_detach(kn);
knote_drop(kn, p);
- s = splhigh();
+ mtx_enter(&kq->kq_lock);
} else if (kevp->flags & (EV_CLEAR | EV_DISPATCH)) {
- s = splhigh();
+ mtx_enter(&kq->kq_lock);
if (kevp->flags & EV_DISPATCH)
kn->kn_status |= KN_DISABLED;
if ((kn->kn_status & KN_QUEUED) == 0)
@@ -1291,7 +1314,7 @@ retry:
KASSERT(kn->kn_status & KN_ATTACHED);
knote_release(kn);
} else {
- s = splhigh();
+ mtx_enter(&kq->kq_lock);
if ((kn->kn_status & KN_QUEUED) == 0) {
kqueue_check(kq);
kq->kq_count++;
@@ -1308,7 +1331,7 @@ retry:
scan->kqs_nevent++;
}
TAILQ_REMOVE(&kq->kq_head, &scan->kqs_start, kn_tqe);
- splx(s);
+ mtx_leave(&kq->kq_lock);
if (scan->kqs_nevent == 0)
goto retry;
done:
@@ -1333,7 +1356,6 @@ void
kqueue_scan_finish(struct kqueue_scan_state *scan)
{
struct kqueue *kq = scan->kqs_kq;
- int s;
KASSERT(scan->kqs_start.kn_filter == EVFILT_MARKER);
KASSERT(scan->kqs_start.kn_status == KN_PROCESSING);
@@ -1342,9 +1364,9 @@ kqueue_scan_finish(struct kqueue_scan_st
if (scan->kqs_queued) {
scan->kqs_queued = 0;
- s = splhigh();
+ mtx_enter(&kq->kq_lock);
TAILQ_REMOVE(&kq->kq_head, &scan->kqs_end, kn_tqe);
- splx(s);
+ mtx_leave(&kq->kq_lock);
}
KQRELE(kq);
}
@@ -1376,17 +1398,17 @@ kqueue_poll(struct file *fp, int events,
{
struct kqueue *kq = (struct kqueue *)fp->f_data;
int revents = 0;
- int s = splhigh();
if (events & (POLLIN | POLLRDNORM)) {
+ mtx_enter(&kq->kq_lock);
if (kq->kq_count) {
revents |= events & (POLLIN | POLLRDNORM);
} else {
selrecord(p, &kq->kq_sel);
kq->kq_state |= KQ_SEL;
}
+ mtx_leave(&kq->kq_lock);
}
- splx(s);
return (revents);
}
@@ -1396,7 +1418,7 @@ kqueue_stat(struct file *fp, struct stat
struct kqueue *kq = fp->f_data;
memset(st, 0, sizeof(*st));
- st->st_size = kq->kq_count;
+ st->st_size = kq->kq_count; /* unlocked read */
st->st_blksize = sizeof(struct kevent);
st->st_mode = S_IFIFO;
return (0);
@@ -1407,14 +1429,14 @@ kqueue_purge(struct proc *p, struct kque
{
int i;
- KERNEL_ASSERT_LOCKED();
-
+ mtx_enter(&kq->kq_lock);
for (i = 0; i < kq->kq_knlistsize; i++)
- knote_remove(p, &kq->kq_knlist[i], 1);
+ knote_remove(p, kq, &kq->kq_knlist[i], 1);
if (kq->kq_knhashmask != 0) {
for (i = 0; i < kq->kq_knhashmask + 1; i++)
- knote_remove(p, &kq->kq_knhash[i], 1);
+ knote_remove(p, kq, &kq->kq_knhash[i], 1);
}
+ mtx_leave(&kq->kq_lock);
}
void
@@ -1422,6 +1444,8 @@ kqueue_terminate(struct proc *p, struct
{
struct knote *kn;
+ mtx_enter(&kq->kq_lock);
+
/*
* Any remaining entries should be scan markers.
* They are removed when the ongoing scans finish.
@@ -1432,6 +1456,7 @@ kqueue_terminate(struct proc *p, struct
kq->kq_state |= KQ_DYING;
kqueue_wakeup(kq);
+ mtx_leave(&kq->kq_lock);
KASSERT(klist_empty(&kq->kq_sel.si_note));
task_del(systq, &kq->kq_task);
@@ -1443,15 +1468,13 @@ kqueue_close(struct file *fp, struct pro
{
struct kqueue *kq = fp->f_data;
- KERNEL_LOCK();
+ fp->f_data = NULL;
+
kqueue_purge(p, kq);
kqueue_terminate(p, kq);
- fp->f_data = NULL;
KQRELE(kq);
- KERNEL_UNLOCK();
-
return (0);
}
@@ -1460,10 +1483,16 @@ kqueue_task(void *arg)
{
struct kqueue *kq = arg;
+ /* Kernel lock is needed inside selwakeup(). */
+ KERNEL_ASSERT_LOCKED();
+
+ mtx_enter(&kq->kq_lock);
if (kq->kq_state & KQ_SEL) {
kq->kq_state &= ~KQ_SEL;
+ mtx_leave(&kq->kq_lock);
selwakeup(&kq->kq_sel);
} else {
+ mtx_leave(&kq->kq_lock);
KNOTE(&kq->kq_sel.si_note, 0);
}
KQRELE(kq);
@@ -1472,6 +1501,7 @@ kqueue_task(void *arg)
void
kqueue_wakeup(struct kqueue *kq)
{
+ MUTEX_ASSERT_LOCKED(&kq->kq_lock);
if (kq->kq_state & KQ_SLEEP) {
kq->kq_state &= ~KQ_SLEEP;
@@ -1491,14 +1521,20 @@ kqueue_expand_hash(struct kqueue *kq)
struct knlist *hash;
u_long hashmask;
+ MUTEX_ASSERT_LOCKED(&kq->kq_lock);
+
if (kq->kq_knhashmask == 0) {
+ mtx_leave(&kq->kq_lock);
hash = hashinit(KN_HASHSIZE, M_KEVENT, M_WAITOK, &hashmask);
+ mtx_enter(&kq->kq_lock);
if (kq->kq_knhashmask == 0) {
kq->kq_knhash = hash;
kq->kq_knhashmask = hashmask;
} else {
/* Another thread has allocated the hash. */
+ mtx_leave(&kq->kq_lock);
hashfree(hash, KN_HASHSIZE, M_KEVENT);
+ mtx_enter(&kq->kq_lock);
}
}
}
@@ -1506,26 +1542,35 @@ kqueue_expand_hash(struct kqueue *kq)
static void
kqueue_expand_list(struct kqueue *kq, int fd)
{
- struct knlist *list;
- int size;
+ struct knlist *list, *olist;
+ int size, osize;
+
+ MUTEX_ASSERT_LOCKED(&kq->kq_lock);
if (kq->kq_knlistsize <= fd) {
size = kq->kq_knlistsize;
+ mtx_leave(&kq->kq_lock);
while (size <= fd)
size += KQEXTENT;
list = mallocarray(size, sizeof(*list), M_KEVENT, M_WAITOK);
+ mtx_enter(&kq->kq_lock);
if (kq->kq_knlistsize <= fd) {
memcpy(list, kq->kq_knlist,
kq->kq_knlistsize * sizeof(*list));
memset(&list[kq->kq_knlistsize], 0,
(size - kq->kq_knlistsize) * sizeof(*list));
- free(kq->kq_knlist, M_KEVENT,
- kq->kq_knlistsize * sizeof(*list));
+ olist = kq->kq_knlist;
+ osize = kq->kq_knlistsize;
kq->kq_knlist = list;
kq->kq_knlistsize = size;
+ mtx_leave(&kq->kq_lock);
+ free(olist, M_KEVENT, osize * sizeof(*list));
+ mtx_enter(&kq->kq_lock);
} else {
/* Another thread has expanded the list. */
+ mtx_leave(&kq->kq_lock);
free(list, M_KEVENT, size * sizeof(*list));
+ mtx_enter(&kq->kq_lock);
}
}
}
@@ -1543,14 +1588,22 @@ kqueue_expand_list(struct kqueue *kq, in
int
knote_acquire(struct knote *kn, struct klist *klist, int ls)
{
- splassert(IPL_HIGH);
+ struct kqueue *kq = kn->kn_kq;
+
+ MUTEX_ASSERT_LOCKED(&kq->kq_lock);
KASSERT(kn->kn_filter != EVFILT_MARKER);
if (kn->kn_status & KN_PROCESSING) {
kn->kn_status |= KN_WAITING;
- if (klist != NULL)
+ if (klist != NULL) {
+ mtx_leave(&kq->kq_lock);
klist_unlock(klist, ls);
- tsleep_nsec(kn, 0, "kqepts", SEC_TO_NSEC(1));
+ /* XXX Timeout resolves potential loss of wakeup. */
+ tsleep_nsec(kn, 0, "kqepts", SEC_TO_NSEC(1));
+ } else {
+ msleep_nsec(kn, &kq->kq_lock, PNORELOCK, "kqepts",
+ SEC_TO_NSEC(1));
+ }
/* knote may be stale now */
return (0);
}
@@ -1564,7 +1617,9 @@ knote_acquire(struct knote *kn, struct k
void
knote_release(struct knote *kn)
{
- splassert(IPL_HIGH);
+ struct kqueue *kq = kn->kn_kq;
+
+ MUTEX_ASSERT_LOCKED(&kq->kq_lock);
KASSERT(kn->kn_filter != EVFILT_MARKER);
KASSERT(kn->kn_status & KN_PROCESSING);
@@ -1582,13 +1637,13 @@ knote_release(struct knote *kn)
void
knote_activate(struct knote *kn)
{
- int s;
+ struct kqueue *kq = kn->kn_kq;
- s = splhigh();
+ mtx_enter(&kq->kq_lock);
kn->kn_status |= KN_ACTIVE;
if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)
knote_enqueue(kn);
- splx(s);
+ mtx_leave(&kq->kq_lock);
}
/*
@@ -1610,18 +1665,20 @@ knote(struct klist *list, long hint)
* remove all knotes from a specified knlist
*/
void
-knote_remove(struct proc *p, struct knlist *list, int purge)
+knote_remove(struct proc *p, struct kqueue *kq, struct knlist *list, int purge)
{
struct knote *kn;
- int s;
+
+ MUTEX_ASSERT_LOCKED(&kq->kq_lock);
while ((kn = SLIST_FIRST(list)) != NULL) {
- s = splhigh();
+ KASSERT(kn->kn_kq == kq);
if (!knote_acquire(kn, NULL, 0)) {
- splx(s);
+ /* knote_acquire() has released kq_lock. */
+ mtx_enter(&kq->kq_lock);
continue;
}
- splx(s);
+ mtx_leave(&kq->kq_lock);
filter_detach(kn);
/*
@@ -1636,20 +1693,22 @@ knote_remove(struct proc *p, struct knli
*/
if (!purge && (kn->kn_flags & __EV_POLL) != 0) {
KASSERT(kn->kn_fop->f_flags & FILTEROP_ISFD);
+ mtx_enter(&kq->kq_lock);
knote_detach(kn);
+ mtx_leave(&kq->kq_lock);
FRELE(kn->kn_fp, p);
kn->kn_fp = NULL;
kn->kn_fop = &badfd_filtops;
filter_event(kn, 0);
knote_activate(kn);
- s = splhigh();
+ mtx_enter(&kq->kq_lock);
knote_release(kn);
- splx(s);
continue;
}
knote_drop(kn, p);
+ mtx_enter(&kq->kq_lock);
}
}
@@ -1661,7 +1720,6 @@ knote_fdclose(struct proc *p, int fd)
{
struct filedesc *fdp = p->p_p->ps_fd;
struct kqueue *kq;
- struct knlist *list;
/*
* fdplock can be ignored if the file descriptor table is being freed
@@ -1670,18 +1728,12 @@ knote_fdclose(struct proc *p, int fd)
if (fdp->fd_refcnt != 0)
fdpassertlocked(fdp);
- if (LIST_EMPTY(&fdp->fd_kqlist))
- return;
-
- KERNEL_LOCK();
LIST_FOREACH(kq, &fdp->fd_kqlist, kq_next) {
- if (fd >= kq->kq_knlistsize)
- continue;
-
- list = &kq->kq_knlist[fd];
- knote_remove(p, list, 0);
+ mtx_enter(&kq->kq_lock);
+ if (fd < kq->kq_knlistsize)
+ knote_remove(p, kq, &kq->kq_knlist[fd], 0);
+ mtx_leave(&kq->kq_lock);
}
- KERNEL_UNLOCK();
}
/*
@@ -1693,6 +1745,7 @@ knote_processexit(struct proc *p)
{
struct process *pr = p->p_p;
+ KERNEL_ASSERT_LOCKED();
KASSERT(p == curproc);
KNOTE(&pr->ps_klist, NOTE_EXIT);
@@ -1706,15 +1759,12 @@ knote_attach(struct knote *kn)
{
struct kqueue *kq = kn->kn_kq;
struct knlist *list;
- int s;
+ MUTEX_ASSERT_LOCKED(&kq->kq_lock);
KASSERT(kn->kn_status & KN_PROCESSING);
KASSERT((kn->kn_status & KN_ATTACHED) == 0);
- s = splhigh();
kn->kn_status |= KN_ATTACHED;
- splx(s);
-
if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
KASSERT(kq->kq_knlistsize > kn->kn_id);
list = &kq->kq_knlist[kn->kn_id];
@@ -1730,8 +1780,8 @@ knote_detach(struct knote *kn)
{
struct kqueue *kq = kn->kn_kq;
struct knlist *list;
- int s;
+ MUTEX_ASSERT_LOCKED(&kq->kq_lock);
KASSERT(kn->kn_status & KN_PROCESSING);
if ((kn->kn_status & KN_ATTACHED) == 0)
@@ -1742,10 +1792,7 @@ knote_detach(struct knote *kn)
else
list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
SLIST_REMOVE(list, kn, knote, kn_link);
-
- s = splhigh();
kn->kn_status &= ~KN_ATTACHED;
- splx(s);
}
/*
@@ -1755,20 +1802,20 @@ knote_detach(struct knote *kn)
void
knote_drop(struct knote *kn, struct proc *p)
{
- int s;
+ struct kqueue *kq = kn->kn_kq;
KASSERT(kn->kn_filter != EVFILT_MARKER);
+ mtx_enter(&kq->kq_lock);
knote_detach(kn);
-
- s = splhigh();
if (kn->kn_status & KN_QUEUED)
knote_dequeue(kn);
if (kn->kn_status & KN_WAITING) {
kn->kn_status &= ~KN_WAITING;
wakeup(kn);
}
- splx(s);
+ mtx_leave(&kq->kq_lock);
+
if ((kn->kn_fop->f_flags & FILTEROP_ISFD) && kn->kn_fp != NULL)
FRELE(kn->kn_fp, p);
pool_put(&knote_pool, kn);
@@ -1780,7 +1827,7 @@ knote_enqueue(struct knote *kn)
{
struct kqueue *kq = kn->kn_kq;
- splassert(IPL_HIGH);
+ MUTEX_ASSERT_LOCKED(&kq->kq_lock);
KASSERT(kn->kn_filter != EVFILT_MARKER);
KASSERT((kn->kn_status & KN_QUEUED) == 0);
@@ -1797,7 +1844,7 @@ knote_dequeue(struct knote *kn)
{
struct kqueue *kq = kn->kn_kq;
- splassert(IPL_HIGH);
+ MUTEX_ASSERT_LOCKED(&kq->kq_lock);
KASSERT(kn->kn_filter != EVFILT_MARKER);
KASSERT(kn->kn_status & KN_QUEUED);
@@ -1905,36 +1952,38 @@ void
klist_invalidate(struct klist *list)
{
struct knote *kn;
+ struct kqueue *kq;
struct proc *p = curproc;
- int ls, s;
+ int ls;
NET_ASSERT_UNLOCKED();
- s = splhigh();
ls = klist_lock(list);
while ((kn = SLIST_FIRST(&list->kl_list)) != NULL) {
+ kq = kn->kn_kq;
+ mtx_enter(&kq->kq_lock);
if (!knote_acquire(kn, list, ls)) {
- /* knote_acquire() has unlocked list. */
+ /* knote_acquire() has released kq_lock
+ * and klist lock. */
ls = klist_lock(list);
continue;
}
+ mtx_leave(&kq->kq_lock);
klist_unlock(list, ls);
- splx(s);
filter_detach(kn);
if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
kn->kn_fop = &dead_filtops;
filter_event(kn, 0);
knote_activate(kn);
- s = splhigh();
+ mtx_enter(&kq->kq_lock);
knote_release(kn);
+ mtx_leave(&kq->kq_lock);
} else {
knote_drop(kn, p);
- s = splhigh();
}
ls = klist_lock(list);
}
klist_unlock(list, ls);
- splx(s);
}
static int
Index: sys/eventvar.h
===================================================================
RCS file: src/sys/sys/eventvar.h,v
retrieving revision 1.11
diff -u -p -r1.11 eventvar.h
--- sys/eventvar.h 17 Jan 2021 05:56:32 -0000 1.11
+++ sys/eventvar.h 20 May 2021 13:45:32 -0000
@@ -31,6 +31,7 @@
#ifndef _SYS_EVENTVAR_H_
#define _SYS_EVENTVAR_H_
+#include <sys/mutex.h>
#include <sys/task.h>
#define KQ_NEVENTS 8 /* minimize copy{in,out} calls */
@@ -38,24 +39,29 @@
/*
* Locking:
+ * I immutable after creation
* a atomic operations
+ * q kq_lock
*/
struct kqueue {
- TAILQ_HEAD(, knote) kq_head; /* list of pending event */
- int kq_count; /* number of pending events */
- u_int kq_refs; /* [a] number of references */
+ struct mutex kq_lock; /* lock for queue access */
+ TAILQ_HEAD(, knote) kq_head; /* [q] list of pending event */
+ int kq_count; /* [q] # of pending events */
+ u_int kq_refs; /* [a] # of references */
struct selinfo kq_sel;
- struct filedesc *kq_fdp;
+ struct filedesc *kq_fdp; /* [I] fd table of this kq */
LIST_ENTRY(kqueue) kq_next;
- int kq_knlistsize; /* size of kq_knlist */
- struct knlist *kq_knlist; /* list of attached knotes */
- u_long kq_knhashmask; /* size of kq_knhash */
- struct knlist *kq_knhash; /* hash table for attached
knotes */
+ int kq_knlistsize; /* [q] size of kq_knlist */
+ struct knlist *kq_knlist; /* [q] list of
+ * attached knotes */
+ u_long kq_knhashmask; /* [q] size of kq_knhash */
+ struct knlist *kq_knhash; /* [q] hash table for
+ * attached knotes */
struct task kq_task; /* deferring of activation */
- int kq_state;
+ int kq_state; /* [q] */
#define KQ_SEL 0x01
#define KQ_SLEEP 0x02
#define KQ_DYING 0x04