On Thu, May 20, 2021 at 01:56:24PM +0000, Visa Hankala wrote:
> This patch adds a mutex that serializes access to a kqueue. As a result,
> most of kqueue's internals should become safe to run without the kernel
> lock. In principle, the patch should allow unlocking kevent(2).
>
> Some notes:
>
> * The existing uses of splhigh() outline where the mutex should be held.
>
> * The code is a true entanglement of lock operations. There are many
> spots where lock usage is far from optimal. The patch does not attempt
> to fix them, so as to keep the changeset relatively small.
>
> * As msleep() with PCATCH requires the kernel lock, kqueue_scan() locks
> the kernel for the section that might sleep. The lock is released
> before the actual scan of events. An opportunistic implementation
> could do a precheck to determine if the scan could be started right
> away, but this is not part of the diff.
>
> * knote_acquire() has a gap where it might miss a wakeup. This is an
> unlikely situation that may arise with klist_invalidate(). It should
> not happen during normal operation, and the code should recover thanks
> to the one-second timeout. The loss of wakeup could be avoided with
> serial numbering for example.
>
> * The timeout in knote_acquire() makes the function try-lock-like, which
> is essential in klist_invalidate(). The normal sequence of action is
> that knote_acquire() comes before klist_lock(). klist_invalidate() has
> to violate this, and the timeout, and retrying, prevent the system
> from deadlocking.
>
> * At the moment, all event sources still require the kernel lock.
> kqueue will lock the kernel when it invokes the filterops callbacks
> if FILTEROP_MPSAFE is not set.
Here is an updated patch. The changes are:
* Lock kqueue mutex in filt_kqueue() for accessing kq_count, to avoid
an unlocked read.
There still is an unlocked read of kq_count in kqueue_stat(). However,
I think it is good enough as is because the value returned to
userspace is best-effort anyhow (FreeBSD omits it altogether).
* Make callers of knote_activate() lock the mutex. This reduces the
number of lock operations.
* Adjust mutex assertion in knote_release() to avoid an unused variable
when compiling without DIAGNOSTIC.
OK?
Index: kern/kern_event.c
===================================================================
RCS file: src/sys/kern/kern_event.c,v
retrieving revision 1.164
diff -u -p -r1.164 kern_event.c
--- kern/kern_event.c 2 Jun 2021 13:56:28 -0000 1.164
+++ kern/kern_event.c 3 Jun 2021 14:06:54 -0000
@@ -123,7 +123,8 @@ void knote_dequeue(struct knote *kn);
int knote_acquire(struct knote *kn, struct klist *, int);
void knote_release(struct knote *kn);
void knote_activate(struct knote *kn);
-void knote_remove(struct proc *p, struct knlist *list, int purge);
+void knote_remove(struct proc *p, struct kqueue *kq, struct knlist *list,
+ int purge);
void filt_kqdetach(struct knote *kn);
int filt_kqueue(struct knote *kn, long hint);
@@ -270,7 +271,9 @@ filt_kqueue(struct knote *kn, long hint)
{
struct kqueue *kq = kn->kn_fp->f_data;
+ mtx_enter(&kq->kq_lock);
kn->kn_data = kq->kq_count;
+ mtx_leave(&kq->kq_lock);
return (kn->kn_data > 0);
}
@@ -416,9 +419,12 @@ void
filt_timerexpire(void *knx)
{
struct knote *kn = knx;
+ struct kqueue *kq = kn->kn_kq;
kn->kn_data++;
+ mtx_enter(&kq->kq_lock);
knote_activate(kn);
+ mtx_leave(&kq->kq_lock);
if ((kn->kn_flags & EV_ONESHOT) == 0)
filt_timer_timeout_add(kn);
@@ -744,28 +750,31 @@ kqpoll_dequeue(struct proc *p)
{
struct knote *kn;
struct kqueue *kq = p->p_kq;
- int s;
- s = splhigh();
+ mtx_enter(&kq->kq_lock);
while ((kn = TAILQ_FIRST(&kq->kq_head)) != NULL) {
/* This kqueue should not be scanned by other threads. */
KASSERT(kn->kn_filter != EVFILT_MARKER);
- if (!knote_acquire(kn, NULL, 0))
+ if (!knote_acquire(kn, NULL, 0)) {
+ /* knote_acquire() has released kq_lock. */
+ mtx_enter(&kq->kq_lock);
continue;
+ }
kqueue_check(kq);
TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
kn->kn_status &= ~KN_QUEUED;
kq->kq_count--;
+ mtx_leave(&kq->kq_lock);
- splx(s);
- kn->kn_fop->f_detach(kn);
+ filter_detach(kn);
knote_drop(kn, p);
- s = splhigh();
+
+ mtx_enter(&kq->kq_lock);
kqueue_check(kq);
}
- splx(s);
+ mtx_leave(&kq->kq_lock);
}
struct kqueue *
@@ -777,6 +786,7 @@ kqueue_alloc(struct filedesc *fdp)
kq->kq_refs = 1;
kq->kq_fdp = fdp;
TAILQ_INIT(&kq->kq_head);
+ mtx_init(&kq->kq_lock, IPL_HIGH);
task_set(&kq->kq_task, kqueue_task, kq);
return (kq);
@@ -938,8 +948,7 @@ kqueue_do_check(struct kqueue *kq, const
struct knote *kn;
int count = 0, nmarker = 0;
- KERNEL_ASSERT_LOCKED();
- splassert(IPL_HIGH);
+ MUTEX_ASSERT_LOCKED(&kq->kq_lock);
TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
if (kn->kn_filter == EVFILT_MARKER) {
@@ -978,7 +987,7 @@ kqueue_register(struct kqueue *kq, struc
struct file *fp = NULL;
struct knote *kn = NULL, *newkn = NULL;
struct knlist *list = NULL;
- int s, error = 0;
+ int active, error = 0;
if (kev->filter < 0) {
if (kev->filter + EVFILT_SYSCOUNT < 0)
@@ -1010,11 +1019,13 @@ again:
error = EBADF;
goto done;
}
+ mtx_enter(&kq->kq_lock);
if (kev->flags & EV_ADD)
kqueue_expand_list(kq, kev->ident);
if (kev->ident < kq->kq_knlistsize)
list = &kq->kq_knlist[kev->ident];
} else {
+ mtx_enter(&kq->kq_lock);
if (kev->flags & EV_ADD)
kqueue_expand_hash(kq);
if (kq->kq_knhashmask != 0) {
@@ -1026,16 +1037,15 @@ again:
SLIST_FOREACH(kn, list, kn_link) {
if (kev->filter == kn->kn_filter &&
kev->ident == kn->kn_id) {
- s = splhigh();
if (!knote_acquire(kn, NULL, 0)) {
- splx(s);
+ /* knote_acquire() has released
+ * kq_lock. */
if (fp != NULL) {
FRELE(fp, p);
fp = NULL;
}
goto again;
}
- splx(s);
break;
}
}
@@ -1043,14 +1053,13 @@ again:
KASSERT(kn == NULL || (kn->kn_status & KN_PROCESSING) != 0);
if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
+ mtx_leave(&kq->kq_lock);
error = ENOENT;
goto done;
}
/*
* kn now contains the matching knote, or NULL if no match.
- * If adding a new knote, sleeping is not allowed until the knote
- * has been inserted.
*/
if (kev->flags & EV_ADD) {
if (kn == NULL) {
@@ -1074,6 +1083,8 @@ again:
kn->kn_kevent = *kev;
knote_attach(kn);
+ mtx_leave(&kq->kq_lock);
+
error = filter_attach(kn);
if (error != 0) {
knote_drop(kn, p);
@@ -1100,7 +1111,9 @@ again:
}
/* Check if there is a pending event. */
- if (filter_process(kn, NULL))
+ active = filter_process(kn, NULL);
+ mtx_enter(&kq->kq_lock);
+ if (active)
knote_activate(kn);
} else {
/*
@@ -1108,7 +1121,10 @@ again:
* initial EV_ADD, but doing so will not reset any
* filters which have already been triggered.
*/
- if (filter_modify(kev, kn))
+ mtx_leave(&kq->kq_lock);
+ active = filter_modify(kev, kn);
+ mtx_enter(&kq->kq_lock);
+ if (active)
knote_activate(kn);
if (kev->flags & EV_ERROR) {
error = kev->data;
@@ -1116,31 +1132,28 @@ again:
}
}
} else if (kev->flags & EV_DELETE) {
+ mtx_leave(&kq->kq_lock);
filter_detach(kn);
knote_drop(kn, p);
goto done;
}
- if ((kev->flags & EV_DISABLE) &&
- ((kn->kn_status & KN_DISABLED) == 0)) {
- s = splhigh();
+ if ((kev->flags & EV_DISABLE) && ((kn->kn_status & KN_DISABLED) == 0))
kn->kn_status |= KN_DISABLED;
- splx(s);
- }
if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
- s = splhigh();
kn->kn_status &= ~KN_DISABLED;
- splx(s);
+ mtx_leave(&kq->kq_lock);
/* Check if there is a pending event. */
- if (filter_process(kn, NULL))
+ active = filter_process(kn, NULL);
+ mtx_enter(&kq->kq_lock);
+ if (active)
knote_activate(kn);
}
release:
- s = splhigh();
knote_release(kn);
- splx(s);
+ mtx_leave(&kq->kq_lock);
done:
if (fp != NULL)
FRELE(fp, p);
@@ -1156,14 +1169,15 @@ kqueue_sleep(struct kqueue *kq, struct t
uint64_t nsecs;
int error;
- splassert(IPL_HIGH);
+ MUTEX_ASSERT_LOCKED(&kq->kq_lock);
if (tsp != NULL) {
getnanouptime(&start);
nsecs = MIN(TIMESPEC_TO_NSEC(tsp), MAXTSLP);
} else
nsecs = INFSLP;
- error = tsleep_nsec(kq, PSOCK | PCATCH, "kqread", nsecs);
+ error = msleep_nsec(kq, &kq->kq_lock, PSOCK | PCATCH | PNORELOCK,
+ "kqread", nsecs);
if (tsp != NULL) {
getnanouptime(&stop);
timespecsub(&stop, &start, &elapsed);
@@ -1186,7 +1200,7 @@ kqueue_scan(struct kqueue_scan_state *sc
{
struct kqueue *kq = scan->kqs_kq;
struct knote *kn;
- int s, error = 0, nkev = 0;
+ int error = 0, nkev = 0;
if (maxevents == 0)
goto done;
@@ -1195,12 +1209,18 @@ retry:
error = 0;
+ /* msleep() with PCATCH requires kernel lock. */
+ KERNEL_LOCK();
+
+ mtx_enter(&kq->kq_lock);
+
if (kq->kq_state & KQ_DYING) {
+ mtx_leave(&kq->kq_lock);
+ KERNEL_UNLOCK();
error = EBADF;
goto done;
}
- s = splhigh();
if (kq->kq_count == 0) {
/*
* Successive loops are only necessary if there are more
@@ -1208,13 +1228,15 @@ retry:
*/
if ((tsp != NULL && !timespecisset(tsp)) ||
scan->kqs_nevent != 0) {
- splx(s);
+ mtx_leave(&kq->kq_lock);
+ KERNEL_UNLOCK();
error = 0;
goto done;
}
kq->kq_state |= KQ_SLEEP;
error = kqueue_sleep(kq, tsp);
- splx(s);
+ /* kqueue_sleep() has released kq_lock. */
+ KERNEL_UNLOCK();
if (error == 0 || error == EWOULDBLOCK)
goto retry;
/* don't restart after signals... */
@@ -1223,6 +1245,9 @@ retry:
goto done;
}
+ /* The actual scan does not sleep on kq, so unlock the kernel. */
+ KERNEL_UNLOCK();
+
/*
* Put the end marker in the queue to limit the scan to the events
* that are currently active. This prevents events from being
@@ -1254,8 +1279,11 @@ retry:
continue;
}
- if (!knote_acquire(kn, NULL, 0))
+ if (!knote_acquire(kn, NULL, 0)) {
+ /* knote_acquire() has released kq_lock. */
+ mtx_enter(&kq->kq_lock);
continue;
+ }
kqueue_check(kq);
TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
@@ -1268,11 +1296,11 @@ retry:
continue;
}
- splx(s);
+ mtx_leave(&kq->kq_lock);
memset(kevp, 0, sizeof(*kevp));
if (filter_process(kn, kevp) == 0) {
- s = splhigh();
+ mtx_enter(&kq->kq_lock);
if ((kn->kn_status & KN_QUEUED) == 0)
kn->kn_status &= ~KN_ACTIVE;
knote_release(kn);
@@ -1286,9 +1314,9 @@ retry:
if (kevp->flags & EV_ONESHOT) {
filter_detach(kn);
knote_drop(kn, p);
- s = splhigh();
+ mtx_enter(&kq->kq_lock);
} else if (kevp->flags & (EV_CLEAR | EV_DISPATCH)) {
- s = splhigh();
+ mtx_enter(&kq->kq_lock);
if (kevp->flags & EV_DISPATCH)
kn->kn_status |= KN_DISABLED;
if ((kn->kn_status & KN_QUEUED) == 0)
@@ -1296,7 +1324,7 @@ retry:
KASSERT(kn->kn_status & KN_ATTACHED);
knote_release(kn);
} else {
- s = splhigh();
+ mtx_enter(&kq->kq_lock);
if ((kn->kn_status & KN_QUEUED) == 0) {
kqueue_check(kq);
kq->kq_count++;
@@ -1313,7 +1341,7 @@ retry:
scan->kqs_nevent++;
}
TAILQ_REMOVE(&kq->kq_head, &scan->kqs_start, kn_tqe);
- splx(s);
+ mtx_leave(&kq->kq_lock);
if (scan->kqs_nevent == 0)
goto retry;
done:
@@ -1338,7 +1366,6 @@ void
kqueue_scan_finish(struct kqueue_scan_state *scan)
{
struct kqueue *kq = scan->kqs_kq;
- int s;
KASSERT(scan->kqs_start.kn_filter == EVFILT_MARKER);
KASSERT(scan->kqs_start.kn_status == KN_PROCESSING);
@@ -1347,9 +1374,9 @@ kqueue_scan_finish(struct kqueue_scan_st
if (scan->kqs_queued) {
scan->kqs_queued = 0;
- s = splhigh();
+ mtx_enter(&kq->kq_lock);
TAILQ_REMOVE(&kq->kq_head, &scan->kqs_end, kn_tqe);
- splx(s);
+ mtx_leave(&kq->kq_lock);
}
KQRELE(kq);
}
@@ -1381,17 +1408,17 @@ kqueue_poll(struct file *fp, int events,
{
struct kqueue *kq = (struct kqueue *)fp->f_data;
int revents = 0;
- int s = splhigh();
if (events & (POLLIN | POLLRDNORM)) {
+ mtx_enter(&kq->kq_lock);
if (kq->kq_count) {
revents |= events & (POLLIN | POLLRDNORM);
} else {
selrecord(p, &kq->kq_sel);
kq->kq_state |= KQ_SEL;
}
+ mtx_leave(&kq->kq_lock);
}
- splx(s);
return (revents);
}
@@ -1401,7 +1428,7 @@ kqueue_stat(struct file *fp, struct stat
struct kqueue *kq = fp->f_data;
memset(st, 0, sizeof(*st));
- st->st_size = kq->kq_count;
+ st->st_size = kq->kq_count; /* unlocked read */
st->st_blksize = sizeof(struct kevent);
st->st_mode = S_IFIFO;
return (0);
@@ -1412,14 +1439,14 @@ kqueue_purge(struct proc *p, struct kque
{
int i;
- KERNEL_ASSERT_LOCKED();
-
+ mtx_enter(&kq->kq_lock);
for (i = 0; i < kq->kq_knlistsize; i++)
- knote_remove(p, &kq->kq_knlist[i], 1);
+ knote_remove(p, kq, &kq->kq_knlist[i], 1);
if (kq->kq_knhashmask != 0) {
for (i = 0; i < kq->kq_knhashmask + 1; i++)
- knote_remove(p, &kq->kq_knhash[i], 1);
+ knote_remove(p, kq, &kq->kq_knhash[i], 1);
}
+ mtx_leave(&kq->kq_lock);
}
void
@@ -1427,6 +1454,8 @@ kqueue_terminate(struct proc *p, struct
{
struct knote *kn;
+ mtx_enter(&kq->kq_lock);
+
/*
* Any remaining entries should be scan markers.
* They are removed when the ongoing scans finish.
@@ -1437,6 +1466,7 @@ kqueue_terminate(struct proc *p, struct
kq->kq_state |= KQ_DYING;
kqueue_wakeup(kq);
+ mtx_leave(&kq->kq_lock);
KASSERT(klist_empty(&kq->kq_sel.si_note));
task_del(systq, &kq->kq_task);
@@ -1448,15 +1478,13 @@ kqueue_close(struct file *fp, struct pro
{
struct kqueue *kq = fp->f_data;
- KERNEL_LOCK();
+ fp->f_data = NULL;
+
kqueue_purge(p, kq);
kqueue_terminate(p, kq);
- fp->f_data = NULL;
KQRELE(kq);
- KERNEL_UNLOCK();
-
return (0);
}
@@ -1465,10 +1493,16 @@ kqueue_task(void *arg)
{
struct kqueue *kq = arg;
+ /* Kernel lock is needed inside selwakeup(). */
+ KERNEL_ASSERT_LOCKED();
+
+ mtx_enter(&kq->kq_lock);
if (kq->kq_state & KQ_SEL) {
kq->kq_state &= ~KQ_SEL;
+ mtx_leave(&kq->kq_lock);
selwakeup(&kq->kq_sel);
} else {
+ mtx_leave(&kq->kq_lock);
KNOTE(&kq->kq_sel.si_note, 0);
}
KQRELE(kq);
@@ -1477,6 +1511,7 @@ kqueue_task(void *arg)
void
kqueue_wakeup(struct kqueue *kq)
{
+ MUTEX_ASSERT_LOCKED(&kq->kq_lock);
if (kq->kq_state & KQ_SLEEP) {
kq->kq_state &= ~KQ_SLEEP;
@@ -1496,14 +1531,20 @@ kqueue_expand_hash(struct kqueue *kq)
struct knlist *hash;
u_long hashmask;
+ MUTEX_ASSERT_LOCKED(&kq->kq_lock);
+
if (kq->kq_knhashmask == 0) {
+ mtx_leave(&kq->kq_lock);
hash = hashinit(KN_HASHSIZE, M_KEVENT, M_WAITOK, &hashmask);
+ mtx_enter(&kq->kq_lock);
if (kq->kq_knhashmask == 0) {
kq->kq_knhash = hash;
kq->kq_knhashmask = hashmask;
} else {
/* Another thread has allocated the hash. */
+ mtx_leave(&kq->kq_lock);
hashfree(hash, KN_HASHSIZE, M_KEVENT);
+ mtx_enter(&kq->kq_lock);
}
}
}
@@ -1511,26 +1552,35 @@ kqueue_expand_hash(struct kqueue *kq)
static void
kqueue_expand_list(struct kqueue *kq, int fd)
{
- struct knlist *list;
- int size;
+ struct knlist *list, *olist;
+ int size, osize;
+
+ MUTEX_ASSERT_LOCKED(&kq->kq_lock);
if (kq->kq_knlistsize <= fd) {
size = kq->kq_knlistsize;
+ mtx_leave(&kq->kq_lock);
while (size <= fd)
size += KQEXTENT;
list = mallocarray(size, sizeof(*list), M_KEVENT, M_WAITOK);
+ mtx_enter(&kq->kq_lock);
if (kq->kq_knlistsize <= fd) {
memcpy(list, kq->kq_knlist,
kq->kq_knlistsize * sizeof(*list));
memset(&list[kq->kq_knlistsize], 0,
(size - kq->kq_knlistsize) * sizeof(*list));
- free(kq->kq_knlist, M_KEVENT,
- kq->kq_knlistsize * sizeof(*list));
+ olist = kq->kq_knlist;
+ osize = kq->kq_knlistsize;
kq->kq_knlist = list;
kq->kq_knlistsize = size;
+ mtx_leave(&kq->kq_lock);
+ free(olist, M_KEVENT, osize * sizeof(*list));
+ mtx_enter(&kq->kq_lock);
} else {
/* Another thread has expanded the list. */
+ mtx_leave(&kq->kq_lock);
free(list, M_KEVENT, size * sizeof(*list));
+ mtx_enter(&kq->kq_lock);
}
}
}
@@ -1548,14 +1598,22 @@ kqueue_expand_list(struct kqueue *kq, in
int
knote_acquire(struct knote *kn, struct klist *klist, int ls)
{
- splassert(IPL_HIGH);
+ struct kqueue *kq = kn->kn_kq;
+
+ MUTEX_ASSERT_LOCKED(&kq->kq_lock);
KASSERT(kn->kn_filter != EVFILT_MARKER);
if (kn->kn_status & KN_PROCESSING) {
kn->kn_status |= KN_WAITING;
- if (klist != NULL)
+ if (klist != NULL) {
+ mtx_leave(&kq->kq_lock);
klist_unlock(klist, ls);
- tsleep_nsec(kn, 0, "kqepts", SEC_TO_NSEC(1));
+ /* XXX Timeout resolves potential loss of wakeup. */
+ tsleep_nsec(kn, 0, "kqepts", SEC_TO_NSEC(1));
+ } else {
+ msleep_nsec(kn, &kq->kq_lock, PNORELOCK, "kqepts",
+ SEC_TO_NSEC(1));
+ }
/* knote may be stale now */
return (0);
}
@@ -1569,7 +1627,7 @@ knote_acquire(struct knote *kn, struct k
void
knote_release(struct knote *kn)
{
- splassert(IPL_HIGH);
+ MUTEX_ASSERT_LOCKED(&kn->kn_kq->kq_lock);
KASSERT(kn->kn_filter != EVFILT_MARKER);
KASSERT(kn->kn_status & KN_PROCESSING);
@@ -1587,13 +1645,11 @@ knote_release(struct knote *kn)
void
knote_activate(struct knote *kn)
{
- int s;
+ MUTEX_ASSERT_LOCKED(&kn->kn_kq->kq_lock);
- s = splhigh();
kn->kn_status |= KN_ACTIVE;
if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)
knote_enqueue(kn);
- splx(s);
}
/*
@@ -1603,30 +1659,38 @@ void
knote(struct klist *list, long hint)
{
struct knote *kn, *kn0;
+ struct kqueue *kq;
KLIST_ASSERT_LOCKED(list);
- SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, kn0)
- if (filter_event(kn, hint))
+ SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, kn0) {
+ if (filter_event(kn, hint)) {
+ kq = kn->kn_kq;
+ mtx_enter(&kq->kq_lock);
knote_activate(kn);
+ mtx_leave(&kq->kq_lock);
+ }
+ }
}
/*
* remove all knotes from a specified knlist
*/
void
-knote_remove(struct proc *p, struct knlist *list, int purge)
+knote_remove(struct proc *p, struct kqueue *kq, struct knlist *list, int purge)
{
struct knote *kn;
- int s;
+
+ MUTEX_ASSERT_LOCKED(&kq->kq_lock);
while ((kn = SLIST_FIRST(list)) != NULL) {
- s = splhigh();
+ KASSERT(kn->kn_kq == kq);
if (!knote_acquire(kn, NULL, 0)) {
- splx(s);
+ /* knote_acquire() has released kq_lock. */
+ mtx_enter(&kq->kq_lock);
continue;
}
- splx(s);
+ mtx_leave(&kq->kq_lock);
filter_detach(kn);
/*
@@ -1641,20 +1705,22 @@ knote_remove(struct proc *p, struct knli
*/
if (!purge && (kn->kn_flags & __EV_POLL) != 0) {
KASSERT(kn->kn_fop->f_flags & FILTEROP_ISFD);
+ mtx_enter(&kq->kq_lock);
knote_detach(kn);
+ mtx_leave(&kq->kq_lock);
FRELE(kn->kn_fp, p);
kn->kn_fp = NULL;
kn->kn_fop = &badfd_filtops;
filter_event(kn, 0);
+ mtx_enter(&kq->kq_lock);
knote_activate(kn);
- s = splhigh();
knote_release(kn);
- splx(s);
continue;
}
knote_drop(kn, p);
+ mtx_enter(&kq->kq_lock);
}
}
@@ -1666,7 +1732,6 @@ knote_fdclose(struct proc *p, int fd)
{
struct filedesc *fdp = p->p_p->ps_fd;
struct kqueue *kq;
- struct knlist *list;
/*
* fdplock can be ignored if the file descriptor table is being freed
@@ -1675,18 +1740,12 @@ knote_fdclose(struct proc *p, int fd)
if (fdp->fd_refcnt != 0)
fdpassertlocked(fdp);
- if (LIST_EMPTY(&fdp->fd_kqlist))
- return;
-
- KERNEL_LOCK();
LIST_FOREACH(kq, &fdp->fd_kqlist, kq_next) {
- if (fd >= kq->kq_knlistsize)
- continue;
-
- list = &kq->kq_knlist[fd];
- knote_remove(p, list, 0);
+ mtx_enter(&kq->kq_lock);
+ if (fd < kq->kq_knlistsize)
+ knote_remove(p, kq, &kq->kq_knlist[fd], 0);
+ mtx_leave(&kq->kq_lock);
}
- KERNEL_UNLOCK();
}
/*
@@ -1698,6 +1757,7 @@ knote_processexit(struct proc *p)
{
struct process *pr = p->p_p;
+ KERNEL_ASSERT_LOCKED();
KASSERT(p == curproc);
KNOTE(&pr->ps_klist, NOTE_EXIT);
@@ -1711,15 +1771,12 @@ knote_attach(struct knote *kn)
{
struct kqueue *kq = kn->kn_kq;
struct knlist *list;
- int s;
+ MUTEX_ASSERT_LOCKED(&kq->kq_lock);
KASSERT(kn->kn_status & KN_PROCESSING);
KASSERT((kn->kn_status & KN_ATTACHED) == 0);
- s = splhigh();
kn->kn_status |= KN_ATTACHED;
- splx(s);
-
if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
KASSERT(kq->kq_knlistsize > kn->kn_id);
list = &kq->kq_knlist[kn->kn_id];
@@ -1735,8 +1792,8 @@ knote_detach(struct knote *kn)
{
struct kqueue *kq = kn->kn_kq;
struct knlist *list;
- int s;
+ MUTEX_ASSERT_LOCKED(&kq->kq_lock);
KASSERT(kn->kn_status & KN_PROCESSING);
if ((kn->kn_status & KN_ATTACHED) == 0)
@@ -1747,10 +1804,7 @@ knote_detach(struct knote *kn)
else
list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
SLIST_REMOVE(list, kn, knote, kn_link);
-
- s = splhigh();
kn->kn_status &= ~KN_ATTACHED;
- splx(s);
}
/*
@@ -1760,20 +1814,20 @@ knote_detach(struct knote *kn)
void
knote_drop(struct knote *kn, struct proc *p)
{
- int s;
+ struct kqueue *kq = kn->kn_kq;
KASSERT(kn->kn_filter != EVFILT_MARKER);
+ mtx_enter(&kq->kq_lock);
knote_detach(kn);
-
- s = splhigh();
if (kn->kn_status & KN_QUEUED)
knote_dequeue(kn);
if (kn->kn_status & KN_WAITING) {
kn->kn_status &= ~KN_WAITING;
wakeup(kn);
}
- splx(s);
+ mtx_leave(&kq->kq_lock);
+
if ((kn->kn_fop->f_flags & FILTEROP_ISFD) && kn->kn_fp != NULL)
FRELE(kn->kn_fp, p);
pool_put(&knote_pool, kn);
@@ -1785,7 +1839,7 @@ knote_enqueue(struct knote *kn)
{
struct kqueue *kq = kn->kn_kq;
- splassert(IPL_HIGH);
+ MUTEX_ASSERT_LOCKED(&kq->kq_lock);
KASSERT(kn->kn_filter != EVFILT_MARKER);
KASSERT((kn->kn_status & KN_QUEUED) == 0);
@@ -1802,7 +1856,7 @@ knote_dequeue(struct knote *kn)
{
struct kqueue *kq = kn->kn_kq;
- splassert(IPL_HIGH);
+ MUTEX_ASSERT_LOCKED(&kq->kq_lock);
KASSERT(kn->kn_filter != EVFILT_MARKER);
KASSERT(kn->kn_status & KN_QUEUED);
@@ -1910,36 +1964,38 @@ void
klist_invalidate(struct klist *list)
{
struct knote *kn;
+ struct kqueue *kq;
struct proc *p = curproc;
- int ls, s;
+ int ls;
NET_ASSERT_UNLOCKED();
- s = splhigh();
ls = klist_lock(list);
while ((kn = SLIST_FIRST(&list->kl_list)) != NULL) {
+ kq = kn->kn_kq;
+ mtx_enter(&kq->kq_lock);
if (!knote_acquire(kn, list, ls)) {
- /* knote_acquire() has unlocked list. */
+ /* knote_acquire() has released kq_lock
+ * and klist lock. */
ls = klist_lock(list);
continue;
}
+ mtx_leave(&kq->kq_lock);
klist_unlock(list, ls);
- splx(s);
filter_detach(kn);
if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
kn->kn_fop = &dead_filtops;
filter_event(kn, 0);
+ mtx_enter(&kq->kq_lock);
knote_activate(kn);
- s = splhigh();
knote_release(kn);
+ mtx_leave(&kq->kq_lock);
} else {
knote_drop(kn, p);
- s = splhigh();
}
ls = klist_lock(list);
}
klist_unlock(list, ls);
- splx(s);
}
static int
Index: sys/eventvar.h
===================================================================
RCS file: src/sys/sys/eventvar.h,v
retrieving revision 1.11
diff -u -p -r1.11 eventvar.h
--- sys/eventvar.h 17 Jan 2021 05:56:32 -0000 1.11
+++ sys/eventvar.h 3 Jun 2021 14:06:54 -0000
@@ -31,6 +31,7 @@
#ifndef _SYS_EVENTVAR_H_
#define _SYS_EVENTVAR_H_
+#include <sys/mutex.h>
#include <sys/task.h>
#define KQ_NEVENTS 8 /* minimize copy{in,out} calls */
@@ -38,24 +39,29 @@
/*
* Locking:
+ * I immutable after creation
* a atomic operations
+ * q kq_lock
*/
struct kqueue {
- TAILQ_HEAD(, knote) kq_head; /* list of pending event */
- int kq_count; /* number of pending events */
- u_int kq_refs; /* [a] number of references */
+ struct mutex kq_lock; /* lock for queue access */
+ TAILQ_HEAD(, knote) kq_head; /* [q] list of pending event */
+ int kq_count; /* [q] # of pending events */
+ u_int kq_refs; /* [a] # of references */
struct selinfo kq_sel;
- struct filedesc *kq_fdp;
+ struct filedesc *kq_fdp; /* [I] fd table of this kq */
LIST_ENTRY(kqueue) kq_next;
- int kq_knlistsize; /* size of kq_knlist */
- struct knlist *kq_knlist; /* list of attached knotes */
- u_long kq_knhashmask; /* size of kq_knhash */
- struct knlist *kq_knhash; /* hash table for attached
knotes */
+ int kq_knlistsize; /* [q] size of kq_knlist */
+ struct knlist *kq_knlist; /* [q] list of
+ * attached knotes */
+ u_long kq_knhashmask; /* [q] size of kq_knhash */
+ struct knlist *kq_knhash; /* [q] hash table for
+ * attached knotes */
struct task kq_task; /* deferring of activation */
- int kq_state;
+ int kq_state; /* [q] */
#define KQ_SEL 0x01
#define KQ_SLEEP 0x02
#define KQ_DYING 0x04