This patch extends struct klist with a callback descriptor and
an argument. The main purpose of this is to let the kqueue subsystem
assert when a klist should be locked, and operate the klist lock
in klist_invalidate().
Access to a knote list of a kqueue-monitored object has to be
serialized somehow. Because the object often has a lock for protecting
its state, and because the object often acquires this lock at the latest
in its f_event callback functions, I would like to use the same lock
also for the knote lists. Uses of NOTE_SUBMIT already show a pattern
arising.
There could be an embedded lock in klist. However, such a lock would be
redundant in many cases. The code could not rely on a single lock type
(mutex, rwlock, something else) because the needs of monitored objects
vary. In addition, an embedded lock would introduce new lock order
constraints. Note that this patch does not rule out use of dedicated
klist locks.
The patch introduces a way to associate lock operations with a klist.
The caller can provide a custom implementation, or use a ready-made
interface with a mutex or rwlock.
For compatibility with old code, the new code falls back to using the
kernel lock if no specific klist initialization has been done. The
existing code already relies on implicit initialization of klist.
Unfortunately, the size of struct klist will grow threefold.
As the patch gives the code the ability to operate the klist lock,
the klist API could provide variants of insert and remove actions that
handle locking internally, for convenience. However, that I would leave
for another patch because I would prefer to rename the current
klist_insert() to klist_insert_locked(), and klist_remove() to
klist_remove_locked().
The patch additionally provides three examples of usage: audio, pipes,
and sockets. Each of these examples is logically a separate changeset.
Please test and review.
Index: dev/audio.c
===================================================================
RCS file: src/sys/dev/audio.c,v
retrieving revision 1.191
diff -u -p -r1.191 audio.c
--- dev/audio.c 19 May 2020 06:32:24 -0000 1.191
+++ dev/audio.c 11 Dec 2020 17:05:09 -0000
@@ -305,11 +305,12 @@ audio_buf_wakeup(void *addr)
int
audio_buf_init(struct audio_softc *sc, struct audio_buf *buf, int dir)
{
+ klist_init_mutex(&buf->sel.si_note, &audio_lock);
buf->softintr = softintr_establish(IPL_SOFTAUDIO,
audio_buf_wakeup, buf);
if (buf->softintr == NULL) {
printf("%s: can't establish softintr\n", DEVNAME(sc));
- return ENOMEM;
+ goto bad;
}
if (sc->ops->round_buffersize) {
buf->datalen = sc->ops->round_buffersize(sc->arg,
@@ -323,9 +324,12 @@ audio_buf_init(struct audio_softc *sc, s
buf->data = malloc(buf->datalen, M_DEVBUF, M_WAITOK);
if (buf->data == NULL) {
softintr_disestablish(buf->softintr);
- return ENOMEM;
+ goto bad;
}
return 0;
+bad:
+ klist_free(&buf->sel.si_note);
+ return ENOMEM;
}
void
@@ -336,6 +340,7 @@ audio_buf_done(struct audio_softc *sc, s
else
free(buf->data, M_DEVBUF, buf->datalen);
softintr_disestablish(buf->softintr);
+ klist_free(&buf->sel.si_note);
}
/*
@@ -1256,6 +1261,7 @@ audio_attach(struct device *parent, stru
return;
}
+ klist_init_mutex(&sc->mix_sel.si_note, &audio_lock);
sc->mix_softintr = softintr_establish(IPL_SOFTAUDIO,
audio_mixer_wakeup, sc);
if (sc->mix_softintr == NULL) {
@@ -1451,6 +1457,7 @@ audio_detach(struct device *self, int fl
/* free resources */
softintr_disestablish(sc->mix_softintr);
+ klist_free(&sc->mix_sel.si_note);
free(sc->mix_evbuf, M_DEVBUF, sc->mix_nent * sizeof(struct mixer_ev));
free(sc->mix_ents, M_DEVBUF, sc->mix_nent * sizeof(struct mixer_ctrl));
audio_buf_done(sc, &sc->play);
Index: kern/kern_event.c
===================================================================
RCS file: src/sys/kern/kern_event.c,v
retrieving revision 1.147
diff -u -p -r1.147 kern_event.c
--- kern/kern_event.c 9 Dec 2020 18:58:19 -0000 1.147
+++ kern/kern_event.c 11 Dec 2020 17:05:09 -0000
@@ -57,6 +57,17 @@
#include <sys/timeout.h>
#include <sys/wait.h>
+#ifdef DIAGNOSTIC
+#define KLIST_ASSERT_LOCKED(kl) do { \
+ if ((kl)->kl_ops != NULL) \
+ (kl)->kl_ops->klo_assertlk((kl)->kl_arg); \
+ else \
+ KERNEL_ASSERT_LOCKED(); \
+} while (0)
+#else
+#define KLIST_ASSERT_LOCKED(kl) ((void)(kl))
+#endif
+
struct kqueue *kqueue_alloc(struct filedesc *);
void kqueue_terminate(struct proc *p, struct kqueue *);
void kqueue_free(struct kqueue *);
@@ -79,6 +90,8 @@ void kqueue_wakeup(struct kqueue *kq);
static void kqueue_expand_hash(struct kqueue *kq);
static void kqueue_expand_list(struct kqueue *kq, int fd);
static void kqueue_task(void *);
+static int klist_lock(struct klist *);
+static void klist_unlock(struct klist *, int);
const struct fileops kqueueops = {
.fo_read = kqueue_read,
@@ -94,7 +107,7 @@ void knote_attach(struct knote *kn);
void knote_drop(struct knote *kn, struct proc *p);
void knote_enqueue(struct knote *kn);
void knote_dequeue(struct knote *kn);
-int knote_acquire(struct knote *kn);
+int knote_acquire(struct knote *kn, struct klist *, int);
void knote_release(struct knote *kn);
void filt_kqdetach(struct knote *kn);
@@ -792,7 +805,7 @@ again:
if (kev->filter == kn->kn_filter &&
kev->ident == kn->kn_id) {
s = splhigh();
- if (!knote_acquire(kn)) {
+ if (!knote_acquire(kn, NULL, 0)) {
splx(s);
if (fp != NULL) {
FRELE(fp, p);
@@ -1018,7 +1031,7 @@ retry:
continue;
}
- if (!knote_acquire(kn))
+ if (!knote_acquire(kn, NULL, 0))
continue;
kqueue_check(kq);
@@ -1293,15 +1306,20 @@ kqueue_expand_list(struct kqueue *kq, in
* If we cannot acquire the knote we sleep and return 0. The knote
* may be stale on return in this case and the caller must restart
* whatever loop they are in.
+ *
+ * If we are about to sleep and klist is non-NULL, the list is unlocked
+ * before sleep and remains unlocked on return.
*/
int
-knote_acquire(struct knote *kn)
+knote_acquire(struct knote *kn, struct klist *klist, int ls)
{
splassert(IPL_HIGH);
KASSERT(kn->kn_filter != EVFILT_MARKER);
if (kn->kn_status & KN_PROCESSING) {
kn->kn_status |= KN_WAITING;
+ if (klist != NULL)
+ klist_unlock(klist, ls);
tsleep_nsec(kn, 0, "kqepts", SEC_TO_NSEC(1));
/* knote may be stale now */
return (0);
@@ -1351,6 +1369,8 @@ knote(struct klist *list, long hint)
{
struct knote *kn, *kn0;
+ KLIST_ASSERT_LOCKED(list);
+
SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, kn0)
if (kn->kn_fop->f_event(kn, hint))
knote_activate(kn);
@@ -1367,7 +1387,7 @@ knote_remove(struct proc *p, struct knli
while ((kn = SLIST_FIRST(list)) != NULL) {
s = splhigh();
- if (!knote_acquire(kn)) {
+ if (!knote_acquire(kn, NULL, 0)) {
splx(s);
continue;
}
@@ -1508,14 +1528,31 @@ knote_dequeue(struct knote *kn)
}
void
+klist_init(struct klist *klist, const struct klistops *ops, void *arg)
+{
+ klist->kl_ops = ops;
+ klist->kl_arg = arg;
+}
+
+void
+klist_free(struct klist *klist)
+{
+ KASSERT(SLIST_EMPTY(&klist->kl_list));
+}
+
+void
klist_insert(struct klist *klist, struct knote *kn)
{
+ KLIST_ASSERT_LOCKED(klist);
+
SLIST_INSERT_HEAD(&klist->kl_list, kn, kn_selnext);
}
void
klist_remove(struct klist *klist, struct knote *kn)
{
+ KLIST_ASSERT_LOCKED(klist);
+
SLIST_REMOVE(&klist->kl_list, kn, knote, kn_selnext);
}
@@ -1530,7 +1567,7 @@ klist_invalidate(struct klist *list)
{
struct knote *kn;
struct proc *p = curproc;
- int s;
+ int ls, s;
/*
* NET_LOCK() must not be held because it can block another thread
@@ -1539,9 +1576,14 @@ klist_invalidate(struct klist *list)
NET_ASSERT_UNLOCKED();
s = splhigh();
+ ls = klist_lock(list);
while ((kn = SLIST_FIRST(&list->kl_list)) != NULL) {
- if (!knote_acquire(kn))
+ if (!knote_acquire(kn, list, ls)) {
+ /* knote_acquire() has unlocked list. */
+ ls = klist_lock(list);
continue;
+ }
+ klist_unlock(list, ls);
splx(s);
kn->kn_fop->f_detach(kn);
if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
@@ -1553,6 +1595,111 @@ klist_invalidate(struct klist *list)
knote_drop(kn, p);
s = splhigh();
}
+ ls = klist_lock(list);
}
+ klist_unlock(list, ls);
splx(s);
}
+
+static int
+klist_lock(struct klist *list)
+{
+ int ls = 0;
+
+ if (list->kl_ops != NULL) {
+ ls = list->kl_ops->klo_lock(list->kl_arg);
+ } else {
+ ls = splhigh();
+ KERNEL_LOCK();
+ }
+ return ls;
+}
+
+static void
+klist_unlock(struct klist *list, int ls)
+{
+ if (list->kl_ops != NULL) {
+ list->kl_ops->klo_unlock(list->kl_arg, ls);
+ } else {
+ KERNEL_UNLOCK();
+ splx(ls);
+ }
+}
+
+static void
+klist_mutex_assertlk(void *arg)
+{
+ struct mutex *mtx = arg;
+
+ (void)mtx;
+
+ MUTEX_ASSERT_LOCKED(mtx);
+}
+
+static int
+klist_mutex_lock(void *arg)
+{
+ struct mutex *mtx = arg;
+
+ mtx_enter(mtx);
+ return 0;
+}
+
+static void
+klist_mutex_unlock(void *arg, int s)
+{
+ struct mutex *mtx = arg;
+
+ mtx_leave(mtx);
+}
+
+static const struct klistops mutex_klistops = {
+ .klo_assertlk = klist_mutex_assertlk,
+ .klo_lock = klist_mutex_lock,
+ .klo_unlock = klist_mutex_unlock,
+};
+
+void
+klist_init_mutex(struct klist *klist, struct mutex *mtx)
+{
+ klist_init(klist, &mutex_klistops, mtx);
+}
+
+static void
+klist_rwlock_assertlk(void *arg)
+{
+ struct rwlock *rwl = arg;
+
+ (void)rwl;
+
+ rw_assert_wrlock(rwl);
+}
+
+static int
+klist_rwlock_lock(void *arg)
+{
+ struct rwlock *rwl = arg;
+
+ rw_enter_write(rwl);
+ return 0;
+}
+
+static void
+klist_rwlock_unlock(void *arg, int s)
+{
+ struct rwlock *rwl = arg;
+
+ rw_exit_write(rwl);
+}
+
+static const struct klistops rwlock_klistops = {
+ .klo_assertlk = klist_rwlock_assertlk,
+ .klo_lock = klist_rwlock_lock,
+ .klo_unlock = klist_rwlock_unlock,
+};
+
+void
+klist_init_rwlock(struct klist *klist, struct rwlock *rwl)
+{
+ klist_init(klist, &rwlock_klistops, rwl);
+}
Index: kern/sys_pipe.c
===================================================================
RCS file: src/sys/kern/sys_pipe.c,v
retrieving revision 1.124
diff -u -p -r1.124 sys_pipe.c
--- kern/sys_pipe.c 11 Dec 2020 14:17:35 -0000 1.124
+++ kern/sys_pipe.c 11 Dec 2020 17:05:09 -0000
@@ -126,6 +126,7 @@ void pipe_iounlock(struct pipe *);
int pipe_iosleep(struct pipe *, const char *);
struct pipe_pair *pipe_pair_create(void);
+void pipe_pair_destroy(struct pipe_pair *);
/*
* The pipe system call for the DTYPE_PIPE type of pipes
@@ -853,7 +854,7 @@ pipe_destroy(struct pipe *cpipe)
rw_exit_write(cpipe->pipe_lock);
if (ppipe == NULL)
- pool_put(&pipe_pair_pool, cpipe->pipe_pair);
+ pipe_pair_destroy(cpipe->pipe_pair);
}
/*
@@ -997,6 +998,9 @@ pipe_pair_create(void)
pp->pp_wpipe.pipe_lock = &pp->pp_lock;
pp->pp_rpipe.pipe_lock = &pp->pp_lock;
+ klist_init_rwlock(&pp->pp_wpipe.pipe_sel.si_note, &pp->pp_lock);
+ klist_init_rwlock(&pp->pp_rpipe.pipe_sel.si_note, &pp->pp_lock);
+
if (pipe_create(&pp->pp_wpipe) || pipe_create(&pp->pp_rpipe))
goto err;
return (pp);
@@ -1005,3 +1009,11 @@ err:
pipe_destroy(&pp->pp_rpipe);
return (NULL);
}
+
+void
+pipe_pair_destroy(struct pipe_pair *pp)
+{
+ klist_free(&pp->pp_wpipe.pipe_sel.si_note);
+ klist_free(&pp->pp_rpipe.pipe_sel.si_note);
+ pool_put(&pipe_pair_pool, pp);
+}
Index: kern/uipc_socket.c
===================================================================
RCS file: src/sys/kern/uipc_socket.c,v
retrieving revision 1.250
diff -u -p -r1.250 uipc_socket.c
--- kern/uipc_socket.c 17 Nov 2020 14:45:42 -0000 1.250
+++ kern/uipc_socket.c 11 Dec 2020 17:05:09 -0000
@@ -164,6 +164,8 @@ socreate(int dom, struct socket **aso, i
so->so_proto = prp;
so->so_snd.sb_timeo_nsecs = INFSLP;
so->so_rcv.sb_timeo_nsecs = INFSLP;
+ klist_init(&so->so_rcv.sb_sel.si_note, &socket_klistops, so);
+ klist_init(&so->so_snd.sb_sel.si_note, &socket_klistops, so);
s = solock(so);
error = (*prp->pr_attach)(so, proto);
@@ -239,6 +241,8 @@ sofree(struct socket *so, int s)
}
}
sigio_free(&so->so_sigio);
+ klist_free(&so->so_rcv.sb_sel.si_note);
+ klist_free(&so->so_snd.sb_sel.si_note);
#ifdef SOCKET_SPLICE
if (so->so_sp) {
if (issplicedback(so)) {
@@ -2017,6 +2021,7 @@ soo_kqfilter(struct file *fp, struct kno
{
struct socket *so = kn->kn_fp->f_data;
struct sockbuf *sb;
+ int s;
KERNEL_ASSERT_LOCKED();
@@ -2040,8 +2045,10 @@ soo_kqfilter(struct file *fp, struct kno
return (EINVAL);
}
+ s = solock(so);
klist_insert(&sb->sb_sel.si_note, kn);
sb->sb_flagsintr |= SB_KNOTE;
+ sounlock(so, s);
return (0);
}
@@ -2050,12 +2057,15 @@ void
filt_sordetach(struct knote *kn)
{
struct socket *so = kn->kn_fp->f_data;
+ int s;
KERNEL_ASSERT_LOCKED();
+ s = solock(so);
klist_remove(&so->so_rcv.sb_sel.si_note, kn);
if (klist_empty(&so->so_rcv.sb_sel.si_note))
so->so_rcv.sb_flagsintr &= ~SB_KNOTE;
+ sounlock(so, s);
}
int
@@ -2103,12 +2113,15 @@ void
filt_sowdetach(struct knote *kn)
{
struct socket *so = kn->kn_fp->f_data;
+ int s;
KERNEL_ASSERT_LOCKED();
+ s = solock(so);
klist_remove(&so->so_snd.sb_sel.si_note, kn);
if (klist_empty(&so->so_snd.sb_sel.si_note))
so->so_snd.sb_flagsintr &= ~SB_KNOTE;
+ sounlock(so, s);
}
int
@@ -2159,6 +2172,36 @@ filt_solisten(struct knote *kn, long hin
return (kn->kn_data != 0);
}
+static void
+klist_soassertlk(void *arg)
+{
+ struct socket *so = arg;
+
+ soassertlocked(so);
+}
+
+static int
+klist_solock(void *arg)
+{
+ struct socket *so = arg;
+
+ return (solock(so));
+}
+
+static void
+klist_sounlock(void *arg, int ls)
+{
+ struct socket *so = arg;
+
+ sounlock(so, ls);
+}
+
+const struct klistops socket_klistops = {
+ .klo_assertlk = klist_soassertlk,
+ .klo_lock = klist_solock,
+ .klo_unlock = klist_sounlock,
+};
+
#ifdef DDB
void
sobuf_print(struct sockbuf *,
Index: kern/uipc_socket2.c
===================================================================
RCS file: src/sys/kern/uipc_socket2.c,v
retrieving revision 1.104
diff -u -p -r1.104 uipc_socket2.c
--- kern/uipc_socket2.c 11 Apr 2020 14:07:06 -0000 1.104
+++ kern/uipc_socket2.c 11 Dec 2020 17:05:09 -0000
@@ -186,6 +186,8 @@ sonewconn(struct socket *head, int conns
so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
so->so_rcv.sb_timeo_nsecs = head->so_rcv.sb_timeo_nsecs;
+ klist_init(&so->so_rcv.sb_sel.si_note, &socket_klistops, so);
+ klist_init(&so->so_snd.sb_sel.si_note, &socket_klistops, so);
sigio_init(&so->so_sigio);
sigio_copy(&so->so_sigio, &head->so_sigio);
@@ -193,6 +195,8 @@ sonewconn(struct socket *head, int conns
if ((*so->so_proto->pr_attach)(so, 0)) {
(void) soqremque(so, soqueue);
sigio_free(&so->so_sigio);
+ klist_free(&so->so_rcv.sb_sel.si_note);
+ klist_free(&so->so_snd.sb_sel.si_note);
pool_put(&socket_pool, so);
return (NULL);
}
Index: sys/event.h
===================================================================
RCS file: src/sys/sys/event.h,v
retrieving revision 1.49
diff -u -p -r1.49 event.h
--- sys/event.h 9 Dec 2020 18:58:19 -0000 1.49
+++ sys/event.h 11 Dec 2020 17:05:09 -0000
@@ -127,11 +127,15 @@ struct kevent {
* programs which pull in <sys/proc.h> or <sys/selinfo.h>.
*/
#include <sys/queue.h>
+
+struct klistops;
struct knote;
SLIST_HEAD(knlist, knote);
struct klist {
struct knlist kl_list;
+ const struct klistops *kl_ops;
+ void *kl_arg;
};
#ifdef _KERNEL
@@ -200,6 +204,12 @@ struct knote {
#define kn_fp kn_ptr.p_fp
};
+struct klistops {
+ void (*klo_assertlk)(void *);
+ int (*klo_lock)(void *);
+ void (*klo_unlock)(void *, int);
+};
+
struct kqueue_scan_state {
struct kqueue *kqs_kq; /* kqueue of this scan */
struct knote kqs_start; /* start marker */
@@ -209,11 +219,14 @@ struct kqueue_scan_state {
* in queue */
};
+struct mutex;
struct proc;
+struct rwlock;
struct timespec;
extern const struct filterops sig_filtops;
extern const struct filterops dead_filtops;
+extern const struct klistops socket_klistops;
extern void kqpoll_init(void);
extern void kqpoll_exit(void);
@@ -231,6 +244,10 @@ extern void kqueue_scan_finish(struct kq
extern void kqueue_purge(struct proc *, struct kqueue *);
extern int filt_seltrue(struct knote *kn, long hint);
extern int seltrue_kqfilter(dev_t, struct knote *);
+extern void klist_init(struct klist *, const struct klistops *, void *);
+extern void klist_init_mutex(struct klist *, struct mutex *);
+extern void klist_init_rwlock(struct klist *, struct rwlock *);
+extern void klist_free(struct klist *);
extern void klist_insert(struct klist *, struct knote *);
extern void klist_remove(struct klist *, struct knote *);
extern int klist_empty(struct klist *);