On Fri, Dec 11, 2020 at 05:37:57PM +0000, Visa Hankala wrote:
> This patch extends struct klist with a callback descriptor and
> an argument. The main purpose of this is to let the kqueue subsystem
> assert when a klist should be locked, and operate the klist lock
> in klist_invalidate().
i've always felt that the klist has always had an obvious need for
locking, but it's excellent that you've actually gone and done more than
feel things about it.
> Access to a knote list of a kqueue-monitored object has to be
> serialized somehow. Because the object often has a lock for protecting
> its state, and because the object often acquires this lock at the latest
> in its f_event callback functions, I would like to use the same lock
> also for the knote lists. Uses of NOTE_SUBMIT already show a pattern
> arising.
That makes sense. If it helps, even if it just helps justify it,
this is the same approach used in kstat, for much the same reason.
> There could be an embedded lock in klist. However, such a lock would be
> redundant in many cases. The code could not rely on a single lock type
> (mutex, rwlock, something else) because the needs of monitored objects
> vary. In addition, an embedded lock would introduce new lock order
> constraints. Note that this patch does not rule out use of dedicated
> klist locks.
>
> The patch introduces a way to associate lock operations with a klist.
> The caller can provide a custom implementation, or use a ready-made
> interface with a mutex or rwlock.
>
> For compatibility with old code, the new code falls back to using the
> kernel lock if no specific klist initialization has been done. The
> existing code already relies on implicit initialization of klist.
I was going to ask if you could provide a struct klistops around
KERNEL_LOCK as the default, but that would involve a lot more churn to
explicitly init all the klist structs.
> Unfortunately, the size of struct klist will grow threefold.
I think we'll live.
> As the patch gives the code the ability to operate the klist lock,
> the klist API could provide variants of insert and remove actions that
> handle locking internally, for convenience. However, that I would leave
> for another patch because I would prefer to rename the current
> klist_insert() to klist_insert_locked(), and klist_remove() to
> klist_remove_locked().
>
> The patch additionally provides three examples of usage: audio, pipes,
> and sockets. Each of these examples is logically a separate changeset.
>
>
> Please test and review.
I'll try to have a closer look soon.
> Index: dev/audio.c
> ===================================================================
> RCS file: src/sys/dev/audio.c,v
> retrieving revision 1.191
> diff -u -p -r1.191 audio.c
> --- dev/audio.c 19 May 2020 06:32:24 -0000 1.191
> +++ dev/audio.c 11 Dec 2020 17:05:09 -0000
> @@ -305,11 +305,12 @@ audio_buf_wakeup(void *addr)
> int
> audio_buf_init(struct audio_softc *sc, struct audio_buf *buf, int dir)
> {
> + klist_init_mutex(&buf->sel.si_note, &audio_lock);
> buf->softintr = softintr_establish(IPL_SOFTAUDIO,
> audio_buf_wakeup, buf);
> if (buf->softintr == NULL) {
> printf("%s: can't establish softintr\n", DEVNAME(sc));
> - return ENOMEM;
> + goto bad;
> }
> if (sc->ops->round_buffersize) {
> buf->datalen = sc->ops->round_buffersize(sc->arg,
> @@ -323,9 +324,12 @@ audio_buf_init(struct audio_softc *sc, s
> buf->data = malloc(buf->datalen, M_DEVBUF, M_WAITOK);
> if (buf->data == NULL) {
> softintr_disestablish(buf->softintr);
> - return ENOMEM;
> + goto bad;
> }
> return 0;
> +bad:
> + klist_free(&buf->sel.si_note);
> + return ENOMEM;
> }
>
> void
> @@ -336,6 +340,7 @@ audio_buf_done(struct audio_softc *sc, s
> else
> free(buf->data, M_DEVBUF, buf->datalen);
> softintr_disestablish(buf->softintr);
> + klist_free(&buf->sel.si_note);
> }
>
> /*
> @@ -1256,6 +1261,7 @@ audio_attach(struct device *parent, stru
> return;
> }
>
> + klist_init_mutex(&sc->mix_sel.si_note, &audio_lock);
> sc->mix_softintr = softintr_establish(IPL_SOFTAUDIO,
> audio_mixer_wakeup, sc);
> if (sc->mix_softintr == NULL) {
> @@ -1451,6 +1457,7 @@ audio_detach(struct device *self, int fl
>
> /* free resources */
> softintr_disestablish(sc->mix_softintr);
> + klist_free(&sc->mix_sel.si_note);
> free(sc->mix_evbuf, M_DEVBUF, sc->mix_nent * sizeof(struct mixer_ev));
> free(sc->mix_ents, M_DEVBUF, sc->mix_nent * sizeof(struct mixer_ctrl));
> audio_buf_done(sc, &sc->play);
> Index: kern/kern_event.c
> ===================================================================
> RCS file: src/sys/kern/kern_event.c,v
> retrieving revision 1.147
> diff -u -p -r1.147 kern_event.c
> --- kern/kern_event.c 9 Dec 2020 18:58:19 -0000 1.147
> +++ kern/kern_event.c 11 Dec 2020 17:05:09 -0000
> @@ -57,6 +57,17 @@
> #include <sys/timeout.h>
> #include <sys/wait.h>
>
> +#ifdef DIAGNOSTIC
> +#define KLIST_ASSERT_LOCKED(kl) do { \
> + if ((kl)->kl_ops != NULL) \
> + (kl)->kl_ops->klo_assertlk((kl)->kl_arg); \
> + else \
> + KERNEL_ASSERT_LOCKED(); \
> +} while (0)
> +#else
> +#define KLIST_ASSERT_LOCKED(kl) ((void)(kl))
> +#endif
> +
> struct kqueue *kqueue_alloc(struct filedesc *);
> void kqueue_terminate(struct proc *p, struct kqueue *);
> void kqueue_free(struct kqueue *);
> @@ -79,6 +90,8 @@ void kqueue_wakeup(struct kqueue *kq);
> static void kqueue_expand_hash(struct kqueue *kq);
> static void kqueue_expand_list(struct kqueue *kq, int fd);
> static void kqueue_task(void *);
> +static int klist_lock(struct klist *);
> +static void klist_unlock(struct klist *, int);
>
> const struct fileops kqueueops = {
> .fo_read = kqueue_read,
> @@ -94,7 +107,7 @@ void knote_attach(struct knote *kn);
> void knote_drop(struct knote *kn, struct proc *p);
> void knote_enqueue(struct knote *kn);
> void knote_dequeue(struct knote *kn);
> -int knote_acquire(struct knote *kn);
> +int knote_acquire(struct knote *kn, struct klist *, int);
> void knote_release(struct knote *kn);
>
> void filt_kqdetach(struct knote *kn);
> @@ -792,7 +805,7 @@ again:
> if (kev->filter == kn->kn_filter &&
> kev->ident == kn->kn_id) {
> s = splhigh();
> - if (!knote_acquire(kn)) {
> + if (!knote_acquire(kn, NULL, 0)) {
> splx(s);
> if (fp != NULL) {
> FRELE(fp, p);
> @@ -1018,7 +1031,7 @@ retry:
> continue;
> }
>
> - if (!knote_acquire(kn))
> + if (!knote_acquire(kn, NULL, 0))
> continue;
>
> kqueue_check(kq);
> @@ -1293,15 +1306,20 @@ kqueue_expand_list(struct kqueue *kq, in
> * If we cannot acquire the knote we sleep and return 0. The knote
> * may be stale on return in this case and the caller must restart
> * whatever loop they are in.
> + *
> + * If we are about to sleep and klist is non-NULL, the list is unlocked
> + * before sleep and remains unlocked on return.
> */
> int
> -knote_acquire(struct knote *kn)
> +knote_acquire(struct knote *kn, struct klist *klist, int ls)
> {
> splassert(IPL_HIGH);
> KASSERT(kn->kn_filter != EVFILT_MARKER);
>
> if (kn->kn_status & KN_PROCESSING) {
> kn->kn_status |= KN_WAITING;
> + if (klist != NULL)
> + klist_unlock(klist, ls);
> tsleep_nsec(kn, 0, "kqepts", SEC_TO_NSEC(1));
> /* knote may be stale now */
> return (0);
> @@ -1351,6 +1369,8 @@ knote(struct klist *list, long hint)
> {
> struct knote *kn, *kn0;
>
> + KLIST_ASSERT_LOCKED(list);
> +
> SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, kn0)
> if (kn->kn_fop->f_event(kn, hint))
> knote_activate(kn);
> @@ -1367,7 +1387,7 @@ knote_remove(struct proc *p, struct knli
>
> while ((kn = SLIST_FIRST(list)) != NULL) {
> s = splhigh();
> - if (!knote_acquire(kn)) {
> + if (!knote_acquire(kn, NULL, 0)) {
> splx(s);
> continue;
> }
> @@ -1508,14 +1528,31 @@ knote_dequeue(struct knote *kn)
> }
>
> void
> +klist_init(struct klist *klist, const struct klistops *ops, void *arg)
> +{
> + klist->kl_ops = ops;
> + klist->kl_arg = arg;
> +}
> +
> +void
> +klist_free(struct klist *klist)
> +{
> + KASSERT(SLIST_EMPTY(&klist->kl_list));
> +}
> +
> +void
> klist_insert(struct klist *klist, struct knote *kn)
> {
> + KLIST_ASSERT_LOCKED(klist);
> +
> SLIST_INSERT_HEAD(&klist->kl_list, kn, kn_selnext);
> }
>
> void
> klist_remove(struct klist *klist, struct knote *kn)
> {
> + KLIST_ASSERT_LOCKED(klist);
> +
> SLIST_REMOVE(&klist->kl_list, kn, knote, kn_selnext);
> }
>
> @@ -1530,7 +1567,7 @@ klist_invalidate(struct klist *list)
> {
> struct knote *kn;
> struct proc *p = curproc;
> - int s;
> + int ls, s;
>
> /*
> * NET_LOCK() must not be held because it can block another thread
> @@ -1539,9 +1576,14 @@ klist_invalidate(struct klist *list)
> NET_ASSERT_UNLOCKED();
>
> s = splhigh();
> + ls = klist_lock(list);
> while ((kn = SLIST_FIRST(&list->kl_list)) != NULL) {
> - if (!knote_acquire(kn))
> + if (!knote_acquire(kn, list, ls)) {
> + /* knote_acquire() has unlocked list. */
> + ls = klist_lock(list);
> continue;
> + }
> + klist_unlock(list, ls);
> splx(s);
> kn->kn_fop->f_detach(kn);
> if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
> @@ -1553,6 +1595,111 @@ klist_invalidate(struct klist *list)
> knote_drop(kn, p);
> s = splhigh();
> }
> + ls = klist_lock(list);
> }
> + klist_unlock(list, ls);
> splx(s);
> }
> +
> +static int
> +klist_lock(struct klist *list)
> +{
> + int ls = 0;
> +
> + if (list->kl_ops != NULL) {
> + ls = list->kl_ops->klo_lock(list->kl_arg);
> + } else {
> + ls = splhigh();
> + KERNEL_LOCK();
> + }
> + return ls;
> +}
> +
> +static void
> +klist_unlock(struct klist *list, int ls)
> +{
> + if (list->kl_ops != NULL) {
> + list->kl_ops->klo_unlock(list->kl_arg, ls);
> + } else {
> + KERNEL_UNLOCK();
> + splx(ls);
> + }
> +}
> +
> +static void
> +klist_mutex_assertlk(void *arg)
> +{
> + struct mutex *mtx = arg;
> +
> + (void)mtx;
> +
> + MUTEX_ASSERT_LOCKED(mtx);
> +}
> +
> +static int
> +klist_mutex_lock(void *arg)
> +{
> + struct mutex *mtx = arg;
> +
> + mtx_enter(mtx);
> + return 0;
> +}
> +
> +static void
> +klist_mutex_unlock(void *arg, int s)
> +{
> + struct mutex *mtx = arg;
> +
> + mtx_leave(mtx);
> +}
> +
> +static const struct klistops mutex_klistops = {
> + .klo_assertlk = klist_mutex_assertlk,
> + .klo_lock = klist_mutex_lock,
> + .klo_unlock = klist_mutex_unlock,
> +};
> +
> +void
> +klist_init_mutex(struct klist *klist, struct mutex *mtx)
> +{
> + klist_init(klist, &mutex_klistops, mtx);
> +}
> +
> +static void
> +klist_rwlock_assertlk(void *arg)
> +{
> + struct rwlock *rwl = arg;
> +
> + (void)rwl;
> +
> + rw_assert_wrlock(rwl);
> +}
> +
> +static int
> +klist_rwlock_lock(void *arg)
> +{
> + struct rwlock *rwl = arg;
> +
> + rw_enter_write(rwl);
> + return 0;
> +}
> +
> +static void
> +klist_rwlock_unlock(void *arg, int s)
> +{
> + struct rwlock *rwl = arg;
> +
> + rw_exit_write(rwl);
> +}
> +
> +static const struct klistops rwlock_klistops = {
> + .klo_assertlk = klist_rwlock_assertlk,
> + .klo_lock = klist_rwlock_lock,
> + .klo_unlock = klist_rwlock_unlock,
> +};
> +
> +void
> +klist_init_rwlock(struct klist *klist, struct rwlock *rwl)
> +{
> + klist_init(klist, &rwlock_klistops, rwl);
> +}
> Index: kern/sys_pipe.c
> ===================================================================
> RCS file: src/sys/kern/sys_pipe.c,v
> retrieving revision 1.124
> diff -u -p -r1.124 sys_pipe.c
> --- kern/sys_pipe.c 11 Dec 2020 14:17:35 -0000 1.124
> +++ kern/sys_pipe.c 11 Dec 2020 17:05:09 -0000
> @@ -126,6 +126,7 @@ void pipe_iounlock(struct pipe *);
> int pipe_iosleep(struct pipe *, const char *);
>
> struct pipe_pair *pipe_pair_create(void);
> +void pipe_pair_destroy(struct pipe_pair *);
>
> /*
> * The pipe system call for the DTYPE_PIPE type of pipes
> @@ -853,7 +854,7 @@ pipe_destroy(struct pipe *cpipe)
> rw_exit_write(cpipe->pipe_lock);
>
> if (ppipe == NULL)
> - pool_put(&pipe_pair_pool, cpipe->pipe_pair);
> + pipe_pair_destroy(cpipe->pipe_pair);
> }
>
> /*
> @@ -997,6 +998,9 @@ pipe_pair_create(void)
> pp->pp_wpipe.pipe_lock = &pp->pp_lock;
> pp->pp_rpipe.pipe_lock = &pp->pp_lock;
>
> + klist_init_rwlock(&pp->pp_wpipe.pipe_sel.si_note, &pp->pp_lock);
> + klist_init_rwlock(&pp->pp_rpipe.pipe_sel.si_note, &pp->pp_lock);
> +
> if (pipe_create(&pp->pp_wpipe) || pipe_create(&pp->pp_rpipe))
> goto err;
> return (pp);
> @@ -1005,3 +1009,11 @@ err:
> pipe_destroy(&pp->pp_rpipe);
> return (NULL);
> }
> +
> +void
> +pipe_pair_destroy(struct pipe_pair *pp)
> +{
> + klist_free(&pp->pp_wpipe.pipe_sel.si_note);
> + klist_free(&pp->pp_rpipe.pipe_sel.si_note);
> + pool_put(&pipe_pair_pool, pp);
> +}
> Index: kern/uipc_socket.c
> ===================================================================
> RCS file: src/sys/kern/uipc_socket.c,v
> retrieving revision 1.250
> diff -u -p -r1.250 uipc_socket.c
> --- kern/uipc_socket.c 17 Nov 2020 14:45:42 -0000 1.250
> +++ kern/uipc_socket.c 11 Dec 2020 17:05:09 -0000
> @@ -164,6 +164,8 @@ socreate(int dom, struct socket **aso, i
> so->so_proto = prp;
> so->so_snd.sb_timeo_nsecs = INFSLP;
> so->so_rcv.sb_timeo_nsecs = INFSLP;
> + klist_init(&so->so_rcv.sb_sel.si_note, &socket_klistops, so);
> + klist_init(&so->so_snd.sb_sel.si_note, &socket_klistops, so);
>
> s = solock(so);
> error = (*prp->pr_attach)(so, proto);
> @@ -239,6 +241,8 @@ sofree(struct socket *so, int s)
> }
> }
> sigio_free(&so->so_sigio);
> + klist_free(&so->so_rcv.sb_sel.si_note);
> + klist_free(&so->so_snd.sb_sel.si_note);
> #ifdef SOCKET_SPLICE
> if (so->so_sp) {
> if (issplicedback(so)) {
> @@ -2017,6 +2021,7 @@ soo_kqfilter(struct file *fp, struct kno
> {
> struct socket *so = kn->kn_fp->f_data;
> struct sockbuf *sb;
> + int s;
>
> KERNEL_ASSERT_LOCKED();
>
> @@ -2040,8 +2045,10 @@ soo_kqfilter(struct file *fp, struct kno
> return (EINVAL);
> }
>
> + s = solock(so);
> klist_insert(&sb->sb_sel.si_note, kn);
> sb->sb_flagsintr |= SB_KNOTE;
> + sounlock(so, s);
>
> return (0);
> }
> @@ -2050,12 +2057,15 @@ void
> filt_sordetach(struct knote *kn)
> {
> struct socket *so = kn->kn_fp->f_data;
> + int s;
>
> KERNEL_ASSERT_LOCKED();
>
> + s = solock(so);
> klist_remove(&so->so_rcv.sb_sel.si_note, kn);
> if (klist_empty(&so->so_rcv.sb_sel.si_note))
> so->so_rcv.sb_flagsintr &= ~SB_KNOTE;
> + sounlock(so, s);
> }
>
> int
> @@ -2103,12 +2113,15 @@ void
> filt_sowdetach(struct knote *kn)
> {
> struct socket *so = kn->kn_fp->f_data;
> + int s;
>
> KERNEL_ASSERT_LOCKED();
>
> + s = solock(so);
> klist_remove(&so->so_snd.sb_sel.si_note, kn);
> if (klist_empty(&so->so_snd.sb_sel.si_note))
> so->so_snd.sb_flagsintr &= ~SB_KNOTE;
> + sounlock(so, s);
> }
>
> int
> @@ -2159,6 +2172,36 @@ filt_solisten(struct knote *kn, long hin
> return (kn->kn_data != 0);
> }
>
> +static void
> +klist_soassertlk(void *arg)
> +{
> + struct socket *so = arg;
> +
> + soassertlocked(so);
> +}
> +
> +static int
> +klist_solock(void *arg)
> +{
> + struct socket *so = arg;
> +
> + return (solock(so));
> +}
> +
> +static void
> +klist_sounlock(void *arg, int ls)
> +{
> + struct socket *so = arg;
> +
> + sounlock(so, ls);
> +}
> +
> +const struct klistops socket_klistops = {
> + .klo_assertlk = klist_soassertlk,
> + .klo_lock = klist_solock,
> + .klo_unlock = klist_sounlock,
> +};
> +
> #ifdef DDB
> void
> sobuf_print(struct sockbuf *,
> Index: kern/uipc_socket2.c
> ===================================================================
> RCS file: src/sys/kern/uipc_socket2.c,v
> retrieving revision 1.104
> diff -u -p -r1.104 uipc_socket2.c
> --- kern/uipc_socket2.c 11 Apr 2020 14:07:06 -0000 1.104
> +++ kern/uipc_socket2.c 11 Dec 2020 17:05:09 -0000
> @@ -186,6 +186,8 @@ sonewconn(struct socket *head, int conns
> so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
> so->so_rcv.sb_timeo_nsecs = head->so_rcv.sb_timeo_nsecs;
>
> + klist_init(&so->so_rcv.sb_sel.si_note, &socket_klistops, so);
> + klist_init(&so->so_snd.sb_sel.si_note, &socket_klistops, so);
> sigio_init(&so->so_sigio);
> sigio_copy(&so->so_sigio, &head->so_sigio);
>
> @@ -193,6 +195,8 @@ sonewconn(struct socket *head, int conns
> if ((*so->so_proto->pr_attach)(so, 0)) {
> (void) soqremque(so, soqueue);
> sigio_free(&so->so_sigio);
> + klist_free(&so->so_rcv.sb_sel.si_note);
> + klist_free(&so->so_snd.sb_sel.si_note);
> pool_put(&socket_pool, so);
> return (NULL);
> }
> Index: sys/event.h
> ===================================================================
> RCS file: src/sys/sys/event.h,v
> retrieving revision 1.49
> diff -u -p -r1.49 event.h
> --- sys/event.h 9 Dec 2020 18:58:19 -0000 1.49
> +++ sys/event.h 11 Dec 2020 17:05:09 -0000
> @@ -127,11 +127,15 @@ struct kevent {
> * programs which pull in <sys/proc.h> or <sys/selinfo.h>.
> */
> #include <sys/queue.h>
> +
> +struct klistops;
> struct knote;
> SLIST_HEAD(knlist, knote);
>
> struct klist {
> struct knlist kl_list;
> + const struct klistops *kl_ops;
> + void *kl_arg;
> };
>
> #ifdef _KERNEL
> @@ -200,6 +204,12 @@ struct knote {
> #define kn_fp kn_ptr.p_fp
> };
>
> +struct klistops {
> + void (*klo_assertlk)(void *);
> + int (*klo_lock)(void *);
> + void (*klo_unlock)(void *, int);
> +};
> +
> struct kqueue_scan_state {
> struct kqueue *kqs_kq; /* kqueue of this scan */
> struct knote kqs_start; /* start marker */
> @@ -209,11 +219,14 @@ struct kqueue_scan_state {
> * in queue */
> };
>
> +struct mutex;
> struct proc;
> +struct rwlock;
> struct timespec;
>
> extern const struct filterops sig_filtops;
> extern const struct filterops dead_filtops;
> +extern const struct klistops socket_klistops;
>
> extern void kqpoll_init(void);
> extern void kqpoll_exit(void);
> @@ -231,6 +244,10 @@ extern void kqueue_scan_finish(struct kq
> extern void kqueue_purge(struct proc *, struct kqueue *);
> extern int filt_seltrue(struct knote *kn, long hint);
> extern int seltrue_kqfilter(dev_t, struct knote *);
> +extern void klist_init(struct klist *, const struct klistops *, void *);
> +extern void klist_init_mutex(struct klist *, struct mutex *);
> +extern void klist_init_rwlock(struct klist *, struct rwlock *);
> +extern void klist_free(struct klist *);
> extern void klist_insert(struct klist *, struct knote *);
> extern void klist_remove(struct klist *, struct knote *);
> extern int klist_empty(struct klist *);
>