On Fri, Dec 11, 2020 at 05:37:57PM +0000, Visa Hankala wrote:
> This patch extends struct klist with a callback descriptor and
> an argument. The main purpose of this is to let the kqueue subsystem
> assert when a klist should be locked, and operate the klist lock
> in klist_invalidate().

i've always felt that the klist has always had an obvious need for
locking, but it's excellent that you've actually gone and done more than
feel things about it.

> Access to a knote list of a kqueue-monitored object has to be
> serialized somehow. Because the object often has a lock for protecting
> its state, and because the object often acquires this lock at the latest
> in its f_event callback functions, I would like to use the same lock
> also for the knote lists. Uses of NOTE_SUBMIT already show a pattern
> arising.

That makes sense. If it helps, even if it just helps justify it,
this is the same approach used in kstat, for much the same reason.

> There could be an embedded lock in klist. However, such a lock would be
> redundant in many cases. The code could not rely on a single lock type
> (mutex, rwlock, something else) because the needs of monitored objects
> vary. In addition, an embedded lock would introduce new lock order
> constraints. Note that this patch does not rule out use of dedicated
> klist locks.
> 
> The patch introduces a way to associate lock operations with a klist.
> The caller can provide a custom implementation, or use a ready-made
> interface with a mutex or rwlock.
> 
> For compatibility with old code, the new code falls back to using the
> kernel lock if no specific klist initialization has been done. The
> existing code already relies on implicit initialization of klist.

I was going to ask if you could provide a struct klistops around
KERNEL_LOCK as the default, but that would involve a lot more churn to
explicitly init all the klist structs.

> Unfortunately, the size of struct klist will grow threefold.

I think we'll live.

> As the patch gives the code the ability to operate the klist lock,
> the klist API could provide variants of insert and remove actions that
> handle locking internally, for convenience. However, that I would leave
> for another patch because I would prefer to rename the current
> klist_insert() to klist_insert_locked(), and klist_remove() to
> klist_remove_locked().
> 
> The patch additionally provides three examples of usage: audio, pipes,
> and sockets. Each of these examples is logically a separate changeset.
> 
> 
> Please test and review.

I'll try to have a closer look soon.

> Index: dev/audio.c
> ===================================================================
> RCS file: src/sys/dev/audio.c,v
> retrieving revision 1.191
> diff -u -p -r1.191 audio.c
> --- dev/audio.c       19 May 2020 06:32:24 -0000      1.191
> +++ dev/audio.c       11 Dec 2020 17:05:09 -0000
> @@ -305,11 +305,12 @@ audio_buf_wakeup(void *addr)
>  int
>  audio_buf_init(struct audio_softc *sc, struct audio_buf *buf, int dir)
>  {
> +     klist_init_mutex(&buf->sel.si_note, &audio_lock);
>       buf->softintr = softintr_establish(IPL_SOFTAUDIO,
>           audio_buf_wakeup, buf);
>       if (buf->softintr == NULL) {
>               printf("%s: can't establish softintr\n", DEVNAME(sc));
> -             return ENOMEM;
> +             goto bad;
>       }
>       if (sc->ops->round_buffersize) {
>               buf->datalen = sc->ops->round_buffersize(sc->arg,
> @@ -323,9 +324,12 @@ audio_buf_init(struct audio_softc *sc, s
>               buf->data = malloc(buf->datalen, M_DEVBUF, M_WAITOK);
>       if (buf->data == NULL) {
>               softintr_disestablish(buf->softintr);
> -             return ENOMEM;
> +             goto bad;
>       }
>       return 0;
> +bad:
> +     klist_free(&buf->sel.si_note);
> +     return ENOMEM;
>  }
>  
>  void
> @@ -336,6 +340,7 @@ audio_buf_done(struct audio_softc *sc, s
>       else
>               free(buf->data, M_DEVBUF, buf->datalen);
>       softintr_disestablish(buf->softintr);
> +     klist_free(&buf->sel.si_note);
>  }
>  
>  /*
> @@ -1256,6 +1261,7 @@ audio_attach(struct device *parent, stru
>               return;
>       }
>  
> +     klist_init_mutex(&sc->mix_sel.si_note, &audio_lock);
>       sc->mix_softintr = softintr_establish(IPL_SOFTAUDIO,
>           audio_mixer_wakeup, sc);
>       if (sc->mix_softintr == NULL) {
> @@ -1451,6 +1457,7 @@ audio_detach(struct device *self, int fl
>  
>       /* free resources */
>       softintr_disestablish(sc->mix_softintr);
> +     klist_free(&sc->mix_sel.si_note);
>       free(sc->mix_evbuf, M_DEVBUF, sc->mix_nent * sizeof(struct mixer_ev));
>       free(sc->mix_ents, M_DEVBUF, sc->mix_nent * sizeof(struct mixer_ctrl));
>       audio_buf_done(sc, &sc->play);
> Index: kern/kern_event.c
> ===================================================================
> RCS file: src/sys/kern/kern_event.c,v
> retrieving revision 1.147
> diff -u -p -r1.147 kern_event.c
> --- kern/kern_event.c 9 Dec 2020 18:58:19 -0000       1.147
> +++ kern/kern_event.c 11 Dec 2020 17:05:09 -0000
> @@ -57,6 +57,17 @@
>  #include <sys/timeout.h>
>  #include <sys/wait.h>
>  
> +#ifdef DIAGNOSTIC
> +#define KLIST_ASSERT_LOCKED(kl) do {                                 \
> +     if ((kl)->kl_ops != NULL)                                       \
> +             (kl)->kl_ops->klo_assertlk((kl)->kl_arg);               \
> +     else                                                            \
> +             KERNEL_ASSERT_LOCKED();                                 \
> +} while (0)
> +#else
> +#define KLIST_ASSERT_LOCKED(kl)      ((void)(kl))
> +#endif
> +
>  struct       kqueue *kqueue_alloc(struct filedesc *);
>  void kqueue_terminate(struct proc *p, struct kqueue *);
>  void kqueue_free(struct kqueue *);
> @@ -79,6 +90,8 @@ void        kqueue_wakeup(struct kqueue *kq);
>  static void  kqueue_expand_hash(struct kqueue *kq);
>  static void  kqueue_expand_list(struct kqueue *kq, int fd);
>  static void  kqueue_task(void *);
> +static int   klist_lock(struct klist *);
> +static void  klist_unlock(struct klist *, int);
>  
>  const struct fileops kqueueops = {
>       .fo_read        = kqueue_read,
> @@ -94,7 +107,7 @@ void       knote_attach(struct knote *kn);
>  void knote_drop(struct knote *kn, struct proc *p);
>  void knote_enqueue(struct knote *kn);
>  void knote_dequeue(struct knote *kn);
> -int  knote_acquire(struct knote *kn);
> +int  knote_acquire(struct knote *kn, struct klist *, int);
>  void knote_release(struct knote *kn);
>  
>  void filt_kqdetach(struct knote *kn);
> @@ -792,7 +805,7 @@ again:
>                       if (kev->filter == kn->kn_filter &&
>                           kev->ident == kn->kn_id) {
>                               s = splhigh();
> -                             if (!knote_acquire(kn)) {
> +                             if (!knote_acquire(kn, NULL, 0)) {
>                                       splx(s);
>                                       if (fp != NULL) {
>                                               FRELE(fp, p);
> @@ -1018,7 +1031,7 @@ retry:
>                       continue;
>               }
>  
> -             if (!knote_acquire(kn))
> +             if (!knote_acquire(kn, NULL, 0))
>                       continue;
>  
>               kqueue_check(kq);
> @@ -1293,15 +1306,20 @@ kqueue_expand_list(struct kqueue *kq, in
>   * If we cannot acquire the knote we sleep and return 0.  The knote
>   * may be stale on return in this case and the caller must restart
>   * whatever loop they are in.
> + *
> + * If we are about to sleep and klist is non-NULL, the list is unlocked
> + * before sleep and remains unlocked on return.
>   */
>  int
> -knote_acquire(struct knote *kn)
> +knote_acquire(struct knote *kn, struct klist *klist, int ls)
>  {
>       splassert(IPL_HIGH);
>       KASSERT(kn->kn_filter != EVFILT_MARKER);
>  
>       if (kn->kn_status & KN_PROCESSING) {
>               kn->kn_status |= KN_WAITING;
> +             if (klist != NULL)
> +                     klist_unlock(klist, ls);
>               tsleep_nsec(kn, 0, "kqepts", SEC_TO_NSEC(1));
>               /* knote may be stale now */
>               return (0);
> @@ -1351,6 +1369,8 @@ knote(struct klist *list, long hint)
>  {
>       struct knote *kn, *kn0;
>  
> +     KLIST_ASSERT_LOCKED(list);
> +
>       SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, kn0)
>               if (kn->kn_fop->f_event(kn, hint))
>                       knote_activate(kn);
> @@ -1367,7 +1387,7 @@ knote_remove(struct proc *p, struct knli
>  
>       while ((kn = SLIST_FIRST(list)) != NULL) {
>               s = splhigh();
> -             if (!knote_acquire(kn)) {
> +             if (!knote_acquire(kn, NULL, 0)) {
>                       splx(s);
>                       continue;
>               }
> @@ -1508,14 +1528,31 @@ knote_dequeue(struct knote *kn)
>  }
>  
>  void
> +klist_init(struct klist *klist, const struct klistops *ops, void *arg)
> +{
> +     klist->kl_ops = ops;
> +     klist->kl_arg = arg;
> +}
> +
> +void
> +klist_free(struct klist *klist)
> +{
> +     KASSERT(SLIST_EMPTY(&klist->kl_list));
> +}
> +
> +void
>  klist_insert(struct klist *klist, struct knote *kn)
>  {
> +     KLIST_ASSERT_LOCKED(klist);
> +
>       SLIST_INSERT_HEAD(&klist->kl_list, kn, kn_selnext);
>  }
>  
>  void
>  klist_remove(struct klist *klist, struct knote *kn)
>  {
> +     KLIST_ASSERT_LOCKED(klist);
> +
>       SLIST_REMOVE(&klist->kl_list, kn, knote, kn_selnext);
>  }
>  
> @@ -1530,7 +1567,7 @@ klist_invalidate(struct klist *list)
>  {
>       struct knote *kn;
>       struct proc *p = curproc;
> -     int s;
> +     int ls, s;
>  
>       /*
>        * NET_LOCK() must not be held because it can block another thread
> @@ -1539,9 +1576,14 @@ klist_invalidate(struct klist *list)
>       NET_ASSERT_UNLOCKED();
>  
>       s = splhigh();
> +     ls = klist_lock(list);
>       while ((kn = SLIST_FIRST(&list->kl_list)) != NULL) {
> -             if (!knote_acquire(kn))
> +             if (!knote_acquire(kn, list, ls)) {
> +                     /* knote_acquire() has unlocked list. */
> +                     ls = klist_lock(list);
>                       continue;
> +             }
> +             klist_unlock(list, ls);
>               splx(s);
>               kn->kn_fop->f_detach(kn);
>               if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
> @@ -1553,6 +1595,111 @@ klist_invalidate(struct klist *list)
>                       knote_drop(kn, p);
>                       s = splhigh();
>               }
> +             ls = klist_lock(list);
>       }
> +     klist_unlock(list, ls);
>       splx(s);
>  }
> +
> +static int
> +klist_lock(struct klist *list)
> +{
> +     int ls = 0;
> +
> +     if (list->kl_ops != NULL) {
> +             ls = list->kl_ops->klo_lock(list->kl_arg);
> +     } else {
> +             ls = splhigh();
> +             KERNEL_LOCK();
> +     }
> +     return ls;
> +}
> +
> +static void
> +klist_unlock(struct klist *list, int ls)
> +{
> +     if (list->kl_ops != NULL) {
> +             list->kl_ops->klo_unlock(list->kl_arg, ls);
> +     } else {
> +             KERNEL_UNLOCK();
> +             splx(ls);
> +     }
> +}
> +
> +static void
> +klist_mutex_assertlk(void *arg)
> +{
> +     struct mutex *mtx = arg;
> +
> +     (void)mtx;
> +
> +     MUTEX_ASSERT_LOCKED(mtx);
> +}
> +
> +static int
> +klist_mutex_lock(void *arg)
> +{
> +     struct mutex *mtx = arg;
> +
> +     mtx_enter(mtx);
> +     return 0;
> +}
> +
> +static void
> +klist_mutex_unlock(void *arg, int s)
> +{
> +     struct mutex *mtx = arg;
> +
> +     mtx_leave(mtx);
> +}
> +
> +static const struct klistops mutex_klistops = {
> +     .klo_assertlk   = klist_mutex_assertlk,
> +     .klo_lock       = klist_mutex_lock,
> +     .klo_unlock     = klist_mutex_unlock,
> +};
> +
> +void
> +klist_init_mutex(struct klist *klist, struct mutex *mtx)
> +{
> +     klist_init(klist, &mutex_klistops, mtx);
> +}
> +
> +static void
> +klist_rwlock_assertlk(void *arg)
> +{
> +     struct rwlock *rwl = arg;
> +
> +     (void)rwl;
> +
> +     rw_assert_wrlock(rwl);
> +}
> +
> +static int
> +klist_rwlock_lock(void *arg)
> +{
> +     struct rwlock *rwl = arg;
> +
> +     rw_enter_write(rwl);
> +     return 0;
> +}
> +
> +static void
> +klist_rwlock_unlock(void *arg, int s)
> +{
> +     struct rwlock *rwl = arg;
> +
> +     rw_exit_write(rwl);
> +}
> +
> +static const struct klistops rwlock_klistops = {
> +     .klo_assertlk   = klist_rwlock_assertlk,
> +     .klo_lock       = klist_rwlock_lock,
> +     .klo_unlock     = klist_rwlock_unlock,
> +};
> +
> +void
> +klist_init_rwlock(struct klist *klist, struct rwlock *rwl)
> +{
> +     klist_init(klist, &rwlock_klistops, rwl);
> +}
> Index: kern/sys_pipe.c
> ===================================================================
> RCS file: src/sys/kern/sys_pipe.c,v
> retrieving revision 1.124
> diff -u -p -r1.124 sys_pipe.c
> --- kern/sys_pipe.c   11 Dec 2020 14:17:35 -0000      1.124
> +++ kern/sys_pipe.c   11 Dec 2020 17:05:09 -0000
> @@ -126,6 +126,7 @@ void      pipe_iounlock(struct pipe *);
>  int  pipe_iosleep(struct pipe *, const char *);
>  
>  struct pipe_pair *pipe_pair_create(void);
> +void pipe_pair_destroy(struct pipe_pair *);
>  
>  /*
>   * The pipe system call for the DTYPE_PIPE type of pipes
> @@ -853,7 +854,7 @@ pipe_destroy(struct pipe *cpipe)
>       rw_exit_write(cpipe->pipe_lock);
>  
>       if (ppipe == NULL)
> -             pool_put(&pipe_pair_pool, cpipe->pipe_pair);
> +             pipe_pair_destroy(cpipe->pipe_pair);
>  }
>  
>  /*
> @@ -997,6 +998,9 @@ pipe_pair_create(void)
>       pp->pp_wpipe.pipe_lock = &pp->pp_lock;
>       pp->pp_rpipe.pipe_lock = &pp->pp_lock;
>  
> +     klist_init_rwlock(&pp->pp_wpipe.pipe_sel.si_note, &pp->pp_lock);
> +     klist_init_rwlock(&pp->pp_rpipe.pipe_sel.si_note, &pp->pp_lock);
> +
>       if (pipe_create(&pp->pp_wpipe) || pipe_create(&pp->pp_rpipe))
>               goto err;
>       return (pp);
> @@ -1005,3 +1009,11 @@ err:
>       pipe_destroy(&pp->pp_rpipe);
>       return (NULL);
>  }
> +
> +void
> +pipe_pair_destroy(struct pipe_pair *pp)
> +{
> +     klist_free(&pp->pp_wpipe.pipe_sel.si_note);
> +     klist_free(&pp->pp_rpipe.pipe_sel.si_note);
> +     pool_put(&pipe_pair_pool, pp);
> +}
> Index: kern/uipc_socket.c
> ===================================================================
> RCS file: src/sys/kern/uipc_socket.c,v
> retrieving revision 1.250
> diff -u -p -r1.250 uipc_socket.c
> --- kern/uipc_socket.c        17 Nov 2020 14:45:42 -0000      1.250
> +++ kern/uipc_socket.c        11 Dec 2020 17:05:09 -0000
> @@ -164,6 +164,8 @@ socreate(int dom, struct socket **aso, i
>       so->so_proto = prp;
>       so->so_snd.sb_timeo_nsecs = INFSLP;
>       so->so_rcv.sb_timeo_nsecs = INFSLP;
> +     klist_init(&so->so_rcv.sb_sel.si_note, &socket_klistops, so);
> +     klist_init(&so->so_snd.sb_sel.si_note, &socket_klistops, so);
>  
>       s = solock(so);
>       error = (*prp->pr_attach)(so, proto);
> @@ -239,6 +241,8 @@ sofree(struct socket *so, int s)
>               }
>       }
>       sigio_free(&so->so_sigio);
> +     klist_free(&so->so_rcv.sb_sel.si_note);
> +     klist_free(&so->so_snd.sb_sel.si_note);
>  #ifdef SOCKET_SPLICE
>       if (so->so_sp) {
>               if (issplicedback(so)) {
> @@ -2017,6 +2021,7 @@ soo_kqfilter(struct file *fp, struct kno
>  {
>       struct socket *so = kn->kn_fp->f_data;
>       struct sockbuf *sb;
> +     int s;
>  
>       KERNEL_ASSERT_LOCKED();
>  
> @@ -2040,8 +2045,10 @@ soo_kqfilter(struct file *fp, struct kno
>               return (EINVAL);
>       }
>  
> +     s = solock(so);
>       klist_insert(&sb->sb_sel.si_note, kn);
>       sb->sb_flagsintr |= SB_KNOTE;
> +     sounlock(so, s);
>  
>       return (0);
>  }
> @@ -2050,12 +2057,15 @@ void
>  filt_sordetach(struct knote *kn)
>  {
>       struct socket *so = kn->kn_fp->f_data;
> +     int s;
>  
>       KERNEL_ASSERT_LOCKED();
>  
> +     s = solock(so);
>       klist_remove(&so->so_rcv.sb_sel.si_note, kn);
>       if (klist_empty(&so->so_rcv.sb_sel.si_note))
>               so->so_rcv.sb_flagsintr &= ~SB_KNOTE;
> +     sounlock(so, s);
>  }
>  
>  int
> @@ -2103,12 +2113,15 @@ void
>  filt_sowdetach(struct knote *kn)
>  {
>       struct socket *so = kn->kn_fp->f_data;
> +     int s;
>  
>       KERNEL_ASSERT_LOCKED();
>  
> +     s = solock(so);
>       klist_remove(&so->so_snd.sb_sel.si_note, kn);
>       if (klist_empty(&so->so_snd.sb_sel.si_note))
>               so->so_snd.sb_flagsintr &= ~SB_KNOTE;
> +     sounlock(so, s);
>  }
>  
>  int
> @@ -2159,6 +2172,36 @@ filt_solisten(struct knote *kn, long hin
>       return (kn->kn_data != 0);
>  }
>  
> +static void
> +klist_soassertlk(void *arg)
> +{
> +     struct socket *so = arg;
> +
> +     soassertlocked(so);
> +}
> +
> +static int
> +klist_solock(void *arg)
> +{
> +     struct socket *so = arg;
> +
> +     return (solock(so));
> +}
> +
> +static void
> +klist_sounlock(void *arg, int ls)
> +{
> +     struct socket *so = arg;
> +
> +     sounlock(so, ls);
> +}
> +
> +const struct klistops socket_klistops = {
> +     .klo_assertlk   = klist_soassertlk,
> +     .klo_lock       = klist_solock,
> +     .klo_unlock     = klist_sounlock,
> +};
> +
>  #ifdef DDB
>  void
>  sobuf_print(struct sockbuf *,
> Index: kern/uipc_socket2.c
> ===================================================================
> RCS file: src/sys/kern/uipc_socket2.c,v
> retrieving revision 1.104
> diff -u -p -r1.104 uipc_socket2.c
> --- kern/uipc_socket2.c       11 Apr 2020 14:07:06 -0000      1.104
> +++ kern/uipc_socket2.c       11 Dec 2020 17:05:09 -0000
> @@ -186,6 +186,8 @@ sonewconn(struct socket *head, int conns
>       so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
>       so->so_rcv.sb_timeo_nsecs = head->so_rcv.sb_timeo_nsecs;
>  
> +     klist_init(&so->so_rcv.sb_sel.si_note, &socket_klistops, so);
> +     klist_init(&so->so_snd.sb_sel.si_note, &socket_klistops, so);
>       sigio_init(&so->so_sigio);
>       sigio_copy(&so->so_sigio, &head->so_sigio);
>  
> @@ -193,6 +195,8 @@ sonewconn(struct socket *head, int conns
>       if ((*so->so_proto->pr_attach)(so, 0)) {
>               (void) soqremque(so, soqueue);
>               sigio_free(&so->so_sigio);
> +             klist_free(&so->so_rcv.sb_sel.si_note);
> +             klist_free(&so->so_snd.sb_sel.si_note);
>               pool_put(&socket_pool, so);
>               return (NULL);
>       }
> Index: sys/event.h
> ===================================================================
> RCS file: src/sys/sys/event.h,v
> retrieving revision 1.49
> diff -u -p -r1.49 event.h
> --- sys/event.h       9 Dec 2020 18:58:19 -0000       1.49
> +++ sys/event.h       11 Dec 2020 17:05:09 -0000
> @@ -127,11 +127,15 @@ struct kevent {
>   * programs which pull in <sys/proc.h> or <sys/selinfo.h>.
>   */
>  #include <sys/queue.h>
> +
> +struct klistops;
>  struct knote;
>  SLIST_HEAD(knlist, knote);
>  
>  struct klist {
>       struct knlist            kl_list;
> +     const struct klistops   *kl_ops;
> +     void                    *kl_arg;
>  };
>  
>  #ifdef _KERNEL
> @@ -200,6 +204,12 @@ struct knote {
>  #define kn_fp                kn_ptr.p_fp
>  };
>  
> +struct klistops {
> +     void    (*klo_assertlk)(void *);
> +     int     (*klo_lock)(void *);
> +     void    (*klo_unlock)(void *, int);
> +};
> +
>  struct kqueue_scan_state {
>       struct kqueue   *kqs_kq;                /* kqueue of this scan */
>       struct knote     kqs_start;             /* start marker */
> @@ -209,11 +219,14 @@ struct kqueue_scan_state {
>                                                * in queue */
>  };
>  
> +struct mutex;
>  struct proc;
> +struct rwlock;
>  struct timespec;
>  
>  extern const struct filterops sig_filtops;
>  extern const struct filterops dead_filtops;
> +extern const struct klistops socket_klistops;
>  
>  extern void  kqpoll_init(void);
>  extern void  kqpoll_exit(void);
> @@ -231,6 +244,10 @@ extern void      kqueue_scan_finish(struct kq
>  extern void  kqueue_purge(struct proc *, struct kqueue *);
>  extern int   filt_seltrue(struct knote *kn, long hint);
>  extern int   seltrue_kqfilter(dev_t, struct knote *);
> +extern void  klist_init(struct klist *, const struct klistops *, void *);
> +extern void  klist_init_mutex(struct klist *, struct mutex *);
> +extern void  klist_init_rwlock(struct klist *, struct rwlock *);
> +extern void  klist_free(struct klist *);
>  extern void  klist_insert(struct klist *, struct knote *);
>  extern void  klist_remove(struct klist *, struct knote *);
>  extern int   klist_empty(struct klist *);
> 

Reply via email to