On Thu, Sep 28, 2017 at 02:07:39PM +0000, Martin Pieuchot wrote:
> On 12/09/17(Tue) 11:09, Martin Pieuchot wrote:
> > My previous attempt to grab the NET_LOCK(), thus potentially sleeping,
> > inside kqueue_scan() resulting in NULL dereferences:
> >     https://marc.info/?l=openbsd-bugs&m=149935139022501&w=2
> > 
> > The problem is that the loop isn't ready to be consulted by multiple
> > threads at the same time.  By "at the same time", I mean that when a
> > thread sleeps it can be consulted by another one.
> > 
> > The diff below addresses that by correcting kqueue's refcount and by
> > using a per-threada marker.  I took this idea from Dragonfly because
> > I believe that we can extend it to make kevent(2) MPSAFE later.
> > 
> > Diff below also includes socket filter modifications grabbing the
> > NET_LOCK() as well.  juanfra@ told me he couldn't reproduce the
> > previous crash with this diff, however I'm looking for more testers.
> 
> Here's an updated version that fixes a hang triggered by lldpd(8)
> reported by sthen@.
> 
> Juan-Fra, Aaron does this version still work for you?

No issues so far!

> 
> Comments?
> 
> Index: kern/kern_event.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_event.c,v
> retrieving revision 1.79
> diff -u -p -r1.79 kern_event.c
> --- kern/kern_event.c 31 May 2017 14:52:05 -0000      1.79
> +++ kern/kern_event.c 28 Sep 2017 14:01:46 -0000
> @@ -476,6 +476,7 @@ sys_kevent(struct proc *p, void *v, regi
>       struct file *fp;
>       struct timespec ts;
>       int i, n, nerrors, error;
> +     struct kevent kev[KQ_NEVENTS];
>  
>       if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL ||
>           (fp->f_type != DTYPE_KQUEUE))
> @@ -500,16 +501,16 @@ sys_kevent(struct proc *p, void *v, regi
>       while (SCARG(uap, nchanges) > 0) {
>               n = SCARG(uap, nchanges) > KQ_NEVENTS ?
>                   KQ_NEVENTS : SCARG(uap, nchanges);
> -             error = copyin(SCARG(uap, changelist), kq->kq_kev,
> +             error = copyin(SCARG(uap, changelist), kev,
>                   n * sizeof(struct kevent));
>               if (error)
>                       goto done;
>  #ifdef KTRACE
>               if (KTRPOINT(p, KTR_STRUCT))
> -                     ktrevent(p, kq->kq_kev, n);
> +                     ktrevent(p, kev, n);
>  #endif
>               for (i = 0; i < n; i++) {
> -                     kevp = &kq->kq_kev[i];
> +                     kevp = &kev[i];
>                       kevp->flags &= ~EV_SYSFLAGS;
>                       error = kqueue_register(kq, kevp, p);
>                       if (error || (kevp->flags & EV_RECEIPT)) {
> @@ -691,6 +692,7 @@ kqueue_scan(struct kqueue *kq, int maxev
>       struct timeval atv, rtv, ttv;
>       struct knote *kn, marker;
>       int s, count, timeout, nkev = 0, error = 0;
> +     struct kevent kev[KQ_NEVENTS];
>  
>       count = maxevents;
>       if (count == 0)
> @@ -737,7 +739,7 @@ start:
>               goto done;
>       }
>  
> -     kevp = kq->kq_kev;
> +     kevp = &kev[0];
>       s = splhigh();
>       if (kq->kq_count == 0) {
>               if (timeout < 0) {
> @@ -757,25 +759,35 @@ start:
>               goto done;
>       }
>  
> +     marker.kn_filter = EVFILT_MARKER;
>       TAILQ_INSERT_TAIL(&kq->kq_head, &marker, kn_tqe);
>       while (count) {
>               kn = TAILQ_FIRST(&kq->kq_head);
> -             TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
>               if (kn == &marker) {
> +                     TAILQ_REMOVE(&kq->kq_head, &marker, kn_tqe);
>                       splx(s);
>                       if (count == maxevents)
>                               goto retry;
>                       goto done;
>               }
> +             if (kn->kn_filter == EVFILT_MARKER) {
> +                     struct knote *other_marker = kn;
> +
> +                     /* Move some other threads marker past this kn */
> +                     kn = TAILQ_NEXT(other_marker, kn_tqe);
> +                     TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
> +                     TAILQ_INSERT_BEFORE(other_marker, kn, kn_tqe);
> +                     continue;
> +             }
> +             TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
> +             kq->kq_count--;
>               if (kn->kn_status & KN_DISABLED) {
>                       kn->kn_status &= ~KN_QUEUED;
> -                     kq->kq_count--;
>                       continue;
>               }
>               if ((kn->kn_flags & EV_ONESHOT) == 0 &&
>                   kn->kn_fop->f_event(kn, 0) == 0) {
>                       kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
> -                     kq->kq_count--;
>                       continue;
>               }
>               *kevp = kn->kn_kevent;
> @@ -783,7 +795,6 @@ start:
>               nkev++;
>               if (kn->kn_flags & EV_ONESHOT) {
>                       kn->kn_status &= ~KN_QUEUED;
> -                     kq->kq_count--;
>                       splx(s);
>                       kn->kn_fop->f_detach(kn);
>                       knote_drop(kn, p, p->p_fd);
> @@ -796,8 +807,8 @@ start:
>                       if (kn->kn_flags & EV_DISPATCH)
>                               kn->kn_status |= KN_DISABLED;
>                       kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
> -                     kq->kq_count--;
>               } else {
> +                     kq->kq_count++;
>                       TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
>               }
>               count--;
> @@ -805,13 +816,13 @@ start:
>                       splx(s);
>  #ifdef KTRACE
>                       if (KTRPOINT(p, KTR_STRUCT))
> -                             ktrevent(p, kq->kq_kev, nkev);
> +                             ktrevent(p, kev, nkev);
>  #endif
> -                     error = copyout(kq->kq_kev, ulistp,
> +                     error = copyout(kev, ulistp,
>                           sizeof(struct kevent) * nkev);
>                       ulistp += nkev;
>                       nkev = 0;
> -                     kevp = kq->kq_kev;
> +                     kevp = &kev[0];
>                       s = splhigh();
>                       if (error)
>                               break;
> @@ -823,9 +834,9 @@ done:
>       if (nkev != 0) {
>  #ifdef KTRACE
>               if (KTRPOINT(p, KTR_STRUCT))
> -                     ktrevent(p, kq->kq_kev, nkev);
> +                     ktrevent(p, kev, nkev);
>  #endif
> -             error = copyout(kq->kq_kev, ulistp,
> +             error = copyout(kev, ulistp,
>                   sizeof(struct kevent) * nkev);
>       }
>       *retval = maxevents - count;
> Index: kern/uipc_socket.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.205
> diff -u -p -r1.205 uipc_socket.c
> --- kern/uipc_socket.c        15 Sep 2017 19:29:28 -0000      1.205
> +++ kern/uipc_socket.c        28 Sep 2017 14:01:46 -0000
> @@ -1960,8 +1960,10 @@ int
>  filt_sowrite(struct knote *kn, long hint)
>  {
>       struct socket *so = kn->kn_fp->f_data;
> -     int rv;
> +     int s, rv;
>  
> +     if (!(hint & NOTE_SUBMIT))
> +             s = solock(so);
>       kn->kn_data = sbspace(so, &so->so_snd);
>       if (so->so_state & SS_CANTSENDMORE) {
>               kn->kn_flags |= EV_EOF;
> @@ -1977,6 +1979,8 @@ filt_sowrite(struct knote *kn, long hint
>       } else {
>               rv = (kn->kn_data >= so->so_snd.sb_lowat);
>       }
> +     if (!(hint & NOTE_SUBMIT))
> +             sounlock(s);
>  
>       return (rv);
>  }
> @@ -1985,8 +1989,13 @@ int
>  filt_solisten(struct knote *kn, long hint)
>  {
>       struct socket *so = kn->kn_fp->f_data;
> +     int s;
>  
> +     if (!(hint & NOTE_SUBMIT))
> +             s = solock(so);
>       kn->kn_data = so->so_qlen;
> +     if (!(hint & NOTE_SUBMIT))
> +             sounlock(s);
>  
>       return (kn->kn_data != 0);
>  }
> Index: miscfs/fifofs/fifo_vnops.c
> ===================================================================
> RCS file: /cvs/src/sys/miscfs/fifofs/fifo_vnops.c,v
> retrieving revision 1.58
> diff -u -p -r1.58 fifo_vnops.c
> --- miscfs/fifofs/fifo_vnops.c        24 Jul 2017 15:07:39 -0000      1.58
> +++ miscfs/fifofs/fifo_vnops.c        28 Sep 2017 14:01:46 -0000
> @@ -545,8 +545,10 @@ int
>  filt_fiforead(struct knote *kn, long hint)
>  {
>       struct socket *so = (struct socket *)kn->kn_hook;
> -     int rv;
> +     int s, rv;
>  
> +     if (!(hint & NOTE_SUBMIT))
> +             s = solock(so);
>       kn->kn_data = so->so_rcv.sb_cc;
>       if (so->so_state & SS_CANTRCVMORE) {
>               kn->kn_flags |= EV_EOF;
> @@ -555,6 +557,8 @@ filt_fiforead(struct knote *kn, long hin
>               kn->kn_flags &= ~EV_EOF;
>               rv = (kn->kn_data > 0);
>       }
> +     if (!(hint & NOTE_SUBMIT))
> +             sounlock(s);
>  
>       return (rv);
>  }
> @@ -573,8 +577,10 @@ int
>  filt_fifowrite(struct knote *kn, long hint)
>  {
>       struct socket *so = (struct socket *)kn->kn_hook;
> -     int rv;
> +     int s, rv;
>  
> +     if (!(hint & NOTE_SUBMIT))
> +             s = solock(so);
>       kn->kn_data = sbspace(so, &so->so_snd);
>       if (so->so_state & SS_CANTSENDMORE) {
>               kn->kn_flags |= EV_EOF;
> @@ -583,6 +589,8 @@ filt_fifowrite(struct knote *kn, long hi
>               kn->kn_flags &= ~EV_EOF;
>               rv = (kn->kn_data >= so->so_snd.sb_lowat);
>       }
> +     if (!(hint & NOTE_SUBMIT))
> +             sounlock(s);
>  
>       return (rv);
>  }
> Index: sys/event.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/event.h,v
> retrieving revision 1.26
> diff -u -p -r1.26 event.h
> --- sys/event.h       26 Jun 2017 09:32:32 -0000      1.26
> +++ sys/event.h       28 Sep 2017 14:01:46 -0000
> @@ -80,13 +80,6 @@ struct kevent {
>  #define EV_ERROR     0x4000          /* error, data contains errno */
>  
>  /*
> - * hint flag for in-kernel use - must not equal any existing note
> - */
> -#ifdef _KERNEL
> -#define NOTE_SUBMIT  0x01000000              /* initial knote submission */
> -#endif
> -
> -/*
>   * data/hint flags for EVFILT_{READ|WRITE}, shared with userspace
>   */
>  #define NOTE_LOWAT   0x0001                  /* low water mark */
> @@ -127,6 +120,13 @@ struct knote;
>  SLIST_HEAD(klist, knote);
>  
>  #ifdef _KERNEL
> +
> +#define EVFILT_MARKER        0xF                     /* placemarker for 
> tailq */
> +
> +/*
> + * hint flag for in-kernel use - must not equal any existing note
> + */
> +#define NOTE_SUBMIT  0x01000000              /* initial knote submission */
>  
>  #define KNOTE(list_, hint)   do { \
>                                       struct klist *list = (list_); \
> Index: sys/eventvar.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/eventvar.h,v
> retrieving revision 1.3
> diff -u -p -r1.3 eventvar.h
> --- sys/eventvar.h    25 Mar 2012 20:33:52 -0000      1.3
> +++ sys/eventvar.h    28 Sep 2017 14:01:46 -0000
> @@ -44,7 +44,6 @@ struct kqueue {
>  #define KQ_SEL               0x01
>  #define KQ_SLEEP     0x02
>  #define KQ_DYING     0x04
> -     struct          kevent kq_kev[KQ_NEVENTS];
>  };
>  
>  #endif /* !_SYS_EVENTVAR_H_ */
> Index: sys/socketvar.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/socketvar.h,v
> retrieving revision 1.76
> diff -u -p -r1.76 socketvar.h
> --- sys/socketvar.h   1 Sep 2017 15:05:31 -0000       1.76
> +++ sys/socketvar.h   28 Sep 2017 14:01:46 -0000
> @@ -186,10 +186,7 @@ static inline long
>  sbspace(struct socket *so, struct sockbuf *sb)
>  {
>       KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
> -#if 0
> -     /* XXXSMP kqueue_scan() calling filt_sowrite() cannot sleep. */
>       soassertlocked(so);
> -#endif
>       return lmin(sb->sb_hiwat - sb->sb_cc, sb->sb_mbmax - sb->sb_mbcnt);
>  }
>  

-- 
PGP: 0x1F81112D62A9ADCE / 3586 3350 BFEA C101 DB1A  4AF0 1F81 112D 62A9 ADCE

Reply via email to