On 12/09/17(Tue) 11:09, Martin Pieuchot wrote:
> My previous attempt to grab the NET_LOCK(), thus potentially sleeping,
> inside kqueue_scan() resulting in NULL dereferences:
>       https://marc.info/?l=openbsd-bugs&m=149935139022501&w=2
> 
> The problem is that the loop isn't ready to be consulted by multiple
> threads at the same time.  By "at the same time", I mean that when a
> thread sleeps it can be consulted by another one.
> 
> The diff below addresses that by correcting kqueue's refcount and by
> using a per-threada marker.  I took this idea from Dragonfly because
> I believe that we can extend it to make kevent(2) MPSAFE later.
> 
> Diff below also includes socket filter modifications grabbing the
> NET_LOCK() as well.  juanfra@ told me he couldn't reproduce the
> previous crash with this diff, however I'm looking for more testers.
> 
> Test reports?  Comments?

Here's an updated diff with moar KASSERT()s.  Testers welcome!

Index: kern/kern_event.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_event.c,v
retrieving revision 1.79
diff -u -p -r1.79 kern_event.c
--- kern/kern_event.c   31 May 2017 14:52:05 -0000      1.79
+++ kern/kern_event.c   2 Oct 2017 10:06:04 -0000
@@ -476,6 +476,7 @@ sys_kevent(struct proc *p, void *v, regi
        struct file *fp;
        struct timespec ts;
        int i, n, nerrors, error;
+       struct kevent kev[KQ_NEVENTS];
 
        if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL ||
            (fp->f_type != DTYPE_KQUEUE))
@@ -500,16 +501,16 @@ sys_kevent(struct proc *p, void *v, regi
        while (SCARG(uap, nchanges) > 0) {
                n = SCARG(uap, nchanges) > KQ_NEVENTS ?
                    KQ_NEVENTS : SCARG(uap, nchanges);
-               error = copyin(SCARG(uap, changelist), kq->kq_kev,
+               error = copyin(SCARG(uap, changelist), kev,
                    n * sizeof(struct kevent));
                if (error)
                        goto done;
 #ifdef KTRACE
                if (KTRPOINT(p, KTR_STRUCT))
-                       ktrevent(p, kq->kq_kev, n);
+                       ktrevent(p, kev, n);
 #endif
                for (i = 0; i < n; i++) {
-                       kevp = &kq->kq_kev[i];
+                       kevp = &kev[i];
                        kevp->flags &= ~EV_SYSFLAGS;
                        error = kqueue_register(kq, kevp, p);
                        if (error || (kevp->flags & EV_RECEIPT)) {
@@ -691,6 +692,7 @@ kqueue_scan(struct kqueue *kq, int maxev
        struct timeval atv, rtv, ttv;
        struct knote *kn, marker;
        int s, count, timeout, nkev = 0, error = 0;
+       struct kevent kev[KQ_NEVENTS];
 
        count = maxevents;
        if (count == 0)
@@ -737,7 +739,7 @@ start:
                goto done;
        }
 
-       kevp = kq->kq_kev;
+       kevp = &kev[0];
        s = splhigh();
        if (kq->kq_count == 0) {
                if (timeout < 0) {
@@ -757,33 +759,46 @@ start:
                goto done;
        }
 
+       marker.kn_filter = EVFILT_MARKER;
+       marker.kn_status = KN_PROCESSING;
        TAILQ_INSERT_TAIL(&kq->kq_head, &marker, kn_tqe);
        while (count) {
                kn = TAILQ_FIRST(&kq->kq_head);
-               TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
                if (kn == &marker) {
+                       TAILQ_REMOVE(&kq->kq_head, &marker, kn_tqe);
                        splx(s);
                        if (count == maxevents)
                                goto retry;
                        goto done;
                }
+               if (kn->kn_filter == EVFILT_MARKER) {
+                       struct knote *other_marker = kn;
+
+                       /* Move some other threads marker past this kn */
+                       kn = TAILQ_NEXT(other_marker, kn_tqe);
+                       TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
+                       TAILQ_INSERT_BEFORE(other_marker, kn, kn_tqe);
+                       continue;
+               }
+
+               TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
+               kq->kq_count--;
+               kn->kn_status |= KN_PROCESSING;
+
                if (kn->kn_status & KN_DISABLED) {
-                       kn->kn_status &= ~KN_QUEUED;
-                       kq->kq_count--;
+                       kn->kn_status &= ~(KN_QUEUED|KN_PROCESSING);
                        continue;
                }
                if ((kn->kn_flags & EV_ONESHOT) == 0 &&
                    kn->kn_fop->f_event(kn, 0) == 0) {
-                       kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
-                       kq->kq_count--;
+                       kn->kn_status &= ~(KN_QUEUED|KN_ACTIVE|KN_PROCESSING);
                        continue;
                }
                *kevp = kn->kn_kevent;
                kevp++;
                nkev++;
                if (kn->kn_flags & EV_ONESHOT) {
-                       kn->kn_status &= ~KN_QUEUED;
-                       kq->kq_count--;
+                       kn->kn_status &= ~(KN_QUEUED|KN_PROCESSING);
                        splx(s);
                        kn->kn_fop->f_detach(kn);
                        knote_drop(kn, p, p->p_fd);
@@ -795,9 +810,10 @@ start:
                        }
                        if (kn->kn_flags & EV_DISPATCH)
                                kn->kn_status |= KN_DISABLED;
-                       kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
-                       kq->kq_count--;
+                       kn->kn_status &= ~(KN_QUEUED|KN_ACTIVE|KN_PROCESSING);
                } else {
+                       kn->kn_status &= ~KN_PROCESSING;
+                       kq->kq_count++;
                        TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
                }
                count--;
@@ -805,13 +821,13 @@ start:
                        splx(s);
 #ifdef KTRACE
                        if (KTRPOINT(p, KTR_STRUCT))
-                               ktrevent(p, kq->kq_kev, nkev);
+                               ktrevent(p, kev, nkev);
 #endif
-                       error = copyout(kq->kq_kev, ulistp,
+                       error = copyout(kev, ulistp,
                            sizeof(struct kevent) * nkev);
                        ulistp += nkev;
                        nkev = 0;
-                       kevp = kq->kq_kev;
+                       kevp = &kev[0];
                        s = splhigh();
                        if (error)
                                break;
@@ -823,9 +839,9 @@ done:
        if (nkev != 0) {
 #ifdef KTRACE
                if (KTRPOINT(p, KTR_STRUCT))
-                       ktrevent(p, kq->kq_kev, nkev);
+                       ktrevent(p, kev, nkev);
 #endif
-               error = copyout(kq->kq_kev, ulistp,
+               error = copyout(kev, ulistp,
                    sizeof(struct kevent) * nkev);
        }
        *retval = maxevents - count;
@@ -983,6 +999,7 @@ knote_remove(struct proc *p, struct klis
        struct knote *kn;
 
        while ((kn = SLIST_FIRST(list)) != NULL) {
+               KASSERT((kn->kn_status & KN_PROCESSING) == 0);
                kn->kn_fop->f_detach(kn);
                knote_drop(kn, p, p->p_fd);
        }
Index: kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.205
diff -u -p -r1.205 uipc_socket.c
--- kern/uipc_socket.c  15 Sep 2017 19:29:28 -0000      1.205
+++ kern/uipc_socket.c  2 Oct 2017 09:58:44 -0000
@@ -1921,8 +1921,10 @@ int
 filt_soread(struct knote *kn, long hint)
 {
        struct socket *so = kn->kn_fp->f_data;
-       int rv;
+       int s, rv;
 
+       if (!(hint & NOTE_SUBMIT))
+               s = solock(so);
        kn->kn_data = so->so_rcv.sb_cc;
 #ifdef SOCKET_SPLICE
        if (isspliced(so)) {
@@ -1940,6 +1942,8 @@ filt_soread(struct knote *kn, long hint)
        } else {
                rv = (kn->kn_data >= so->so_rcv.sb_lowat);
        }
+       if (!(hint & NOTE_SUBMIT))
+               sounlock(s);
 
        return rv;
 }
@@ -1960,8 +1964,10 @@ int
 filt_sowrite(struct knote *kn, long hint)
 {
        struct socket *so = kn->kn_fp->f_data;
-       int rv;
+       int s, rv;
 
+       if (!(hint & NOTE_SUBMIT))
+               s = solock(so);
        kn->kn_data = sbspace(so, &so->so_snd);
        if (so->so_state & SS_CANTSENDMORE) {
                kn->kn_flags |= EV_EOF;
@@ -1977,6 +1983,8 @@ filt_sowrite(struct knote *kn, long hint
        } else {
                rv = (kn->kn_data >= so->so_snd.sb_lowat);
        }
+       if (!(hint & NOTE_SUBMIT))
+               sounlock(s);
 
        return (rv);
 }
@@ -1985,8 +1993,13 @@ int
 filt_solisten(struct knote *kn, long hint)
 {
        struct socket *so = kn->kn_fp->f_data;
+       int s;
 
+       if (!(hint & NOTE_SUBMIT))
+               s = solock(so);
        kn->kn_data = so->so_qlen;
+       if (!(hint & NOTE_SUBMIT))
+               sounlock(s);
 
        return (kn->kn_data != 0);
 }
Index: miscfs/fifofs/fifo_vnops.c
===================================================================
RCS file: /cvs/src/sys/miscfs/fifofs/fifo_vnops.c,v
retrieving revision 1.58
diff -u -p -r1.58 fifo_vnops.c
--- miscfs/fifofs/fifo_vnops.c  24 Jul 2017 15:07:39 -0000      1.58
+++ miscfs/fifofs/fifo_vnops.c  2 Oct 2017 08:46:42 -0000
@@ -545,8 +545,10 @@ int
 filt_fiforead(struct knote *kn, long hint)
 {
        struct socket *so = (struct socket *)kn->kn_hook;
-       int rv;
+       int s, rv;
 
+       if (!(hint & NOTE_SUBMIT))
+               s = solock(so);
        kn->kn_data = so->so_rcv.sb_cc;
        if (so->so_state & SS_CANTRCVMORE) {
                kn->kn_flags |= EV_EOF;
@@ -555,6 +557,8 @@ filt_fiforead(struct knote *kn, long hin
                kn->kn_flags &= ~EV_EOF;
                rv = (kn->kn_data > 0);
        }
+       if (!(hint & NOTE_SUBMIT))
+               sounlock(s);
 
        return (rv);
 }
@@ -573,8 +577,10 @@ int
 filt_fifowrite(struct knote *kn, long hint)
 {
        struct socket *so = (struct socket *)kn->kn_hook;
-       int rv;
+       int s, rv;
 
+       if (!(hint & NOTE_SUBMIT))
+               s = solock(so);
        kn->kn_data = sbspace(so, &so->so_snd);
        if (so->so_state & SS_CANTSENDMORE) {
                kn->kn_flags |= EV_EOF;
@@ -583,6 +589,8 @@ filt_fifowrite(struct knote *kn, long hi
                kn->kn_flags &= ~EV_EOF;
                rv = (kn->kn_data >= so->so_snd.sb_lowat);
        }
+       if (!(hint & NOTE_SUBMIT))
+               sounlock(s);
 
        return (rv);
 }
Index: sys/event.h
===================================================================
RCS file: /cvs/src/sys/sys/event.h,v
retrieving revision 1.26
diff -u -p -r1.26 event.h
--- sys/event.h 26 Jun 2017 09:32:32 -0000      1.26
+++ sys/event.h 2 Oct 2017 09:55:20 -0000
@@ -80,13 +80,6 @@ struct kevent {
 #define EV_ERROR       0x4000          /* error, data contains errno */
 
 /*
- * hint flag for in-kernel use - must not equal any existing note
- */
-#ifdef _KERNEL
-#define NOTE_SUBMIT    0x01000000              /* initial knote submission */
-#endif
-
-/*
  * data/hint flags for EVFILT_{READ|WRITE}, shared with userspace
  */
 #define NOTE_LOWAT     0x0001                  /* low water mark */
@@ -128,6 +121,13 @@ SLIST_HEAD(klist, knote);
 
 #ifdef _KERNEL
 
+#define EVFILT_MARKER  0xF                     /* placemarker for tailq */
+
+/*
+ * hint flag for in-kernel use - must not equal any existing note
+ */
+#define NOTE_SUBMIT    0x01000000              /* initial knote submission */
+
 #define KNOTE(list_, hint)     do { \
                                        struct klist *list = (list_); \
                                        if ((list) != NULL) \
@@ -164,10 +164,11 @@ struct knote {
        } kn_ptr;
        const struct            filterops *kn_fop;
        void                    *kn_hook;
-#define KN_ACTIVE      0x01                    /* event has been triggered */
-#define KN_QUEUED      0x02                    /* event is on queue */
-#define KN_DISABLED    0x04                    /* event is disabled */
-#define KN_DETACHED    0x08                    /* knote is detached */
+#define KN_ACTIVE      0x0001                  /* event has been triggered */
+#define KN_QUEUED      0x0002                  /* event is on queue */
+#define KN_DISABLED    0x0004                  /* event is disabled */
+#define KN_DETACHED    0x0008                  /* knote is detached */
+#define KN_PROCESSING  0x0010                  /* event processing in prog */
 
 #define kn_id          kn_kevent.ident
 #define kn_filter      kn_kevent.filter
Index: sys/eventvar.h
===================================================================
RCS file: /cvs/src/sys/sys/eventvar.h,v
retrieving revision 1.3
diff -u -p -r1.3 eventvar.h
--- sys/eventvar.h      25 Mar 2012 20:33:52 -0000      1.3
+++ sys/eventvar.h      2 Oct 2017 08:46:42 -0000
@@ -44,7 +44,6 @@ struct kqueue {
 #define KQ_SEL         0x01
 #define KQ_SLEEP       0x02
 #define KQ_DYING       0x04
-       struct          kevent kq_kev[KQ_NEVENTS];
 };
 
 #endif /* !_SYS_EVENTVAR_H_ */
Index: sys/socketvar.h
===================================================================
RCS file: /cvs/src/sys/sys/socketvar.h,v
retrieving revision 1.76
diff -u -p -r1.76 socketvar.h
--- sys/socketvar.h     1 Sep 2017 15:05:31 -0000       1.76
+++ sys/socketvar.h     2 Oct 2017 08:46:42 -0000
@@ -186,10 +186,7 @@ static inline long
 sbspace(struct socket *so, struct sockbuf *sb)
 {
        KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
-#if 0
-       /* XXXSMP kqueue_scan() calling filt_sowrite() cannot sleep. */
        soassertlocked(so);
-#endif
        return lmin(sb->sb_hiwat - sb->sb_cc, sb->sb_mbmax - sb->sb_mbcnt);
 }
 

Reply via email to