i originally came at this from the other side, where i wanted to run
kqueue_enqueue and _dequeue without the KERNEL_LOCK, but that implied
making kqueue_scan use the mutex too, which allowed the syscall to
become less locked.

it assumes that the existing locking in kqueue_scan is in the right
place, it just turns it into a mutex instead of KERNEL_LOCK with
splhigh. it leaves the kqueue_register code under KERNEL_LOCK, but if
you're not making changes with kevent then this should be a win.

there's an extra rwlock around the kqueue_scan call. this protects the
kq_head list from having multiple marker structs attached to it. that is
an extremely rare situation, ie, you'd have to have two threads execute
kevent on the same kq fd concurrently, but that never happens. right?

it seems to work ok, but i havent tested it extensively.

thoughts?

Index: sys/eventvar.h
===================================================================
RCS file: /cvs/src/sys/sys/eventvar.h,v
retrieving revision 1.5
diff -u -p -r1.5 eventvar.h
--- sys/eventvar.h      17 Jun 2018 08:22:02 -0000      1.5
+++ sys/eventvar.h      1 May 2019 06:29:43 -0000
@@ -35,6 +35,8 @@
 #define KQEXTENT       256             /* linear growth by this amount */
 
 struct kqueue {
+       struct rwlock   kq_kevent;              /* serialise kevent syscall */
+       struct mutex    kq_mtx;
        TAILQ_HEAD(kqlist, knote) kq_head;      /* list of pending event */
        int             kq_count;               /* number of pending events */
        int             kq_refs;                /* number of references */
Index: kern/kern_event.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_event.c,v
retrieving revision 1.102
diff -u -p -r1.102 kern_event.c
--- kern/kern_event.c   1 May 2019 06:22:39 -0000       1.102
+++ kern/kern_event.c   1 May 2019 06:29:43 -0000
@@ -455,6 +455,8 @@ sys_kqueue(struct proc *p, void *v, regi
        fp->f_type = DTYPE_KQUEUE;
        fp->f_ops = &kqueueops;
        kq = pool_get(&kqueue_pool, PR_WAITOK|PR_ZERO);
+       rw_init(&kq->kq_kevent, "kevent");
+       mtx_init(&kq->kq_mtx, IPL_HIGH);
        TAILQ_INIT(&kq->kq_head);
        fp->f_data = kq;
        KQREF(kq);
@@ -509,37 +511,42 @@ sys_kevent(struct proc *p, void *v, regi
        kq = fp->f_data;
        nerrors = 0;
 
-       while (SCARG(uap, nchanges) > 0) {
-               n = SCARG(uap, nchanges) > KQ_NEVENTS ?
-                   KQ_NEVENTS : SCARG(uap, nchanges);
-               error = copyin(SCARG(uap, changelist), kev,
-                   n * sizeof(struct kevent));
-               if (error)
-                       goto done;
+       if (SCARG(uap, nchanges) > 0) {
+               KERNEL_LOCK();
+               do {
+                       n = SCARG(uap, nchanges) > KQ_NEVENTS ?
+                           KQ_NEVENTS : SCARG(uap, nchanges);
+                       error = copyin(SCARG(uap, changelist), kev,
+                           n * sizeof(struct kevent));
+                       if (error)
+                               goto done;
 #ifdef KTRACE
-               if (KTRPOINT(p, KTR_STRUCT))
-                       ktrevent(p, kev, n);
+                       if (KTRPOINT(p, KTR_STRUCT))
+                               ktrevent(p, kev, n);
 #endif
-               for (i = 0; i < n; i++) {
-                       kevp = &kev[i];
-                       kevp->flags &= ~EV_SYSFLAGS;
-                       error = kqueue_register(kq, kevp, p);
-                       if (error || (kevp->flags & EV_RECEIPT)) {
-                               if (SCARG(uap, nevents) != 0) {
-                                       kevp->flags = EV_ERROR;
-                                       kevp->data = error;
-                                       copyout(kevp, SCARG(uap, eventlist),
-                                           sizeof(*kevp));
-                                       SCARG(uap, eventlist)++;
-                                       SCARG(uap, nevents)--;
-                                       nerrors++;
-                               } else {
-                                       goto done;
+                       for (i = 0; i < n; i++) {
+                               kevp = &kev[i];
+                               kevp->flags &= ~EV_SYSFLAGS;
+                               error = kqueue_register(kq, kevp, p);
+                               if (error || (kevp->flags & EV_RECEIPT)) {
+                                       if (SCARG(uap, nevents) != 0) {
+                                               kevp->flags = EV_ERROR;
+                                               kevp->data = error;
+                                               copyout(kevp,
+                                                   SCARG(uap, eventlist),
+                                                   sizeof(*kevp));
+                                               SCARG(uap, eventlist)++;
+                                               SCARG(uap, nevents)--;
+                                               nerrors++;
+                                       } else {
+                                               goto done;
+                                       }
                                }
                        }
-               }
-               SCARG(uap, nchanges) -= n;
-               SCARG(uap, changelist) += n;
+                       SCARG(uap, nchanges) -= n;
+                       SCARG(uap, changelist) += n;
+               } while (SCARG(uap, nchanges) > 0);
+               KERNEL_UNLOCK();
        }
        if (nerrors) {
                *retval = nerrors;
@@ -547,12 +554,18 @@ sys_kevent(struct proc *p, void *v, regi
                goto done;
        }
 
+       error = rw_enter(&kq->kq_kevent, RW_WRITE|RW_NOSLEEP);
+       if (error != 0)
+               goto done;
+
        KQREF(kq);
        FRELE(fp, p);
        error = kqueue_scan(kq, SCARG(uap, nevents), SCARG(uap, eventlist),
            SCARG(uap, timeout), p, &n);
        KQRELE(kq);
+       rw_exit(&kq->kq_kevent);
        *retval = n;
+
        return (error);
 
  done:
@@ -701,7 +714,7 @@ kqueue_scan(struct kqueue *kq, int maxev
        struct kevent *kevp;
        struct timespec ats, rts, tts;
        struct knote *kn, marker;
-       int s, count, timeout, nkev = 0, error = 0;
+       int count, timeout, nkev = 0, error = 0;
        struct kevent kev[KQ_NEVENTS];
 
        count = maxevents;
@@ -749,15 +762,16 @@ start:
        }
 
        kevp = &kev[0];
-       s = splhigh();
+       mtx_enter(&kq->kq_mtx);
        if (kq->kq_count == 0) {
                if (timeout < 0) {
                        error = EWOULDBLOCK;
                } else {
                        kq->kq_state |= KQ_SLEEP;
-                       error = tsleep(kq, PSOCK | PCATCH, "kqread", timeout);
+                       error = msleep(kq, &kq->kq_mtx, PSOCK | PCATCH,
+                           "kqread", timeout);
                }
-               splx(s);
+               mtx_leave(&kq->kq_mtx);
                if (error == 0)
                        goto retry;
                /* don't restart after signals... */
@@ -773,7 +787,7 @@ start:
                kn = TAILQ_FIRST(&kq->kq_head);
                if (kn == &marker) {
                        TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
-                       splx(s);
+                       mtx_leave(&kq->kq_mtx);
                        if (count == maxevents)
                                goto retry;
                        goto done;
@@ -796,10 +810,12 @@ start:
                nkev++;
                if (kn->kn_flags & EV_ONESHOT) {
                        kn->kn_status &= ~KN_QUEUED;
-                       splx(s);
+                       mtx_leave(&kq->kq_mtx);
+                       KERNEL_LOCK();
                        kn->kn_fop->f_detach(kn);
+                       KERNEL_UNLOCK();
                        knote_drop(kn, p);
-                       s = splhigh();
+                       mtx_enter(&kq->kq_mtx);
                } else if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
                        if (kn->kn_flags & EV_CLEAR) {
                                kn->kn_data = 0;
@@ -814,7 +830,7 @@ start:
                }
                count--;
                if (nkev == KQ_NEVENTS) {
-                       splx(s);
+                       mtx_leave(&kq->kq_mtx);
 #ifdef KTRACE
                        if (KTRPOINT(p, KTR_STRUCT))
                                ktrevent(p, kev, nkev);
@@ -824,13 +840,13 @@ start:
                        ulistp += nkev;
                        nkev = 0;
                        kevp = &kev[0];
-                       s = splhigh();
+                       mtx_enter(&kq->kq_mtx);
                        if (error)
                                break;
                }
        }
        TAILQ_REMOVE(&kq->kq_head, &marker, kn_tqe);
-       splx(s);
+       mtx_leave(&kq->kq_mtx);
 done:
        if (nkev != 0) {
 #ifdef KTRACE
@@ -1070,15 +1086,14 @@ void
 knote_enqueue(struct knote *kn)
 {
        struct kqueue *kq = kn->kn_kq;
-       int s = splhigh();
 
+       mtx_enter(&kq->kq_mtx);
        KASSERT((kn->kn_status & KN_QUEUED) == 0);
-       KERNEL_ASSERT_LOCKED();
 
        TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
        kn->kn_status |= KN_QUEUED;
        kq->kq_count++;
-       splx(s);
+       mtx_leave(&kq->kq_mtx);
        kqueue_wakeup(kq);
 }
 
@@ -1086,15 +1101,14 @@ void
 knote_dequeue(struct knote *kn)
 {
        struct kqueue *kq = kn->kn_kq;
-       int s = splhigh();
 
+       mtx_enter(&kq->kq_mtx);
        KASSERT(kn->kn_status & KN_QUEUED);
-       KERNEL_ASSERT_LOCKED();
 
        TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
        kn->kn_status &= ~KN_QUEUED;
        kq->kq_count--;
-       splx(s);
+       mtx_leave(&kq->kq_mtx);
 }
 
 void
Index: kern/syscalls.master
===================================================================
RCS file: /cvs/src/sys/kern/syscalls.master,v
retrieving revision 1.189
diff -u -p -r1.189 syscalls.master
--- kern/syscalls.master        11 Jan 2019 18:46:30 -0000      1.189
+++ kern/syscalls.master        1 May 2019 06:29:44 -0000
@@ -166,7 +166,7 @@
                            struct itimerval *itv); }
 71     STD             { int sys_select(int nd, fd_set *in, fd_set *ou, \
                            fd_set *ex, struct timeval *tv); }
-72     STD             { int sys_kevent(int fd, \
+72     STD NOLOCK      { int sys_kevent(int fd, \
                            const struct kevent *changelist, int nchanges, \
                            struct kevent *eventlist, int nevents, \
                            const struct timespec *timeout); }

Reply via email to