i originally came at this from the other side, where i wanted to run kqueue_enqueue and _dequeue without the KERNEL_LOCK, but that implied making kqueue_scan use the mutex too, which allowed the syscall to become less locked.
it assumes that the existing locking in kqueue_scan is in the right place, it just turns it into a mutex instead of KERNEL_LOCK with splhigh. it leaves the kqueue_register code under KERNEL_LOCK, but if you're not making changes with kevent then this should be a win. there's an extra rwlock around the kqueue_scan call. this protects the kq_head list from having multiple marker structs attached to it. that is an extremely rare situation, ie, you'd have to have two threads execute kevent on the same kq fd concurrently, but that never happens. right? it seems to work ok, but i havent tested it extensively. thoughts? Index: sys/eventvar.h =================================================================== RCS file: /cvs/src/sys/sys/eventvar.h,v retrieving revision 1.5 diff -u -p -r1.5 eventvar.h --- sys/eventvar.h 17 Jun 2018 08:22:02 -0000 1.5 +++ sys/eventvar.h 1 May 2019 06:29:43 -0000 @@ -35,6 +35,8 @@ #define KQEXTENT 256 /* linear growth by this amount */ struct kqueue { + struct rwlock kq_kevent; /* serialise kevent syscall */ + struct mutex kq_mtx; TAILQ_HEAD(kqlist, knote) kq_head; /* list of pending event */ int kq_count; /* number of pending events */ int kq_refs; /* number of references */ Index: kern/kern_event.c =================================================================== RCS file: /cvs/src/sys/kern/kern_event.c,v retrieving revision 1.102 diff -u -p -r1.102 kern_event.c --- kern/kern_event.c 1 May 2019 06:22:39 -0000 1.102 +++ kern/kern_event.c 1 May 2019 06:29:43 -0000 @@ -455,6 +455,8 @@ sys_kqueue(struct proc *p, void *v, regi fp->f_type = DTYPE_KQUEUE; fp->f_ops = &kqueueops; kq = pool_get(&kqueue_pool, PR_WAITOK|PR_ZERO); + rw_init(&kq->kq_kevent, "kevent"); + mtx_init(&kq->kq_mtx, IPL_HIGH); TAILQ_INIT(&kq->kq_head); fp->f_data = kq; KQREF(kq); @@ -509,37 +511,42 @@ sys_kevent(struct proc *p, void *v, regi kq = fp->f_data; nerrors = 0; - while (SCARG(uap, nchanges) > 0) { - n = SCARG(uap, nchanges) > KQ_NEVENTS ? - KQ_NEVENTS : SCARG(uap, nchanges); - error = copyin(SCARG(uap, changelist), kev, - n * sizeof(struct kevent)); - if (error) - goto done; + if (SCARG(uap, nchanges) > 0) { + KERNEL_LOCK(); + do { + n = SCARG(uap, nchanges) > KQ_NEVENTS ? + KQ_NEVENTS : SCARG(uap, nchanges); + error = copyin(SCARG(uap, changelist), kev, + n * sizeof(struct kevent)); + if (error) + goto done; #ifdef KTRACE - if (KTRPOINT(p, KTR_STRUCT)) - ktrevent(p, kev, n); + if (KTRPOINT(p, KTR_STRUCT)) + ktrevent(p, kev, n); #endif - for (i = 0; i < n; i++) { - kevp = &kev[i]; - kevp->flags &= ~EV_SYSFLAGS; - error = kqueue_register(kq, kevp, p); - if (error || (kevp->flags & EV_RECEIPT)) { - if (SCARG(uap, nevents) != 0) { - kevp->flags = EV_ERROR; - kevp->data = error; - copyout(kevp, SCARG(uap, eventlist), - sizeof(*kevp)); - SCARG(uap, eventlist)++; - SCARG(uap, nevents)--; - nerrors++; - } else { - goto done; + for (i = 0; i < n; i++) { + kevp = &kev[i]; + kevp->flags &= ~EV_SYSFLAGS; + error = kqueue_register(kq, kevp, p); + if (error || (kevp->flags & EV_RECEIPT)) { + if (SCARG(uap, nevents) != 0) { + kevp->flags = EV_ERROR; + kevp->data = error; + copyout(kevp, + SCARG(uap, eventlist), + sizeof(*kevp)); + SCARG(uap, eventlist)++; + SCARG(uap, nevents)--; + nerrors++; + } else { + goto done; + } } } - } - SCARG(uap, nchanges) -= n; - SCARG(uap, changelist) += n; + SCARG(uap, nchanges) -= n; + SCARG(uap, changelist) += n; + } while (SCARG(uap, nchanges) > 0); + KERNEL_UNLOCK(); } if (nerrors) { *retval = nerrors; @@ -547,12 +554,18 @@ sys_kevent(struct proc *p, void *v, regi goto done; } + error = rw_enter(&kq->kq_kevent, RW_WRITE|RW_NOSLEEP); + if (error != 0) + goto done; + KQREF(kq); FRELE(fp, p); error = kqueue_scan(kq, SCARG(uap, nevents), SCARG(uap, eventlist), SCARG(uap, timeout), p, &n); KQRELE(kq); + rw_exit(&kq->kq_kevent); *retval = n; + return (error); done: @@ -701,7 +714,7 @@ kqueue_scan(struct kqueue *kq, int maxev struct kevent *kevp; struct timespec ats, rts, tts; struct knote *kn, marker; - int s, count, timeout, nkev = 0, error = 0; + int count, timeout, nkev = 0, error = 0; struct kevent kev[KQ_NEVENTS]; count = maxevents; @@ -749,15 +762,16 @@ start: } kevp = &kev[0]; - s = splhigh(); + mtx_enter(&kq->kq_mtx); if (kq->kq_count == 0) { if (timeout < 0) { error = EWOULDBLOCK; } else { kq->kq_state |= KQ_SLEEP; - error = tsleep(kq, PSOCK | PCATCH, "kqread", timeout); + error = msleep(kq, &kq->kq_mtx, PSOCK | PCATCH, + "kqread", timeout); } - splx(s); + mtx_leave(&kq->kq_mtx); if (error == 0) goto retry; /* don't restart after signals... */ @@ -773,7 +787,7 @@ start: kn = TAILQ_FIRST(&kq->kq_head); if (kn == &marker) { TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); - splx(s); + mtx_leave(&kq->kq_mtx); if (count == maxevents) goto retry; goto done; @@ -796,10 +810,12 @@ start: nkev++; if (kn->kn_flags & EV_ONESHOT) { kn->kn_status &= ~KN_QUEUED; - splx(s); + mtx_leave(&kq->kq_mtx); + KERNEL_LOCK(); kn->kn_fop->f_detach(kn); + KERNEL_UNLOCK(); knote_drop(kn, p); - s = splhigh(); + mtx_enter(&kq->kq_mtx); } else if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) { if (kn->kn_flags & EV_CLEAR) { kn->kn_data = 0; @@ -814,7 +830,7 @@ start: } count--; if (nkev == KQ_NEVENTS) { - splx(s); + mtx_leave(&kq->kq_mtx); #ifdef KTRACE if (KTRPOINT(p, KTR_STRUCT)) ktrevent(p, kev, nkev); @@ -824,13 +840,13 @@ start: ulistp += nkev; nkev = 0; kevp = &kev[0]; - s = splhigh(); + mtx_enter(&kq->kq_mtx); if (error) break; } } TAILQ_REMOVE(&kq->kq_head, &marker, kn_tqe); - splx(s); + mtx_leave(&kq->kq_mtx); done: if (nkev != 0) { #ifdef KTRACE @@ -1070,15 +1086,14 @@ void knote_enqueue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; - int s = splhigh(); + mtx_enter(&kq->kq_mtx); KASSERT((kn->kn_status & KN_QUEUED) == 0); - KERNEL_ASSERT_LOCKED(); TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kn->kn_status |= KN_QUEUED; kq->kq_count++; - splx(s); + mtx_leave(&kq->kq_mtx); kqueue_wakeup(kq); } @@ -1086,15 +1101,14 @@ void knote_dequeue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; - int s = splhigh(); + mtx_enter(&kq->kq_mtx); KASSERT(kn->kn_status & KN_QUEUED); - KERNEL_ASSERT_LOCKED(); TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_QUEUED; kq->kq_count--; - splx(s); + mtx_leave(&kq->kq_mtx); } void Index: kern/syscalls.master =================================================================== RCS file: /cvs/src/sys/kern/syscalls.master,v retrieving revision 1.189 diff -u -p -r1.189 syscalls.master --- kern/syscalls.master 11 Jan 2019 18:46:30 -0000 1.189 +++ kern/syscalls.master 1 May 2019 06:29:44 -0000 @@ -166,7 +166,7 @@ struct itimerval *itv); } 71 STD { int sys_select(int nd, fd_set *in, fd_set *ou, \ fd_set *ex, struct timeval *tv); } -72 STD { int sys_kevent(int fd, \ +72 STD NOLOCK { int sys_kevent(int fd, \ const struct kevent *changelist, int nchanges, \ struct kevent *eventlist, int nevents, \ const struct timespec *timeout); }