Diff below changes the internal of *poll(2) and *select(2) to use the
*_kqfilter() handlers instead of *_poll() ones.  Events are stored on a
private per-thread kqueue then converted to the corresponding "fd_set"
or "pollfd".  The design is similar to DragonFly's solution.

The main argument for this proposal is to reduce the amount of code
executed to notify userland when an event occur.  The outcome of this
diff is that a single notification subsystem needs to be taken out of
the KERNEL_LOCK().  This simplifies a lot existing locking tentacles.

Using kqueue internally means collision is avoided and there's no need
to query handlers for fds that aren't ready.  This comes at the cost of
allocating descriptors.  A space vs time trade-off.  Note that this cost
can be diminished by doing lazy removal of event descriptors to be able
to re-use them. 

A lot of kqueue(2) related cleanups and fixes are required and included
below.  To keep it small, the diff doesn't contain the removal of the
*_poll() handlers and abstraction layers.

Although I've been running with this diff for over a month now I'm not
asking for oks yet.  I'm sharing it to get feedbacks, inputs and for
interested people to try it out.

I'm thinking of continuing to push the kqueue(2) fixes in the tree
because those are worth having even without this.  If we agree this is
a beneficial change, I'm suggesting a 3 steps plans to integrate it:

- Discuss & integrate the kqfilter handlers changes required

- Convert *select(2) first because the interface is simpler and there's
  fewer possibilities of regression compared to *poll(2)

- Convert *poll(2)

Depending on tests and findings regarding performances, I can work on
lazy removal of event descriptors prior or after integrating *poll(2).
I'd like the help of somebody with a heavy poll(2) use case, like bgpd(8).

Comments?

diff --git sys/kern/kern_event.c sys/kern/kern_event.c
index 0bed8e5f671..68c35f0d6a4 100644
--- sys/kern/kern_event.c
+++ sys/kern/kern_event.c
@@ -57,14 +57,13 @@
 #include <sys/timeout.h>
 #include <sys/wait.h>
 
+void   kqueue_terminate(struct proc *p, struct kqueue *);
+void   kqueue_free(struct kqueue *);
 void   kqueue_init(void);
 void   KQREF(struct kqueue *);
 void   KQRELE(struct kqueue *);
 
 int    kqueue_sleep(struct kqueue *, struct timespec *);
-int    kqueue_scan(struct kqueue_scan_state *scan, int maxevents,
-                   struct kevent *ulistp, struct timespec *timeout,
-                   struct proc *p, int *retval);
 
 int    kqueue_read(struct file *, struct uio *, int);
 int    kqueue_write(struct file *, struct uio *, int);
@@ -156,6 +155,7 @@ const struct filterops *const sysfilt_ops[] = {
        &sig_filtops,                   /* EVFILT_SIGNAL */
        &timer_filtops,                 /* EVFILT_TIMER */
        &file_filtops,                  /* EVFILT_DEVICE */
+       &file_filtops,                  /* EVFILT_EXCEPT */
 };
 
 void
@@ -181,8 +181,13 @@ KQRELE(struct kqueue *kq)
                fdpunlock(fdp);
        }
 
-       free(kq->kq_knlist, M_KEVENT, kq->kq_knlistsize *
-           sizeof(struct knlist));
+       kqueue_free(kq);
+}
+
+void
+kqueue_free(struct kqueue *kq)
+{
+       free(kq->kq_knlist, M_KEVENT, kq->kq_knlistsize * sizeof(struct klist));
        hashfree(kq->kq_knhash, KN_HASHSIZE, M_KEVENT);
        pool_put(&kqueue_pool, kq);
 }
@@ -492,13 +497,10 @@ static const struct filterops dead_filtops = {
        .f_event        = filt_dead,
 };
 
-int
-sys_kqueue(struct proc *p, void *v, register_t *retval)
+struct kqueue *
+kqueue_alloc(struct filedesc *fdp)
 {
-       struct filedesc *fdp = p->p_fd;
        struct kqueue *kq;
-       struct file *fp;
-       int fd, error;
 
        kq = pool_get(&kqueue_pool, PR_WAITOK | PR_ZERO);
        kq->kq_refs = 1;
@@ -506,6 +508,27 @@ sys_kqueue(struct proc *p, void *v, register_t *retval)
        TAILQ_INIT(&kq->kq_head);
        task_set(&kq->kq_task, kqueue_task, kq);
 
+       return (kq);
+}
+
+void
+kqueue_exit(struct proc *p)
+{
+       kqueue_terminate(p, p->p_kq);
+       kqueue_free(p->p_kq);
+       p->p_kq = NULL;
+}
+
+int
+sys_kqueue(struct proc *p, void *v, register_t *retval)
+{
+       struct filedesc *fdp = p->p_fd;
+       struct kqueue *kq;
+       struct file *fp;
+       int fd, error;
+
+       kq = kqueue_alloc(fdp);
+
        fdplock(fdp);
        error = falloc(p, &fp, &fd);
        if (error)
@@ -545,6 +568,7 @@ sys_kevent(struct proc *p, void *v, register_t *retval)
        struct timespec ts;
        struct timespec *tsp = NULL;
        int i, n, nerrors, error;
+       int ready, total;
        struct kevent kev[KQ_NEVENTS];
 
        if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
@@ -573,9 +597,9 @@ sys_kevent(struct proc *p, void *v, register_t *retval)
        kq = fp->f_data;
        nerrors = 0;
 
-       while (SCARG(uap, nchanges) > 0) {
-               n = SCARG(uap, nchanges) > KQ_NEVENTS ?
-                   KQ_NEVENTS : SCARG(uap, nchanges);
+       while ((n = SCARG(uap, nchanges)) > 0) {
+               if (n > nitems(kev))
+                       n = nitems(kev);
                error = copyin(SCARG(uap, changelist), kev,
                    n * sizeof(struct kevent));
                if (error)
@@ -611,14 +635,39 @@ sys_kevent(struct proc *p, void *v, register_t *retval)
                goto done;
        }
 
+
        KQREF(kq);
        FRELE(fp, p);
+       /*
+        * Collect as many events as we can.  The timeout on successive
+        * loops is disabled (kqueue_scan() becomes non-blocking).
+        */
+       total = 0;
+       error = 0;
        kqueue_scan_setup(&scan, kq);
-       error = kqueue_scan(&scan, SCARG(uap, nevents), SCARG(uap, eventlist),
-           tsp, p, &n);
+       while ((n = SCARG(uap, nevents) - total) > 0) {
+               if (n > nitems(kev))
+                       n = nitems(kev);
+               ready = kqueue_scan(&scan, n, kev, tsp, p, &error);
+               if (ready == 0)
+                       break;
+               error = copyout(kev, SCARG(uap, eventlist) + total,
+                   sizeof(struct kevent) * ready);
+#ifdef KTRACE
+               if (KTRPOINT(p, KTR_STRUCT))
+                       ktrevent(p, kev, ready);
+#endif
+               total += ready;
+               if (error || ready < n)
+                       break;
+               tsp = &ts;              /* successive loops non-blocking */
+               timespecclear(tsp);
+       }
        kqueue_scan_finish(&scan);
        KQRELE(kq);
-       *retval = n;
+       if (error == EWOULDBLOCK)
+               error = 0;
+       *retval = total;
        return (error);
 
  done:
@@ -872,27 +921,28 @@ kqueue_sleep(struct kqueue *kq, struct timespec *tsp)
        return (error);
 }
 
+/*
+ * Scan the kqueue, blocking if necessary until the target time is reached.
+ * If tsp is NULL we block indefinitely.  If tsp->ts_secs/nsecs are both
+ * 0 we do not block at all.
+ */
 int
 kqueue_scan(struct kqueue_scan_state *scan, int maxevents,
-    struct kevent *ulistp, struct timespec *tsp, struct proc *p, int *retval)
+    struct kevent *kevp, struct timespec *tsp, struct proc *p, int *errorp)
 {
-       struct kevent *kevp;
        struct knote *kn;
        struct kqueue *kq = scan->kqs_kq;
        int s, count, nkev = 0, error = 0;
-       struct kevent kev[KQ_NEVENTS];
 
        count = maxevents;
        if (count == 0)
                goto done;
-
 retry:
        if (kq->kq_state & KQ_DYING) {
                error = EBADF;
                goto done;
        }
 
-       kevp = &kev[0];
        s = splhigh();
        if (kq->kq_count == 0) {
                if ((tsp != NULL && !timespecisset(tsp)) ||
@@ -904,7 +954,7 @@ retry:
                kq->kq_state |= KQ_SLEEP;
                error = kqueue_sleep(kq, tsp);
                splx(s);
-               if (error == 0 || error == EWOULDBLOCK)
+               if (error == 0)
                        goto retry;
                /* don't restart after signals... */
                if (error == ERESTART)
@@ -976,6 +1026,9 @@ retry:
                count--;
                scan->kqs_nevent++;
 
+               /*
+                * Post-event action on the note
+                */
                if (kn->kn_flags & EV_ONESHOT) {
                        splx(s);
                        kn->kn_fop->f_detach(kn);
@@ -1001,35 +1054,14 @@ retry:
                        knote_release(kn);
                }
                kqueue_check(kq);
-               if (nkev == KQ_NEVENTS) {
-                       splx(s);
-#ifdef KTRACE
-                       if (KTRPOINT(p, KTR_STRUCT))
-                               ktrevent(p, kev, nkev);
-#endif
-                       error = copyout(kev, ulistp,
-                           sizeof(struct kevent) * nkev);
-                       ulistp += nkev;
-                       nkev = 0;
-                       kevp = &kev[0];
-                       s = splhigh();
-                       if (error)
-                               break;
-               }
        }
        TAILQ_REMOVE(&kq->kq_head, &scan->kqs_start, kn_tqe);
        splx(s);
+       if (scan->kqs_nevent == 0)
+               goto retry;
 done:
-       if (nkev != 0) {
-#ifdef KTRACE
-               if (KTRPOINT(p, KTR_STRUCT))
-                       ktrevent(p, kev, nkev);
-#endif
-               error = copyout(kev, ulistp,
-                   sizeof(struct kevent) * nkev);
-       }
-       *retval = maxevents - count;
-       return (error);
+       *errorp = error;
+       return (nkev);
 }
 
 void
@@ -1115,13 +1147,12 @@ kqueue_stat(struct file *fp, struct stat *st, struct 
proc *p)
        return (0);
 }
 
-int
-kqueue_close(struct file *fp, struct proc *p)
+void
+kqueue_purge(struct proc *p, struct kqueue *kq)
 {
-       struct kqueue *kq = fp->f_data;
        int i;
 
-       KERNEL_LOCK();
+       KERNEL_ASSERT_LOCKED();
 
        for (i = 0; i < kq->kq_knlistsize; i++)
                knote_remove(p, &kq->kq_knlist[i]);
@@ -1129,14 +1160,29 @@ kqueue_close(struct file *fp, struct proc *p)
                for (i = 0; i < kq->kq_knhashmask + 1; i++)
                        knote_remove(p, &kq->kq_knhash[i]);
        }
-       fp->f_data = NULL;
+}
 
+void
+kqueue_terminate(struct proc *p, struct kqueue *kq)
+{
+       kqueue_purge(p, kq);
        kq->kq_state |= KQ_DYING;
        kqueue_wakeup(kq);
 
        KASSERT(klist_empty(&kq->kq_sel.si_note));
        task_del(systq, &kq->kq_task);
 
+}
+
+int
+kqueue_close(struct file *fp, struct proc *p)
+{
+       struct kqueue *kq = fp->f_data;
+
+       KERNEL_LOCK();
+       kqueue_terminate(p, kq);
+       fp->f_data = NULL;
+
        KQRELE(kq);
 
        KERNEL_UNLOCK();
diff --git sys/kern/kern_exit.c sys/kern/kern_exit.c
index 66ffad0de02..313890df808 100644
--- sys/kern/kern_exit.c
+++ sys/kern/kern_exit.c
@@ -184,6 +184,8 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
        if ((p->p_flag & P_THREAD) == 0)
                pr->ps_siglist = 0;
 
+       kqueue_exit(p);
+
 #if NKCOV > 0
        kcov_exit(p);
 #endif
diff --git sys/kern/kern_fork.c sys/kern/kern_fork.c
index 6cfb39b6252..3a278321df4 100644
--- sys/kern/kern_fork.c
+++ sys/kern/kern_fork.c
@@ -422,6 +422,8 @@ fork1(struct proc *curp, int flags, void (*func)(void *), 
void *arg,
                newptstat = malloc(sizeof(*newptstat), M_SUBPROC, M_WAITOK);
 
        p->p_tid = alloctid();
+       p->p_kq = kqueue_alloc(p->p_fd);
+       p->p_kq_serial = arc4random();
 
        LIST_INSERT_HEAD(&allproc, p, p_list);
        LIST_INSERT_HEAD(TIDHASH(p->p_tid), p, p_hash);
@@ -553,6 +555,8 @@ thread_fork(struct proc *curp, void *stack, void *tcb, 
pid_t *tidptr,
        cpu_fork(curp, p, stack, tcb, child_return, p);
 
        p->p_tid = alloctid();
+       p->p_kq = kqueue_alloc(p->p_fd);
+       p->p_kq_serial = arc4random();
 
        LIST_INSERT_HEAD(&allproc, p, p_list);
        LIST_INSERT_HEAD(TIDHASH(p->p_tid), p, p_hash);
diff --git sys/kern/spec_vnops.c sys/kern/spec_vnops.c
index 887a7acb641..ac2ed144c20 100644
--- sys/kern/spec_vnops.c
+++ sys/kern/spec_vnops.c
@@ -386,11 +386,9 @@ spec_poll(void *v)
        dev_t dev;
 
        switch (ap->a_vp->v_type) {
-
        default:
                return (ap->a_events &
                    (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
-
        case VCHR:
                dev = ap->a_vp->v_rdev;
                return (*cdevsw[major(dev)].d_poll)(dev, ap->a_events, ap->a_p);
@@ -400,12 +398,17 @@ int
 spec_kqfilter(void *v)
 {
        struct vop_kqfilter_args *ap = v;
-
        dev_t dev;
 
        dev = ap->a_vp->v_rdev;
-       if (cdevsw[major(dev)].d_kqfilter)
-               return (*cdevsw[major(dev)].d_kqfilter)(dev, ap->a_kn);
+
+       switch (ap->a_vp->v_type) {
+       default:
+               return seltrue_kqfilter(dev, ap->a_kn);
+       case VCHR:
+               if (cdevsw[major(dev)].d_kqfilter)
+                       return (*cdevsw[major(dev)].d_kqfilter)(dev, ap->a_kn);
+       }
        return (EOPNOTSUPP);
 }
 
diff --git sys/kern/sys_generic.c sys/kern/sys_generic.c
index 477d8a433d4..427dc7e2001 100644
--- sys/kern/sys_generic.c
+++ sys/kern/sys_generic.c
@@ -55,6 +55,7 @@
 #include <sys/time.h>
 #include <sys/malloc.h>
 #include <sys/poll.h>
+#include <sys/eventvar.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
@@ -66,14 +67,27 @@
 
 #include <uvm/uvm_extern.h>
 
-int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
-void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
+/*
+ * Debug values:
+ *  1 - print implementation errors, things that should not happen.
+ *  2 - print ppoll(2) information, somewhat verbose
+ *  3 - print pselect(2) and ppoll(2) information, very verbose
+ */
+int kqpoll_debug = 0;
+#define DPRINTFN(v, x...) if (kqpoll_debug > v) {              \
+    printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid); \
+    printf(x);                                                 \
+}
+int pselregister(struct proc *, fd_set *, int, int, int *);
+int pselcollect(struct proc *, struct kevent *, fd_set *[]);
+int ppollregister(struct proc *, struct pollfd *, int, int *);
+int ppollcollect(struct proc *, struct kevent *, struct pollfd *, u_int);
+
 int pollout(struct pollfd *, struct pollfd *, u_int);
 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
     struct timespec *, const sigset_t *, register_t *);
 int doppoll(struct proc *, struct pollfd *, u_int, struct timespec *,
     const sigset_t *, register_t *);
-void doselwakeup(struct selinfo *);
 
 int
 iovec_copyin(const struct iovec *uiov, struct iovec **iovp, struct iovec *aiov,
@@ -584,11 +598,11 @@ int
 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
     struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
 {
+       struct kqueue_scan_state scan;
        fd_mask bits[6];
        fd_set *pibits[3], *pobits[3];
-       struct timespec elapsed, start, stop;
-       uint64_t nsecs;
-       int s, ncoll, error = 0;
+       struct timespec ts;
+       int error, nevents = 0;
        u_int ni;
 
        if (nd < 0)
@@ -636,43 +650,61 @@ dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, 
fd_set *ex,
        if (sigmask)
                dosigsuspend(p, *sigmask &~ sigcantmask);
 
-retry:
-       ncoll = nselcoll;
-       atomic_setbits_int(&p->p_flag, P_SELECT);
-       error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
-       if (error || *retval)
+       /* Register kqueue events */
+       if ((error = pselregister(p, pibits[0], nd, ni, &nevents) != 0))
                goto done;
-       if (timeout == NULL || timespecisset(timeout)) {
-               if (timeout != NULL) {
-                       getnanouptime(&start);
+
+       /*
+        * The poll/select family of syscalls has been designed to
+        * block when file descriptors are not available, even if
+        * there's nothing to wait for.
+        */
+       if (nevents == 0) {
+               uint64_t nsecs = INFSLP;
+
+               if (timeout != NULL)
                        nsecs = MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP);
-               } else
-                       nsecs = INFSLP;
-               s = splhigh();
-               if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
-                       splx(s);
-                       goto retry;
-               }
-               atomic_clearbits_int(&p->p_flag, P_SELECT);
-               error = tsleep_nsec(&selwait, PSOCK | PCATCH, "select", nsecs);
-               splx(s);
-               if (timeout != NULL) {
-                       getnanouptime(&stop);
-                       timespecsub(&stop, &start, &elapsed);
-                       timespecsub(timeout, &elapsed, timeout);
-                       if (timeout->tv_sec < 0)
-                               timespecclear(timeout);
-               }
-               if (error == 0 || error == EWOULDBLOCK)
-                       goto retry;
+
+               error = tsleep_nsec(&p->p_kq, PSOCK | PCATCH, "kqsel", nsecs);
+               /* select is not restarted after signals... */
+               if (error == ERESTART)
+                       error = EINTR;
        }
-done:
-       atomic_clearbits_int(&p->p_flag, P_SELECT);
-       /* select is not restarted after signals... */
-       if (error == ERESTART)
-               error = EINTR;
+
+       /* Collect at most `nevents' possibly waiting in kqueue_scan() */
+       kqueue_scan_setup(&scan, p->p_kq);
+       while (nevents > 0) {
+               struct kevent kev[KQ_NEVENTS];
+               int i, ready, count;
+
+               /* Maxium number of events per iteration */
+               count = MIN(nitems(kev), nevents);
+               ready = kqueue_scan(&scan, count, kev, timeout, p, &error);
+#ifdef KTRACE
+               if (KTRPOINT(p, KTR_STRUCT))
+                       ktrevent(p, kev, ready);
+#endif
+               /* Convert back events that are ready. */
+               for (i = 0; i < ready; i++)
+                       *retval += pselcollect(p, &kev[i], pobits);
+
+               /*
+                * Stop if there was an error or if we had enough
+                * place to collect all events that were ready.
+                */
+               if (error || ready < count)
+                       break;
+
+               timeout = &ts;          /* successive loops non-blocking */
+               timespecclear(timeout);
+
+               nevents -= ready;
+       }
+       kqueue_scan_finish(&scan);
+
        if (error == EWOULDBLOCK)
                error = 0;
+ done:
 #define        putbits(name, x) \
        if (name && (error2 = copyout(pobits[x], name, ni))) \
                error = error2;
@@ -694,41 +726,107 @@ done:
 
        if (pibits[0] != (fd_set *)&bits[0])
                free(pibits[0], M_TEMP, 6 * ni);
+
+       kqueue_purge(p, p->p_kq);
+       p->p_kq_serial += nd;
+
        return (error);
 }
 
+/*
+ * Convert fd_set into kqueue events and register them on the
+ * per-thread queue.
+ */
 int
-selscan(struct proc *p, fd_set *ibits, fd_set *obits, int nfd, int ni,
-    register_t *retval)
+pselregister(struct proc *p, fd_set *ibits, int nfd, int ni, int *nregistered)
 {
-       caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
-       struct filedesc *fdp = p->p_fd;
-       int msk, i, j, fd;
+       static const struct {
+               int filter;
+               int fflags;
+       } evf[] = {
+           { EVFILT_READ,      NOTE_IMM },
+           { EVFILT_WRITE,     NOTE_IMM },
+           { EVFILT_EXCEPT,    0 }
+       };
+       caddr_t cibits = (caddr_t)ibits;
+       int msk, i, j, fd, nevents = 0, error = 0;
+       struct kevent kev;
        fd_mask bits;
-       struct file *fp;
-       int n = 0;
-       static const int flag[3] = { POLLIN, POLLOUT|POLL_NOHUP, POLLPRI };
 
        for (msk = 0; msk < 3; msk++) {
                fd_set *pibits = (fd_set *)&cibits[msk*ni];
-               fd_set *pobits = (fd_set *)&cobits[msk*ni];
 
                for (i = 0; i < nfd; i += NFDBITS) {
                        bits = pibits->fds_bits[i/NFDBITS];
                        while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
                                bits &= ~(1 << j);
-                               if ((fp = fd_getfile(fdp, fd)) == NULL)
-                                       return (EBADF);
-                               if ((*fp->f_ops->fo_poll)(fp, flag[msk], p)) {
-                                       FD_SET(fd, pobits);
-                                       n++;
+
+                               DPRINTFN(2, "select fd %d mask %d serial %lu\n",
+                                   fd, msk, p->p_kq_serial);
+                               EV_SET(&kev, fd, evf[msk].filter,
+                                   EV_ADD|EV_ENABLE, evf[msk].fflags, 0,
+                                   (void *)(p->p_kq_serial));
+#ifdef KTRACE
+                               if (KTRPOINT(p, KTR_STRUCT))
+                                       ktrevent(p, &kev, 1);
+#endif
+                               error = kqueue_register(p->p_kq, &kev, p);
+                               switch (error) {
+                               case 0:
+                                       nevents++;
+                               case EOPNOTSUPP:/* No underlying kqfilter */
+                               case EINVAL:    /* Unimplemented filter */
+                                       error = 0;
+                                       break;
+                               case ENXIO:     /* Device has been detached */
+                               default:
+                                       goto bad;
                                }
-                               FRELE(fp, p);
                        }
                }
        }
-       *retval = n;
+
+       *nregistered = nevents;
        return (0);
+bad:
+       DPRINTFN(0, "select fd %u filt %d error %d\n", (int)kev.ident,
+           kev.filter, error);
+       return (error);
+}
+
+/*
+ * Convert given kqueue event into corresponding select(2) bit.
+ */
+int
+pselcollect(struct proc *p, struct kevent *kevp, fd_set *pobits[3])
+{
+#ifdef DIAGNOSTIC
+       /* Filter out and lazily delete spurious events */
+       if ((unsigned long)kevp->udata != p->p_kq_serial) {
+               DPRINTFN(0, "select fd %u mismatched serial %lu\n",
+                   (int)kevp->ident, p->p_kq_serial);
+               kevp->flags = EV_DISABLE|EV_DELETE;
+               kqueue_register(p->p_kq, kevp, p);
+               return (0);
+       }
+#endif
+
+       switch (kevp->filter) {
+       case EVFILT_READ:
+               FD_SET(kevp->ident, pobits[0]);
+               break;
+       case EVFILT_WRITE:
+               FD_SET(kevp->ident, pobits[1]);
+               break;
+       case EVFILT_EXCEPT:
+               FD_SET(kevp->ident, pobits[2]);
+               break;
+       default:
+               KASSERT(0);
+       }
+
+       DPRINTFN(2, "select fd %d filt %d\n", (int)kevp->ident, kevp->filter);
+       return (1);
 }
 
 int
@@ -774,59 +872,113 @@ selwakeup(struct selinfo *sip)
 {
        KERNEL_LOCK();
        KNOTE(&sip->si_note, NOTE_SUBMIT);
-       doselwakeup(sip);
        KERNEL_UNLOCK();
 }
 
-void
-doselwakeup(struct selinfo *sip)
+int
+ppollregister_evt(struct proc *p, struct kevent *kevp, int nkev,
+    struct pollfd *pl)
 {
-       struct proc *p;
+       int i, error, nevents = 0;
 
-       KERNEL_ASSERT_LOCKED();
+       KASSERT(pl->revents == 0);
 
-       if (sip->si_seltid == 0)
-               return;
-       if (sip->si_flags & SI_COLL) {
-               nselcoll++;
-               sip->si_flags &= ~SI_COLL;
-               wakeup(&selwait);
-       }
-       p = tfind(sip->si_seltid);
-       sip->si_seltid = 0;
-       if (p != NULL) {
-               if (wakeup_proc(p, &selwait)) {
-                       /* nothing else to do */
-               } else if (p->p_flag & P_SELECT)
-                       atomic_clearbits_int(&p->p_flag, P_SELECT);
+#ifdef KTRACE
+       if (KTRPOINT(p, KTR_STRUCT))
+               ktrevent(p, kevp, nkev);
+#endif
+       for (i = 0; i < nkev; i++, kevp++) {
+               error = kqueue_register(p->p_kq, kevp, p);
+               switch (error) {
+               case 0:
+                       nevents++;
+                       break;
+               case EOPNOTSUPP:/* No underlying kqfilter */
+               case EINVAL:    /* Unimplemented filter */
+                       break;
+               case EBADF:     /* Bad file descriptor */
+                       pl->revents |= POLLNVAL;
+                       break;
+               case EPIPE:     /* Specific to pipes */
+                       KASSERT(kevp->filter == EVFILT_WRITE);
+                       pl->revents |= POLLHUP;
+                       break;
+               default:
+#ifdef DIAGNOSTIC
+                       DPRINTFN(0, "poll err %lu fd %d revents %02x serial"
+                           " %lu filt %d ERROR=%d\n",
+                           ((unsigned long)kevp->udata - p->p_kq_serial),
+                           pl->fd, pl->revents, p->p_kq_serial, kevp->filter,
+                           error);
+#endif
+                       /* FALLTHROUGH */
+               case ENXIO:     /* Device has been detached */
+                       pl->revents |= POLLERR;
+                       break;
+               }
        }
+
+       return (nevents);
 }
 
-void
-pollscan(struct proc *p, struct pollfd *pl, u_int nfd, register_t *retval)
+/*
+ * Convert pollfd into kqueue events and register them on the
+ * per-thread queue.
+ *
+ * Return the number of pollfd that triggered at least one error and aren't
+ * completly monitored.  These pollfd should have the correponding error bit
+ * set in `revents'.
+ *
+ * At most 3 events can correspond to a single pollfd.
+ */
+int
+ppollregister(struct proc *p, struct pollfd *pl, int nfds, int *nregistered)
 {
-       struct filedesc *fdp = p->p_fd;
-       struct file *fp;
-       u_int i;
-       int n = 0;
+       int i, nkev, nevt, errcount = 0;
+       struct kevent kev[3], *kevp;
 
-       for (i = 0; i < nfd; i++, pl++) {
-               /* Check the file descriptor. */
-               if (pl->fd < 0) {
-                       pl->revents = 0;
+       for (i = 0; i < nfds; i++) {
+               pl[i].events &= ~POLL_NOHUP;
+               pl[i].revents = 0;
+
+               if (pl[i].fd < 0)
                        continue;
+
+               DPRINTFN(1, "poll set %d/%d fd %d  events %02x serial %lu\n",
+                   i, nfds-1, pl[i].fd, pl[i].events, p->p_kq_serial);
+
+               nevt = 0;
+               nkev = 0;
+               kevp = kev;
+               if (pl[i].events & (POLLIN | POLLRDNORM)) {
+                       EV_SET(kevp, pl[i].fd, EVFILT_READ, EV_ADD|EV_ENABLE,
+                           NOTE_IMM, 0, (void *)(p->p_kq_serial + i));
+                       nkev++;
+                       kevp++;
                }
-               if ((fp = fd_getfile(fdp, pl->fd)) == NULL) {
-                       pl->revents = POLLNVAL;
-                       n++;
-                       continue;
+               if (pl[i].events & (POLLOUT | POLLWRNORM)) {
+                       EV_SET(kevp, pl[i].fd, EVFILT_WRITE, EV_ADD|EV_ENABLE,
+                           NOTE_IMM, 0, (void *)(p->p_kq_serial + i));
+                       nkev++;
+                       kevp++;
                }
-               pl->revents = (*fp->f_ops->fo_poll)(fp, pl->events, p);
-               FRELE(fp, p);
-               if (pl->revents != 0)
-                       n++;
+               if (pl[i].events & (POLLPRI | POLLRDBAND)) {
+                       EV_SET(kevp, pl[i].fd, EVFILT_EXCEPT, EV_ADD|EV_ENABLE,
+                           0, 0, (void *)(p->p_kq_serial + i));
+                       nkev++;
+                       kevp++;
+               }
+
+               if (nkev == 0)
+                       continue;
+
+               nevt = ppollregister_evt(p, kev, nkev, &pl[i]);
+               if (nevt == 0)
+                       errcount++;
+               *nregistered += nevt;
        }
-       *retval = n;
+
+       return (errcount);
 }
 
 /*
@@ -916,11 +1068,11 @@ int
 doppoll(struct proc *p, struct pollfd *fds, u_int nfds,
     struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
 {
-       size_t sz;
+       struct kqueue_scan_state scan;
        struct pollfd pfds[4], *pl = pfds;
-       struct timespec elapsed, start, stop;
-       uint64_t nsecs;
-       int ncoll, i, s, error;
+       struct timespec ts;
+       int error, nevents = 0;
+       size_t sz;
 
        /* Standards say no more than MAX_OPEN; this is possibly better. */
        if (nfds > min((int)lim_cur(RLIMIT_NOFILE), maxfiles))
@@ -939,53 +1091,65 @@ doppoll(struct proc *p, struct pollfd *fds, u_int nfds,
        if ((error = copyin(fds, pl, sz)) != 0)
                goto bad;
 
-       for (i = 0; i < nfds; i++) {
-               pl[i].events &= ~POLL_NOHUP;
-               pl[i].revents = 0;
-       }
-
        if (sigmask)
                dosigsuspend(p, *sigmask &~ sigcantmask);
 
-retry:
-       ncoll = nselcoll;
-       atomic_setbits_int(&p->p_flag, P_SELECT);
-       pollscan(p, pl, nfds, retval);
-       if (*retval)
-               goto done;
-       if (timeout == NULL || timespecisset(timeout)) {
-               if (timeout != NULL) {
-                       getnanouptime(&start);
+       /* Register kqueue events */
+       *retval = ppollregister(p, pl, nfds, &nevents);
+
+       /*
+        * The poll/select family of syscalls has been designed to
+        * block when file descriptors are not available, even if
+        * there's nothing to wait for.
+        */
+       if (nevents == 0) {
+               uint64_t nsecs = INFSLP;
+
+               if (timeout != NULL)
                        nsecs = MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP);
-               } else
-                       nsecs = INFSLP;
-               s = splhigh();
-               if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
-                       splx(s);
-                       goto retry;
-               }
-               atomic_clearbits_int(&p->p_flag, P_SELECT);
-               error = tsleep_nsec(&selwait, PSOCK | PCATCH, "poll", nsecs);
-               splx(s);
-               if (timeout != NULL) {
-                       getnanouptime(&stop);
-                       timespecsub(&stop, &start, &elapsed);
-                       timespecsub(timeout, &elapsed, timeout);
-                       if (timeout->tv_sec < 0)
-                               timespecclear(timeout);
-               }
-               if (error == 0 || error == EWOULDBLOCK)
-                       goto retry;
+
+               error = tsleep_nsec(&p->p_kq, PSOCK | PCATCH, "kqpoll", nsecs);
+               if (error == ERESTART)
+                       error = EINTR;
+       }
+
+       /* Collect at most `nevents' possibly waiting in kqueue_scan() */
+       kqueue_scan_setup(&scan, p->p_kq);
+       while (nevents > 0) {
+               struct kevent kev[KQ_NEVENTS];
+               int i, ready, count;
+
+               /* Maxium number of events per iteration */
+               count = MIN(nitems(kev), nevents);
+               ready = kqueue_scan(&scan, count, kev, timeout, p, &error);
+#ifdef KTRACE
+               if (KTRPOINT(p, KTR_STRUCT))
+                       ktrevent(p, kev, ready);
+#endif
+               /* Convert back events that are ready. */
+               for (i = 0; i < ready; i++)
+                       *retval += ppollcollect(p, &kev[i], pl, nfds);
+
+               /*
+                * Stop if there was an error or if we had enough
+                * place to collect all events that were ready.
+                */
+               if (error || ready < count)
+                       break;
+
+               timeout = &ts;          /* successive loops non-blocking */
+               timespecclear(timeout);
+
+               nevents -= ready;
        }
+       kqueue_scan_finish(&scan);
 
-done:
-       atomic_clearbits_int(&p->p_flag, P_SELECT);
        /*
         * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
         *       ignored (since the whole point is to see what would block).
         */
        switch (error) {
-       case ERESTART:
+       case EINTR:
                error = pollout(pl, fds, nfds);
                if (error == 0)
                        error = EINTR;
@@ -1002,9 +1166,89 @@ done:
 bad:
        if (pl != pfds)
                free(pl, M_TEMP, sz);
+
+       kqueue_purge(p, p->p_kq);
+       p->p_kq_serial += nfds;
+
        return (error);
 }
 
+/*
+ * Convert given kqueue event into corresponding poll(2) revents bit.
+ */
+int
+ppollcollect(struct proc *p, struct kevent *kevp, struct pollfd *pl, u_int 
nfds)
+{
+       int already_seen;
+       unsigned long i;
+
+       /*  Extract poll array index */
+       i = (unsigned long)kevp->udata - p->p_kq_serial;
+
+#ifdef DIAGNOSTIC
+       /*
+        * Lazily delete spurious events.
+        *
+        * This should not happen as long as kqueue_purge() is called
+        * at the end of every syscall.  It migh be interesting to do
+        * like DragonFlyBSD and not always allocated a new knote in
+        * kqueue_register() with that lazy removal makes sense.
+        */
+       if (i >= nfds) {
+               DPRINTFN(0, "poll get out of range udata %lu vs serial %lu\n",
+                   (unsigned long)kevp->udata, p->p_kq_serial);
+               kevp->flags = EV_DISABLE|EV_DELETE;
+               kqueue_register(p->p_kq, kevp, p);
+               return (0);
+       }
+       if ((int)kevp->ident != pl[i].fd) {
+               DPRINTFN(0, "poll get %lu/%d mismatch fd %u!=%d serial %lu\n",
+                   i, nfds-1, (int)kevp->ident, pl[i].fd, p->p_kq_serial);
+               return (0);
+       }
+#endif
+
+       /*
+        * A given descriptor may already have generated an error
+        * against another filter during kqueue_register().
+        *
+        * Make sure to set the appropriate flags but do not
+        * increment `*retval' more than once.
+        */
+       already_seen = (pl[i].revents != 0);
+
+       switch (kevp->filter) {
+       case EVFILT_READ:
+               if (kevp->flags & EV_HUP)
+                       pl[i].revents |= POLLHUP;
+               if (pl[i].events & (POLLIN | POLLRDNORM))
+                       pl[i].revents |= pl[i].events & (POLLIN | POLLRDNORM);
+               break;
+       case EVFILT_WRITE:
+               /* NOTE: POLLHUP and POLLOUT/POLLWRNORM are mutually exclusive*/
+               if (kevp->flags & EV_HUP) {
+                       pl[i].revents |= POLLHUP;
+               } else if (pl[i].events & (POLLOUT | POLLWRNORM)) {
+                       pl[i].revents |= pl[i].events & (POLLOUT | POLLWRNORM);
+               }
+               break;
+       case EVFILT_EXCEPT:
+               if (pl[i].events & (POLLPRI | POLLRDBAND))
+                       pl[i].revents |= pl[i].events & (POLLPRI | POLLRDBAND);
+               break;
+       default:
+               KASSERT(0);
+       }
+
+       DPRINTFN(1, "poll get %lu/%d fd %d revents %02x serial %lu filt %d\n",
+           i, nfds-1, pl[i].fd, pl[i].revents, (unsigned long)kevp->udata,
+           kevp->filter);
+       if (!already_seen && (pl[i].revents != 0))
+               return (1);
+
+       return (0);
+}
+
 /*
  * utrace system call
  */
diff --git sys/kern/sys_pipe.c sys/kern/sys_pipe.c
index fc221bfd8f1..a1ade981c9f 100644
--- sys/kern/sys_pipe.c
+++ sys/kern/sys_pipe.c
@@ -966,7 +966,7 @@ filt_piperead(struct knote *kn, long hint)
        if ((rpipe->pipe_state & PIPE_EOF) || wpipe == NULL) {
                if ((hint & NOTE_SUBMIT) == 0)
                        rw_exit_read(lock);
-               kn->kn_flags |= EV_EOF; 
+               kn->kn_flags |= (EV_EOF | EV_HUP);
                return (1);
        }
 
@@ -990,7 +990,7 @@ filt_pipewrite(struct knote *kn, long hint)
                if ((hint & NOTE_SUBMIT) == 0)
                        rw_exit_read(lock);
                kn->kn_data = 0;
-               kn->kn_flags |= EV_EOF; 
+               kn->kn_flags |= (EV_EOF | EV_HUP);
                return (1);
        }
        kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
diff --git sys/kern/tty.c sys/kern/tty.c
index e9f98f87e01..caa3113e995 100644
--- sys/kern/tty.c
+++ sys/kern/tty.c
@@ -1155,7 +1155,7 @@ filt_ttyread(struct knote *kn, long hint)
        kn->kn_data = ttnread(tp);
        splx(s);
        if (!ISSET(tp->t_cflag, CLOCAL) && !ISSET(tp->t_state, TS_CARR_ON)) {
-               kn->kn_flags |= EV_EOF;
+               kn->kn_flags |= (EV_EOF | EV_HUP);
                return (1);
        }
        return (kn->kn_data > 0);
diff --git sys/kern/tty_pty.c sys/kern/tty_pty.c
index 2152bcde86a..89cef33fd4c 100644
--- sys/kern/tty_pty.c
+++ sys/kern/tty_pty.c
@@ -107,6 +107,7 @@ void        filt_ptcrdetach(struct knote *);
 int    filt_ptcread(struct knote *, long);
 void   filt_ptcwdetach(struct knote *);
 int    filt_ptcwrite(struct knote *, long);
+int    filt_ptcexcept(struct knote *, long);
 
 static struct pt_softc **ptyarralloc(int);
 static int check_pty(int);
@@ -677,7 +678,7 @@ filt_ptcread(struct knote *kn, long hint)
        }
 
        if (!ISSET(tp->t_state, TS_CARR_ON)) {
-               kn->kn_flags |= EV_EOF;
+               kn->kn_flags |= (EV_EOF | EV_HUP);
                return (1);
        }
 
@@ -708,7 +709,8 @@ filt_ptcwrite(struct knote *kn, long hint)
                if (ISSET(pti->pt_flags, PF_REMOTE)) {
                        if (tp->t_canq.c_cc == 0)
                                kn->kn_data = tp->t_canq.c_cn;
-               } else if (tp->t_rawq.c_cc + tp->t_canq.c_cc < TTYHOG(tp)-2)
+               } else if ((tp->t_rawq.c_cc + tp->t_canq.c_cc < TTYHOG(tp)-2) ||
+                   (tp->t_canq.c_cc == 0 && ISSET(tp->t_lflag, ICANON)))
                        kn->kn_data = tp->t_canq.c_cn -
                            (tp->t_rawq.c_cc + tp->t_canq.c_cc);
        }
@@ -716,6 +718,23 @@ filt_ptcwrite(struct knote *kn, long hint)
        return (kn->kn_data > 0);
 }
 
+int
+filt_ptcexcept(struct knote *kn, long hint)
+{
+       struct pt_softc *pti = (struct pt_softc *)kn->kn_hook;
+       struct tty *tp;
+
+       tp = pti->pt_tty;
+       kn->kn_data = 0;
+
+       /* If in packet or user control mode, check for data. */
+       if (((pti->pt_flags & PF_PKT) && pti->pt_send) ||
+           ((pti->pt_flags & PF_UCNTL) && pti->pt_ucntl))
+               kn->kn_data = 1;
+
+       return (kn->kn_data > 0);
+}
+
 const struct filterops ptcread_filtops = {
        .f_flags        = FILTEROP_ISFD,
        .f_attach       = NULL,
@@ -730,6 +749,13 @@ const struct filterops ptcwrite_filtops = {
        .f_event        = filt_ptcwrite,
 };
 
+const struct filterops ptcexcept_filtops = {
+       .f_flags        = FILTEROP_ISFD,
+       .f_attach       = NULL,
+       .f_detach       = filt_ptcrdetach,
+       .f_event        = filt_ptcexcept,
+};
+
 int
 ptckqfilter(dev_t dev, struct knote *kn)
 {
@@ -746,6 +772,9 @@ ptckqfilter(dev_t dev, struct knote *kn)
                klist = &pti->pt_selw.si_note;
                kn->kn_fop = &ptcwrite_filtops;
                break;
+       case EVFILT_EXCEPT:
+               klist = &pti->pt_selr.si_note;
+               kn->kn_fop = &ptcexcept_filtops;
        default:
                return (EINVAL);
        }
diff --git sys/kern/tty_tty.c sys/kern/tty_tty.c
index 2aa99721cfc..3906cdbcb83 100644
--- sys/kern/tty_tty.c
+++ sys/kern/tty_tty.c
@@ -159,6 +159,6 @@ cttykqfilter(dev_t dev, struct knote *kn)
        struct vnode *ttyvp = cttyvp(curproc);
 
        if (ttyvp == NULL)
-               return (ENXIO);
+               return (seltrue_kqfilter(dev, kn));
        return (VOP_KQFILTER(ttyvp, FREAD|FWRITE, kn));
 }
diff --git sys/kern/uipc_socket.c sys/kern/uipc_socket.c
index d6f2eb6ca3c..076549d9cc9 100644
--- sys/kern/uipc_socket.c
+++ sys/kern/uipc_socket.c
@@ -71,6 +71,7 @@ int   filt_soread(struct knote *kn, long hint);
 void   filt_sowdetach(struct knote *kn);
 int    filt_sowrite(struct knote *kn, long hint);
 int    filt_solisten(struct knote *kn, long hint);
+int    filt_soexcept(struct knote *kn, long hint);
 
 const struct filterops solisten_filtops = {
        .f_flags        = FILTEROP_ISFD,
@@ -93,6 +94,12 @@ const struct filterops sowrite_filtops = {
        .f_event        = filt_sowrite,
 };
 
+const struct filterops soexcept_filtops = {
+       .f_flags        = FILTEROP_ISFD,
+       .f_attach       = NULL,
+       .f_detach       = filt_sordetach,
+       .f_event        = filt_soexcept,
+};
 
 #ifndef SOMINCONN
 #define SOMINCONN 80
@@ -2026,6 +2033,10 @@ soo_kqfilter(struct file *fp, struct knote *kn)
                kn->kn_fop = &sowrite_filtops;
                sb = &so->so_snd;
                break;
+       case EVFILT_EXCEPT:
+               kn->kn_fop = &soexcept_filtops;
+               sb = &so->so_rcv;
+               break;
        default:
                return (EINVAL);
        }
@@ -2056,6 +2067,7 @@ filt_soread(struct knote *kn, long hint)
 
        if ((hint & NOTE_SUBMIT) == 0)
                s = solock(so);
+
        kn->kn_data = so->so_rcv.sb_cc;
 #ifdef SOCKET_SPLICE
        if (isspliced(so)) {
@@ -2064,6 +2076,8 @@ filt_soread(struct knote *kn, long hint)
 #endif /* SOCKET_SPLICE */
        if (so->so_state & SS_CANTRCVMORE) {
                kn->kn_flags |= EV_EOF;
+               if (so->so_state & SS_ISDISCONNECTED)
+                       kn->kn_flags |= EV_HUP;
                kn->kn_fflags = so->so_error;
                rv = 1;
        } else if (so->so_error) {      /* temporary udp error */
@@ -2102,6 +2116,8 @@ filt_sowrite(struct knote *kn, long hint)
        kn->kn_data = sbspace(so, &so->so_snd);
        if (so->so_state & SS_CANTSENDMORE) {
                kn->kn_flags |= EV_EOF;
+               if (so->so_state & SS_ISDISCONNECTED)
+                       kn->kn_flags |= EV_HUP;
                kn->kn_fflags = so->so_error;
                rv = 1;
        } else if (so->so_error) {      /* temporary udp error */
@@ -2135,6 +2151,21 @@ filt_solisten(struct knote *kn, long hint)
        return (kn->kn_data != 0);
 }
 
+int
+filt_soexcept(struct knote *kn, long hint)
+{
+       struct socket *so = kn->kn_fp->f_data;
+       int s;
+
+       if ((hint & NOTE_SUBMIT) == 0)
+               s = solock(so);
+       kn->kn_data = (so->so_oobmark || (so->so_state & SS_RCVATMARK));
+       if ((hint & NOTE_SUBMIT) == 0)
+               sounlock(so, s);
+
+       return (kn->kn_data != 0);
+}
+
 #ifdef DDB
 void
 sobuf_print(struct sockbuf *,
diff --git sys/sys/conf.h sys/sys/conf.h
index 6531db84b22..847b1bb8fd8 100644
--- sys/sys/conf.h
+++ sys/sys/conf.h
@@ -312,7 +312,7 @@ extern struct cdevsw cdevsw[];
        dev_init(c,n,open), dev_init(c,n,close), dev_init(c,n,read), \
        dev_init(c,n,write), dev_init(c,n,ioctl), \
        (dev_type_stop((*))) enodev, 0, dev_init(c,n,poll), \
-       (dev_type_mmap((*))) enodev }
+       (dev_type_mmap((*))) enodev, 0, 0, dev_init(c,n,kqfilter) }
 
 /* open, close, read, write, ioctl, poll, kqfilter */
 #define cdev_midi_init(c,n) { \
@@ -390,12 +390,12 @@ extern struct cdevsw cdevsw[];
        (dev_type_stop((*))) enodev, 0, selfalse, \
        (dev_type_mmap((*))) enodev }
 
-/* open, close, ioctl, read, mmap, poll */
+/* open, close, ioctl, read, mmap, poll, kqfilter */
 #define cdev_video_init(c,n) { \
        dev_init(c,n,open), dev_init(c,n,close), dev_init(c,n,read), \
        (dev_type_write((*))) enodev, dev_init(c,n,ioctl), \
        (dev_type_stop((*))) enodev, 0, dev_init(c,n,poll), \
-       dev_init(c,n,mmap) }
+       dev_init(c,n,mmap), 0, 0, dev_init(c,n,kqfilter) }
 
 /* open, close, write, ioctl */
 #define cdev_spkr_init(c,n) { \
@@ -439,7 +439,7 @@ extern struct cdevsw cdevsw[];
        (dev_type_stop((*))) enodev, 0, selfalse, \
        (dev_type_mmap((*))) enodev }
 
-/* open, close, read, ioctl, poll, mmap, nokqfilter */
+/* open, close, read, ioctl, poll, mmap, kqfilter */
 #define      cdev_drm_init(c,n)        { \
        dev_init(c,n,open), dev_init(c,n,close), dev_init(c, n, read), \
        (dev_type_write((*))) enodev, dev_init(c,n,ioctl), \
diff --git sys/sys/event.h sys/sys/event.h
index 51487835ced..41c521474e6 100644
--- sys/sys/event.h
+++ sys/sys/event.h
@@ -39,6 +39,7 @@
 #define EVFILT_SIGNAL          (-6)    /* attached to struct process */
 #define EVFILT_TIMER           (-7)    /* timers */
 #define EVFILT_DEVICE          (-8)    /* devices */
+#define EVFILT_EXCEPT          (-9)    /* exceptional conditions */
 
 #define EVFILT_SYSCOUNT                8
 
@@ -75,6 +76,7 @@ struct kevent {
 
 #define EV_SYSFLAGS    0xF000          /* reserved by system */
 #define EV_FLAG1       0x2000          /* filter-specific flag */
+#define EV_HUP         EV_FLAG1        /* device or socket disconnected */
 
 /* returned values */
 #define EV_EOF         0x8000          /* EOF detected */
@@ -128,6 +130,10 @@ struct klist {
 };
 
 #ifdef _KERNEL
+/*
+ * data/hint flags for EVFILT_{READ|WRITE}, not shared with userspace
+ */
+#define NOTE_IMM       0x1000                  /* Immediate read event */
 
 #define EVFILT_MARKER  0xf                     /* placemarker for tailq */
 
@@ -199,6 +205,7 @@ struct kqueue_scan_state {
 };
 
 struct proc;
+struct filedesc;
 
 extern const struct filterops sig_filtops;
 
@@ -207,10 +214,17 @@ extern void       knote_activate(struct knote *);
 extern void    knote_remove(struct proc *p, struct knlist *list);
 extern void    knote_fdclose(struct proc *p, int fd);
 extern void    knote_processexit(struct proc *);
+extern struct  kqueue *kqueue_alloc(struct filedesc *);
+extern void    kqueue_exit(struct proc *);
 extern int     kqueue_register(struct kqueue *kq,
                    struct kevent *kev, struct proc *p);
 extern void    kqueue_scan_setup(struct kqueue_scan_state *, struct kqueue *);
 extern void    kqueue_scan_finish(struct kqueue_scan_state *);
+extern int     kqueue_scan(struct kqueue_scan_state *, int, struct kevent *,
+                   struct timespec *, struct proc *, int *);
+extern void    kqueue_purge(struct proc *, struct kqueue *);
+extern void    kqueue_scan_setup(struct kqueue_scan_state *, struct kqueue *);
+extern void    kqueue_scan_finish(struct kqueue_scan_state *);
 extern int     filt_seltrue(struct knote *kn, long hint);
 extern int     seltrue_kqfilter(dev_t, struct knote *);
 extern void    klist_insert(struct klist *, struct knote *);
diff --git sys/sys/proc.h sys/sys/proc.h
index 357c0c0d52c..424f8240b1f 100644
--- sys/sys/proc.h
+++ sys/sys/proc.h
@@ -315,6 +315,7 @@ struct process {
 
 struct kcov_dev;
 struct lock_list_entry;
+struct kqueue;
 
 struct p_inentry {
        u_long   ie_serial;
@@ -377,6 +378,8 @@ struct proc {
        struct  plimit  *p_limit;       /* [l] read ref. of p_p->ps_limit */
        struct  kcov_dev *p_kd;         /* kcov device handle */
        struct  lock_list_entry *p_sleeplocks;  /* WITNESS lock tracking */ 
+       struct  kqueue *p_kq;           /* for select/poll */
+       unsigned long p_kq_serial;      /* for select/poll */
 
        int      p_siglist;             /* Signals arrived but not delivered. */
 
diff --git sys/arch/sparc64/dev/vldcp.c sys/arch/sparc64/dev/vldcp.c
index 91255700c5f..0dba0267028 100644
--- sys/arch/sparc64/dev/vldcp.c
+++ sys/arch/sparc64/dev/vldcp.c
@@ -70,6 +70,11 @@ struct vldcp_softc {
 
 int    vldcp_match(struct device *, void *, void *);
 void   vldcp_attach(struct device *, struct device *, void *);
+void   filt_vldcprdetach(struct knote *);
+void   filt_vldcpwdetach(struct knote *);
+int    filt_vldcpread(struct knote *, long);
+int    filt_vldcpwrite(struct knote *, long);
+int    vldcpkqfilter(dev_t, struct knote *);
 
 struct cfattach vldcp_ca = {
        sizeof(struct vldcp_softc), vldcp_match, vldcp_attach
@@ -615,3 +620,121 @@ vldcppoll(dev_t dev, int events, struct proc *p)
        splx(s);
        return revents;
 }
+
+void
+filt_vldcprdetach(struct knote *kn)
+{
+       struct vldcp_softc *sc = (void *)kn->kn_hook;
+       int s;
+
+       s = spltty();
+       klist_remove(&sc->sc_rsel.si_note, kn);
+       splx(s);
+}
+
+void
+filt_vldcpwdetach(struct knote *kn)
+{
+       struct vldcp_softc *sc = (void *)kn->kn_hook;
+       int s;
+
+       s = spltty();
+       klist_remove(&sc->sc_wsel.si_note, kn);
+       splx(s);
+}
+
+int
+filt_vldcpread(struct knote *kn, long hint)
+{
+       struct vldcp_softc *sc = (void *)kn->kn_hook;
+       struct ldc_conn *lc = &sc->sc_lc;
+       uint64_t head, tail, avail, state;
+       int s, err;
+
+       s = spltty();
+       err = hv_ldc_rx_get_state(lc->lc_id, &head, &tail, &state);
+       if (err == 0 && state == LDC_CHANNEL_UP && head != tail) {
+               avail = (head - tail) / sizeof(struct ldc_pkt);
+               avail = (avail + lc->lc_rxq->lq_nentries - 1) %
+                   lc->lc_rxq->lq_nentries;
+               kn->kn_data = avail;
+       } else {
+               cbus_intr_setenabled(sc->sc_bustag, sc->sc_rx_ino,
+                   INTR_ENABLED);
+       }
+       splx(s);
+
+       return (kn->kn_data > 0);
+}
+
+int
+filt_vldcwrite(struct knote *kn, long hint)
+{
+       struct vldcp_softc *sc = (void *)kn->kn_hook;
+       struct ldc_conn *lc = &sc->sc_lc;
+       uint64_t head, tail, avail, state;
+       int s, err;
+
+       s = spltty();
+       err = hv_ldc_tx_get_state(lc->lc_id, &head, &tail, &state);
+       if (err == 0 && state == LDC_CHANNEL_UP && head != tail) {
+               avail = (head - tail) / sizeof(struct ldc_pkt);
+               avail = (avail + lc->lc_txq->lq_nentries - 1) %
+                   lc->lc_txq->lq_nentries;
+               kn->kn_data = avail;
+       } else {
+               cbus_intr_setenabled(sc->sc_bustag, sc->sc_tx_ino,
+                   INTR_ENABLED);
+       }
+       splx(s);
+
+       return (kn->kn_data > 0);
+}
+
+const struct filterops vldcpread_filtops = {
+       .f_flags        = FILTEROP_ISFD,
+       .f_attach       = NULL,
+       .f_detach       = filt_vldcprdetach,
+       .f_event        = filt_vldcpread,
+};
+
+const struct filterops vldcpwrite_filtops = {
+       .f_flags        = FILTEROP_ISFD,
+       .f_attach       = NULL,
+       .f_detach       = filt_vldcpwdetach,
+       .f_event        = filt_vldcwrite,
+};
+
+int
+vldcpkqfilter(dev_t dev, struct knote *kn)
+{
+       struct vldcp_softc *sc;
+       struct klist *klist;
+       int s;
+
+       sc = vldcp_lookup(dev);
+       if (sc == NULL)
+               return (ENXIO);
+
+       switch (kn->kn_filter) {
+       case EVFILT_READ:
+               klist = &sc->sc_rsel.si_note;
+               kn->kn_fop = &vldcpread_filtops;
+               break;
+       case EVFILT_WRITE:
+               klist = &sc->sc_wsel.si_note;
+               kn->kn_fop = &vldcpwrite_filtops;
+               break;
+
+       default:
+               return (EINVAL);
+       }
+
+       kn->kn_hook = sc;
+
+       s = spltty();
+       klist_insert(klist, kn);
+       splx(s);
+
+       return (0);
+}
diff --git sys/arch/sparc64/include/conf.h sys/arch/sparc64/include/conf.h
index 558c0c22ef7..57fd1699cb6 100644
--- sys/arch/sparc64/include/conf.h
+++ sys/arch/sparc64/include/conf.h
@@ -64,7 +64,8 @@ cdev_decl(vdsp);
 #define        cdev_gen_init(c,n) { \
        dev_init(c,n,open), dev_init(c,n,close), dev_init(c,n,read), \
        dev_init(c,n,write), dev_init(c,n,ioctl), (dev_type_stop((*))) nullop, \
-       0, dev_init(c,n,poll), (dev_type_mmap((*))) enodev }
+       0, dev_init(c,n,poll), (dev_type_mmap((*))) enodev, \
+       0, 0, dev_init(c,n,kqfilter) }
 
 cdev_decl(cn);
 
diff --git sys/dev/audio.c sys/dev/audio.c
index 7e061b7eb95..93fc848ab82 100644
--- sys/dev/audio.c
+++ sys/dev/audio.c
@@ -165,6 +165,36 @@ struct cfdriver audio_cd = {
        NULL, "audio", DV_DULL
 };
 
+void filt_audioctlrdetach(struct knote *);
+int filt_audioctlread(struct knote *, long);
+
+const struct filterops audioctlread_filtops = {
+       .f_flags        = FILTEROP_ISFD,
+       .f_attach       = NULL,
+       .f_detach       = filt_audioctlrdetach,
+       .f_event        = filt_audioctlread,
+};
+
+void filt_audiowdetach(struct knote *);
+int filt_audiowrite(struct knote *, long);
+
+const struct filterops audiowrite_filtops = {
+       .f_flags        = FILTEROP_ISFD,
+       .f_attach       = NULL,
+       .f_detach       = filt_audiowdetach,
+       .f_event        = filt_audiowrite,
+};
+
+void filt_audiordetach(struct knote *);
+int filt_audioread(struct knote *, long);
+
+const struct filterops audioread_filtops = {
+       .f_flags        = FILTEROP_ISFD,
+       .f_attach       = NULL,
+       .f_detach       = filt_audiordetach,
+       .f_event        = filt_audioread,
+};
+
 /*
  * This mutex protects data structures (including registers on the
  * sound-card) that are manipulated by both the interrupt handler and
@@ -243,7 +273,14 @@ audio_mixer_wakeup(void *addr)
                wakeup(&sc->mix_blocking);
                sc->mix_blocking = 0;
        }
+       /*
+        * As long as selwakeup() grabs the KERNEL_LOCK() make sure it is
+        * already held here to avoid lock ordering problems with `audio_lock'
+        */
+       KERNEL_ASSERT_LOCKED();
+       mtx_enter(&audio_lock);
        selwakeup(&sc->mix_sel);
+       mtx_leave(&audio_lock);
 }
 
 void
@@ -255,7 +292,14 @@ audio_buf_wakeup(void *addr)
                wakeup(&buf->blocking);
                buf->blocking = 0;
        }
+       /*
+        * As long as selwakeup() grabs the KERNEL_LOCK() make sure it is
+        * already held here to avoid lock ordering problems with `audio_lock'
+        */
+       KERNEL_ASSERT_LOCKED();
+       mtx_enter(&audio_lock);
        selwakeup(&buf->sel);
+       mtx_leave(&audio_lock);
 }
 
 int
@@ -1383,9 +1427,12 @@ audio_detach(struct device *self, int flags)
        if (sc->mode != 0) {
                if (sc->active) {
                        wakeup(&sc->play.blocking);
-                       selwakeup(&sc->play.sel);
+                       KERNEL_ASSERT_LOCKED();
+                       mtx_enter(&audio_lock);
                        wakeup(&sc->rec.blocking);
+                       selwakeup(&sc->play.sel);
                        selwakeup(&sc->rec.sel);
+                       mtx_leave(&audio_lock);
                        audio_stop(sc);
                }
                sc->ops->close(sc->arg);
@@ -1393,8 +1440,14 @@ audio_detach(struct device *self, int flags)
        }
        if (sc->mix_isopen) {
                wakeup(&sc->mix_blocking);
+               KERNEL_ASSERT_LOCKED();
+               mtx_enter(&audio_lock);
                selwakeup(&sc->mix_sel);
+               mtx_leave(&audio_lock);
        }
+       klist_invalidate(&sc->play.sel.si_note);
+       klist_invalidate(&sc->rec.sel.si_note);
+       klist_invalidate(&sc->mix_sel.si_note);
 
        /* free resources */
        softintr_disestablish(sc->mix_softintr);
@@ -2199,6 +2252,130 @@ audiopoll(dev_t dev, int events, struct proc *p)
        return revents;
 }
 
+int
+audiokqfilter(dev_t dev, struct knote *kn)
+{
+       struct audio_softc *sc;
+       struct klist      *klist;
+       int error;
+
+       sc = (struct audio_softc *)device_lookup(&audio_cd, AUDIO_UNIT(dev));
+       if (sc == NULL)
+               return ENXIO;
+       error = 0;
+       switch (AUDIO_DEV(dev)) {
+       case AUDIO_DEV_AUDIO:
+               switch (kn->kn_filter) {
+               case EVFILT_READ:
+                       klist = &sc->rec.sel.si_note;
+                       kn->kn_fop = &audioread_filtops;
+                       break;
+               case EVFILT_WRITE:
+                       klist = &sc->play.sel.si_note;
+                       kn->kn_fop = &audiowrite_filtops;
+                       break;
+               default:
+                       error = EINVAL;
+                       goto done;
+               }
+               break;
+       case AUDIO_DEV_AUDIOCTL:
+               switch (kn->kn_filter) {
+               case EVFILT_READ:
+                       klist = &sc->mix_sel.si_note;
+                       kn->kn_fop = &audioctlread_filtops;
+                       break;
+               default:
+                       error = EINVAL;
+                       goto done;
+               }
+               break;
+       }
+       kn->kn_hook = sc;
+
+       mtx_enter(&audio_lock);
+       klist_insert(klist, kn);
+       mtx_leave(&audio_lock);
+done:
+       device_unref(&sc->dev);
+       return error;
+}
+
+void
+filt_audiordetach(struct knote *kn)
+{
+       struct audio_softc *sc = kn->kn_hook;
+
+       mtx_enter(&audio_lock);
+       klist_remove(&sc->rec.sel.si_note, kn);
+       mtx_leave(&audio_lock);
+}
+
+int
+filt_audioread(struct knote *kn, long hint)
+{
+       struct audio_softc *sc = kn->kn_hook;
+       int retval = 0;
+
+       if ((hint & NOTE_SUBMIT) == 0)
+               mtx_enter(&audio_lock);
+       retval = (sc->mode & AUMODE_RECORD) && (sc->rec.used > 0);
+       if ((hint & NOTE_SUBMIT) == 0)
+               mtx_leave(&audio_lock);
+
+       return retval;
+}
+
+void
+filt_audiowdetach(struct knote *kn)
+{
+       struct audio_softc *sc = kn->kn_hook;
+
+       mtx_enter(&audio_lock);
+       klist_remove(&sc->play.sel.si_note, kn);
+       mtx_leave(&audio_lock);
+}
+
+int
+filt_audiowrite(struct knote *kn, long hint)
+{
+       struct audio_softc *sc = kn->kn_hook;
+       int retval = 0;
+
+       if ((hint & NOTE_SUBMIT) == 0)
+               mtx_enter(&audio_lock);
+       retval = (sc->mode & AUMODE_PLAY) && (sc->play.used < sc->play.len);
+       if ((hint & NOTE_SUBMIT) == 0)
+               mtx_leave(&audio_lock);
+
+       return retval;
+}
+
+void
+filt_audioctlrdetach(struct knote *kn)
+{
+       struct audio_softc *sc = kn->kn_hook;
+
+       mtx_enter(&audio_lock);
+       klist_remove(&sc->mix_sel.si_note, kn);
+       mtx_leave(&audio_lock);
+}
+
+int
+filt_audioctlread(struct knote *kn, long hint)
+{
+       struct audio_softc *sc = kn->kn_hook;
+       int retval = 0;
+
+       if ((hint & NOTE_SUBMIT) == 0)
+               mtx_enter(&audio_lock);
+       retval = (sc->mix_isopen && sc->mix_pending);
+       if ((hint & NOTE_SUBMIT) == 0)
+               mtx_leave(&audio_lock);
+
+       return retval;
+}
+
 #if NWSKBD > 0
 int
 wskbd_initmute(struct audio_softc *sc, struct mixer_devinfo *vol)
diff --git sys/dev/cons.c sys/dev/cons.c
index c6694411df0..11ddff0591e 100644
--- sys/dev/cons.c
+++ sys/dev/cons.c
@@ -213,9 +213,7 @@ cnkqfilter(dev_t dev, struct knote *kn)
                return (ENXIO);
        else
                dev = cn_tab->cn_dev;
-       if (cdevsw[major(dev)].d_kqfilter)
-               return ((*cdevsw[major(dev)].d_kqfilter)(dev, kn));
-       return (EOPNOTSUPP);
+       return (ttkqfilter(dev, kn));
 }
 
 int
diff --git sys/dev/pci/cz.c sys/dev/pci/cz.c
index c9a01b94001..953de876412 100644
--- sys/dev/pci/cz.c
+++ sys/dev/pci/cz.c
@@ -1114,22 +1114,6 @@ czttywrite(dev_t dev, struct uio *uio, int flags)
        return ((*linesw[tp->t_line].l_write)(tp, uio, flags));
 }
 
-#if 0
-/*
- * czttypoll:
- *
- *     Poll a Cyclades-Z serial port.
- */
-int
-czttypoll(dev_t dev, int events, struct proc *p)
-{
-       struct cztty_softc *sc = CZTTY_SOFTC(dev);
-       struct tty *tp = sc->sc_tty;
- 
-       return ((*linesw[tp->t_line].l_poll)(tp, events, p));
-}
-#endif
-
 /*
  * czttyioctl:
  *
diff --git sys/dev/pci/drm/drm_drv.c sys/dev/pci/drm/drm_drv.c
index 11e72a0d5b6..67ee4ce1805 100644
--- sys/dev/pci/drm/drm_drv.c
+++ sys/dev/pci/drm/drm_drv.c
@@ -484,6 +484,35 @@ filt_drmkms(struct knote *kn, long hint)
        return (kn->kn_fflags != 0);
 }
 
+void
+filt_drmreaddetach(struct knote *kn)
+{
+       struct drm_file         *file_priv = kn->kn_hook;
+       int s;
+
+       s = spltty();
+       klist_remove(&file_priv->rsel.si_note, kn);
+       splx(s);
+}
+
+int
+filt_drmread(struct knote *kn, long hint)
+{
+       struct drm_file         *file_priv = kn->kn_hook;
+       int                      val = 0;
+
+#if notyet
+       if ((hint & NOTE_SUBMIT) == 0)
+               mtx_enter(&file_priv->minor->dev->event_lock);
+#endif
+       val = !list_empty(&file_priv->event_list);
+#if notyet
+       if ((hint & NOTE_SUBMIT) == 0)
+               mtx_leave(&file_priv->minor->dev->event_lock);
+#endif
+       return (val);
+}
+
 const struct filterops drm_filtops = {
        .f_flags        = FILTEROP_ISFD,
        .f_attach       = NULL,
@@ -491,30 +520,51 @@ const struct filterops drm_filtops = {
        .f_event        = filt_drmkms,
 };
 
+const struct filterops drmread_filtops = {
+       .f_flags        = FILTEROP_ISFD,
+       .f_attach       = NULL,
+       .f_detach       = filt_drmreaddetach,
+       .f_event        = filt_drmread,
+};
+
 int
 drmkqfilter(dev_t kdev, struct knote *kn)
 {
        struct drm_device       *dev = NULL;
-       int s;
+       struct drm_file         *file_priv = NULL;
+       int                      s;
 
        dev = drm_get_device_from_kdev(kdev);
        if (dev == NULL || dev->dev_private == NULL)
                return (ENXIO);
 
        switch (kn->kn_filter) {
+       case EVFILT_READ:
+               mutex_lock(&dev->struct_mutex);
+               file_priv = drm_find_file_by_minor(dev, minor(kdev));
+               mutex_unlock(&dev->struct_mutex);
+               if (file_priv == NULL)
+                       return (ENXIO);
+
+               kn->kn_fop = &drmread_filtops;
+               kn->kn_hook = file_priv;
+
+               s = spltty();
+               klist_insert(&file_priv->rsel.si_note, kn);
+               splx(s);
+               break;
        case EVFILT_DEVICE:
                kn->kn_fop = &drm_filtops;
+               kn->kn_hook = dev;
+
+               s = spltty();
+               klist_insert(&dev->note, kn);
+               splx(s);
                break;
        default:
                return (EINVAL);
        }
 
-       kn->kn_hook = dev;
-
-       s = spltty();
-       klist_insert(&dev->note, kn);
-       splx(s);
-
        return (0);
 }
 
@@ -772,7 +822,6 @@ out:
        return (gotone);
 }
 
-/* XXX kqfilter ... */
 int
 drmpoll(dev_t kdev, int events, struct proc *p)
 {
diff --git sys/dev/sbus/magma.c sys/dev/sbus/magma.c
index 8aba38e9384..766c45846d3 100644
--- sys/dev/sbus/magma.c
+++ sys/dev/sbus/magma.c
@@ -1340,6 +1340,7 @@ mtty_param(struct tty *tp, struct termios *t)
  *     mbppwrite       write to mbpp
  *     mbppioctl       do ioctl on mbpp
  *     mbpppoll        do poll on mbpp
+ *     mbppkqfilter    kqueue on mbpp
  *     mbpp_rw         general rw routine
  *     mbpp_timeout    rw timeout
  *     mbpp_start      rw start after delay
@@ -1515,6 +1516,12 @@ mbpppoll(dev_t dev, int events, struct proc *p)
        return (seltrue(dev, events, p));
 }
 
+int
+mbppkqfilter(dev_t dev, struct knote *kn)
+{
+       return (seltrue_kqfilter(dev, kn));
+}
+
 int
 mbpp_rw(dev_t dev, struct uio *uio)
 {
diff --git sys/dev/sbus/spif.c sys/dev/sbus/spif.c
index 19f128420d7..6fcd8f37cbc 100644
--- sys/dev/sbus/spif.c
+++ sys/dev/sbus/spif.c
@@ -91,6 +91,7 @@ int   sbppwrite(dev_t, struct uio *, int);
 int    sbpp_rw(dev_t, struct uio *);
 int    spifppcintr(void *);
 int    sbpppoll(dev_t, int, struct proc *);
+int    sbppkqfilter(dev_t, struct knote *);
 int    sbppioctl(dev_t, u_long, caddr_t, int, struct proc *);
 
 struct cfattach spif_ca = {
@@ -1044,6 +1045,11 @@ sbpppoll(dev_t dev, int events, struct proc *p)
 {
        return (seltrue(dev, events, p));
 }
+int
+sbppkqfilter(dev_t dev, struct knote *kn)
+{
+       return (seltrue_kqfilter(dev, kn));
+}
 
 int
 sbppioctl(dev_t dev, u_long cmd, caddr_t data, int flags, struct proc *p)
diff --git sys/isofs/cd9660/cd9660_vnops.c sys/isofs/cd9660/cd9660_vnops.c
index f1d43c64bd0..e2fbacbf0f0 100644
--- sys/isofs/cd9660/cd9660_vnops.c
+++ sys/isofs/cd9660/cd9660_vnops.c
@@ -1036,6 +1036,9 @@ filt_cd9660read(struct knote *kn, long hint)
                return (1);
        }
 
+       if (kn->kn_sfflags & NOTE_IMM)
+               return (1);
+
        return (kn->kn_data != 0);
 }
 
diff --git sys/miscfs/deadfs/dead_vnops.c sys/miscfs/deadfs/dead_vnops.c
index 606960c726d..c5b9bcae1cf 100644
--- sys/miscfs/deadfs/dead_vnops.c
+++ sys/miscfs/deadfs/dead_vnops.c
@@ -34,6 +34,7 @@
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/event.h>
 #include <sys/time.h>
 #include <sys/vnode.h>
 #include <sys/lock.h>
@@ -52,6 +53,7 @@ int   dead_read(void *);
 int    dead_write(void *);
 int    dead_ioctl(void *);
 int    dead_poll(void *);
+int    dead_kqfilter(void *v);
 int    dead_inactive(void *);
 int    dead_lock(void *);
 int    dead_bmap(void *);
@@ -73,6 +75,7 @@ const struct vops dead_vops = {
        .vop_write      = dead_write,
        .vop_ioctl      = dead_ioctl,
        .vop_poll       = dead_poll,
+       .vop_kqfilter   = dead_kqfilter,
        .vop_fsync      = nullop,
        .vop_remove     = dead_badop,
        .vop_link       = dead_badop,
@@ -167,6 +170,45 @@ dead_poll(void *v)
        return (POLLHUP);
 }
 
+void
+filt_deaddetach(struct knote *kn)
+{
+}
+
+int
+filt_deadrw(struct knote *kn, long hint)
+{
+       /*
+        * Let the user find out that the descriptor is gone.
+        */
+       kn->kn_data = EV_HUP;
+       return (1);
+}
+
+const struct filterops deadrw_filtops = {
+       .f_flags        = FILTEROP_ISFD,
+       .f_attach       = NULL,
+       .f_detach       = filt_deaddetach,
+       .f_event        = filt_deadrw,
+};
+
+int
+dead_kqfilter(void *v)
+{
+       struct vop_kqfilter_args *ap = v;
+
+       switch (ap->a_kn->kn_filter) {
+       case EVFILT_READ:
+       case EVFILT_WRITE:
+               ap->a_kn->kn_fop = &deadrw_filtops;
+               break;
+       default:
+               return (EINVAL);
+       }
+
+       return (0);
+}
+
 /*
  * Just call the device strategy routine
  */
diff --git sys/miscfs/fifofs/fifo_vnops.c sys/miscfs/fifofs/fifo_vnops.c
index 07ede492f47..21268799c5e 100644
--- sys/miscfs/fifofs/fifo_vnops.c
+++ sys/miscfs/fifofs/fifo_vnops.c
@@ -559,6 +559,8 @@ filt_fiforead(struct knote *kn, long hint)
        kn->kn_data = so->so_rcv.sb_cc;
        if (so->so_state & SS_CANTRCVMORE) {
                kn->kn_flags |= EV_EOF;
+               if (so->so_state & SS_ISDISCONNECTED)
+                       kn->kn_flags |= EV_HUP;
                rv = 1;
        } else {
                kn->kn_flags &= ~EV_EOF;
diff --git sys/miscfs/fuse/fuse_vnops.c sys/miscfs/fuse/fuse_vnops.c
index 6438cecd1c9..9b941d97cf5 100644
--- sys/miscfs/fuse/fuse_vnops.c
+++ sys/miscfs/fuse/fuse_vnops.c
@@ -188,6 +188,9 @@ filt_fusefsread(struct knote *kn, long hint)
                return (1);
        }
 
+       if (kn->kn_sfflags & NOTE_IMM)
+               return (1);
+
        return (kn->kn_data != 0);
 }
 
diff --git sys/msdosfs/msdosfs_vnops.c sys/msdosfs/msdosfs_vnops.c
index fc46268052e..60a3622d8c0 100644
--- sys/msdosfs/msdosfs_vnops.c
+++ sys/msdosfs/msdosfs_vnops.c
@@ -2013,6 +2013,10 @@ filt_msdosfsread(struct knote *kn, long hint)
                kn->kn_fflags |= NOTE_EOF;
                return (1);
        }
+
+       if (kn->kn_sfflags & NOTE_IMM)
+               return (1);
+
        return (kn->kn_data != 0);
 }
 
diff --git sys/net/if_pppx.c sys/net/if_pppx.c
index 4dcbe1d5695..b0d0663d5dc 100644
--- sys/net/if_pppx.c
+++ sys/net/if_pppx.c
@@ -1215,20 +1215,20 @@ static void     filt_pppac_rdetach(struct knote *);
 static int     filt_pppac_read(struct knote *, long);
 
 static const struct filterops pppac_rd_filtops = {
-       1,
-       NULL,
-       filt_pppac_rdetach,
-       filt_pppac_read
+       .f_flags        = FILTEROP_ISFD,
+       .f_attach       = NULL,
+       .f_detach       = filt_pppac_rdetach,
+       .f_event        = filt_pppac_read
 };
 
 static void    filt_pppac_wdetach(struct knote *);
 static int     filt_pppac_write(struct knote *, long);
 
 static const struct filterops pppac_wr_filtops = {
-       1,
-       NULL,
-       filt_pppac_wdetach,
-       filt_pppac_write
+       .f_flags        = FILTEROP_ISFD,
+       .f_attach       = NULL,
+       .f_detach       = filt_pppac_wdetach,
+       .f_event        = filt_pppac_write
 };
 
 static struct pppac_list pppac_devs = LIST_HEAD_INITIALIZER(pppac_devs);
diff --git sys/nfs/nfs_kq.c sys/nfs/nfs_kq.c
index f43211e76c3..b07ff318bcf 100644
--- sys/nfs/nfs_kq.c
+++ sys/nfs/nfs_kq.c
@@ -32,183 +32,26 @@
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/kernel.h>
-#include <sys/proc.h>
 #include <sys/mount.h>
-#include <sys/malloc.h>
 #include <sys/vnode.h>
-#include <sys/unistd.h>
 #include <sys/file.h>
-#include <sys/kthread.h>
-#include <sys/rwlock.h>
-#include <sys/queue.h>
 
-#include <nfs/rpcv2.h>
 #include <nfs/nfsproto.h>
 #include <nfs/nfs.h>
 #include <nfs/nfsnode.h>
 #include <nfs/nfs_var.h>
 
-void   nfs_kqpoll(void *);
-
 void   filt_nfsdetach(struct knote *);
 int    filt_nfsread(struct knote *, long);
+int    filt_nfswrite(struct knote *, long);
 int    filt_nfsvnode(struct knote *, long);
 
-struct kevq {
-       SLIST_ENTRY(kevq)       kev_link;
-       struct vnode            *vp;
-       u_int                   usecount;
-       u_int                   flags;
-#define KEVQ_BUSY      0x01    /* currently being processed */
-#define KEVQ_WANT      0x02    /* want to change this entry */
-       struct timespec         omtime; /* old modification time */
-       struct timespec         octime; /* old change time */
-       nlink_t                 onlink; /* old number of references to file */
-};
-SLIST_HEAD(kevqlist, kevq);
-
-struct rwlock nfskevq_lock = RWLOCK_INITIALIZER("nfskqlk");
-struct proc *pnfskq;
-struct kevqlist kevlist = SLIST_HEAD_INITIALIZER(kevlist);
-
-/*
- * This quite simplistic routine periodically checks for server changes
- * of any of the watched files every NFS_MINATTRTIMO/2 seconds.
- * Only changes in size, modification time, change time and nlinks
- * are being checked, everything else is ignored.
- * The routine only calls VOP_GETATTR() when it's likely it would get
- * some new data, i.e. when the vnode expires from attrcache. This
- * should give same result as periodically running stat(2) from userland,
- * while keeping CPU/network usage low, and still provide proper kevent
- * semantics.
- * The poller thread is created when first vnode is added to watch list,
- * and exits when the watch list is empty. The overhead of thread creation
- * isn't really important, neither speed of attach and detach of knote.
- */
-/* ARGSUSED */
-void
-nfs_kqpoll(void *arg)
-{
-       struct kevq *ke;
-       struct vattr attr;
-       struct proc *p = pnfskq;
-       u_quad_t osize;
-       int error;
-
-       for(;;) {
-               rw_enter_write(&nfskevq_lock);
-               SLIST_FOREACH(ke, &kevlist, kev_link) {
-                       struct nfsnode *np = VTONFS(ke->vp);
-
-#ifdef DEBUG
-                       printf("nfs_kqpoll on: ");
-                       VOP_PRINT(ke->vp);
-#endif
-                       /* skip if still in attrcache */
-                       if (nfs_getattrcache(ke->vp, &attr) != ENOENT)
-                               continue;
-
-                       /*
-                        * Mark entry busy, release lock and check
-                        * for changes.
-                        */
-                       ke->flags |= KEVQ_BUSY;
-                       rw_exit_write(&nfskevq_lock);
-
-                       /* save v_size, nfs_getattr() updates it */
-                       osize = np->n_size;
-
-                       error = VOP_GETATTR(ke->vp, &attr, p->p_ucred, p);
-                       if (error == ESTALE) {
-                               NFS_INVALIDATE_ATTRCACHE(np);
-                               VN_KNOTE(ke->vp, NOTE_DELETE);
-                               goto next;
-                       }
-
-                       /* following is a bit fragile, but about best
-                        * we can get */
-                       if (attr.va_size != osize) {
-                               int flags = NOTE_WRITE;
-
-                               if (attr.va_size > osize)
-                                       flags |= NOTE_EXTEND;
-                               else
-                                       flags |= NOTE_TRUNCATE;
-
-                               VN_KNOTE(ke->vp, flags);
-                               ke->omtime = attr.va_mtime;
-                       } else if (attr.va_mtime.tv_sec != ke->omtime.tv_sec
-                           || attr.va_mtime.tv_nsec != ke->omtime.tv_nsec) {
-                               VN_KNOTE(ke->vp, NOTE_WRITE);
-                               ke->omtime = attr.va_mtime;
-                       }
-
-                       if (attr.va_ctime.tv_sec != ke->octime.tv_sec
-                           || attr.va_ctime.tv_nsec != ke->octime.tv_nsec) {
-                               VN_KNOTE(ke->vp, NOTE_ATTRIB);
-                               ke->octime = attr.va_ctime;
-                       }
-
-                       if (attr.va_nlink != ke->onlink) {
-                               VN_KNOTE(ke->vp, NOTE_LINK);
-                               ke->onlink = attr.va_nlink;
-                       }
-
-next:
-                       rw_enter_write(&nfskevq_lock);
-                       ke->flags &= ~KEVQ_BUSY;
-                       if (ke->flags & KEVQ_WANT) {
-                               ke->flags &= ~KEVQ_WANT;
-                               wakeup(ke);
-                       }
-               }
-
-               if (SLIST_EMPTY(&kevlist)) {
-                       /* Nothing more to watch, exit */
-                       pnfskq = NULL;
-                       rw_exit_write(&nfskevq_lock);
-                       kthread_exit(0);
-               }
-               rw_exit_write(&nfskevq_lock);
-
-               /* wait a while before checking for changes again */
-               tsleep_nsec(pnfskq, PSOCK, "nfskqpw",
-                   SEC_TO_NSEC(NFS_MINATTRTIMO) / 2);
-       }
-}
-
 void
 filt_nfsdetach(struct knote *kn)
 {
        struct vnode *vp = (struct vnode *)kn->kn_hook;
-       struct kevq *ke;
 
        klist_remove(&vp->v_selectinfo.si_note, kn);
-
-       /* Remove the vnode from watch list */
-       rw_enter_write(&nfskevq_lock);
-       SLIST_FOREACH(ke, &kevlist, kev_link) {
-               if (ke->vp == vp) {
-                       while (ke->flags & KEVQ_BUSY) {
-                               ke->flags |= KEVQ_WANT;
-                               rw_exit_write(&nfskevq_lock);
-                               tsleep_nsec(ke, PSOCK, "nfskqdet", INFSLP);
-                               rw_enter_write(&nfskevq_lock);
-                       }
-
-                       if (ke->usecount > 1) {
-                               /* keep, other kevents need this */
-                               ke->usecount--;
-                       } else {
-                               /* last user, g/c */
-                               SLIST_REMOVE(&kevlist, ke, kevq, kev_link);
-                               free(ke, M_KEVENT, sizeof(*ke));
-                       }
-                       break;
-               }
-       }
-       rw_exit_write(&nfskevq_lock);
 }
 
 int
@@ -227,16 +70,33 @@ filt_nfsread(struct knote *kn, long hint)
        }
 
        kn->kn_data = np->n_size - foffset(kn->kn_fp);
-#ifdef DEBUG
-       printf("nfsread event. %lld\n", kn->kn_data);
-#endif
        if (kn->kn_data == 0 && kn->kn_sfflags & NOTE_EOF) {
                kn->kn_fflags |= NOTE_EOF;
                return (1);
        }
+
+       if (kn->kn_sfflags & NOTE_IMM)
+               return (1);
+
         return (kn->kn_data != 0);
 }
 
+int
+filt_nfswrite(struct knote *kn, long hint)
+{
+       /*
+        * filesystem is gone, so set the EOF flag and schedule
+        * the knote for deletion.
+        */
+       if (hint == NOTE_REVOKE) {
+               kn->kn_flags |= (EV_EOF | EV_ONESHOT);
+               return (1);
+       }
+
+       kn->kn_data = 0;
+       return (1);
+}
+
 int
 filt_nfsvnode(struct knote *kn, long hint)
 {
@@ -256,6 +116,13 @@ static const struct filterops nfsread_filtops = {
        .f_event        = filt_nfsread,
 };
 
+const struct filterops nfswrite_filtops = {
+       .f_flags        = FILTEROP_ISFD,
+       .f_attach       = NULL,
+       .f_detach       = filt_nfsdetach,
+       .f_event        = filt_nfswrite,
+};
+
 static const struct filterops nfsvnode_filtops = {
        .f_flags        = FILTEROP_ISFD,
        .f_attach       = NULL,
@@ -267,25 +134,16 @@ int
 nfs_kqfilter(void *v)
 {
        struct vop_kqfilter_args *ap = v;
-       struct vnode *vp;
-       struct knote *kn;
-       struct kevq *ke;
-       int error = 0;
-       struct vattr attr;
-       struct proc *p = curproc;       /* XXX */
-
-       vp = ap->a_vp;
-       kn = ap->a_kn;
-
-#ifdef DEBUG
-       printf("nfs_kqfilter(%d) on: ", kn->kn_filter);
-       VOP_PRINT(vp);
-#endif
+       struct vnode *vp = ap->a_vp;
+       struct knote *kn = ap->a_kn;
 
        switch (kn->kn_filter) {
        case EVFILT_READ:
                kn->kn_fop = &nfsread_filtops;
                break;
+       case EVFILT_WRITE:
+               kn->kn_fop = &nfswrite_filtops;
+               break;
        case EVFILT_VNODE:
                kn->kn_fop = &nfsvnode_filtops;
                break;
@@ -295,53 +153,7 @@ nfs_kqfilter(void *v)
 
        kn->kn_hook = vp;
 
-       /*
-        * Put the vnode to watched list.
-        */
-       
-       /*
-        * Fetch current attributes. It's only needed when the vnode
-        * is not watched yet, but we need to do this without lock
-        * held. This is likely cheap due to attrcache, so do it now.
-        */ 
-       memset(&attr, 0, sizeof(attr));
-       (void) VOP_GETATTR(vp, &attr, p->p_ucred, p);
-
-       rw_enter_write(&nfskevq_lock);
-
-       /* ensure the poller is running */
-       if (!pnfskq) {
-               error = kthread_create(nfs_kqpoll, NULL, &pnfskq,
-                               "nfskqpoll");
-               if (error)
-                       goto out;
-       }
-
-       SLIST_FOREACH(ke, &kevlist, kev_link)
-               if (ke->vp == vp)
-                       break;
-
-       if (ke) {
-               /* already watched, so just bump usecount */
-               ke->usecount++;
-       } else {
-               /* need a new one */
-               ke = malloc(sizeof(*ke), M_KEVENT, M_WAITOK);
-               ke->vp = vp;
-               ke->usecount = 1;
-               ke->flags = 0;
-               ke->omtime = attr.va_mtime;
-               ke->octime = attr.va_ctime;
-               ke->onlink = attr.va_nlink;
-               SLIST_INSERT_HEAD(&kevlist, ke, kev_link);
-       }
-
-       /* kick the poller */
-       wakeup(pnfskq);
-
        klist_insert(&vp->v_selectinfo.si_note, kn);
 
-out:
-       rw_exit_write(&nfskevq_lock);
-       return (error);
+       return (0);
 }
diff --git sys/tmpfs/tmpfs_vnops.c sys/tmpfs/tmpfs_vnops.c
index c7a40d10333..aae8499b3f1 100644
--- sys/tmpfs/tmpfs_vnops.c
+++ sys/tmpfs/tmpfs_vnops.c
@@ -2665,6 +2665,9 @@ filt_tmpfsread(struct knote *kn, long hint)
                return (1);
        }
 
+       if (kn->kn_sfflags & NOTE_IMM)
+               return (1);
+
        return (kn->kn_data != 0);
 }
 
diff --git sys/ufs/mfs/mfs_vnops.c sys/ufs/mfs/mfs_vnops.c
index f085f1874ca..d495d2fdf9c 100644
--- sys/ufs/mfs/mfs_vnops.c
+++ sys/ufs/mfs/mfs_vnops.c
@@ -60,6 +60,7 @@ const struct vops mfs_vops = {
         .vop_write      = mfs_badop,
         .vop_ioctl      = mfs_ioctl,
         .vop_poll       = mfs_badop,
+        .vop_kqfilter   = mfs_badop,
         .vop_revoke     = mfs_revoke,
         .vop_fsync      = spec_fsync,
         .vop_remove     = mfs_badop,
diff --git sys/ufs/ufs/ufs_vnops.c sys/ufs/ufs/ufs_vnops.c
index a651fa9f065..7c6d4eecc07 100644
--- sys/ufs/ufs/ufs_vnops.c
+++ sys/ufs/ufs/ufs_vnops.c
@@ -1973,6 +1973,9 @@ filt_ufsread(struct knote *kn, long hint)
                return (1);
        }
 
+       if (kn->kn_sfflags & NOTE_IMM)
+               return (1);
+
        return (kn->kn_data != 0);
 }
 

Reply via email to