Diff below changes the internal of *poll(2) and *select(2) to use the
*_kqfilter() handlers instead of *_poll() ones. Events are stored on a
private per-thread kqueue then converted to the corresponding "fd_set"
or "pollfd". The design is similar to DragonFly's solution.
The main argument for this proposal is to reduce the amount of code
executed to notify userland when an event occur. The outcome of this
diff is that a single notification subsystem needs to be taken out of
the KERNEL_LOCK(). This simplifies a lot existing locking tentacles.
Using kqueue internally means collision is avoided and there's no need
to query handlers for fds that aren't ready. This comes at the cost of
allocating descriptors. A space vs time trade-off. Note that this cost
can be diminished by doing lazy removal of event descriptors to be able
to re-use them.
A lot of kqueue(2) related cleanups and fixes are required and included
below. To keep it small, the diff doesn't contain the removal of the
*_poll() handlers and abstraction layers.
Although I've been running with this diff for over a month now I'm not
asking for oks yet. I'm sharing it to get feedbacks, inputs and for
interested people to try it out.
I'm thinking of continuing to push the kqueue(2) fixes in the tree
because those are worth having even without this. If we agree this is
a beneficial change, I'm suggesting a 3 steps plans to integrate it:
- Discuss & integrate the kqfilter handlers changes required
- Convert *select(2) first because the interface is simpler and there's
fewer possibilities of regression compared to *poll(2)
- Convert *poll(2)
Depending on tests and findings regarding performances, I can work on
lazy removal of event descriptors prior or after integrating *poll(2).
I'd like the help of somebody with a heavy poll(2) use case, like bgpd(8).
Comments?
diff --git sys/kern/kern_event.c sys/kern/kern_event.c
index 0bed8e5f671..68c35f0d6a4 100644
--- sys/kern/kern_event.c
+++ sys/kern/kern_event.c
@@ -57,14 +57,13 @@
#include <sys/timeout.h>
#include <sys/wait.h>
+void kqueue_terminate(struct proc *p, struct kqueue *);
+void kqueue_free(struct kqueue *);
void kqueue_init(void);
void KQREF(struct kqueue *);
void KQRELE(struct kqueue *);
int kqueue_sleep(struct kqueue *, struct timespec *);
-int kqueue_scan(struct kqueue_scan_state *scan, int maxevents,
- struct kevent *ulistp, struct timespec *timeout,
- struct proc *p, int *retval);
int kqueue_read(struct file *, struct uio *, int);
int kqueue_write(struct file *, struct uio *, int);
@@ -156,6 +155,7 @@ const struct filterops *const sysfilt_ops[] = {
&sig_filtops, /* EVFILT_SIGNAL */
&timer_filtops, /* EVFILT_TIMER */
&file_filtops, /* EVFILT_DEVICE */
+ &file_filtops, /* EVFILT_EXCEPT */
};
void
@@ -181,8 +181,13 @@ KQRELE(struct kqueue *kq)
fdpunlock(fdp);
}
- free(kq->kq_knlist, M_KEVENT, kq->kq_knlistsize *
- sizeof(struct knlist));
+ kqueue_free(kq);
+}
+
+void
+kqueue_free(struct kqueue *kq)
+{
+ free(kq->kq_knlist, M_KEVENT, kq->kq_knlistsize * sizeof(struct klist));
hashfree(kq->kq_knhash, KN_HASHSIZE, M_KEVENT);
pool_put(&kqueue_pool, kq);
}
@@ -492,13 +497,10 @@ static const struct filterops dead_filtops = {
.f_event = filt_dead,
};
-int
-sys_kqueue(struct proc *p, void *v, register_t *retval)
+struct kqueue *
+kqueue_alloc(struct filedesc *fdp)
{
- struct filedesc *fdp = p->p_fd;
struct kqueue *kq;
- struct file *fp;
- int fd, error;
kq = pool_get(&kqueue_pool, PR_WAITOK | PR_ZERO);
kq->kq_refs = 1;
@@ -506,6 +508,27 @@ sys_kqueue(struct proc *p, void *v, register_t *retval)
TAILQ_INIT(&kq->kq_head);
task_set(&kq->kq_task, kqueue_task, kq);
+ return (kq);
+}
+
+void
+kqueue_exit(struct proc *p)
+{
+ kqueue_terminate(p, p->p_kq);
+ kqueue_free(p->p_kq);
+ p->p_kq = NULL;
+}
+
+int
+sys_kqueue(struct proc *p, void *v, register_t *retval)
+{
+ struct filedesc *fdp = p->p_fd;
+ struct kqueue *kq;
+ struct file *fp;
+ int fd, error;
+
+ kq = kqueue_alloc(fdp);
+
fdplock(fdp);
error = falloc(p, &fp, &fd);
if (error)
@@ -545,6 +568,7 @@ sys_kevent(struct proc *p, void *v, register_t *retval)
struct timespec ts;
struct timespec *tsp = NULL;
int i, n, nerrors, error;
+ int ready, total;
struct kevent kev[KQ_NEVENTS];
if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
@@ -573,9 +597,9 @@ sys_kevent(struct proc *p, void *v, register_t *retval)
kq = fp->f_data;
nerrors = 0;
- while (SCARG(uap, nchanges) > 0) {
- n = SCARG(uap, nchanges) > KQ_NEVENTS ?
- KQ_NEVENTS : SCARG(uap, nchanges);
+ while ((n = SCARG(uap, nchanges)) > 0) {
+ if (n > nitems(kev))
+ n = nitems(kev);
error = copyin(SCARG(uap, changelist), kev,
n * sizeof(struct kevent));
if (error)
@@ -611,14 +635,39 @@ sys_kevent(struct proc *p, void *v, register_t *retval)
goto done;
}
+
KQREF(kq);
FRELE(fp, p);
+ /*
+ * Collect as many events as we can. The timeout on successive
+ * loops is disabled (kqueue_scan() becomes non-blocking).
+ */
+ total = 0;
+ error = 0;
kqueue_scan_setup(&scan, kq);
- error = kqueue_scan(&scan, SCARG(uap, nevents), SCARG(uap, eventlist),
- tsp, p, &n);
+ while ((n = SCARG(uap, nevents) - total) > 0) {
+ if (n > nitems(kev))
+ n = nitems(kev);
+ ready = kqueue_scan(&scan, n, kev, tsp, p, &error);
+ if (ready == 0)
+ break;
+ error = copyout(kev, SCARG(uap, eventlist) + total,
+ sizeof(struct kevent) * ready);
+#ifdef KTRACE
+ if (KTRPOINT(p, KTR_STRUCT))
+ ktrevent(p, kev, ready);
+#endif
+ total += ready;
+ if (error || ready < n)
+ break;
+ tsp = &ts; /* successive loops non-blocking */
+ timespecclear(tsp);
+ }
kqueue_scan_finish(&scan);
KQRELE(kq);
- *retval = n;
+ if (error == EWOULDBLOCK)
+ error = 0;
+ *retval = total;
return (error);
done:
@@ -872,27 +921,28 @@ kqueue_sleep(struct kqueue *kq, struct timespec *tsp)
return (error);
}
+/*
+ * Scan the kqueue, blocking if necessary until the target time is reached.
+ * If tsp is NULL we block indefinitely. If tsp->ts_secs/nsecs are both
+ * 0 we do not block at all.
+ */
int
kqueue_scan(struct kqueue_scan_state *scan, int maxevents,
- struct kevent *ulistp, struct timespec *tsp, struct proc *p, int *retval)
+ struct kevent *kevp, struct timespec *tsp, struct proc *p, int *errorp)
{
- struct kevent *kevp;
struct knote *kn;
struct kqueue *kq = scan->kqs_kq;
int s, count, nkev = 0, error = 0;
- struct kevent kev[KQ_NEVENTS];
count = maxevents;
if (count == 0)
goto done;
-
retry:
if (kq->kq_state & KQ_DYING) {
error = EBADF;
goto done;
}
- kevp = &kev[0];
s = splhigh();
if (kq->kq_count == 0) {
if ((tsp != NULL && !timespecisset(tsp)) ||
@@ -904,7 +954,7 @@ retry:
kq->kq_state |= KQ_SLEEP;
error = kqueue_sleep(kq, tsp);
splx(s);
- if (error == 0 || error == EWOULDBLOCK)
+ if (error == 0)
goto retry;
/* don't restart after signals... */
if (error == ERESTART)
@@ -976,6 +1026,9 @@ retry:
count--;
scan->kqs_nevent++;
+ /*
+ * Post-event action on the note
+ */
if (kn->kn_flags & EV_ONESHOT) {
splx(s);
kn->kn_fop->f_detach(kn);
@@ -1001,35 +1054,14 @@ retry:
knote_release(kn);
}
kqueue_check(kq);
- if (nkev == KQ_NEVENTS) {
- splx(s);
-#ifdef KTRACE
- if (KTRPOINT(p, KTR_STRUCT))
- ktrevent(p, kev, nkev);
-#endif
- error = copyout(kev, ulistp,
- sizeof(struct kevent) * nkev);
- ulistp += nkev;
- nkev = 0;
- kevp = &kev[0];
- s = splhigh();
- if (error)
- break;
- }
}
TAILQ_REMOVE(&kq->kq_head, &scan->kqs_start, kn_tqe);
splx(s);
+ if (scan->kqs_nevent == 0)
+ goto retry;
done:
- if (nkev != 0) {
-#ifdef KTRACE
- if (KTRPOINT(p, KTR_STRUCT))
- ktrevent(p, kev, nkev);
-#endif
- error = copyout(kev, ulistp,
- sizeof(struct kevent) * nkev);
- }
- *retval = maxevents - count;
- return (error);
+ *errorp = error;
+ return (nkev);
}
void
@@ -1115,13 +1147,12 @@ kqueue_stat(struct file *fp, struct stat *st, struct
proc *p)
return (0);
}
-int
-kqueue_close(struct file *fp, struct proc *p)
+void
+kqueue_purge(struct proc *p, struct kqueue *kq)
{
- struct kqueue *kq = fp->f_data;
int i;
- KERNEL_LOCK();
+ KERNEL_ASSERT_LOCKED();
for (i = 0; i < kq->kq_knlistsize; i++)
knote_remove(p, &kq->kq_knlist[i]);
@@ -1129,14 +1160,29 @@ kqueue_close(struct file *fp, struct proc *p)
for (i = 0; i < kq->kq_knhashmask + 1; i++)
knote_remove(p, &kq->kq_knhash[i]);
}
- fp->f_data = NULL;
+}
+void
+kqueue_terminate(struct proc *p, struct kqueue *kq)
+{
+ kqueue_purge(p, kq);
kq->kq_state |= KQ_DYING;
kqueue_wakeup(kq);
KASSERT(klist_empty(&kq->kq_sel.si_note));
task_del(systq, &kq->kq_task);
+}
+
+int
+kqueue_close(struct file *fp, struct proc *p)
+{
+ struct kqueue *kq = fp->f_data;
+
+ KERNEL_LOCK();
+ kqueue_terminate(p, kq);
+ fp->f_data = NULL;
+
KQRELE(kq);
KERNEL_UNLOCK();
diff --git sys/kern/kern_exit.c sys/kern/kern_exit.c
index 66ffad0de02..313890df808 100644
--- sys/kern/kern_exit.c
+++ sys/kern/kern_exit.c
@@ -184,6 +184,8 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
if ((p->p_flag & P_THREAD) == 0)
pr->ps_siglist = 0;
+ kqueue_exit(p);
+
#if NKCOV > 0
kcov_exit(p);
#endif
diff --git sys/kern/kern_fork.c sys/kern/kern_fork.c
index 6cfb39b6252..3a278321df4 100644
--- sys/kern/kern_fork.c
+++ sys/kern/kern_fork.c
@@ -422,6 +422,8 @@ fork1(struct proc *curp, int flags, void (*func)(void *),
void *arg,
newptstat = malloc(sizeof(*newptstat), M_SUBPROC, M_WAITOK);
p->p_tid = alloctid();
+ p->p_kq = kqueue_alloc(p->p_fd);
+ p->p_kq_serial = arc4random();
LIST_INSERT_HEAD(&allproc, p, p_list);
LIST_INSERT_HEAD(TIDHASH(p->p_tid), p, p_hash);
@@ -553,6 +555,8 @@ thread_fork(struct proc *curp, void *stack, void *tcb,
pid_t *tidptr,
cpu_fork(curp, p, stack, tcb, child_return, p);
p->p_tid = alloctid();
+ p->p_kq = kqueue_alloc(p->p_fd);
+ p->p_kq_serial = arc4random();
LIST_INSERT_HEAD(&allproc, p, p_list);
LIST_INSERT_HEAD(TIDHASH(p->p_tid), p, p_hash);
diff --git sys/kern/spec_vnops.c sys/kern/spec_vnops.c
index 887a7acb641..ac2ed144c20 100644
--- sys/kern/spec_vnops.c
+++ sys/kern/spec_vnops.c
@@ -386,11 +386,9 @@ spec_poll(void *v)
dev_t dev;
switch (ap->a_vp->v_type) {
-
default:
return (ap->a_events &
(POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
-
case VCHR:
dev = ap->a_vp->v_rdev;
return (*cdevsw[major(dev)].d_poll)(dev, ap->a_events, ap->a_p);
@@ -400,12 +398,17 @@ int
spec_kqfilter(void *v)
{
struct vop_kqfilter_args *ap = v;
-
dev_t dev;
dev = ap->a_vp->v_rdev;
- if (cdevsw[major(dev)].d_kqfilter)
- return (*cdevsw[major(dev)].d_kqfilter)(dev, ap->a_kn);
+
+ switch (ap->a_vp->v_type) {
+ default:
+ return seltrue_kqfilter(dev, ap->a_kn);
+ case VCHR:
+ if (cdevsw[major(dev)].d_kqfilter)
+ return (*cdevsw[major(dev)].d_kqfilter)(dev, ap->a_kn);
+ }
return (EOPNOTSUPP);
}
diff --git sys/kern/sys_generic.c sys/kern/sys_generic.c
index 477d8a433d4..427dc7e2001 100644
--- sys/kern/sys_generic.c
+++ sys/kern/sys_generic.c
@@ -55,6 +55,7 @@
#include <sys/time.h>
#include <sys/malloc.h>
#include <sys/poll.h>
+#include <sys/eventvar.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
@@ -66,14 +67,27 @@
#include <uvm/uvm_extern.h>
-int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
-void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
+/*
+ * Debug values:
+ * 1 - print implementation errors, things that should not happen.
+ * 2 - print ppoll(2) information, somewhat verbose
+ * 3 - print pselect(2) and ppoll(2) information, very verbose
+ */
+int kqpoll_debug = 0;
+#define DPRINTFN(v, x...) if (kqpoll_debug > v) { \
+ printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid); \
+ printf(x); \
+}
+int pselregister(struct proc *, fd_set *, int, int, int *);
+int pselcollect(struct proc *, struct kevent *, fd_set *[]);
+int ppollregister(struct proc *, struct pollfd *, int, int *);
+int ppollcollect(struct proc *, struct kevent *, struct pollfd *, u_int);
+
int pollout(struct pollfd *, struct pollfd *, u_int);
int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
struct timespec *, const sigset_t *, register_t *);
int doppoll(struct proc *, struct pollfd *, u_int, struct timespec *,
const sigset_t *, register_t *);
-void doselwakeup(struct selinfo *);
int
iovec_copyin(const struct iovec *uiov, struct iovec **iovp, struct iovec *aiov,
@@ -584,11 +598,11 @@ int
dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
{
+ struct kqueue_scan_state scan;
fd_mask bits[6];
fd_set *pibits[3], *pobits[3];
- struct timespec elapsed, start, stop;
- uint64_t nsecs;
- int s, ncoll, error = 0;
+ struct timespec ts;
+ int error, nevents = 0;
u_int ni;
if (nd < 0)
@@ -636,43 +650,61 @@ dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou,
fd_set *ex,
if (sigmask)
dosigsuspend(p, *sigmask &~ sigcantmask);
-retry:
- ncoll = nselcoll;
- atomic_setbits_int(&p->p_flag, P_SELECT);
- error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
- if (error || *retval)
+ /* Register kqueue events */
+ if ((error = pselregister(p, pibits[0], nd, ni, &nevents) != 0))
goto done;
- if (timeout == NULL || timespecisset(timeout)) {
- if (timeout != NULL) {
- getnanouptime(&start);
+
+ /*
+ * The poll/select family of syscalls has been designed to
+ * block when file descriptors are not available, even if
+ * there's nothing to wait for.
+ */
+ if (nevents == 0) {
+ uint64_t nsecs = INFSLP;
+
+ if (timeout != NULL)
nsecs = MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP);
- } else
- nsecs = INFSLP;
- s = splhigh();
- if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
- splx(s);
- goto retry;
- }
- atomic_clearbits_int(&p->p_flag, P_SELECT);
- error = tsleep_nsec(&selwait, PSOCK | PCATCH, "select", nsecs);
- splx(s);
- if (timeout != NULL) {
- getnanouptime(&stop);
- timespecsub(&stop, &start, &elapsed);
- timespecsub(timeout, &elapsed, timeout);
- if (timeout->tv_sec < 0)
- timespecclear(timeout);
- }
- if (error == 0 || error == EWOULDBLOCK)
- goto retry;
+
+ error = tsleep_nsec(&p->p_kq, PSOCK | PCATCH, "kqsel", nsecs);
+ /* select is not restarted after signals... */
+ if (error == ERESTART)
+ error = EINTR;
}
-done:
- atomic_clearbits_int(&p->p_flag, P_SELECT);
- /* select is not restarted after signals... */
- if (error == ERESTART)
- error = EINTR;
+
+ /* Collect at most `nevents' possibly waiting in kqueue_scan() */
+ kqueue_scan_setup(&scan, p->p_kq);
+ while (nevents > 0) {
+ struct kevent kev[KQ_NEVENTS];
+ int i, ready, count;
+
+ /* Maxium number of events per iteration */
+ count = MIN(nitems(kev), nevents);
+ ready = kqueue_scan(&scan, count, kev, timeout, p, &error);
+#ifdef KTRACE
+ if (KTRPOINT(p, KTR_STRUCT))
+ ktrevent(p, kev, ready);
+#endif
+ /* Convert back events that are ready. */
+ for (i = 0; i < ready; i++)
+ *retval += pselcollect(p, &kev[i], pobits);
+
+ /*
+ * Stop if there was an error or if we had enough
+ * place to collect all events that were ready.
+ */
+ if (error || ready < count)
+ break;
+
+ timeout = &ts; /* successive loops non-blocking */
+ timespecclear(timeout);
+
+ nevents -= ready;
+ }
+ kqueue_scan_finish(&scan);
+
if (error == EWOULDBLOCK)
error = 0;
+ done:
#define putbits(name, x) \
if (name && (error2 = copyout(pobits[x], name, ni))) \
error = error2;
@@ -694,41 +726,107 @@ done:
if (pibits[0] != (fd_set *)&bits[0])
free(pibits[0], M_TEMP, 6 * ni);
+
+ kqueue_purge(p, p->p_kq);
+ p->p_kq_serial += nd;
+
return (error);
}
+/*
+ * Convert fd_set into kqueue events and register them on the
+ * per-thread queue.
+ */
int
-selscan(struct proc *p, fd_set *ibits, fd_set *obits, int nfd, int ni,
- register_t *retval)
+pselregister(struct proc *p, fd_set *ibits, int nfd, int ni, int *nregistered)
{
- caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
- struct filedesc *fdp = p->p_fd;
- int msk, i, j, fd;
+ static const struct {
+ int filter;
+ int fflags;
+ } evf[] = {
+ { EVFILT_READ, NOTE_IMM },
+ { EVFILT_WRITE, NOTE_IMM },
+ { EVFILT_EXCEPT, 0 }
+ };
+ caddr_t cibits = (caddr_t)ibits;
+ int msk, i, j, fd, nevents = 0, error = 0;
+ struct kevent kev;
fd_mask bits;
- struct file *fp;
- int n = 0;
- static const int flag[3] = { POLLIN, POLLOUT|POLL_NOHUP, POLLPRI };
for (msk = 0; msk < 3; msk++) {
fd_set *pibits = (fd_set *)&cibits[msk*ni];
- fd_set *pobits = (fd_set *)&cobits[msk*ni];
for (i = 0; i < nfd; i += NFDBITS) {
bits = pibits->fds_bits[i/NFDBITS];
while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
bits &= ~(1 << j);
- if ((fp = fd_getfile(fdp, fd)) == NULL)
- return (EBADF);
- if ((*fp->f_ops->fo_poll)(fp, flag[msk], p)) {
- FD_SET(fd, pobits);
- n++;
+
+ DPRINTFN(2, "select fd %d mask %d serial %lu\n",
+ fd, msk, p->p_kq_serial);
+ EV_SET(&kev, fd, evf[msk].filter,
+ EV_ADD|EV_ENABLE, evf[msk].fflags, 0,
+ (void *)(p->p_kq_serial));
+#ifdef KTRACE
+ if (KTRPOINT(p, KTR_STRUCT))
+ ktrevent(p, &kev, 1);
+#endif
+ error = kqueue_register(p->p_kq, &kev, p);
+ switch (error) {
+ case 0:
+ nevents++;
+ case EOPNOTSUPP:/* No underlying kqfilter */
+ case EINVAL: /* Unimplemented filter */
+ error = 0;
+ break;
+ case ENXIO: /* Device has been detached */
+ default:
+ goto bad;
}
- FRELE(fp, p);
}
}
}
- *retval = n;
+
+ *nregistered = nevents;
return (0);
+bad:
+ DPRINTFN(0, "select fd %u filt %d error %d\n", (int)kev.ident,
+ kev.filter, error);
+ return (error);
+}
+
+/*
+ * Convert given kqueue event into corresponding select(2) bit.
+ */
+int
+pselcollect(struct proc *p, struct kevent *kevp, fd_set *pobits[3])
+{
+#ifdef DIAGNOSTIC
+ /* Filter out and lazily delete spurious events */
+ if ((unsigned long)kevp->udata != p->p_kq_serial) {
+ DPRINTFN(0, "select fd %u mismatched serial %lu\n",
+ (int)kevp->ident, p->p_kq_serial);
+ kevp->flags = EV_DISABLE|EV_DELETE;
+ kqueue_register(p->p_kq, kevp, p);
+ return (0);
+ }
+#endif
+
+ switch (kevp->filter) {
+ case EVFILT_READ:
+ FD_SET(kevp->ident, pobits[0]);
+ break;
+ case EVFILT_WRITE:
+ FD_SET(kevp->ident, pobits[1]);
+ break;
+ case EVFILT_EXCEPT:
+ FD_SET(kevp->ident, pobits[2]);
+ break;
+ default:
+ KASSERT(0);
+ }
+
+ DPRINTFN(2, "select fd %d filt %d\n", (int)kevp->ident, kevp->filter);
+ return (1);
}
int
@@ -774,59 +872,113 @@ selwakeup(struct selinfo *sip)
{
KERNEL_LOCK();
KNOTE(&sip->si_note, NOTE_SUBMIT);
- doselwakeup(sip);
KERNEL_UNLOCK();
}
-void
-doselwakeup(struct selinfo *sip)
+int
+ppollregister_evt(struct proc *p, struct kevent *kevp, int nkev,
+ struct pollfd *pl)
{
- struct proc *p;
+ int i, error, nevents = 0;
- KERNEL_ASSERT_LOCKED();
+ KASSERT(pl->revents == 0);
- if (sip->si_seltid == 0)
- return;
- if (sip->si_flags & SI_COLL) {
- nselcoll++;
- sip->si_flags &= ~SI_COLL;
- wakeup(&selwait);
- }
- p = tfind(sip->si_seltid);
- sip->si_seltid = 0;
- if (p != NULL) {
- if (wakeup_proc(p, &selwait)) {
- /* nothing else to do */
- } else if (p->p_flag & P_SELECT)
- atomic_clearbits_int(&p->p_flag, P_SELECT);
+#ifdef KTRACE
+ if (KTRPOINT(p, KTR_STRUCT))
+ ktrevent(p, kevp, nkev);
+#endif
+ for (i = 0; i < nkev; i++, kevp++) {
+ error = kqueue_register(p->p_kq, kevp, p);
+ switch (error) {
+ case 0:
+ nevents++;
+ break;
+ case EOPNOTSUPP:/* No underlying kqfilter */
+ case EINVAL: /* Unimplemented filter */
+ break;
+ case EBADF: /* Bad file descriptor */
+ pl->revents |= POLLNVAL;
+ break;
+ case EPIPE: /* Specific to pipes */
+ KASSERT(kevp->filter == EVFILT_WRITE);
+ pl->revents |= POLLHUP;
+ break;
+ default:
+#ifdef DIAGNOSTIC
+ DPRINTFN(0, "poll err %lu fd %d revents %02x serial"
+ " %lu filt %d ERROR=%d\n",
+ ((unsigned long)kevp->udata - p->p_kq_serial),
+ pl->fd, pl->revents, p->p_kq_serial, kevp->filter,
+ error);
+#endif
+ /* FALLTHROUGH */
+ case ENXIO: /* Device has been detached */
+ pl->revents |= POLLERR;
+ break;
+ }
}
+
+ return (nevents);
}
-void
-pollscan(struct proc *p, struct pollfd *pl, u_int nfd, register_t *retval)
+/*
+ * Convert pollfd into kqueue events and register them on the
+ * per-thread queue.
+ *
+ * Return the number of pollfd that triggered at least one error and aren't
+ * completly monitored. These pollfd should have the correponding error bit
+ * set in `revents'.
+ *
+ * At most 3 events can correspond to a single pollfd.
+ */
+int
+ppollregister(struct proc *p, struct pollfd *pl, int nfds, int *nregistered)
{
- struct filedesc *fdp = p->p_fd;
- struct file *fp;
- u_int i;
- int n = 0;
+ int i, nkev, nevt, errcount = 0;
+ struct kevent kev[3], *kevp;
- for (i = 0; i < nfd; i++, pl++) {
- /* Check the file descriptor. */
- if (pl->fd < 0) {
- pl->revents = 0;
+ for (i = 0; i < nfds; i++) {
+ pl[i].events &= ~POLL_NOHUP;
+ pl[i].revents = 0;
+
+ if (pl[i].fd < 0)
continue;
+
+ DPRINTFN(1, "poll set %d/%d fd %d events %02x serial %lu\n",
+ i, nfds-1, pl[i].fd, pl[i].events, p->p_kq_serial);
+
+ nevt = 0;
+ nkev = 0;
+ kevp = kev;
+ if (pl[i].events & (POLLIN | POLLRDNORM)) {
+ EV_SET(kevp, pl[i].fd, EVFILT_READ, EV_ADD|EV_ENABLE,
+ NOTE_IMM, 0, (void *)(p->p_kq_serial + i));
+ nkev++;
+ kevp++;
}
- if ((fp = fd_getfile(fdp, pl->fd)) == NULL) {
- pl->revents = POLLNVAL;
- n++;
- continue;
+ if (pl[i].events & (POLLOUT | POLLWRNORM)) {
+ EV_SET(kevp, pl[i].fd, EVFILT_WRITE, EV_ADD|EV_ENABLE,
+ NOTE_IMM, 0, (void *)(p->p_kq_serial + i));
+ nkev++;
+ kevp++;
}
- pl->revents = (*fp->f_ops->fo_poll)(fp, pl->events, p);
- FRELE(fp, p);
- if (pl->revents != 0)
- n++;
+ if (pl[i].events & (POLLPRI | POLLRDBAND)) {
+ EV_SET(kevp, pl[i].fd, EVFILT_EXCEPT, EV_ADD|EV_ENABLE,
+ 0, 0, (void *)(p->p_kq_serial + i));
+ nkev++;
+ kevp++;
+ }
+
+ if (nkev == 0)
+ continue;
+
+ nevt = ppollregister_evt(p, kev, nkev, &pl[i]);
+ if (nevt == 0)
+ errcount++;
+ *nregistered += nevt;
}
- *retval = n;
+
+ return (errcount);
}
/*
@@ -916,11 +1068,11 @@ int
doppoll(struct proc *p, struct pollfd *fds, u_int nfds,
struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
{
- size_t sz;
+ struct kqueue_scan_state scan;
struct pollfd pfds[4], *pl = pfds;
- struct timespec elapsed, start, stop;
- uint64_t nsecs;
- int ncoll, i, s, error;
+ struct timespec ts;
+ int error, nevents = 0;
+ size_t sz;
/* Standards say no more than MAX_OPEN; this is possibly better. */
if (nfds > min((int)lim_cur(RLIMIT_NOFILE), maxfiles))
@@ -939,53 +1091,65 @@ doppoll(struct proc *p, struct pollfd *fds, u_int nfds,
if ((error = copyin(fds, pl, sz)) != 0)
goto bad;
- for (i = 0; i < nfds; i++) {
- pl[i].events &= ~POLL_NOHUP;
- pl[i].revents = 0;
- }
-
if (sigmask)
dosigsuspend(p, *sigmask &~ sigcantmask);
-retry:
- ncoll = nselcoll;
- atomic_setbits_int(&p->p_flag, P_SELECT);
- pollscan(p, pl, nfds, retval);
- if (*retval)
- goto done;
- if (timeout == NULL || timespecisset(timeout)) {
- if (timeout != NULL) {
- getnanouptime(&start);
+ /* Register kqueue events */
+ *retval = ppollregister(p, pl, nfds, &nevents);
+
+ /*
+ * The poll/select family of syscalls has been designed to
+ * block when file descriptors are not available, even if
+ * there's nothing to wait for.
+ */
+ if (nevents == 0) {
+ uint64_t nsecs = INFSLP;
+
+ if (timeout != NULL)
nsecs = MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP);
- } else
- nsecs = INFSLP;
- s = splhigh();
- if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
- splx(s);
- goto retry;
- }
- atomic_clearbits_int(&p->p_flag, P_SELECT);
- error = tsleep_nsec(&selwait, PSOCK | PCATCH, "poll", nsecs);
- splx(s);
- if (timeout != NULL) {
- getnanouptime(&stop);
- timespecsub(&stop, &start, &elapsed);
- timespecsub(timeout, &elapsed, timeout);
- if (timeout->tv_sec < 0)
- timespecclear(timeout);
- }
- if (error == 0 || error == EWOULDBLOCK)
- goto retry;
+
+ error = tsleep_nsec(&p->p_kq, PSOCK | PCATCH, "kqpoll", nsecs);
+ if (error == ERESTART)
+ error = EINTR;
+ }
+
+ /* Collect at most `nevents' possibly waiting in kqueue_scan() */
+ kqueue_scan_setup(&scan, p->p_kq);
+ while (nevents > 0) {
+ struct kevent kev[KQ_NEVENTS];
+ int i, ready, count;
+
+ /* Maxium number of events per iteration */
+ count = MIN(nitems(kev), nevents);
+ ready = kqueue_scan(&scan, count, kev, timeout, p, &error);
+#ifdef KTRACE
+ if (KTRPOINT(p, KTR_STRUCT))
+ ktrevent(p, kev, ready);
+#endif
+ /* Convert back events that are ready. */
+ for (i = 0; i < ready; i++)
+ *retval += ppollcollect(p, &kev[i], pl, nfds);
+
+ /*
+ * Stop if there was an error or if we had enough
+ * place to collect all events that were ready.
+ */
+ if (error || ready < count)
+ break;
+
+ timeout = &ts; /* successive loops non-blocking */
+ timespecclear(timeout);
+
+ nevents -= ready;
}
+ kqueue_scan_finish(&scan);
-done:
- atomic_clearbits_int(&p->p_flag, P_SELECT);
/*
* NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
* ignored (since the whole point is to see what would block).
*/
switch (error) {
- case ERESTART:
+ case EINTR:
error = pollout(pl, fds, nfds);
if (error == 0)
error = EINTR;
@@ -1002,9 +1166,89 @@ done:
bad:
if (pl != pfds)
free(pl, M_TEMP, sz);
+
+ kqueue_purge(p, p->p_kq);
+ p->p_kq_serial += nfds;
+
return (error);
}
+/*
+ * Convert given kqueue event into corresponding poll(2) revents bit.
+ */
+int
+ppollcollect(struct proc *p, struct kevent *kevp, struct pollfd *pl, u_int
nfds)
+{
+ int already_seen;
+ unsigned long i;
+
+ /* Extract poll array index */
+ i = (unsigned long)kevp->udata - p->p_kq_serial;
+
+#ifdef DIAGNOSTIC
+ /*
+ * Lazily delete spurious events.
+ *
+ * This should not happen as long as kqueue_purge() is called
+ * at the end of every syscall. It migh be interesting to do
+ * like DragonFlyBSD and not always allocated a new knote in
+ * kqueue_register() with that lazy removal makes sense.
+ */
+ if (i >= nfds) {
+ DPRINTFN(0, "poll get out of range udata %lu vs serial %lu\n",
+ (unsigned long)kevp->udata, p->p_kq_serial);
+ kevp->flags = EV_DISABLE|EV_DELETE;
+ kqueue_register(p->p_kq, kevp, p);
+ return (0);
+ }
+ if ((int)kevp->ident != pl[i].fd) {
+ DPRINTFN(0, "poll get %lu/%d mismatch fd %u!=%d serial %lu\n",
+ i, nfds-1, (int)kevp->ident, pl[i].fd, p->p_kq_serial);
+ return (0);
+ }
+#endif
+
+ /*
+ * A given descriptor may already have generated an error
+ * against another filter during kqueue_register().
+ *
+ * Make sure to set the appropriate flags but do not
+ * increment `*retval' more than once.
+ */
+ already_seen = (pl[i].revents != 0);
+
+ switch (kevp->filter) {
+ case EVFILT_READ:
+ if (kevp->flags & EV_HUP)
+ pl[i].revents |= POLLHUP;
+ if (pl[i].events & (POLLIN | POLLRDNORM))
+ pl[i].revents |= pl[i].events & (POLLIN | POLLRDNORM);
+ break;
+ case EVFILT_WRITE:
+ /* NOTE: POLLHUP and POLLOUT/POLLWRNORM are mutually exclusive*/
+ if (kevp->flags & EV_HUP) {
+ pl[i].revents |= POLLHUP;
+ } else if (pl[i].events & (POLLOUT | POLLWRNORM)) {
+ pl[i].revents |= pl[i].events & (POLLOUT | POLLWRNORM);
+ }
+ break;
+ case EVFILT_EXCEPT:
+ if (pl[i].events & (POLLPRI | POLLRDBAND))
+ pl[i].revents |= pl[i].events & (POLLPRI | POLLRDBAND);
+ break;
+ default:
+ KASSERT(0);
+ }
+
+ DPRINTFN(1, "poll get %lu/%d fd %d revents %02x serial %lu filt %d\n",
+ i, nfds-1, pl[i].fd, pl[i].revents, (unsigned long)kevp->udata,
+ kevp->filter);
+ if (!already_seen && (pl[i].revents != 0))
+ return (1);
+
+ return (0);
+}
+
/*
* utrace system call
*/
diff --git sys/kern/sys_pipe.c sys/kern/sys_pipe.c
index fc221bfd8f1..a1ade981c9f 100644
--- sys/kern/sys_pipe.c
+++ sys/kern/sys_pipe.c
@@ -966,7 +966,7 @@ filt_piperead(struct knote *kn, long hint)
if ((rpipe->pipe_state & PIPE_EOF) || wpipe == NULL) {
if ((hint & NOTE_SUBMIT) == 0)
rw_exit_read(lock);
- kn->kn_flags |= EV_EOF;
+ kn->kn_flags |= (EV_EOF | EV_HUP);
return (1);
}
@@ -990,7 +990,7 @@ filt_pipewrite(struct knote *kn, long hint)
if ((hint & NOTE_SUBMIT) == 0)
rw_exit_read(lock);
kn->kn_data = 0;
- kn->kn_flags |= EV_EOF;
+ kn->kn_flags |= (EV_EOF | EV_HUP);
return (1);
}
kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
diff --git sys/kern/tty.c sys/kern/tty.c
index e9f98f87e01..caa3113e995 100644
--- sys/kern/tty.c
+++ sys/kern/tty.c
@@ -1155,7 +1155,7 @@ filt_ttyread(struct knote *kn, long hint)
kn->kn_data = ttnread(tp);
splx(s);
if (!ISSET(tp->t_cflag, CLOCAL) && !ISSET(tp->t_state, TS_CARR_ON)) {
- kn->kn_flags |= EV_EOF;
+ kn->kn_flags |= (EV_EOF | EV_HUP);
return (1);
}
return (kn->kn_data > 0);
diff --git sys/kern/tty_pty.c sys/kern/tty_pty.c
index 2152bcde86a..89cef33fd4c 100644
--- sys/kern/tty_pty.c
+++ sys/kern/tty_pty.c
@@ -107,6 +107,7 @@ void filt_ptcrdetach(struct knote *);
int filt_ptcread(struct knote *, long);
void filt_ptcwdetach(struct knote *);
int filt_ptcwrite(struct knote *, long);
+int filt_ptcexcept(struct knote *, long);
static struct pt_softc **ptyarralloc(int);
static int check_pty(int);
@@ -677,7 +678,7 @@ filt_ptcread(struct knote *kn, long hint)
}
if (!ISSET(tp->t_state, TS_CARR_ON)) {
- kn->kn_flags |= EV_EOF;
+ kn->kn_flags |= (EV_EOF | EV_HUP);
return (1);
}
@@ -708,7 +709,8 @@ filt_ptcwrite(struct knote *kn, long hint)
if (ISSET(pti->pt_flags, PF_REMOTE)) {
if (tp->t_canq.c_cc == 0)
kn->kn_data = tp->t_canq.c_cn;
- } else if (tp->t_rawq.c_cc + tp->t_canq.c_cc < TTYHOG(tp)-2)
+ } else if ((tp->t_rawq.c_cc + tp->t_canq.c_cc < TTYHOG(tp)-2) ||
+ (tp->t_canq.c_cc == 0 && ISSET(tp->t_lflag, ICANON)))
kn->kn_data = tp->t_canq.c_cn -
(tp->t_rawq.c_cc + tp->t_canq.c_cc);
}
@@ -716,6 +718,23 @@ filt_ptcwrite(struct knote *kn, long hint)
return (kn->kn_data > 0);
}
+int
+filt_ptcexcept(struct knote *kn, long hint)
+{
+ struct pt_softc *pti = (struct pt_softc *)kn->kn_hook;
+ struct tty *tp;
+
+ tp = pti->pt_tty;
+ kn->kn_data = 0;
+
+ /* If in packet or user control mode, check for data. */
+ if (((pti->pt_flags & PF_PKT) && pti->pt_send) ||
+ ((pti->pt_flags & PF_UCNTL) && pti->pt_ucntl))
+ kn->kn_data = 1;
+
+ return (kn->kn_data > 0);
+}
+
const struct filterops ptcread_filtops = {
.f_flags = FILTEROP_ISFD,
.f_attach = NULL,
@@ -730,6 +749,13 @@ const struct filterops ptcwrite_filtops = {
.f_event = filt_ptcwrite,
};
+const struct filterops ptcexcept_filtops = {
+ .f_flags = FILTEROP_ISFD,
+ .f_attach = NULL,
+ .f_detach = filt_ptcrdetach,
+ .f_event = filt_ptcexcept,
+};
+
int
ptckqfilter(dev_t dev, struct knote *kn)
{
@@ -746,6 +772,9 @@ ptckqfilter(dev_t dev, struct knote *kn)
klist = &pti->pt_selw.si_note;
kn->kn_fop = &ptcwrite_filtops;
break;
+ case EVFILT_EXCEPT:
+ klist = &pti->pt_selr.si_note;
+ kn->kn_fop = &ptcexcept_filtops;
default:
return (EINVAL);
}
diff --git sys/kern/tty_tty.c sys/kern/tty_tty.c
index 2aa99721cfc..3906cdbcb83 100644
--- sys/kern/tty_tty.c
+++ sys/kern/tty_tty.c
@@ -159,6 +159,6 @@ cttykqfilter(dev_t dev, struct knote *kn)
struct vnode *ttyvp = cttyvp(curproc);
if (ttyvp == NULL)
- return (ENXIO);
+ return (seltrue_kqfilter(dev, kn));
return (VOP_KQFILTER(ttyvp, FREAD|FWRITE, kn));
}
diff --git sys/kern/uipc_socket.c sys/kern/uipc_socket.c
index d6f2eb6ca3c..076549d9cc9 100644
--- sys/kern/uipc_socket.c
+++ sys/kern/uipc_socket.c
@@ -71,6 +71,7 @@ int filt_soread(struct knote *kn, long hint);
void filt_sowdetach(struct knote *kn);
int filt_sowrite(struct knote *kn, long hint);
int filt_solisten(struct knote *kn, long hint);
+int filt_soexcept(struct knote *kn, long hint);
const struct filterops solisten_filtops = {
.f_flags = FILTEROP_ISFD,
@@ -93,6 +94,12 @@ const struct filterops sowrite_filtops = {
.f_event = filt_sowrite,
};
+const struct filterops soexcept_filtops = {
+ .f_flags = FILTEROP_ISFD,
+ .f_attach = NULL,
+ .f_detach = filt_sordetach,
+ .f_event = filt_soexcept,
+};
#ifndef SOMINCONN
#define SOMINCONN 80
@@ -2026,6 +2033,10 @@ soo_kqfilter(struct file *fp, struct knote *kn)
kn->kn_fop = &sowrite_filtops;
sb = &so->so_snd;
break;
+ case EVFILT_EXCEPT:
+ kn->kn_fop = &soexcept_filtops;
+ sb = &so->so_rcv;
+ break;
default:
return (EINVAL);
}
@@ -2056,6 +2067,7 @@ filt_soread(struct knote *kn, long hint)
if ((hint & NOTE_SUBMIT) == 0)
s = solock(so);
+
kn->kn_data = so->so_rcv.sb_cc;
#ifdef SOCKET_SPLICE
if (isspliced(so)) {
@@ -2064,6 +2076,8 @@ filt_soread(struct knote *kn, long hint)
#endif /* SOCKET_SPLICE */
if (so->so_state & SS_CANTRCVMORE) {
kn->kn_flags |= EV_EOF;
+ if (so->so_state & SS_ISDISCONNECTED)
+ kn->kn_flags |= EV_HUP;
kn->kn_fflags = so->so_error;
rv = 1;
} else if (so->so_error) { /* temporary udp error */
@@ -2102,6 +2116,8 @@ filt_sowrite(struct knote *kn, long hint)
kn->kn_data = sbspace(so, &so->so_snd);
if (so->so_state & SS_CANTSENDMORE) {
kn->kn_flags |= EV_EOF;
+ if (so->so_state & SS_ISDISCONNECTED)
+ kn->kn_flags |= EV_HUP;
kn->kn_fflags = so->so_error;
rv = 1;
} else if (so->so_error) { /* temporary udp error */
@@ -2135,6 +2151,21 @@ filt_solisten(struct knote *kn, long hint)
return (kn->kn_data != 0);
}
+int
+filt_soexcept(struct knote *kn, long hint)
+{
+ struct socket *so = kn->kn_fp->f_data;
+ int s;
+
+ if ((hint & NOTE_SUBMIT) == 0)
+ s = solock(so);
+ kn->kn_data = (so->so_oobmark || (so->so_state & SS_RCVATMARK));
+ if ((hint & NOTE_SUBMIT) == 0)
+ sounlock(so, s);
+
+ return (kn->kn_data != 0);
+}
+
#ifdef DDB
void
sobuf_print(struct sockbuf *,
diff --git sys/sys/conf.h sys/sys/conf.h
index 6531db84b22..847b1bb8fd8 100644
--- sys/sys/conf.h
+++ sys/sys/conf.h
@@ -312,7 +312,7 @@ extern struct cdevsw cdevsw[];
dev_init(c,n,open), dev_init(c,n,close), dev_init(c,n,read), \
dev_init(c,n,write), dev_init(c,n,ioctl), \
(dev_type_stop((*))) enodev, 0, dev_init(c,n,poll), \
- (dev_type_mmap((*))) enodev }
+ (dev_type_mmap((*))) enodev, 0, 0, dev_init(c,n,kqfilter) }
/* open, close, read, write, ioctl, poll, kqfilter */
#define cdev_midi_init(c,n) { \
@@ -390,12 +390,12 @@ extern struct cdevsw cdevsw[];
(dev_type_stop((*))) enodev, 0, selfalse, \
(dev_type_mmap((*))) enodev }
-/* open, close, ioctl, read, mmap, poll */
+/* open, close, ioctl, read, mmap, poll, kqfilter */
#define cdev_video_init(c,n) { \
dev_init(c,n,open), dev_init(c,n,close), dev_init(c,n,read), \
(dev_type_write((*))) enodev, dev_init(c,n,ioctl), \
(dev_type_stop((*))) enodev, 0, dev_init(c,n,poll), \
- dev_init(c,n,mmap) }
+ dev_init(c,n,mmap), 0, 0, dev_init(c,n,kqfilter) }
/* open, close, write, ioctl */
#define cdev_spkr_init(c,n) { \
@@ -439,7 +439,7 @@ extern struct cdevsw cdevsw[];
(dev_type_stop((*))) enodev, 0, selfalse, \
(dev_type_mmap((*))) enodev }
-/* open, close, read, ioctl, poll, mmap, nokqfilter */
+/* open, close, read, ioctl, poll, mmap, kqfilter */
#define cdev_drm_init(c,n) { \
dev_init(c,n,open), dev_init(c,n,close), dev_init(c, n, read), \
(dev_type_write((*))) enodev, dev_init(c,n,ioctl), \
diff --git sys/sys/event.h sys/sys/event.h
index 51487835ced..41c521474e6 100644
--- sys/sys/event.h
+++ sys/sys/event.h
@@ -39,6 +39,7 @@
#define EVFILT_SIGNAL (-6) /* attached to struct process */
#define EVFILT_TIMER (-7) /* timers */
#define EVFILT_DEVICE (-8) /* devices */
+#define EVFILT_EXCEPT (-9) /* exceptional conditions */
#define EVFILT_SYSCOUNT 8
@@ -75,6 +76,7 @@ struct kevent {
#define EV_SYSFLAGS 0xF000 /* reserved by system */
#define EV_FLAG1 0x2000 /* filter-specific flag */
+#define EV_HUP EV_FLAG1 /* device or socket disconnected */
/* returned values */
#define EV_EOF 0x8000 /* EOF detected */
@@ -128,6 +130,10 @@ struct klist {
};
#ifdef _KERNEL
+/*
+ * data/hint flags for EVFILT_{READ|WRITE}, not shared with userspace
+ */
+#define NOTE_IMM 0x1000 /* Immediate read event */
#define EVFILT_MARKER 0xf /* placemarker for tailq */
@@ -199,6 +205,7 @@ struct kqueue_scan_state {
};
struct proc;
+struct filedesc;
extern const struct filterops sig_filtops;
@@ -207,10 +214,17 @@ extern void knote_activate(struct knote *);
extern void knote_remove(struct proc *p, struct knlist *list);
extern void knote_fdclose(struct proc *p, int fd);
extern void knote_processexit(struct proc *);
+extern struct kqueue *kqueue_alloc(struct filedesc *);
+extern void kqueue_exit(struct proc *);
extern int kqueue_register(struct kqueue *kq,
struct kevent *kev, struct proc *p);
extern void kqueue_scan_setup(struct kqueue_scan_state *, struct kqueue *);
extern void kqueue_scan_finish(struct kqueue_scan_state *);
+extern int kqueue_scan(struct kqueue_scan_state *, int, struct kevent *,
+ struct timespec *, struct proc *, int *);
+extern void kqueue_purge(struct proc *, struct kqueue *);
+extern void kqueue_scan_setup(struct kqueue_scan_state *, struct kqueue *);
+extern void kqueue_scan_finish(struct kqueue_scan_state *);
extern int filt_seltrue(struct knote *kn, long hint);
extern int seltrue_kqfilter(dev_t, struct knote *);
extern void klist_insert(struct klist *, struct knote *);
diff --git sys/sys/proc.h sys/sys/proc.h
index 357c0c0d52c..424f8240b1f 100644
--- sys/sys/proc.h
+++ sys/sys/proc.h
@@ -315,6 +315,7 @@ struct process {
struct kcov_dev;
struct lock_list_entry;
+struct kqueue;
struct p_inentry {
u_long ie_serial;
@@ -377,6 +378,8 @@ struct proc {
struct plimit *p_limit; /* [l] read ref. of p_p->ps_limit */
struct kcov_dev *p_kd; /* kcov device handle */
struct lock_list_entry *p_sleeplocks; /* WITNESS lock tracking */
+ struct kqueue *p_kq; /* for select/poll */
+ unsigned long p_kq_serial; /* for select/poll */
int p_siglist; /* Signals arrived but not delivered. */
diff --git sys/arch/sparc64/dev/vldcp.c sys/arch/sparc64/dev/vldcp.c
index 91255700c5f..0dba0267028 100644
--- sys/arch/sparc64/dev/vldcp.c
+++ sys/arch/sparc64/dev/vldcp.c
@@ -70,6 +70,11 @@ struct vldcp_softc {
int vldcp_match(struct device *, void *, void *);
void vldcp_attach(struct device *, struct device *, void *);
+void filt_vldcprdetach(struct knote *);
+void filt_vldcpwdetach(struct knote *);
+int filt_vldcpread(struct knote *, long);
+int filt_vldcpwrite(struct knote *, long);
+int vldcpkqfilter(dev_t, struct knote *);
struct cfattach vldcp_ca = {
sizeof(struct vldcp_softc), vldcp_match, vldcp_attach
@@ -615,3 +620,121 @@ vldcppoll(dev_t dev, int events, struct proc *p)
splx(s);
return revents;
}
+
+void
+filt_vldcprdetach(struct knote *kn)
+{
+ struct vldcp_softc *sc = (void *)kn->kn_hook;
+ int s;
+
+ s = spltty();
+ klist_remove(&sc->sc_rsel.si_note, kn);
+ splx(s);
+}
+
+void
+filt_vldcpwdetach(struct knote *kn)
+{
+ struct vldcp_softc *sc = (void *)kn->kn_hook;
+ int s;
+
+ s = spltty();
+ klist_remove(&sc->sc_wsel.si_note, kn);
+ splx(s);
+}
+
+int
+filt_vldcpread(struct knote *kn, long hint)
+{
+ struct vldcp_softc *sc = (void *)kn->kn_hook;
+ struct ldc_conn *lc = &sc->sc_lc;
+ uint64_t head, tail, avail, state;
+ int s, err;
+
+ s = spltty();
+ err = hv_ldc_rx_get_state(lc->lc_id, &head, &tail, &state);
+ if (err == 0 && state == LDC_CHANNEL_UP && head != tail) {
+ avail = (head - tail) / sizeof(struct ldc_pkt);
+ avail = (avail + lc->lc_rxq->lq_nentries - 1) %
+ lc->lc_rxq->lq_nentries;
+ kn->kn_data = avail;
+ } else {
+ cbus_intr_setenabled(sc->sc_bustag, sc->sc_rx_ino,
+ INTR_ENABLED);
+ }
+ splx(s);
+
+ return (kn->kn_data > 0);
+}
+
+int
+filt_vldcwrite(struct knote *kn, long hint)
+{
+ struct vldcp_softc *sc = (void *)kn->kn_hook;
+ struct ldc_conn *lc = &sc->sc_lc;
+ uint64_t head, tail, avail, state;
+ int s, err;
+
+ s = spltty();
+ err = hv_ldc_tx_get_state(lc->lc_id, &head, &tail, &state);
+ if (err == 0 && state == LDC_CHANNEL_UP && head != tail) {
+ avail = (head - tail) / sizeof(struct ldc_pkt);
+ avail = (avail + lc->lc_txq->lq_nentries - 1) %
+ lc->lc_txq->lq_nentries;
+ kn->kn_data = avail;
+ } else {
+ cbus_intr_setenabled(sc->sc_bustag, sc->sc_tx_ino,
+ INTR_ENABLED);
+ }
+ splx(s);
+
+ return (kn->kn_data > 0);
+}
+
+const struct filterops vldcpread_filtops = {
+ .f_flags = FILTEROP_ISFD,
+ .f_attach = NULL,
+ .f_detach = filt_vldcprdetach,
+ .f_event = filt_vldcpread,
+};
+
+const struct filterops vldcpwrite_filtops = {
+ .f_flags = FILTEROP_ISFD,
+ .f_attach = NULL,
+ .f_detach = filt_vldcpwdetach,
+ .f_event = filt_vldcwrite,
+};
+
+int
+vldcpkqfilter(dev_t dev, struct knote *kn)
+{
+ struct vldcp_softc *sc;
+ struct klist *klist;
+ int s;
+
+ sc = vldcp_lookup(dev);
+ if (sc == NULL)
+ return (ENXIO);
+
+ switch (kn->kn_filter) {
+ case EVFILT_READ:
+ klist = &sc->sc_rsel.si_note;
+ kn->kn_fop = &vldcpread_filtops;
+ break;
+ case EVFILT_WRITE:
+ klist = &sc->sc_wsel.si_note;
+ kn->kn_fop = &vldcpwrite_filtops;
+ break;
+
+ default:
+ return (EINVAL);
+ }
+
+ kn->kn_hook = sc;
+
+ s = spltty();
+ klist_insert(klist, kn);
+ splx(s);
+
+ return (0);
+}
diff --git sys/arch/sparc64/include/conf.h sys/arch/sparc64/include/conf.h
index 558c0c22ef7..57fd1699cb6 100644
--- sys/arch/sparc64/include/conf.h
+++ sys/arch/sparc64/include/conf.h
@@ -64,7 +64,8 @@ cdev_decl(vdsp);
#define cdev_gen_init(c,n) { \
dev_init(c,n,open), dev_init(c,n,close), dev_init(c,n,read), \
dev_init(c,n,write), dev_init(c,n,ioctl), (dev_type_stop((*))) nullop, \
- 0, dev_init(c,n,poll), (dev_type_mmap((*))) enodev }
+ 0, dev_init(c,n,poll), (dev_type_mmap((*))) enodev, \
+ 0, 0, dev_init(c,n,kqfilter) }
cdev_decl(cn);
diff --git sys/dev/audio.c sys/dev/audio.c
index 7e061b7eb95..93fc848ab82 100644
--- sys/dev/audio.c
+++ sys/dev/audio.c
@@ -165,6 +165,36 @@ struct cfdriver audio_cd = {
NULL, "audio", DV_DULL
};
+void filt_audioctlrdetach(struct knote *);
+int filt_audioctlread(struct knote *, long);
+
+const struct filterops audioctlread_filtops = {
+ .f_flags = FILTEROP_ISFD,
+ .f_attach = NULL,
+ .f_detach = filt_audioctlrdetach,
+ .f_event = filt_audioctlread,
+};
+
+void filt_audiowdetach(struct knote *);
+int filt_audiowrite(struct knote *, long);
+
+const struct filterops audiowrite_filtops = {
+ .f_flags = FILTEROP_ISFD,
+ .f_attach = NULL,
+ .f_detach = filt_audiowdetach,
+ .f_event = filt_audiowrite,
+};
+
+void filt_audiordetach(struct knote *);
+int filt_audioread(struct knote *, long);
+
+const struct filterops audioread_filtops = {
+ .f_flags = FILTEROP_ISFD,
+ .f_attach = NULL,
+ .f_detach = filt_audiordetach,
+ .f_event = filt_audioread,
+};
+
/*
* This mutex protects data structures (including registers on the
* sound-card) that are manipulated by both the interrupt handler and
@@ -243,7 +273,14 @@ audio_mixer_wakeup(void *addr)
wakeup(&sc->mix_blocking);
sc->mix_blocking = 0;
}
+ /*
+ * As long as selwakeup() grabs the KERNEL_LOCK() make sure it is
+ * already held here to avoid lock ordering problems with `audio_lock'
+ */
+ KERNEL_ASSERT_LOCKED();
+ mtx_enter(&audio_lock);
selwakeup(&sc->mix_sel);
+ mtx_leave(&audio_lock);
}
void
@@ -255,7 +292,14 @@ audio_buf_wakeup(void *addr)
wakeup(&buf->blocking);
buf->blocking = 0;
}
+ /*
+ * As long as selwakeup() grabs the KERNEL_LOCK() make sure it is
+ * already held here to avoid lock ordering problems with `audio_lock'
+ */
+ KERNEL_ASSERT_LOCKED();
+ mtx_enter(&audio_lock);
selwakeup(&buf->sel);
+ mtx_leave(&audio_lock);
}
int
@@ -1383,9 +1427,12 @@ audio_detach(struct device *self, int flags)
if (sc->mode != 0) {
if (sc->active) {
wakeup(&sc->play.blocking);
- selwakeup(&sc->play.sel);
+ KERNEL_ASSERT_LOCKED();
+ mtx_enter(&audio_lock);
wakeup(&sc->rec.blocking);
+ selwakeup(&sc->play.sel);
selwakeup(&sc->rec.sel);
+ mtx_leave(&audio_lock);
audio_stop(sc);
}
sc->ops->close(sc->arg);
@@ -1393,8 +1440,14 @@ audio_detach(struct device *self, int flags)
}
if (sc->mix_isopen) {
wakeup(&sc->mix_blocking);
+ KERNEL_ASSERT_LOCKED();
+ mtx_enter(&audio_lock);
selwakeup(&sc->mix_sel);
+ mtx_leave(&audio_lock);
}
+ klist_invalidate(&sc->play.sel.si_note);
+ klist_invalidate(&sc->rec.sel.si_note);
+ klist_invalidate(&sc->mix_sel.si_note);
/* free resources */
softintr_disestablish(sc->mix_softintr);
@@ -2199,6 +2252,130 @@ audiopoll(dev_t dev, int events, struct proc *p)
return revents;
}
+int
+audiokqfilter(dev_t dev, struct knote *kn)
+{
+ struct audio_softc *sc;
+ struct klist *klist;
+ int error;
+
+ sc = (struct audio_softc *)device_lookup(&audio_cd, AUDIO_UNIT(dev));
+ if (sc == NULL)
+ return ENXIO;
+ error = 0;
+ switch (AUDIO_DEV(dev)) {
+ case AUDIO_DEV_AUDIO:
+ switch (kn->kn_filter) {
+ case EVFILT_READ:
+ klist = &sc->rec.sel.si_note;
+ kn->kn_fop = &audioread_filtops;
+ break;
+ case EVFILT_WRITE:
+ klist = &sc->play.sel.si_note;
+ kn->kn_fop = &audiowrite_filtops;
+ break;
+ default:
+ error = EINVAL;
+ goto done;
+ }
+ break;
+ case AUDIO_DEV_AUDIOCTL:
+ switch (kn->kn_filter) {
+ case EVFILT_READ:
+ klist = &sc->mix_sel.si_note;
+ kn->kn_fop = &audioctlread_filtops;
+ break;
+ default:
+ error = EINVAL;
+ goto done;
+ }
+ break;
+ }
+ kn->kn_hook = sc;
+
+ mtx_enter(&audio_lock);
+ klist_insert(klist, kn);
+ mtx_leave(&audio_lock);
+done:
+ device_unref(&sc->dev);
+ return error;
+}
+
+void
+filt_audiordetach(struct knote *kn)
+{
+ struct audio_softc *sc = kn->kn_hook;
+
+ mtx_enter(&audio_lock);
+ klist_remove(&sc->rec.sel.si_note, kn);
+ mtx_leave(&audio_lock);
+}
+
+int
+filt_audioread(struct knote *kn, long hint)
+{
+ struct audio_softc *sc = kn->kn_hook;
+ int retval = 0;
+
+ if ((hint & NOTE_SUBMIT) == 0)
+ mtx_enter(&audio_lock);
+ retval = (sc->mode & AUMODE_RECORD) && (sc->rec.used > 0);
+ if ((hint & NOTE_SUBMIT) == 0)
+ mtx_leave(&audio_lock);
+
+ return retval;
+}
+
+void
+filt_audiowdetach(struct knote *kn)
+{
+ struct audio_softc *sc = kn->kn_hook;
+
+ mtx_enter(&audio_lock);
+ klist_remove(&sc->play.sel.si_note, kn);
+ mtx_leave(&audio_lock);
+}
+
+int
+filt_audiowrite(struct knote *kn, long hint)
+{
+ struct audio_softc *sc = kn->kn_hook;
+ int retval = 0;
+
+ if ((hint & NOTE_SUBMIT) == 0)
+ mtx_enter(&audio_lock);
+ retval = (sc->mode & AUMODE_PLAY) && (sc->play.used < sc->play.len);
+ if ((hint & NOTE_SUBMIT) == 0)
+ mtx_leave(&audio_lock);
+
+ return retval;
+}
+
+void
+filt_audioctlrdetach(struct knote *kn)
+{
+ struct audio_softc *sc = kn->kn_hook;
+
+ mtx_enter(&audio_lock);
+ klist_remove(&sc->mix_sel.si_note, kn);
+ mtx_leave(&audio_lock);
+}
+
+int
+filt_audioctlread(struct knote *kn, long hint)
+{
+ struct audio_softc *sc = kn->kn_hook;
+ int retval = 0;
+
+ if ((hint & NOTE_SUBMIT) == 0)
+ mtx_enter(&audio_lock);
+ retval = (sc->mix_isopen && sc->mix_pending);
+ if ((hint & NOTE_SUBMIT) == 0)
+ mtx_leave(&audio_lock);
+
+ return retval;
+}
+
#if NWSKBD > 0
int
wskbd_initmute(struct audio_softc *sc, struct mixer_devinfo *vol)
diff --git sys/dev/cons.c sys/dev/cons.c
index c6694411df0..11ddff0591e 100644
--- sys/dev/cons.c
+++ sys/dev/cons.c
@@ -213,9 +213,7 @@ cnkqfilter(dev_t dev, struct knote *kn)
return (ENXIO);
else
dev = cn_tab->cn_dev;
- if (cdevsw[major(dev)].d_kqfilter)
- return ((*cdevsw[major(dev)].d_kqfilter)(dev, kn));
- return (EOPNOTSUPP);
+ return (ttkqfilter(dev, kn));
}
int
diff --git sys/dev/pci/cz.c sys/dev/pci/cz.c
index c9a01b94001..953de876412 100644
--- sys/dev/pci/cz.c
+++ sys/dev/pci/cz.c
@@ -1114,22 +1114,6 @@ czttywrite(dev_t dev, struct uio *uio, int flags)
return ((*linesw[tp->t_line].l_write)(tp, uio, flags));
}
-#if 0
-/*
- * czttypoll:
- *
- * Poll a Cyclades-Z serial port.
- */
-int
-czttypoll(dev_t dev, int events, struct proc *p)
-{
- struct cztty_softc *sc = CZTTY_SOFTC(dev);
- struct tty *tp = sc->sc_tty;
-
- return ((*linesw[tp->t_line].l_poll)(tp, events, p));
-}
-#endif
-
/*
* czttyioctl:
*
diff --git sys/dev/pci/drm/drm_drv.c sys/dev/pci/drm/drm_drv.c
index 11e72a0d5b6..67ee4ce1805 100644
--- sys/dev/pci/drm/drm_drv.c
+++ sys/dev/pci/drm/drm_drv.c
@@ -484,6 +484,35 @@ filt_drmkms(struct knote *kn, long hint)
return (kn->kn_fflags != 0);
}
+void
+filt_drmreaddetach(struct knote *kn)
+{
+ struct drm_file *file_priv = kn->kn_hook;
+ int s;
+
+ s = spltty();
+ klist_remove(&file_priv->rsel.si_note, kn);
+ splx(s);
+}
+
+int
+filt_drmread(struct knote *kn, long hint)
+{
+ struct drm_file *file_priv = kn->kn_hook;
+ int val = 0;
+
+#if notyet
+ if ((hint & NOTE_SUBMIT) == 0)
+ mtx_enter(&file_priv->minor->dev->event_lock);
+#endif
+ val = !list_empty(&file_priv->event_list);
+#if notyet
+ if ((hint & NOTE_SUBMIT) == 0)
+ mtx_leave(&file_priv->minor->dev->event_lock);
+#endif
+ return (val);
+}
+
const struct filterops drm_filtops = {
.f_flags = FILTEROP_ISFD,
.f_attach = NULL,
@@ -491,30 +520,51 @@ const struct filterops drm_filtops = {
.f_event = filt_drmkms,
};
+const struct filterops drmread_filtops = {
+ .f_flags = FILTEROP_ISFD,
+ .f_attach = NULL,
+ .f_detach = filt_drmreaddetach,
+ .f_event = filt_drmread,
+};
+
int
drmkqfilter(dev_t kdev, struct knote *kn)
{
struct drm_device *dev = NULL;
- int s;
+ struct drm_file *file_priv = NULL;
+ int s;
dev = drm_get_device_from_kdev(kdev);
if (dev == NULL || dev->dev_private == NULL)
return (ENXIO);
switch (kn->kn_filter) {
+ case EVFILT_READ:
+ mutex_lock(&dev->struct_mutex);
+ file_priv = drm_find_file_by_minor(dev, minor(kdev));
+ mutex_unlock(&dev->struct_mutex);
+ if (file_priv == NULL)
+ return (ENXIO);
+
+ kn->kn_fop = &drmread_filtops;
+ kn->kn_hook = file_priv;
+
+ s = spltty();
+ klist_insert(&file_priv->rsel.si_note, kn);
+ splx(s);
+ break;
case EVFILT_DEVICE:
kn->kn_fop = &drm_filtops;
+ kn->kn_hook = dev;
+
+ s = spltty();
+ klist_insert(&dev->note, kn);
+ splx(s);
break;
default:
return (EINVAL);
}
- kn->kn_hook = dev;
-
- s = spltty();
- klist_insert(&dev->note, kn);
- splx(s);
-
return (0);
}
@@ -772,7 +822,6 @@ out:
return (gotone);
}
-/* XXX kqfilter ... */
int
drmpoll(dev_t kdev, int events, struct proc *p)
{
diff --git sys/dev/sbus/magma.c sys/dev/sbus/magma.c
index 8aba38e9384..766c45846d3 100644
--- sys/dev/sbus/magma.c
+++ sys/dev/sbus/magma.c
@@ -1340,6 +1340,7 @@ mtty_param(struct tty *tp, struct termios *t)
* mbppwrite write to mbpp
* mbppioctl do ioctl on mbpp
* mbpppoll do poll on mbpp
+ * mbppkqfilter kqueue on mbpp
* mbpp_rw general rw routine
* mbpp_timeout rw timeout
* mbpp_start rw start after delay
@@ -1515,6 +1516,12 @@ mbpppoll(dev_t dev, int events, struct proc *p)
return (seltrue(dev, events, p));
}
+int
+mbppkqfilter(dev_t dev, struct knote *kn)
+{
+ return (seltrue_kqfilter(dev, kn));
+}
+
int
mbpp_rw(dev_t dev, struct uio *uio)
{
diff --git sys/dev/sbus/spif.c sys/dev/sbus/spif.c
index 19f128420d7..6fcd8f37cbc 100644
--- sys/dev/sbus/spif.c
+++ sys/dev/sbus/spif.c
@@ -91,6 +91,7 @@ int sbppwrite(dev_t, struct uio *, int);
int sbpp_rw(dev_t, struct uio *);
int spifppcintr(void *);
int sbpppoll(dev_t, int, struct proc *);
+int sbppkqfilter(dev_t, struct knote *);
int sbppioctl(dev_t, u_long, caddr_t, int, struct proc *);
struct cfattach spif_ca = {
@@ -1044,6 +1045,11 @@ sbpppoll(dev_t dev, int events, struct proc *p)
{
return (seltrue(dev, events, p));
}
+int
+sbppkqfilter(dev_t dev, struct knote *kn)
+{
+ return (seltrue_kqfilter(dev, kn));
+}
int
sbppioctl(dev_t dev, u_long cmd, caddr_t data, int flags, struct proc *p)
diff --git sys/isofs/cd9660/cd9660_vnops.c sys/isofs/cd9660/cd9660_vnops.c
index f1d43c64bd0..e2fbacbf0f0 100644
--- sys/isofs/cd9660/cd9660_vnops.c
+++ sys/isofs/cd9660/cd9660_vnops.c
@@ -1036,6 +1036,9 @@ filt_cd9660read(struct knote *kn, long hint)
return (1);
}
+ if (kn->kn_sfflags & NOTE_IMM)
+ return (1);
+
return (kn->kn_data != 0);
}
diff --git sys/miscfs/deadfs/dead_vnops.c sys/miscfs/deadfs/dead_vnops.c
index 606960c726d..c5b9bcae1cf 100644
--- sys/miscfs/deadfs/dead_vnops.c
+++ sys/miscfs/deadfs/dead_vnops.c
@@ -34,6 +34,7 @@
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/event.h>
#include <sys/time.h>
#include <sys/vnode.h>
#include <sys/lock.h>
@@ -52,6 +53,7 @@ int dead_read(void *);
int dead_write(void *);
int dead_ioctl(void *);
int dead_poll(void *);
+int dead_kqfilter(void *v);
int dead_inactive(void *);
int dead_lock(void *);
int dead_bmap(void *);
@@ -73,6 +75,7 @@ const struct vops dead_vops = {
.vop_write = dead_write,
.vop_ioctl = dead_ioctl,
.vop_poll = dead_poll,
+ .vop_kqfilter = dead_kqfilter,
.vop_fsync = nullop,
.vop_remove = dead_badop,
.vop_link = dead_badop,
@@ -167,6 +170,45 @@ dead_poll(void *v)
return (POLLHUP);
}
+void
+filt_deaddetach(struct knote *kn)
+{
+}
+
+int
+filt_deadrw(struct knote *kn, long hint)
+{
+ /*
+ * Let the user find out that the descriptor is gone.
+ */
+ kn->kn_data = EV_HUP;
+ return (1);
+}
+
+const struct filterops deadrw_filtops = {
+ .f_flags = FILTEROP_ISFD,
+ .f_attach = NULL,
+ .f_detach = filt_deaddetach,
+ .f_event = filt_deadrw,
+};
+
+int
+dead_kqfilter(void *v)
+{
+ struct vop_kqfilter_args *ap = v;
+
+ switch (ap->a_kn->kn_filter) {
+ case EVFILT_READ:
+ case EVFILT_WRITE:
+ ap->a_kn->kn_fop = &deadrw_filtops;
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
/*
* Just call the device strategy routine
*/
diff --git sys/miscfs/fifofs/fifo_vnops.c sys/miscfs/fifofs/fifo_vnops.c
index 07ede492f47..21268799c5e 100644
--- sys/miscfs/fifofs/fifo_vnops.c
+++ sys/miscfs/fifofs/fifo_vnops.c
@@ -559,6 +559,8 @@ filt_fiforead(struct knote *kn, long hint)
kn->kn_data = so->so_rcv.sb_cc;
if (so->so_state & SS_CANTRCVMORE) {
kn->kn_flags |= EV_EOF;
+ if (so->so_state & SS_ISDISCONNECTED)
+ kn->kn_flags |= EV_HUP;
rv = 1;
} else {
kn->kn_flags &= ~EV_EOF;
diff --git sys/miscfs/fuse/fuse_vnops.c sys/miscfs/fuse/fuse_vnops.c
index 6438cecd1c9..9b941d97cf5 100644
--- sys/miscfs/fuse/fuse_vnops.c
+++ sys/miscfs/fuse/fuse_vnops.c
@@ -188,6 +188,9 @@ filt_fusefsread(struct knote *kn, long hint)
return (1);
}
+ if (kn->kn_sfflags & NOTE_IMM)
+ return (1);
+
return (kn->kn_data != 0);
}
diff --git sys/msdosfs/msdosfs_vnops.c sys/msdosfs/msdosfs_vnops.c
index fc46268052e..60a3622d8c0 100644
--- sys/msdosfs/msdosfs_vnops.c
+++ sys/msdosfs/msdosfs_vnops.c
@@ -2013,6 +2013,10 @@ filt_msdosfsread(struct knote *kn, long hint)
kn->kn_fflags |= NOTE_EOF;
return (1);
}
+
+ if (kn->kn_sfflags & NOTE_IMM)
+ return (1);
+
return (kn->kn_data != 0);
}
diff --git sys/net/if_pppx.c sys/net/if_pppx.c
index 4dcbe1d5695..b0d0663d5dc 100644
--- sys/net/if_pppx.c
+++ sys/net/if_pppx.c
@@ -1215,20 +1215,20 @@ static void filt_pppac_rdetach(struct knote *);
static int filt_pppac_read(struct knote *, long);
static const struct filterops pppac_rd_filtops = {
- 1,
- NULL,
- filt_pppac_rdetach,
- filt_pppac_read
+ .f_flags = FILTEROP_ISFD,
+ .f_attach = NULL,
+ .f_detach = filt_pppac_rdetach,
+ .f_event = filt_pppac_read
};
static void filt_pppac_wdetach(struct knote *);
static int filt_pppac_write(struct knote *, long);
static const struct filterops pppac_wr_filtops = {
- 1,
- NULL,
- filt_pppac_wdetach,
- filt_pppac_write
+ .f_flags = FILTEROP_ISFD,
+ .f_attach = NULL,
+ .f_detach = filt_pppac_wdetach,
+ .f_event = filt_pppac_write
};
static struct pppac_list pppac_devs = LIST_HEAD_INITIALIZER(pppac_devs);
diff --git sys/nfs/nfs_kq.c sys/nfs/nfs_kq.c
index f43211e76c3..b07ff318bcf 100644
--- sys/nfs/nfs_kq.c
+++ sys/nfs/nfs_kq.c
@@ -32,183 +32,26 @@
#include <sys/param.h>
#include <sys/systm.h>
-#include <sys/kernel.h>
-#include <sys/proc.h>
#include <sys/mount.h>
-#include <sys/malloc.h>
#include <sys/vnode.h>
-#include <sys/unistd.h>
#include <sys/file.h>
-#include <sys/kthread.h>
-#include <sys/rwlock.h>
-#include <sys/queue.h>
-#include <nfs/rpcv2.h>
#include <nfs/nfsproto.h>
#include <nfs/nfs.h>
#include <nfs/nfsnode.h>
#include <nfs/nfs_var.h>
-void nfs_kqpoll(void *);
-
void filt_nfsdetach(struct knote *);
int filt_nfsread(struct knote *, long);
+int filt_nfswrite(struct knote *, long);
int filt_nfsvnode(struct knote *, long);
-struct kevq {
- SLIST_ENTRY(kevq) kev_link;
- struct vnode *vp;
- u_int usecount;
- u_int flags;
-#define KEVQ_BUSY 0x01 /* currently being processed */
-#define KEVQ_WANT 0x02 /* want to change this entry */
- struct timespec omtime; /* old modification time */
- struct timespec octime; /* old change time */
- nlink_t onlink; /* old number of references to file */
-};
-SLIST_HEAD(kevqlist, kevq);
-
-struct rwlock nfskevq_lock = RWLOCK_INITIALIZER("nfskqlk");
-struct proc *pnfskq;
-struct kevqlist kevlist = SLIST_HEAD_INITIALIZER(kevlist);
-
-/*
- * This quite simplistic routine periodically checks for server changes
- * of any of the watched files every NFS_MINATTRTIMO/2 seconds.
- * Only changes in size, modification time, change time and nlinks
- * are being checked, everything else is ignored.
- * The routine only calls VOP_GETATTR() when it's likely it would get
- * some new data, i.e. when the vnode expires from attrcache. This
- * should give same result as periodically running stat(2) from userland,
- * while keeping CPU/network usage low, and still provide proper kevent
- * semantics.
- * The poller thread is created when first vnode is added to watch list,
- * and exits when the watch list is empty. The overhead of thread creation
- * isn't really important, neither speed of attach and detach of knote.
- */
-/* ARGSUSED */
-void
-nfs_kqpoll(void *arg)
-{
- struct kevq *ke;
- struct vattr attr;
- struct proc *p = pnfskq;
- u_quad_t osize;
- int error;
-
- for(;;) {
- rw_enter_write(&nfskevq_lock);
- SLIST_FOREACH(ke, &kevlist, kev_link) {
- struct nfsnode *np = VTONFS(ke->vp);
-
-#ifdef DEBUG
- printf("nfs_kqpoll on: ");
- VOP_PRINT(ke->vp);
-#endif
- /* skip if still in attrcache */
- if (nfs_getattrcache(ke->vp, &attr) != ENOENT)
- continue;
-
- /*
- * Mark entry busy, release lock and check
- * for changes.
- */
- ke->flags |= KEVQ_BUSY;
- rw_exit_write(&nfskevq_lock);
-
- /* save v_size, nfs_getattr() updates it */
- osize = np->n_size;
-
- error = VOP_GETATTR(ke->vp, &attr, p->p_ucred, p);
- if (error == ESTALE) {
- NFS_INVALIDATE_ATTRCACHE(np);
- VN_KNOTE(ke->vp, NOTE_DELETE);
- goto next;
- }
-
- /* following is a bit fragile, but about best
- * we can get */
- if (attr.va_size != osize) {
- int flags = NOTE_WRITE;
-
- if (attr.va_size > osize)
- flags |= NOTE_EXTEND;
- else
- flags |= NOTE_TRUNCATE;
-
- VN_KNOTE(ke->vp, flags);
- ke->omtime = attr.va_mtime;
- } else if (attr.va_mtime.tv_sec != ke->omtime.tv_sec
- || attr.va_mtime.tv_nsec != ke->omtime.tv_nsec) {
- VN_KNOTE(ke->vp, NOTE_WRITE);
- ke->omtime = attr.va_mtime;
- }
-
- if (attr.va_ctime.tv_sec != ke->octime.tv_sec
- || attr.va_ctime.tv_nsec != ke->octime.tv_nsec) {
- VN_KNOTE(ke->vp, NOTE_ATTRIB);
- ke->octime = attr.va_ctime;
- }
-
- if (attr.va_nlink != ke->onlink) {
- VN_KNOTE(ke->vp, NOTE_LINK);
- ke->onlink = attr.va_nlink;
- }
-
-next:
- rw_enter_write(&nfskevq_lock);
- ke->flags &= ~KEVQ_BUSY;
- if (ke->flags & KEVQ_WANT) {
- ke->flags &= ~KEVQ_WANT;
- wakeup(ke);
- }
- }
-
- if (SLIST_EMPTY(&kevlist)) {
- /* Nothing more to watch, exit */
- pnfskq = NULL;
- rw_exit_write(&nfskevq_lock);
- kthread_exit(0);
- }
- rw_exit_write(&nfskevq_lock);
-
- /* wait a while before checking for changes again */
- tsleep_nsec(pnfskq, PSOCK, "nfskqpw",
- SEC_TO_NSEC(NFS_MINATTRTIMO) / 2);
- }
-}
-
void
filt_nfsdetach(struct knote *kn)
{
struct vnode *vp = (struct vnode *)kn->kn_hook;
- struct kevq *ke;
klist_remove(&vp->v_selectinfo.si_note, kn);
-
- /* Remove the vnode from watch list */
- rw_enter_write(&nfskevq_lock);
- SLIST_FOREACH(ke, &kevlist, kev_link) {
- if (ke->vp == vp) {
- while (ke->flags & KEVQ_BUSY) {
- ke->flags |= KEVQ_WANT;
- rw_exit_write(&nfskevq_lock);
- tsleep_nsec(ke, PSOCK, "nfskqdet", INFSLP);
- rw_enter_write(&nfskevq_lock);
- }
-
- if (ke->usecount > 1) {
- /* keep, other kevents need this */
- ke->usecount--;
- } else {
- /* last user, g/c */
- SLIST_REMOVE(&kevlist, ke, kevq, kev_link);
- free(ke, M_KEVENT, sizeof(*ke));
- }
- break;
- }
- }
- rw_exit_write(&nfskevq_lock);
}
int
@@ -227,16 +70,33 @@ filt_nfsread(struct knote *kn, long hint)
}
kn->kn_data = np->n_size - foffset(kn->kn_fp);
-#ifdef DEBUG
- printf("nfsread event. %lld\n", kn->kn_data);
-#endif
if (kn->kn_data == 0 && kn->kn_sfflags & NOTE_EOF) {
kn->kn_fflags |= NOTE_EOF;
return (1);
}
+
+ if (kn->kn_sfflags & NOTE_IMM)
+ return (1);
+
return (kn->kn_data != 0);
}
+int
+filt_nfswrite(struct knote *kn, long hint)
+{
+ /*
+ * filesystem is gone, so set the EOF flag and schedule
+ * the knote for deletion.
+ */
+ if (hint == NOTE_REVOKE) {
+ kn->kn_flags |= (EV_EOF | EV_ONESHOT);
+ return (1);
+ }
+
+ kn->kn_data = 0;
+ return (1);
+}
+
int
filt_nfsvnode(struct knote *kn, long hint)
{
@@ -256,6 +116,13 @@ static const struct filterops nfsread_filtops = {
.f_event = filt_nfsread,
};
+const struct filterops nfswrite_filtops = {
+ .f_flags = FILTEROP_ISFD,
+ .f_attach = NULL,
+ .f_detach = filt_nfsdetach,
+ .f_event = filt_nfswrite,
+};
+
static const struct filterops nfsvnode_filtops = {
.f_flags = FILTEROP_ISFD,
.f_attach = NULL,
@@ -267,25 +134,16 @@ int
nfs_kqfilter(void *v)
{
struct vop_kqfilter_args *ap = v;
- struct vnode *vp;
- struct knote *kn;
- struct kevq *ke;
- int error = 0;
- struct vattr attr;
- struct proc *p = curproc; /* XXX */
-
- vp = ap->a_vp;
- kn = ap->a_kn;
-
-#ifdef DEBUG
- printf("nfs_kqfilter(%d) on: ", kn->kn_filter);
- VOP_PRINT(vp);
-#endif
+ struct vnode *vp = ap->a_vp;
+ struct knote *kn = ap->a_kn;
switch (kn->kn_filter) {
case EVFILT_READ:
kn->kn_fop = &nfsread_filtops;
break;
+ case EVFILT_WRITE:
+ kn->kn_fop = &nfswrite_filtops;
+ break;
case EVFILT_VNODE:
kn->kn_fop = &nfsvnode_filtops;
break;
@@ -295,53 +153,7 @@ nfs_kqfilter(void *v)
kn->kn_hook = vp;
- /*
- * Put the vnode to watched list.
- */
-
- /*
- * Fetch current attributes. It's only needed when the vnode
- * is not watched yet, but we need to do this without lock
- * held. This is likely cheap due to attrcache, so do it now.
- */
- memset(&attr, 0, sizeof(attr));
- (void) VOP_GETATTR(vp, &attr, p->p_ucred, p);
-
- rw_enter_write(&nfskevq_lock);
-
- /* ensure the poller is running */
- if (!pnfskq) {
- error = kthread_create(nfs_kqpoll, NULL, &pnfskq,
- "nfskqpoll");
- if (error)
- goto out;
- }
-
- SLIST_FOREACH(ke, &kevlist, kev_link)
- if (ke->vp == vp)
- break;
-
- if (ke) {
- /* already watched, so just bump usecount */
- ke->usecount++;
- } else {
- /* need a new one */
- ke = malloc(sizeof(*ke), M_KEVENT, M_WAITOK);
- ke->vp = vp;
- ke->usecount = 1;
- ke->flags = 0;
- ke->omtime = attr.va_mtime;
- ke->octime = attr.va_ctime;
- ke->onlink = attr.va_nlink;
- SLIST_INSERT_HEAD(&kevlist, ke, kev_link);
- }
-
- /* kick the poller */
- wakeup(pnfskq);
-
klist_insert(&vp->v_selectinfo.si_note, kn);
-out:
- rw_exit_write(&nfskevq_lock);
- return (error);
+ return (0);
}
diff --git sys/tmpfs/tmpfs_vnops.c sys/tmpfs/tmpfs_vnops.c
index c7a40d10333..aae8499b3f1 100644
--- sys/tmpfs/tmpfs_vnops.c
+++ sys/tmpfs/tmpfs_vnops.c
@@ -2665,6 +2665,9 @@ filt_tmpfsread(struct knote *kn, long hint)
return (1);
}
+ if (kn->kn_sfflags & NOTE_IMM)
+ return (1);
+
return (kn->kn_data != 0);
}
diff --git sys/ufs/mfs/mfs_vnops.c sys/ufs/mfs/mfs_vnops.c
index f085f1874ca..d495d2fdf9c 100644
--- sys/ufs/mfs/mfs_vnops.c
+++ sys/ufs/mfs/mfs_vnops.c
@@ -60,6 +60,7 @@ const struct vops mfs_vops = {
.vop_write = mfs_badop,
.vop_ioctl = mfs_ioctl,
.vop_poll = mfs_badop,
+ .vop_kqfilter = mfs_badop,
.vop_revoke = mfs_revoke,
.vop_fsync = spec_fsync,
.vop_remove = mfs_badop,
diff --git sys/ufs/ufs/ufs_vnops.c sys/ufs/ufs/ufs_vnops.c
index a651fa9f065..7c6d4eecc07 100644
--- sys/ufs/ufs/ufs_vnops.c
+++ sys/ufs/ufs/ufs_vnops.c
@@ -1973,6 +1973,9 @@ filt_ufsread(struct knote *kn, long hint)
return (1);
}
+ if (kn->kn_sfflags & NOTE_IMM)
+ return (1);
+
return (kn->kn_data != 0);
}