When it comes to kqueue filters NFS is special. A custom thread is created when the first event is registered. Its purpose is to poll for changes every 2.5sec. This logic has been inherited from NetBSD and is not present in FreeBSD.
Since filt_nfsread() only check `n_size' of a given nfsnode having a different context to emulate stat(2) behavior in kernel is questionable. So the diff below gets rid of this logic. The rationals are KISS, coherency with nfs_poll() and match what other FS kqueue filters do. While here add a missing write filter. Matching the nfs_poll() logic and the write filter are necessary to keep the current behavior of poll(2) & select(2) once they start querying kqfilter handlers. ok? Index: nfs/nfs_kq.c =================================================================== RCS file: /cvs/src/sys/nfs/nfs_kq.c,v retrieving revision 1.30 diff -u -p -r1.30 nfs_kq.c --- nfs/nfs_kq.c 7 Apr 2020 13:27:52 -0000 1.30 +++ nfs/nfs_kq.c 28 May 2020 09:52:52 -0000 @@ -32,183 +32,26 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/kernel.h> -#include <sys/proc.h> #include <sys/mount.h> -#include <sys/malloc.h> #include <sys/vnode.h> -#include <sys/unistd.h> #include <sys/file.h> -#include <sys/kthread.h> -#include <sys/rwlock.h> -#include <sys/queue.h> -#include <nfs/rpcv2.h> #include <nfs/nfsproto.h> #include <nfs/nfs.h> #include <nfs/nfsnode.h> #include <nfs/nfs_var.h> -void nfs_kqpoll(void *); - void filt_nfsdetach(struct knote *); int filt_nfsread(struct knote *, long); +int filt_nfswrite(struct knote *, long); int filt_nfsvnode(struct knote *, long); -struct kevq { - SLIST_ENTRY(kevq) kev_link; - struct vnode *vp; - u_int usecount; - u_int flags; -#define KEVQ_BUSY 0x01 /* currently being processed */ -#define KEVQ_WANT 0x02 /* want to change this entry */ - struct timespec omtime; /* old modification time */ - struct timespec octime; /* old change time */ - nlink_t onlink; /* old number of references to file */ -}; -SLIST_HEAD(kevqlist, kevq); - -struct rwlock nfskevq_lock = RWLOCK_INITIALIZER("nfskqlk"); -struct proc *pnfskq; -struct kevqlist kevlist = SLIST_HEAD_INITIALIZER(kevlist); - -/* - * This quite simplistic routine periodically checks for server changes - * of any of the watched files every NFS_MINATTRTIMO/2 seconds. - * Only changes in size, modification time, change time and nlinks - * are being checked, everything else is ignored. - * The routine only calls VOP_GETATTR() when it's likely it would get - * some new data, i.e. when the vnode expires from attrcache. This - * should give same result as periodically running stat(2) from userland, - * while keeping CPU/network usage low, and still provide proper kevent - * semantics. - * The poller thread is created when first vnode is added to watch list, - * and exits when the watch list is empty. The overhead of thread creation - * isn't really important, neither speed of attach and detach of knote. - */ -/* ARGSUSED */ -void -nfs_kqpoll(void *arg) -{ - struct kevq *ke; - struct vattr attr; - struct proc *p = pnfskq; - u_quad_t osize; - int error; - - for(;;) { - rw_enter_write(&nfskevq_lock); - SLIST_FOREACH(ke, &kevlist, kev_link) { - struct nfsnode *np = VTONFS(ke->vp); - -#ifdef DEBUG - printf("nfs_kqpoll on: "); - VOP_PRINT(ke->vp); -#endif - /* skip if still in attrcache */ - if (nfs_getattrcache(ke->vp, &attr) != ENOENT) - continue; - - /* - * Mark entry busy, release lock and check - * for changes. - */ - ke->flags |= KEVQ_BUSY; - rw_exit_write(&nfskevq_lock); - - /* save v_size, nfs_getattr() updates it */ - osize = np->n_size; - - error = VOP_GETATTR(ke->vp, &attr, p->p_ucred, p); - if (error == ESTALE) { - NFS_INVALIDATE_ATTRCACHE(np); - VN_KNOTE(ke->vp, NOTE_DELETE); - goto next; - } - - /* following is a bit fragile, but about best - * we can get */ - if (attr.va_size != osize) { - int flags = NOTE_WRITE; - - if (attr.va_size > osize) - flags |= NOTE_EXTEND; - else - flags |= NOTE_TRUNCATE; - - VN_KNOTE(ke->vp, flags); - ke->omtime = attr.va_mtime; - } else if (attr.va_mtime.tv_sec != ke->omtime.tv_sec - || attr.va_mtime.tv_nsec != ke->omtime.tv_nsec) { - VN_KNOTE(ke->vp, NOTE_WRITE); - ke->omtime = attr.va_mtime; - } - - if (attr.va_ctime.tv_sec != ke->octime.tv_sec - || attr.va_ctime.tv_nsec != ke->octime.tv_nsec) { - VN_KNOTE(ke->vp, NOTE_ATTRIB); - ke->octime = attr.va_ctime; - } - - if (attr.va_nlink != ke->onlink) { - VN_KNOTE(ke->vp, NOTE_LINK); - ke->onlink = attr.va_nlink; - } - -next: - rw_enter_write(&nfskevq_lock); - ke->flags &= ~KEVQ_BUSY; - if (ke->flags & KEVQ_WANT) { - ke->flags &= ~KEVQ_WANT; - wakeup(ke); - } - } - - if (SLIST_EMPTY(&kevlist)) { - /* Nothing more to watch, exit */ - pnfskq = NULL; - rw_exit_write(&nfskevq_lock); - kthread_exit(0); - } - rw_exit_write(&nfskevq_lock); - - /* wait a while before checking for changes again */ - tsleep_nsec(pnfskq, PSOCK, "nfskqpw", - SEC_TO_NSEC(NFS_MINATTRTIMO) / 2); - } -} - void filt_nfsdetach(struct knote *kn) { struct vnode *vp = (struct vnode *)kn->kn_hook; - struct kevq *ke; klist_remove(&vp->v_selectinfo.si_note, kn); - - /* Remove the vnode from watch list */ - rw_enter_write(&nfskevq_lock); - SLIST_FOREACH(ke, &kevlist, kev_link) { - if (ke->vp == vp) { - while (ke->flags & KEVQ_BUSY) { - ke->flags |= KEVQ_WANT; - rw_exit_write(&nfskevq_lock); - tsleep_nsec(ke, PSOCK, "nfskqdet", INFSLP); - rw_enter_write(&nfskevq_lock); - } - - if (ke->usecount > 1) { - /* keep, other kevents need this */ - ke->usecount--; - } else { - /* last user, g/c */ - SLIST_REMOVE(&kevlist, ke, kevq, kev_link); - free(ke, M_KEVENT, sizeof(*ke)); - } - break; - } - } - rw_exit_write(&nfskevq_lock); } int @@ -227,9 +70,6 @@ filt_nfsread(struct knote *kn, long hint } kn->kn_data = np->n_size - foffset(kn->kn_fp); -#ifdef DEBUG - printf("nfsread event. %lld\n", kn->kn_data); -#endif if (kn->kn_data == 0 && kn->kn_sfflags & NOTE_EOF) { kn->kn_fflags |= NOTE_EOF; return (1); @@ -238,6 +78,22 @@ filt_nfsread(struct knote *kn, long hint } int +filt_nfswrite(struct knote *kn, long hint) +{ + /* + * filesystem is gone, so set the EOF flag and schedule + * the knote for deletion. + */ + if (hint == NOTE_REVOKE) { + kn->kn_flags |= (EV_EOF | EV_ONESHOT); + return (1); + } + + kn->kn_data = 0; + return (1); +} + +int filt_nfsvnode(struct knote *kn, long hint) { if (kn->kn_sfflags & hint) @@ -256,6 +112,13 @@ static const struct filterops nfsread_fi .f_event = filt_nfsread, }; +const struct filterops nfswrite_filtops = { + .f_flags = FILTEROP_ISFD, + .f_attach = NULL, + .f_detach = filt_nfsdetach, + .f_event = filt_nfswrite, +}; + static const struct filterops nfsvnode_filtops = { .f_flags = FILTEROP_ISFD, .f_attach = NULL, @@ -267,25 +130,16 @@ int nfs_kqfilter(void *v) { struct vop_kqfilter_args *ap = v; - struct vnode *vp; - struct knote *kn; - struct kevq *ke; - int error = 0; - struct vattr attr; - struct proc *p = curproc; /* XXX */ - - vp = ap->a_vp; - kn = ap->a_kn; - -#ifdef DEBUG - printf("nfs_kqfilter(%d) on: ", kn->kn_filter); - VOP_PRINT(vp); -#endif + struct vnode *vp = ap->a_vp; + struct knote *kn = ap->a_kn; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &nfsread_filtops; break; + case EVFILT_WRITE: + kn->kn_fop = &nfswrite_filtops; + break; case EVFILT_VNODE: kn->kn_fop = &nfsvnode_filtops; break; @@ -295,53 +149,7 @@ nfs_kqfilter(void *v) kn->kn_hook = vp; - /* - * Put the vnode to watched list. - */ - - /* - * Fetch current attributes. It's only needed when the vnode - * is not watched yet, but we need to do this without lock - * held. This is likely cheap due to attrcache, so do it now. - */ - memset(&attr, 0, sizeof(attr)); - (void) VOP_GETATTR(vp, &attr, p->p_ucred, p); - - rw_enter_write(&nfskevq_lock); - - /* ensure the poller is running */ - if (!pnfskq) { - error = kthread_create(nfs_kqpoll, NULL, &pnfskq, - "nfskqpoll"); - if (error) - goto out; - } - - SLIST_FOREACH(ke, &kevlist, kev_link) - if (ke->vp == vp) - break; - - if (ke) { - /* already watched, so just bump usecount */ - ke->usecount++; - } else { - /* need a new one */ - ke = malloc(sizeof(*ke), M_KEVENT, M_WAITOK); - ke->vp = vp; - ke->usecount = 1; - ke->flags = 0; - ke->omtime = attr.va_mtime; - ke->octime = attr.va_ctime; - ke->onlink = attr.va_nlink; - SLIST_INSERT_HEAD(&kevlist, ke, kev_link); - } - - /* kick the poller */ - wakeup(pnfskq); - klist_insert(&vp->v_selectinfo.si_note, kn); -out: - rw_exit_write(&nfskevq_lock); - return (error); + return (0); }