When I changed vnode kqueue events to be posted from common code at the VFS
layer, I introduced a regression in sending those events on stacked file
systems like nullfs. The root cause of the problem is that when knotes are
attached to the vnode, that operation (by way of the VOP bypass mechanism in
layerfs) drills all the way down to the bottom of the vnode stack and attaches
there (as intended), but with event delivery being handled now in vnode_if.c,
it’s the upper vnode that’s being worked with, and no knotes are attached there.
These stacked vnodes already share a bit of state with the bottom layer…
(locks, etc.), so my solution to this problem is to allow these stacked vnodes
to also refer to the bottom vnode’s klist. Each vnode always starts out
referring to its own, but as a layerfs vnode is constructed, it is told to
refer to the lower vnode's klist (just as it refers to the lower vnode’s
interlock and uvm_obj lock). Various assertions are included to ensure that
knotes are never attached to a vnode that refers to another vnode’s klist (they
should always be attached to the bottom-most vnode).
(After applying the patch, vnode_if.c must be rebuilt.)
Index: sys/fs/union/union_subr.c
===
RCS file: /cvsroot/src/sys/fs/union/union_subr.c,v
retrieving revision 1.81
diff -u -p -r1.81 union_subr.c
--- sys/fs/union/union_subr.c 19 Mar 2022 13:53:32 - 1.81
+++ sys/fs/union/union_subr.c 12 Jul 2022 00:17:28 -
@@ -232,10 +232,11 @@ union_newupper(struct union_node *un, st
unlock_ap.a_desc = VDESC(vop_unlock);
unlock_ap.a_vp = UNIONTOV(un);
genfs_unlock(&unlock_ap);
- /* Update union vnode interlock & vmobjlock. */
+ /* Update union vnode interlock, vmobjlock, & klist. */
vshareilock(UNIONTOV(un), uppervp);
rw_obj_hold(uppervp->v_uobj.vmobjlock);
uvm_obj_setlock(&UNIONTOV(un)->v_uobj, uppervp->v_uobj.vmobjlock);
+ vshareklist(UNIONTOV(un), uppervp);
mutex_exit(&un->un_lock);
if (ohash != nhash) {
LIST_INSERT_HEAD(&uhashtbl[nhash], un, un_cache);
@@ -577,6 +578,7 @@ union_loadvnode(struct mount *mp, struct
vshareilock(vp, svp);
rw_obj_hold(svp->v_uobj.vmobjlock);
uvm_obj_setlock(&vp->v_uobj, svp->v_uobj.vmobjlock);
+ vshareklist(vp, svp);
/* detect the root vnode (and aliases) */
if ((un->un_uppervp == um->um_uppervp) &&
Index: sys/miscfs/genfs/layer_vfsops.c
===
RCS file: /cvsroot/src/sys/miscfs/genfs/layer_vfsops.c,v
retrieving revision 1.54
diff -u -p -r1.54 layer_vfsops.c
--- sys/miscfs/genfs/layer_vfsops.c 23 Feb 2020 15:46:41 - 1.54
+++ sys/miscfs/genfs/layer_vfsops.c 12 Jul 2022 00:17:28 -
@@ -205,10 +205,11 @@ layerfs_loadvnode(struct mount *mp, stru
xp = kmem_alloc(lmp->layerm_size, KM_SLEEP);
- /* Share the interlock and vmobjlock with the lower node. */
+ /* Share the interlock, vmobjlock, and klist with the lower node. */
vshareilock(vp, lowervp);
rw_obj_hold(lowervp->v_uobj.vmobjlock);
uvm_obj_setlock(&vp->v_uobj, lowervp->v_uobj.vmobjlock);
+ vshareklist(vp, lowervp);
vp->v_tag = lmp->layerm_tag;
vp->v_type = lowervp->v_type;
Index: sys/kern/vfs_vnode.c
===
RCS file: /cvsroot/src/sys/kern/vfs_vnode.c,v
retrieving revision 1.143
diff -u -p -r1.143 vfs_vnode.c
--- sys/kern/vfs_vnode.c9 Apr 2022 23:45:45 - 1.143
+++ sys/kern/vfs_vnode.c12 Jul 2022 00:17:28 -
@@ -457,7 +457,8 @@ vnalloc_marker(struct mount *mp)
vp->v_mount = mp;
vp->v_type = VBAD;
vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
- klist_init(&vp->v_klist);
+ klist_init(&vip->vi_klist.vk_klist);
+ vp->v_klist = &vip->vi_klist;
vip->vi_state = VS_MARKER;
return vp;
@@ -475,7 +476,7 @@ vnfree_marker(vnode_t *vp)
KASSERT(vip->vi_state == VS_MARKER);
mutex_obj_free(vp->v_interlock);
uvm_obj_destroy(&vp->v_uobj, true);
- klist_fini(&vp->v_klist);
+ klist_fini(&vip->vi_klist.vk_klist);
pool_cache_put(vcache_pool, vip);
}
@@ -1391,7 +1392,8 @@ vcache_alloc(void)
vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1);
- klist_init(&vp->v_klist);
+ klist_init(&vip->vi_klist.vk_klist);
+ vp->v_klist = &vip->vi_klist;
cv_init(&vp->v_cv, "vnode");
cache_vnode_init(vp);
@@ -1453,7 +1455,9 @@ vcache_free(vnode_impl_t *vip)
mutex_obj_free(vp->v_interlock);
rw_destroy(&vip->vi_lock);
uvm_obj_destroy(&vp->v_uobj, true);
- klist_fini(&vp->v_klist);
+ KASSERT(vp->v_klist == &vip->vi_klist ||
+ S