Module Name: src Committed By: ad Date: Fri Jan 24 16:05:23 UTC 2020
Modified Files: src/sys/kern [ad-namecache]: vfs_subr.c vfs_vnode.c src/sys/miscfs/genfs [ad-namecache]: genfs_vnops.c src/sys/sys [ad-namecache]: vnode.h vnode_impl.h Log Message: vnodes: - Have own v_usecount again, don't share the uvm_object's refcount. - Cluster the members of vnode_t and vnode_impl_t in a cache-concious way. - Go back to having vi_lock directly in vnode_impl_t. - Go back to having v_usecount adjusted with atomics. - Start adjusting v_holdcnt with atomics, too. - Put all the namecache stuff back into vnode_impl_t. To generate a diff of this commit: cvs rdiff -u -r1.478.2.1 -r1.478.2.2 src/sys/kern/vfs_subr.c cvs rdiff -u -r1.105.2.4 -r1.105.2.5 src/sys/kern/vfs_vnode.c cvs rdiff -u -r1.200.2.2 -r1.200.2.3 src/sys/miscfs/genfs/genfs_vnops.c cvs rdiff -u -r1.286 -r1.286.2.1 src/sys/sys/vnode.h cvs rdiff -u -r1.19.2.4 -r1.19.2.5 src/sys/sys/vnode_impl.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/kern/vfs_subr.c diff -u src/sys/kern/vfs_subr.c:1.478.2.1 src/sys/kern/vfs_subr.c:1.478.2.2 --- src/sys/kern/vfs_subr.c:1.478.2.1 Fri Jan 17 21:47:35 2020 +++ src/sys/kern/vfs_subr.c Fri Jan 24 16:05:22 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: vfs_subr.c,v 1.478.2.1 2020/01/17 21:47:35 ad Exp $ */ +/* $NetBSD: vfs_subr.c,v 1.478.2.2 2020/01/24 16:05:22 ad Exp $ */ /*- * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008, 2019 @@ -69,7 +69,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.478.2.1 2020/01/17 21:47:35 ad Exp $"); +__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.478.2.2 2020/01/24 16:05:22 ad Exp $"); #ifdef _KERNEL_OPT #include "opt_ddb.h" @@ -1111,7 +1111,7 @@ vprint_common(struct vnode *vp, const ch vp->v_usecount, vp->v_writecount, vp->v_holdcnt); (*pr)("%ssize %" PRIx64 " writesize %" PRIx64 " numoutput %d\n", prefix, vp->v_size, vp->v_writesize, vp->v_numoutput); - (*pr)("%sdata %p lock %p\n", prefix, vp->v_data, vip->vi_lock); + (*pr)("%sdata %p lock %p\n", prefix, vp->v_data, &vip->vi_lock); (*pr)("%sstate %s key(%p %zd)", prefix, vstate_name(vip->vi_state), vip->vi_key.vk_mount, vip->vi_key.vk_key_len); @@ -1544,7 +1544,7 @@ vfs_vnode_lock_print(void *vlock, int fu for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp)) { TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) { - if (vip->vi_lock == vlock || + if (&vip->vi_lock == vlock || VIMPL_TO_VNODE(vip)->v_interlock == vlock) vfs_vnode_print(VIMPL_TO_VNODE(vip), full, pr); } Index: src/sys/kern/vfs_vnode.c diff -u src/sys/kern/vfs_vnode.c:1.105.2.4 src/sys/kern/vfs_vnode.c:1.105.2.5 --- src/sys/kern/vfs_vnode.c:1.105.2.4 Thu Jan 23 19:28:39 2020 +++ src/sys/kern/vfs_vnode.c Fri Jan 24 16:05:22 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: vfs_vnode.c,v 1.105.2.4 2020/01/23 19:28:39 ad Exp $ */ +/* $NetBSD: vfs_vnode.c,v 1.105.2.5 2020/01/24 16:05:22 ad Exp $ */ /*- * Copyright (c) 1997-2011, 2019 The NetBSD Foundation, Inc. @@ -142,10 +142,19 @@ * as vput(9), routines. Common points holding references are e.g. * file openings, current working directory, mount points, etc. * + * Note on v_usecount & v_holdcnt and their locking + * + * At nearly all points it is known that the counts could be zero, + * the vnode_t::v_interlock will be held. To change the counts away + * from zero, the interlock must be held. To change from a non-zero + * value to zero, again the interlock must be held. + * + * Changing the usecount from a non-zero value to a non-zero value can + * safely be done using atomic operations, without the interlock held. */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.105.2.4 2020/01/23 19:28:39 ad Exp $"); +__KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.105.2.5 2020/01/24 16:05:22 ad Exp $"); #include <sys/param.h> #include <sys/kernel.h> @@ -669,6 +678,27 @@ vdrain_thread(void *cookie) } /* + * Try to drop reference on a vnode. Abort if we are releasing the + * last reference. Note: this _must_ succeed if not the last reference. + */ +static bool +vtryrele(vnode_t *vp) +{ + u_int use, next; + + for (use = vp->v_usecount;; use = next) { + if (__predict_false(use == 1)) { + return false; + } + KASSERT(use > 1); + next = atomic_cas_uint(&vp->v_usecount, use, use - 1); + if (__predict_true(next == use)) { + return true; + } + } +} + +/* * vput: unlock and release the reference. */ void @@ -676,7 +706,20 @@ vput(vnode_t *vp) { int lktype; - if ((vp->v_vflag & VV_LOCKSWORK) == 0) { + /* + * Do an unlocked check of v_usecount. If it looks like we're not + * about to drop the last reference, then unlock the vnode and try + * to drop the reference. If it ends up being the last reference + * after all, we dropped the lock when we shouldn't have. vrelel() + * can fix it all up. Most of the time this will all go to plan. + */ + if (vp->v_usecount > 1) { + VOP_UNLOCK(vp); + if (vtryrele(vp)) { + return; + } + lktype = LK_NONE; + } else if ((vp->v_vflag & VV_LOCKSWORK) == 0) { lktype = LK_EXCLUSIVE; } else { lktype = VOP_ISLOCKED(vp); @@ -708,11 +751,10 @@ vrelel(vnode_t *vp, int flags, int lktyp * If not the last reference, just drop the reference count * and unlock. */ - if (vp->v_usecount > 1) { + if (vtryrele(vp)) { if (lktype != LK_NONE) { VOP_UNLOCK(vp); } - vp->v_usecount--; mutex_exit(vp->v_interlock); return; } @@ -792,8 +834,7 @@ vrelel(vnode_t *vp, int flags, int lktyp mutex_enter(vp->v_interlock); if (!recycle) { VOP_UNLOCK(vp); - if (vp->v_usecount > 1) { - vp->v_usecount--; + if (vtryrele(vp)) { mutex_exit(vp->v_interlock); return; } @@ -820,8 +861,7 @@ vrelel(vnode_t *vp, int flags, int lktyp KASSERT(vp->v_usecount > 0); } - vp->v_usecount--; - if (vp->v_usecount != 0) { + if (atomic_dec_uint_nv(&vp->v_usecount) != 0) { /* Gained another reference while being reclaimed. */ mutex_exit(vp->v_interlock); return; @@ -848,6 +888,9 @@ void vrele(vnode_t *vp) { + if (vtryrele(vp)) { + return; + } mutex_enter(vp->v_interlock); vrelel(vp, 0, LK_NONE); } @@ -859,6 +902,9 @@ void vrele_async(vnode_t *vp) { + if (vtryrele(vp)) { + return; + } mutex_enter(vp->v_interlock); vrelel(vp, VRELEL_ASYNC, LK_NONE); } @@ -873,9 +919,7 @@ vref(vnode_t *vp) KASSERT(vp->v_usecount != 0); - mutex_enter(vp->v_interlock); - vp->v_usecount++; - mutex_exit(vp->v_interlock); + atomic_inc_uint(&vp->v_usecount); } /* @@ -888,11 +932,34 @@ vholdl(vnode_t *vp) KASSERT(mutex_owned(vp->v_interlock)); - if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) + if (atomic_inc_uint_nv(&vp->v_holdcnt) == 1 && vp->v_usecount == 0) lru_requeue(vp, lru_which(vp)); } /* + * Page or buffer structure gets a reference. + */ +void +vhold(vnode_t *vp) +{ + int hold, next; + + for (hold = vp->v_holdcnt;; hold = next) { + if (__predict_false(hold == 0)) { + break; + } + next = atomic_cas_uint(&vp->v_holdcnt, hold, hold + 1); + if (__predict_true(next == hold)) { + return; + } + } + + mutex_enter(vp->v_interlock); + vholdl(vp); + mutex_exit(vp->v_interlock); +} + +/* * Page or buffer structure frees a reference. * Called with v_interlock held. */ @@ -906,12 +973,35 @@ holdrelel(vnode_t *vp) vnpanic(vp, "%s: holdcnt vp %p", __func__, vp); } - vp->v_holdcnt--; - if (vp->v_holdcnt == 0 && vp->v_usecount == 0) + if (atomic_dec_uint_nv(&vp->v_holdcnt) == 0 && vp->v_usecount == 0) lru_requeue(vp, lru_which(vp)); } /* + * Page or buffer structure frees a reference. + */ +void +holdrele(vnode_t *vp) +{ + int hold, next; + + for (hold = vp->v_holdcnt;; hold = next) { + if (__predict_false(hold == 1)) { + break; + } + KASSERT(hold > 1); + next = atomic_cas_uint(&vp->v_holdcnt, hold, hold - 1); + if (__predict_true(next == hold)) { + return; + } + } + + mutex_enter(vp->v_interlock); + holdrelel(vp); + mutex_exit(vp->v_interlock); +} + +/* * Recycle an unused vnode if caller holds the last reference. */ bool @@ -1013,7 +1103,7 @@ vrevoke(vnode_t *vp) if (VSTATE_GET(vp) == VS_RECLAIMED) { mutex_exit(vp->v_interlock); } else if (vp->v_type != VBLK && vp->v_type != VCHR) { - vp->v_usecount++; + atomic_inc_uint(&vp->v_usecount); mutex_exit(vp->v_interlock); vgone(vp); } else { @@ -1068,8 +1158,8 @@ static void vcache_init(void) { - vcache_pool = pool_cache_init(sizeof(vnode_impl_t), 0, 0, 0, - "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL); + vcache_pool = pool_cache_init(sizeof(vnode_impl_t), coherency_unit, + 0, 0, "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL); KASSERT(vcache_pool != NULL); mutex_init(&vcache_lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&vcache_cv, "vcache"); @@ -1139,7 +1229,7 @@ vcache_alloc(void) vip = pool_cache_get(vcache_pool, PR_WAITOK); memset(vip, 0, sizeof(*vip)); - vip->vi_lock = rw_obj_alloc(); + rw_init(&vip->vi_lock); vp = VIMPL_TO_VNODE(vip); uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0); @@ -1201,7 +1291,7 @@ vcache_free(vnode_impl_t *vip) if (vp->v_type == VBLK || vp->v_type == VCHR) spec_node_destroy(vp); - rw_obj_free(vip->vi_lock); + rw_destroy(&vip->vi_lock); uvm_obj_destroy(&vp->v_uobj, true); cv_destroy(&vp->v_cv); cache_vnode_fini(vp); @@ -1226,8 +1316,10 @@ vcache_tryvget(vnode_t *vp) error = ENOENT; else if (__predict_false(VSTATE_GET(vp) != VS_LOADED)) error = EBUSY; + else if (vp->v_usecount == 0) + vp->v_usecount = 1; else - vp->v_usecount++; + atomic_inc_uint(&vp->v_usecount); mutex_exit(vp->v_interlock); @@ -1261,7 +1353,10 @@ vcache_vget(vnode_t *vp) return ENOENT; } VSTATE_ASSERT(vp, VS_LOADED); - vp->v_usecount++; + if (vp->v_usecount == 0) + vp->v_usecount = 1; + else + atomic_inc_uint(&vp->v_usecount); mutex_exit(vp->v_interlock); return 0; Index: src/sys/miscfs/genfs/genfs_vnops.c diff -u src/sys/miscfs/genfs/genfs_vnops.c:1.200.2.2 src/sys/miscfs/genfs/genfs_vnops.c:1.200.2.3 --- src/sys/miscfs/genfs/genfs_vnops.c:1.200.2.2 Wed Jan 22 12:00:18 2020 +++ src/sys/miscfs/genfs/genfs_vnops.c Fri Jan 24 16:05:22 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: genfs_vnops.c,v 1.200.2.2 2020/01/22 12:00:18 ad Exp $ */ +/* $NetBSD: genfs_vnops.c,v 1.200.2.3 2020/01/24 16:05:22 ad Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. @@ -57,7 +57,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: genfs_vnops.c,v 1.200.2.2 2020/01/22 12:00:18 ad Exp $"); +__KERNEL_RCSID(0, "$NetBSD: genfs_vnops.c,v 1.200.2.3 2020/01/24 16:05:22 ad Exp $"); #include <sys/param.h> #include <sys/systm.h> @@ -293,19 +293,19 @@ genfs_deadlock(void *v) return ENOENT; if (ISSET(flags, LK_DOWNGRADE)) { - rw_downgrade(vip->vi_lock); + rw_downgrade(&vip->vi_lock); } else if (ISSET(flags, LK_UPGRADE)) { KASSERT(ISSET(flags, LK_NOWAIT)); - if (!rw_tryupgrade(vip->vi_lock)) { + if (!rw_tryupgrade(&vip->vi_lock)) { return EBUSY; } } else if ((flags & (LK_EXCLUSIVE | LK_SHARED)) != 0) { op = (ISSET(flags, LK_EXCLUSIVE) ? RW_WRITER : RW_READER); if (ISSET(flags, LK_NOWAIT)) { - if (!rw_tryenter(vip->vi_lock, op)) + if (!rw_tryenter(&vip->vi_lock, op)) return EBUSY; } else { - rw_enter(vip->vi_lock, op); + rw_enter(&vip->vi_lock, op); } } VSTATE_ASSERT_UNLOCKED(vp, VS_RECLAIMED); @@ -324,7 +324,7 @@ genfs_deadunlock(void *v) vnode_t *vp = ap->a_vp; vnode_impl_t *vip = VNODE_TO_VIMPL(vp); - rw_exit(vip->vi_lock); + rw_exit(&vip->vi_lock); return 0; } @@ -345,19 +345,19 @@ genfs_lock(void *v) krw_t op; if (ISSET(flags, LK_DOWNGRADE)) { - rw_downgrade(vip->vi_lock); + rw_downgrade(&vip->vi_lock); } else if (ISSET(flags, LK_UPGRADE)) { KASSERT(ISSET(flags, LK_NOWAIT)); - if (!rw_tryupgrade(vip->vi_lock)) { + if (!rw_tryupgrade(&vip->vi_lock)) { return EBUSY; } } else if ((flags & (LK_EXCLUSIVE | LK_SHARED)) != 0) { op = (ISSET(flags, LK_EXCLUSIVE) ? RW_WRITER : RW_READER); if (ISSET(flags, LK_NOWAIT)) { - if (!rw_tryenter(vip->vi_lock, op)) + if (!rw_tryenter(&vip->vi_lock, op)) return EBUSY; } else { - rw_enter(vip->vi_lock, op); + rw_enter(&vip->vi_lock, op); } } VSTATE_ASSERT_UNLOCKED(vp, VS_ACTIVE); @@ -376,7 +376,7 @@ genfs_unlock(void *v) vnode_t *vp = ap->a_vp; vnode_impl_t *vip = VNODE_TO_VIMPL(vp); - rw_exit(vip->vi_lock); + rw_exit(&vip->vi_lock); return 0; } @@ -393,10 +393,10 @@ genfs_islocked(void *v) vnode_t *vp = ap->a_vp; vnode_impl_t *vip = VNODE_TO_VIMPL(vp); - if (rw_write_held(vip->vi_lock)) + if (rw_write_held(&vip->vi_lock)) return LK_EXCLUSIVE; - if (rw_read_held(vip->vi_lock)) + if (rw_read_held(&vip->vi_lock)) return LK_SHARED; return 0; Index: src/sys/sys/vnode.h diff -u src/sys/sys/vnode.h:1.286 src/sys/sys/vnode.h:1.286.2.1 --- src/sys/sys/vnode.h:1.286 Sun Dec 22 19:47:34 2019 +++ src/sys/sys/vnode.h Fri Jan 24 16:05:23 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: vnode.h,v 1.286 2019/12/22 19:47:34 ad Exp $ */ +/* $NetBSD: vnode.h,v 1.286.2.1 2020/01/24 16:05:23 ad Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. @@ -134,20 +134,35 @@ LIST_HEAD(buflists, buf); * it from v_data. */ struct vnode { + /* + * VM system related items. + */ struct uvm_object v_uobj; /* i: the VM object */ - kcondvar_t v_cv; /* i: synchronization */ voff_t v_size; /* i: size of file */ voff_t v_writesize; /* i: new size after write */ + + /* + * Unstable items get their own cache line. + * On _LP64 this fills the space nicely. + */ + kcondvar_t v_cv /* i: synchronization */ + __aligned(COHERENCY_UNIT); int v_iflag; /* i: VI_* flags */ - int v_vflag; /* v: VV_* flags */ int v_uflag; /* u: VU_* flags */ int v_numoutput; /* i: # of pending writes */ int v_writecount; /* i: ref count of writers */ + int v_usecount; /* i: use count */ int v_holdcnt; /* i: page & buffer refs */ - struct mount *v_mount; /* v: ptr to vfs we are in */ - int (**v_op)(void *); /* :: vnode operations vector */ struct buflists v_cleanblkhd; /* x: clean blocklist head */ struct buflists v_dirtyblkhd; /* x: dirty blocklist head */ + + /* + * The remaining items are largely stable. + */ + int v_vflag /* v: VV_* flags */ + __aligned(COHERENCY_UNIT); + struct mount *v_mount; /* v: ptr to vfs we are in */ + int (**v_op)(void *); /* :: vnode operations vector */ union { struct mount *vu_mountedhere;/* v: ptr to vfs (VDIR) */ struct socket *vu_socket; /* v: unix ipc (VSOCK) */ @@ -160,7 +175,6 @@ struct vnode { void *v_data; /* :: private data for fs */ struct klist v_klist; /* i: notes attached to vnode */ }; -#define v_usecount v_uobj.uo_refs #define v_interlock v_uobj.vmobjlock #define v_mountedhere v_un.vu_mountedhere #define v_socket v_un.vu_socket @@ -320,36 +334,11 @@ extern const int vttoif_tab[]; #define VDEAD_NOWAIT 0x0001 /* vdead_check: do not sleep */ void holdrelel(struct vnode *); +void holdrele(struct vnode *); void vholdl(struct vnode *); +void vhold(struct vnode *); void vref(struct vnode *); -static __inline void holdrele(struct vnode *) __unused; -static __inline void vhold(struct vnode *) __unused; - -/* - * decrease buf or page ref - */ -static __inline void -holdrele(struct vnode *vp) -{ - - mutex_enter(vp->v_interlock); - holdrelel(vp); - mutex_exit(vp->v_interlock); -} - -/* - * increase buf or page ref - */ -static __inline void -vhold(struct vnode *vp) -{ - - mutex_enter(vp->v_interlock); - vholdl(vp); - mutex_exit(vp->v_interlock); -} - #define NULLVP ((struct vnode *)NULL) static __inline void Index: src/sys/sys/vnode_impl.h diff -u src/sys/sys/vnode_impl.h:1.19.2.4 src/sys/sys/vnode_impl.h:1.19.2.5 --- src/sys/sys/vnode_impl.h:1.19.2.4 Fri Jan 17 22:26:26 2020 +++ src/sys/sys/vnode_impl.h Fri Jan 24 16:05:23 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: vnode_impl.h,v 1.19.2.4 2020/01/17 22:26:26 ad Exp $ */ +/* $NetBSD: vnode_impl.h,v 1.19.2.5 2020/01/24 16:05:23 ad Exp $ */ /*- * Copyright (c) 2016, 2019, 2020 The NetBSD Foundation, Inc. @@ -57,26 +57,58 @@ struct vcache_key { * Reading or writing any of these items requires holding the appropriate * lock. Field markings and the corresponding locks: * + * - stable throughout the life of the vnode * c vcache_lock * d vdrain_lock * i v_interlock + * l vi_nc_listlock * m mnt_vnodelock - * n managed by vfs_cache: see above + * n vi_nc_lock + * n,l vi_nc_lock + vi_nc_listlock to modify * s syncer_data_lock */ struct vnode_impl { struct vnode vi_vnode; - enum vnode_state vi_state; /* i: current state */ - struct vnodelst *vi_lrulisthd; /* d: current lru list head */ - TAILQ_ENTRY(vnode_impl) vi_lrulist; /* d: lru list */ - struct nchnode *vi_ncache; /* n: namecache state */ - int vi_synclist_slot; /* s: synclist slot index */ - int vi_lrulisttm; /* i: time of lru enqueue */ - TAILQ_ENTRY(vnode_impl) vi_synclist; /* s: vnodes with dirty bufs */ - TAILQ_ENTRY(vnode_impl) vi_mntvnodes; /* m: vnodes for mount point */ - SLIST_ENTRY(vnode_impl) vi_hash; /* c: vnode cache list */ - krwlock_t *vi_lock; /* -: lock for this vnode */ - struct vcache_key vi_key; /* c: vnode cache key */ + + /* + * Largely stable data. + */ + struct vcache_key vi_key; /* c vnode cache key */ + + /* + * Namecache. Give it a separate line so activity doesn't impinge + * on the stable stuff. + */ + rb_tree_t vi_nc_tree /* n namecache tree */ + __aligned(COHERENCY_UNIT); + TAILQ_HEAD(,namecache) vi_nc_list; /* l namecaches (parent) */ + mode_t vi_nc_mode; /* n,l cached mode or VNOVAL */ + uid_t vi_nc_uid; /* n,l cached UID or VNOVAL */ + gid_t vi_nc_gid; /* n,l cached GID or VNOVAL */ + uint32_t vi_nc_spare; /* - spare (padding) */ + + /* + * vnode cache, LRU and syncer. This all changes with some + * regularity so keep it together. + */ + struct vnodelst *vi_lrulisthd /* d current lru list head */ + __aligned(COHERENCY_UNIT); + TAILQ_ENTRY(vnode_impl) vi_lrulist; /* d lru list */ + int vi_synclist_slot; /* s synclist slot index */ + int vi_lrulisttm; /* i time of lru enqueue */ + TAILQ_ENTRY(vnode_impl) vi_synclist; /* s vnodes with dirty bufs */ + SLIST_ENTRY(vnode_impl) vi_hash; /* c vnode cache list */ + enum vnode_state vi_state; /* i current state */ + + /* + * Locks and expensive to access items which can be expected to + * generate a cache miss. + */ + krwlock_t vi_lock /* - lock for this vnode */ + __aligned(COHERENCY_UNIT); + krwlock_t vi_nc_lock; /* - lock on node */ + krwlock_t vi_nc_listlock; /* - lock on nn_list */ + TAILQ_ENTRY(vnode_impl) vi_mntvnodes; /* m vnodes for mount point */ }; typedef struct vnode_impl vnode_impl_t;