Module Name:    src
Committed By:   ad
Date:           Tue May 26 18:38:37 UTC 2020

Modified Files:
        src/sys/kern: vfs_cache.c vfs_lookup.c vfs_subr.c vfs_vnode.c

Log Message:
Make vcache_tryvget() lockless.  Reviewed by hannken@.


To generate a diff of this commit:
cvs rdiff -u -r1.143 -r1.144 src/sys/kern/vfs_cache.c
cvs rdiff -u -r1.219 -r1.220 src/sys/kern/vfs_lookup.c
cvs rdiff -u -r1.487 -r1.488 src/sys/kern/vfs_subr.c
cvs rdiff -u -r1.122 -r1.123 src/sys/kern/vfs_vnode.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/kern/vfs_cache.c
diff -u src/sys/kern/vfs_cache.c:1.143 src/sys/kern/vfs_cache.c:1.144
--- src/sys/kern/vfs_cache.c:1.143	Sat May 16 18:31:50 2020
+++ src/sys/kern/vfs_cache.c	Tue May 26 18:38:37 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: vfs_cache.c,v 1.143 2020/05/16 18:31:50 christos Exp $	*/
+/*	$NetBSD: vfs_cache.c,v 1.144 2020/05/26 18:38:37 ad Exp $	*/
 
 /*-
  * Copyright (c) 2008, 2019, 2020 The NetBSD Foundation, Inc.
@@ -172,7 +172,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.143 2020/05/16 18:31:50 christos Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.144 2020/05/26 18:38:37 ad Exp $");
 
 #define __NAMECACHE_PRIVATE
 #ifdef _KERNEL_OPT
@@ -582,13 +582,8 @@ cache_lookup(struct vnode *dvp, const ch
 		return hit;
 	}
 	vp = ncp->nc_vp;
-	mutex_enter(vp->v_interlock);
-	rw_exit(&dvi->vi_nc_lock);
-
-	/*
-	 * Unlocked except for the vnode interlock.  Call vcache_tryvget().
-	 */
 	error = vcache_tryvget(vp);
+	rw_exit(&dvi->vi_nc_lock);
 	if (error) {
 		KASSERT(error == EBUSY);
 		/*
@@ -821,9 +816,8 @@ cache_revlookup(struct vnode *vp, struct
 		}
 
 		dvp = ncp->nc_dvp;
-		mutex_enter(dvp->v_interlock);
-		rw_exit(&vi->vi_nc_listlock);
 		error = vcache_tryvget(dvp);
+		rw_exit(&vi->vi_nc_listlock);
 		if (error) {
 			KASSERT(error == EBUSY);
 			if (bufp)

Index: src/sys/kern/vfs_lookup.c
diff -u src/sys/kern/vfs_lookup.c:1.219 src/sys/kern/vfs_lookup.c:1.220
--- src/sys/kern/vfs_lookup.c:1.219	Wed Apr 22 21:35:52 2020
+++ src/sys/kern/vfs_lookup.c	Tue May 26 18:38:37 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: vfs_lookup.c,v 1.219 2020/04/22 21:35:52 ad Exp $	*/
+/*	$NetBSD: vfs_lookup.c,v 1.220 2020/05/26 18:38:37 ad Exp $	*/
 
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
@@ -37,7 +37,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_lookup.c,v 1.219 2020/04/22 21:35:52 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_lookup.c,v 1.220 2020/05/26 18:38:37 ad Exp $");
 
 #ifdef _KERNEL_OPT
 #include "opt_magiclinks.h"
@@ -1354,9 +1354,7 @@ lookup_fastforward(struct namei_state *s
 		    foundobj->v_type != VDIR ||
 		    (foundobj->v_type == VDIR &&
 		    foundobj->v_mountedhere != NULL)) {
-		    	mutex_enter(foundobj->v_interlock);
 			error = vcache_tryvget(foundobj);
-			/* v_interlock now unheld */
 			if (error != 0) {
 				foundobj = NULL;
 				error = EOPNOTSUPP;
@@ -1381,9 +1379,7 @@ lookup_fastforward(struct namei_state *s
 	 * let lookup_once() take care of it.
 	 */
 	if (searchdir != *searchdir_ret) {
-		mutex_enter(searchdir->v_interlock);
 		error2 = vcache_tryvget(searchdir);
-		/* v_interlock now unheld */
 		KASSERT(plock != NULL);
 		rw_exit(plock);
 		if (__predict_true(error2 == 0)) {

Index: src/sys/kern/vfs_subr.c
diff -u src/sys/kern/vfs_subr.c:1.487 src/sys/kern/vfs_subr.c:1.488
--- src/sys/kern/vfs_subr.c:1.487	Sat May 16 18:31:50 2020
+++ src/sys/kern/vfs_subr.c	Tue May 26 18:38:37 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: vfs_subr.c,v 1.487 2020/05/16 18:31:50 christos Exp $	*/
+/*	$NetBSD: vfs_subr.c,v 1.488 2020/05/26 18:38:37 ad Exp $	*/
 
 /*-
  * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008, 2019, 2020
@@ -69,7 +69,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.487 2020/05/16 18:31:50 christos Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.488 2020/05/26 18:38:37 ad Exp $");
 
 #ifdef _KERNEL_OPT
 #include "opt_ddb.h"
@@ -730,18 +730,15 @@ lazy_sync_vnode(struct vnode *vp)
 	KASSERT(mutex_owned(&syncer_data_lock));
 
 	synced = false;
-	/* We are locking in the wrong direction. */
-	if (mutex_tryenter(vp->v_interlock)) {
+	if (vcache_tryvget(vp) == 0) {
 		mutex_exit(&syncer_data_lock);
-		if (vcache_tryvget(vp) == 0) {
-			if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
-				synced = true;
-				(void) VOP_FSYNC(vp, curlwp->l_cred,
-				    FSYNC_LAZY, 0, 0);
-				vput(vp);
-			} else
-				vrele(vp);
-		}
+		if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
+			synced = true;
+			(void) VOP_FSYNC(vp, curlwp->l_cred,
+			    FSYNC_LAZY, 0, 0);
+			vput(vp);
+		} else
+			vrele(vp);
 		mutex_enter(&syncer_data_lock);
 	}
 	return synced;

Index: src/sys/kern/vfs_vnode.c
diff -u src/sys/kern/vfs_vnode.c:1.122 src/sys/kern/vfs_vnode.c:1.123
--- src/sys/kern/vfs_vnode.c:1.122	Mon May 18 08:27:54 2020
+++ src/sys/kern/vfs_vnode.c	Tue May 26 18:38:37 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: vfs_vnode.c,v 1.122 2020/05/18 08:27:54 hannken Exp $	*/
+/*	$NetBSD: vfs_vnode.c,v 1.123 2020/05/26 18:38:37 ad Exp $	*/
 
 /*-
  * Copyright (c) 1997-2011, 2019, 2020 The NetBSD Foundation, Inc.
@@ -112,7 +112,7 @@
  *	LOADING -> LOADED
  *			Vnode has been initialised in vcache_get() or
  *			vcache_new() and is ready to use.
- *	LOADED -> RECLAIMING
+ *	BLOCKED -> RECLAIMING
  *			Vnode starts disassociation from underlying file
  *			system in vcache_reclaim().
  *	RECLAIMING -> RECLAIMED
@@ -143,19 +143,12 @@
  *	as vput(9), routines.  Common points holding references are e.g.
  *	file openings, current working directory, mount points, etc.  
  *
- * Note on v_usecount and its locking
- *
- *	At nearly all points it is known that v_usecount could be zero,
- *	the vnode_t::v_interlock will be held.  To change the count away
- *	from zero, the interlock must be held.  To change from a non-zero
- *	value to zero, again the interlock must be held.
- *
- *	Changing the usecount from a non-zero value to a non-zero value can
- *	safely be done using atomic operations, without the interlock held.
+ *	v_usecount is adjusted with atomic operations, however to change
+ *	from a non-zero value to zero the interlock must also be held.
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.122 2020/05/18 08:27:54 hannken Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.123 2020/05/26 18:38:37 ad Exp $");
 
 #ifdef _KERNEL_OPT
 #include "opt_pax.h"
@@ -237,13 +230,20 @@ extern int		(**spec_vnodeop_p)(void *);
 extern struct vfsops	dead_vfsops;
 
 /*
+ * The high bit of v_usecount is a gate for vcache_tryvget().  It's set
+ * only when the vnode state is LOADED.
+ */
+#define	VUSECOUNT_MASK	0x7fffffff
+#define	VUSECOUNT_GATE	0x80000000
+
+/*
  * Return the current usecount of a vnode.
  */
 inline int
 vrefcnt(struct vnode *vp)
 {
 
-	return atomic_load_relaxed(&vp->v_usecount);
+	return atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_MASK;
 }
 
 /* Vnode state operations and diagnostics. */
@@ -329,8 +329,8 @@ static void
 vstate_assert_change(vnode_t *vp, enum vnode_state from, enum vnode_state to,
     const char *func, int line)
 {
+	bool gated = (atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_GATE);
 	vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
-	int refcnt = vrefcnt(vp);
 
 	KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
 	if (from == VS_LOADING)
@@ -345,10 +345,15 @@ vstate_assert_change(vnode_t *vp, enum v
 	if (vip->vi_state != from)
 		vnpanic(vp, "from is %s, expected %s at %s:%d\n",
 		    vstate_name(vip->vi_state), vstate_name(from), func, line);
-	if ((from == VS_BLOCKED || to == VS_BLOCKED) && refcnt != 1)
-		vnpanic(vp, "%s to %s with usecount %d at %s:%d",
-		    vstate_name(from), vstate_name(to), refcnt,
-		    func, line);
+	if ((from == VS_LOADED) != gated)
+		vnpanic(vp, "state is %s, gate %d does not match at %s:%d\n",
+		    vstate_name(vip->vi_state), gated, func, line);
+
+	/* Open/close the gate for vcache_tryvget(). */	
+	if (to == VS_LOADED)
+		atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE);
+	else
+		atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE);
 
 	vip->vi_state = to;
 	if (from == VS_LOADING)
@@ -386,6 +391,12 @@ vstate_change(vnode_t *vp, enum vnode_st
 {
 	vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
 
+	/* Open/close the gate for vcache_tryvget(). */	
+	if (to == VS_LOADED)
+		atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE);
+	else
+		atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE);
+
 	vip->vi_state = to;
 	if (from == VS_LOADING)
 		cv_broadcast(&vcache_cv);
@@ -712,10 +723,10 @@ vtryrele(vnode_t *vp)
 	u_int use, next;
 
 	for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
-		if (__predict_false(use == 1)) {
+		if (__predict_false((use & VUSECOUNT_MASK) == 1)) {
 			return false;
 		}
-		KASSERT(use > 1);
+		KASSERT((use & VUSECOUNT_MASK) > 1);
 		next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
 		if (__predict_true(next == use)) {
 			return true;
@@ -846,10 +857,8 @@ vrelel(vnode_t *vp, int flags, int lktyp
 		VOP_UNLOCK(vp);
 	} else {
 		/*
-		 * The vnode must not gain another reference while being
-		 * deactivated.  If VOP_INACTIVE() indicates that
-		 * the described file has been deleted, then recycle
-		 * the vnode.
+		 * If VOP_INACTIVE() indicates that the file has been
+		 * deleted, then recycle the vnode.
 		 *
 		 * Note that VOP_INACTIVE() will not drop the vnode lock.
 		 */
@@ -858,12 +867,34 @@ vrelel(vnode_t *vp, int flags, int lktyp
 		VOP_INACTIVE(vp, &recycle);
 		rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
 		mutex_enter(vp->v_interlock);
-		if (vtryrele(vp)) {
-			VOP_UNLOCK(vp);
-			mutex_exit(vp->v_interlock);
-			rw_exit(vp->v_uobj.vmobjlock);
-			return;
-		}
+
+		for (;;) {
+			/*
+			 * If no longer the last reference, try to shed it. 
+			 * On success, drop the interlock last thereby
+			 * preventing the vnode being freed behind us.
+			 */
+			if (vtryrele(vp)) {
+				VOP_UNLOCK(vp);
+				rw_exit(vp->v_uobj.vmobjlock);
+				mutex_exit(vp->v_interlock);
+				return;
+			}
+			/*
+			 * Block new references then check again to see if a
+			 * new reference was acquired in the meantime.  If
+			 * it was, restore the vnode state and try again.
+			 */
+			if (recycle) {
+				VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
+				if (vrefcnt(vp) != 1) {
+					VSTATE_CHANGE(vp, VS_BLOCKED,
+					    VS_LOADED);
+					continue;
+				}
+			}
+			break;
+ 		}
 
 		/* Take care of space accounting. */
 		if ((vp->v_iflag & VI_EXECMAP) != 0 &&
@@ -880,7 +911,7 @@ vrelel(vnode_t *vp, int flags, int lktyp
 		 * otherwise just free it.
 		 */
 		if (recycle) {
-			VSTATE_ASSERT(vp, VS_LOADED);
+			VSTATE_ASSERT(vp, VS_BLOCKED);
 			/* vcache_reclaim drops the lock. */
 			vcache_reclaim(vp);
 		} else {
@@ -889,7 +920,7 @@ vrelel(vnode_t *vp, int flags, int lktyp
 		KASSERT(vrefcnt(vp) > 0);
 	}
 
-	if (atomic_dec_uint_nv(&vp->v_usecount) != 0) {
+	if ((atomic_dec_uint_nv(&vp->v_usecount) & VUSECOUNT_MASK) != 0) {
 		/* Gained another reference while being reclaimed. */
 		mutex_exit(vp->v_interlock);
 		return;
@@ -941,7 +972,7 @@ vrele_async(vnode_t *vp)
  * Vnode reference, where a reference is already held by some other
  * object (for example, a file structure).
  *
- * NB: we have lockless code sequences that rely on this not blocking.
+ * NB: lockless code sequences may rely on this not blocking.
  */
 void
 vref(vnode_t *vp)
@@ -1019,14 +1050,8 @@ vrecycle(vnode_t *vp)
 
 	mutex_enter(vp->v_interlock);
 
-	/* Make sure we hold the last reference. */
-	VSTATE_WAIT_STABLE(vp);
-	if (vrefcnt(vp) != 1) {
-		mutex_exit(vp->v_interlock);
-		return false;
-	}
-
 	/* If the vnode is already clean we're done. */
+	VSTATE_WAIT_STABLE(vp);
 	if (VSTATE_GET(vp) != VS_LOADED) {
 		VSTATE_ASSERT(vp, VS_RECLAIMED);
 		vrelel(vp, 0, LK_NONE);
@@ -1035,6 +1060,14 @@ vrecycle(vnode_t *vp)
 
 	/* Prevent further references until the vnode is locked. */
 	VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
+
+	/* Make sure we hold the last reference. */
+	if (vrefcnt(vp) != 1) {
+		VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
+		mutex_exit(vp->v_interlock);
+		return false;
+	}
+
 	mutex_exit(vp->v_interlock);
 
 	/*
@@ -1046,9 +1079,8 @@ vrecycle(vnode_t *vp)
 	error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
 
 	mutex_enter(vp->v_interlock);
-	VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
-
 	if (error) {
+		VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
 		mutex_exit(vp->v_interlock);
 		return false;
 	}
@@ -1143,6 +1175,7 @@ vgone(vnode_t *vp)
 	mutex_enter(vp->v_interlock);
 	VSTATE_WAIT_STABLE(vp);
 	if (VSTATE_GET(vp) == VS_LOADED) { 
+		VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
 		vcache_reclaim(vp);
 		lktype = LK_NONE;
 	}
@@ -1310,30 +1343,24 @@ vcache_free(vnode_impl_t *vip)
 
 /*
  * Try to get an initial reference on this cached vnode.
- * Returns zero on success,  ENOENT if the vnode has been reclaimed and
- * EBUSY if the vnode state is unstable.
+ * Returns zero on success or EBUSY if the vnode state is not LOADED.
  *
- * v_interlock locked on entry and unlocked on exit.
+ * NB: lockless code sequences may rely on this not blocking.
  */
 int
 vcache_tryvget(vnode_t *vp)
 {
-	int error = 0;
-
-	KASSERT(mutex_owned(vp->v_interlock));
-
-	if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED))
-		error = ENOENT;
-	else if (__predict_false(VSTATE_GET(vp) != VS_LOADED))
-		error = EBUSY;
-	else if (vp->v_usecount == 0)
-		vp->v_usecount = 1;
-	else
-		atomic_inc_uint(&vp->v_usecount);
-
-	mutex_exit(vp->v_interlock);
+	u_int use, next;
 
-	return error;
+	for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
+		if (__predict_false((use & VUSECOUNT_GATE) == 0)) {
+			return EBUSY;
+		}
+		next = atomic_cas_uint(&vp->v_usecount, use, use + 1);
+		if (__predict_true(next == use)) {
+			return 0;
+		}
+	}
 }
 
 /*
@@ -1363,10 +1390,7 @@ vcache_vget(vnode_t *vp)
 		return ENOENT;
 	}
 	VSTATE_ASSERT(vp, VS_LOADED);
-	if (vp->v_usecount == 0)
-		vp->v_usecount = 1;
-	else
-		atomic_inc_uint(&vp->v_usecount);
+	atomic_inc_uint(&vp->v_usecount);
 	mutex_exit(vp->v_interlock);
 
 	return 0;
@@ -1679,7 +1703,7 @@ vcache_reclaim(vnode_t *vp)
 	 * Prevent the vnode from being recycled or brought into use
 	 * while we clean it out.
 	 */
-	VSTATE_CHANGE(vp, VS_LOADED, VS_RECLAIMING);
+	VSTATE_CHANGE(vp, VS_BLOCKED, VS_RECLAIMING);
 	mutex_exit(vp->v_interlock);
 
 	rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);

Reply via email to