Module Name:    src
Committed By:   riastradh
Date:           Mon Oct 15 23:08:20 UTC 2012

Modified Files:
        src/external/cddl/osnet/dist/uts/common/fs/zfs: zfs_vfsops.c
            zfs_vnops.c zfs_znode.c

Log Message:
Fix various issues in zfs life cycle, locking, and vop protocol.

- Restore some zfs locking and unlocking that got lost randomly.

- Enable use of the BSD vnode lock.  Lock order: all BSD vnode locks
are taken before all zfs internal locks.  There remains an issue with
O_EXCL, to be solved later (famous last words).  KASSERT the locking
scheme up the wazoo.

- Take our cruft out of zfs_lookup and move it to zfs_netbsd_lookup.
Restore much of the way zfs_lookup looked to make merging future
versions easier.  Disable use of the namecache for now because its
locking dance is too scary to contemplate.

- Implement BSD semantics for rename, to appease our tests.  This is
a provisional kludge; eventually we need VOP_RENAME to take a flag
specifying whether to use BSD semantics or POSIX semantics.

- Simplify zfs_netbsd_reclaim and make it work.  Now that getnewvnode
never tries to vclean anything itself, we need not worry about
recursion of ZFS_OBJ_MUTEX locks.

- Clarify and fix genfs node initialization and destruction.

zfs passes most of our atf vfs tests now, including the rename races.

Still to do:

- fix the impedance mismatch between our permissions model and zfs's;
- fix O_EXCL (nontrivial);
- throw dirconc at it and see how badly it explodes;
- find why zpool sometimes wedges itself during mkfs; and
- find why pool caches sometimes seem to get corrupted.


To generate a diff of this commit:
cvs rdiff -u -r1.7 -r1.8 \
    src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vfsops.c
cvs rdiff -u -r1.11 -r1.12 \
    src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vnops.c
cvs rdiff -u -r1.12 -r1.13 \
    src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_znode.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vfsops.c
diff -u src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vfsops.c:1.7 src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vfsops.c:1.8
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vfsops.c:1.7	Sun Nov 20 02:54:25 2011
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vfsops.c	Mon Oct 15 23:08:19 2012
@@ -1811,6 +1811,10 @@ zfs_root(vfs_t *vfsp, vnode_t **vpp)
 		*vpp = ZTOV(rootzp);
 	dprintf("vpp -> %d, error %d -- %p\n", (*vpp)->v_type, error, *vpp);
 	ZFS_EXIT(zfsvfs);
+	if (error == 0)
+		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
+	KASSERT((error != 0) || (*vpp != NULL));
+	KASSERT((error != 0) || (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE));
 	return (error);
 }
 

Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vnops.c
diff -u src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vnops.c:1.11 src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vnops.c:1.12
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vnops.c:1.11	Mon Oct  1 18:19:18 2012
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vnops.c	Mon Oct 15 23:08:19 2012
@@ -1197,13 +1197,12 @@ specvp_check(vnode_t **vpp, cred_t *cr)
  */
 /* ARGSUSED */
 static int
-zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
-    int nameiop, cred_t *cr, int flags)
+zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
+    int flags, vnode_t *rdir, cred_t *cr,  caller_context_t *ct,
+    int *direntflags, pathname_t *realpnp)
 {
 	znode_t *zdp = VTOZ(dvp);
 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
-	int *direntflags = NULL;
-	void *realpnp = NULL;
 	int	error = 0;
 
 	/* fast path */
@@ -1249,7 +1248,7 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode
 	ZFS_VERIFY_ZP(zdp);
 
 	*vpp = NULL;
-	dprintf("zfs_lookup called %s\n", nm);
+
 	if (flags & LOOKUP_XATTR) {
 #ifdef TODO
 		/*
@@ -1301,17 +1300,6 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode
 		return (error);
 	}
 
-	/*
-	 * Before tediously performing a linear scan of the directory,
-	 * check the name cache to see if the directory/name pair
-	 * we are looking for is known already.
-	 */
-
-	if ((error = cache_lookup(dvp, vpp, cnp)) >= 0) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
 	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
@@ -1323,56 +1311,6 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode
 		error = specvp_check(vpp, cr);
 
 	ZFS_EXIT(zfsvfs);
-	
-	/* Translate errors and add SAVENAME when needed. */
-	if (cnp->cn_flags & ISLASTCN) {
-		switch (nameiop) {
-		case CREATE:
-		case RENAME:
-			if (error == ENOENT) {
-				error = EJUSTRETURN;
-				break;
-			}
-			/* FALLTHROUGH */
-		case DELETE:
-			break;
-		}
-	}
-
-	if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
-		int ltype = 0;
-
-		if (cnp->cn_flags & ISDOTDOT) {
-			ltype = VOP_ISLOCKED(dvp);
-			VOP_UNLOCK(dvp);
-		}
-		error = vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
-		if (cnp->cn_flags & ISDOTDOT)
-			vn_lock(dvp, ltype | LK_RETRY);
-		if (error != 0) {
-			VN_RELE(*vpp);
-			*vpp = NULL;
-			return (error);
-		}
-	}
-
-	/*
-	 * Insert name into cache if appropriate.
-	 */
-	if ((cnp->cn_flags & MAKEENTRY) == 0){
-		return (error);
-	}
-	switch (error) {
-	case 0:
-		cache_enter(dvp, *vpp, cnp);
-		break;
-	case ENOENT:
-		if (nameiop != CREATE)
-			cache_enter(dvp, *vpp, cnp);
-		break;
-	default:
-		break;
-	}
 	return (error);
 }
 
@@ -2033,6 +1971,12 @@ top:
 	vnevent_rmdir(vp, dvp, name, ct);
 
 	/*
+	 * Grab a lock on the directory to make sure that noone is
+	 * trying to add (or lookup) entries while we are removing it.
+	 */
+	rw_enter(&zp->z_name_lock, RW_WRITER);
+
+	/*
 	 * Grab a lock on the parent pointer to make sure we play well
 	 * with the treewalk and directory rename code.
 	 */
@@ -3571,10 +3515,20 @@ top:
 		 * entries refer to the same file object, rename
 		 * must do nothing and exit without error.
 		 */
+#ifndef __NetBSD__
+		/*
+		 * But on NetBSD we have a different system call to do
+		 * this, posix_rename, which sorta kinda handles this
+		 * case (modulo races), and our tests expect BSD
+		 * semantics for rename, so we'll do that until we can
+		 * push the choice between BSD and POSIX semantics into
+		 * the VOP_RENAME protocol as a flag.
+		 */
 		if (szp->z_id == tzp->z_id) {
 			error = 0;
 			goto out;
 		}
+#endif
 	}
 
 	vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
@@ -3623,15 +3577,20 @@ top:
 		return (error);
 	}
 
-	if (tzp)	/* Attempt to remove the existing target */
+	if (tzp && (tzp->z_id != szp->z_id))
+		/* Attempt to remove the existing target */
 		error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
 
 	if (error == 0) {
-		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
+		if (!tzp || (tzp->z_id != szp->z_id))
+			error = zfs_link_create(tdl, szp, tx, ZRENAMING);
 		if (error == 0) {
 			szp->z_phys->zp_flags |= ZFS_AV_MODIFIED;
 
-			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
+			error = zfs_link_destroy(sdl, szp, tx,
+			    /* Kludge for BSD rename semantics.  */
+			    ((tzp && (tzp->z_id == szp->z_id)) ?
+				zflg : ZRENAMING), NULL);
 			ASSERT(error == 0);
 
 			zfs_log_rename(zilog, tx,
@@ -4779,13 +4738,8 @@ static int
 zfs_netbsd_open(void *v)
 {
 	struct vop_open_args *ap = v;
-	vnode_t	*vp = ap->a_vp;
-	znode_t *zp = VTOZ(vp);
-	int error;
 
-	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
-
-	return (error);
+	return (zfs_open(&ap->a_vp, ap->a_mode, ap->a_cred, NULL));
 }
 
 static int
@@ -4846,63 +4800,346 @@ zfs_netbsd_access(void *v)
 static int
 zfs_netbsd_lookup(void *v)
 {
-	struct vop_lookup_args *ap = v;
+	struct vop_lookup_args /* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+	} */ *ap = v;
+	struct vnode *dvp = ap->a_dvp;
+	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	char nm[NAME_MAX + 1];
-	int err;
-	
-	ASSERT(cnp->cn_namelen < sizeof(nm));
-	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
+	int error;
 
-	err = zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
-	    cnp->cn_cred, 0);
-	
-	return err;
+	KASSERT(dvp != NULL);
+	KASSERT(vpp != NULL);
+	KASSERT(cnp != NULL);
+	KASSERT(cnp->cn_nameptr != NULL);
+	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
+	KASSERT(cnp->cn_namelen < sizeof nm);
+
+#if 0				/* Namecache too scary to contemplate.  */
+	*vpp = NULL;
+
+	/*
+	 * Do an access check before the cache lookup.  zfs_lookup does
+	 * an access check too, but it's too scary to contemplate
+	 * injecting our namecache stuff into zfs internals.
+	 *
+	 * XXX Is this the correct access check?
+	 */
+	if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred)) != 0)
+		goto out;
+
+	/*
+	 * Check the namecache before entering zfs_lookup.
+	 * cache_lookup does the locking dance for us.
+	 */
+	if ((error = cache_lookup(dvp, vpp, cnp)) >= 0)
+		goto out;
+#endif
+
+	/*
+	 * zfs_lookup wants a null-terminated component name, but namei
+	 * gives us a pointer into the full pathname.
+	 */
+	(void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
+
+	error = zfs_lookup(dvp, nm, vpp, NULL, 0, NULL, cnp->cn_cred, NULL,
+	    NULL, NULL);
+
+	/*
+	 * Translate errors to match our namei insanity.
+	 */
+	if (cnp->cn_flags & ISLASTCN) {
+		switch (cnp->cn_nameiop) {
+		case CREATE:
+		case RENAME:
+			if (error == ENOENT) {
+				error = EJUSTRETURN;
+				break;
+			}
+			/* FALLTHROUGH */
+		case DELETE:
+			break;
+		}
+	}
+
+	if (error) {
+		KASSERT(*vpp == NULL);
+		goto out;
+	}
+	KASSERT(*vpp != NULL);	/* XXX Correct?  */
+
+	/*
+	 * Do a locking dance in conformance to the VOP_LOOKUP protocol.
+	 */
+	if ((cnp->cn_namelen == 1) && (cnp->cn_nameptr[0] == '.')) {
+		KASSERT(!(cnp->cn_flags & ISDOTDOT));
+		KASSERT(dvp == *vpp);
+	} else if ((cnp->cn_namelen == 2) &&
+	    (cnp->cn_nameptr[0] == '.') &&
+	    (cnp->cn_nameptr[1] == '.')) {
+		KASSERT(cnp->cn_flags & ISDOTDOT);
+		VOP_UNLOCK(dvp);
+		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
+		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
+	} else {
+		KASSERT(!(cnp->cn_flags & ISDOTDOT));
+		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
+	}
+
+out:
+	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
+	KASSERT((*vpp == NULL) || (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE));
+
+#if 0				/* Namecache too scary to contemplate.  */
+	/*
+	 * Insert name into cache if appropriate.
+	 *
+	 * XXX This seems like a lot of code.  Is it all necessary?
+	 * Can we just do a single cache_enter call?
+	 */
+#if 0
+	cache_enter(dvp, *vpp, cnp);
+#endif
+
+	if ((cnp->cn_flags & MAKEENTRY) == 0)
+		return (error);
+
+	switch (error) {
+	case 0:
+		cache_enter(dvp, *vpp, cnp);
+		break;
+	case ENOENT:
+		KASSERT(*vpp == NULL);
+		if (cnp->cn_nameiop != CREATE)
+			cache_enter(dvp, NULL, cnp);
+		break;
+	default:
+		break;
+	}
+#endif
+
+	return (error);
 }
 
 static int
 zfs_netbsd_create(void *v)
 {
-	struct vop_create_args *ap = v;
+	struct vop_create_args /* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+		struct vattr *a_vap;
+	} */ *ap = v;
+	struct vnode *dvp = ap->a_dvp;
+	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
-	vattr_t *vap = ap->a_vap;
+	struct vattr *vap = ap->a_vap;
 	int mode;
+	int error;
+
+	KASSERT(dvp != NULL);
+	KASSERT(vpp != NULL);
+	KASSERT(cnp != NULL);
+	KASSERT(vap != NULL);
+	KASSERT(cnp->cn_nameptr != NULL);
+	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
+
+#if 0
+	/*
+	 * ZFS doesn't require dvp to be BSD-locked, and the caller
+	 * expects us to drop this lock anyway, so we might as well
+	 * drop it early to encourage concurrency.
+	 */
+	VOP_UNLOCK(dvp);
+#endif
 
 	vattr_init_mask(vap);
 	mode = vap->va_mode & ALLPERMS;
 
-	return (zfs_create(ap->a_dvp, (char *)cnp->cn_nameptr, vap, !EXCL, mode,
-		ap->a_vpp, cnp->cn_cred));
+	/* XXX !EXCL is wrong here...  */
+	error = zfs_create(dvp, __UNCONST(cnp->cn_nameptr), vap, !EXCL, mode,
+	    vpp, cnp->cn_cred);
+	if (error) {
+		KASSERT(*vpp == NULL);
+		goto out;
+	}
+	KASSERT(*vpp != NULL);
+
+	/*
+	 * Lock *vpp in conformance to the VOP_CREATE protocol.
+	 */
+	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
+
+out:
+	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
+	KASSERT((*vpp == NULL) || (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE));
+
+	/*
+	 * Unlock and release dvp because the VOP_CREATE protocol is insane.
+	 */
+	VOP_UNLOCK(dvp);
+	VN_RELE(dvp);
+
+	return (error);
 }
 
 static int
 zfs_netbsd_remove(void *v)
 {
-	struct vop_remove_args *ap = v;
+	struct vop_remove_args /* {
+		struct vnode *a_dvp;
+		struct vnode *a_vp;
+		struct componentname *a_cnp;
+	} */ *ap = v;
+	struct vnode *dvp = ap->a_dvp;
+	struct vnode *vp = ap->a_vp;
+	struct componentname *cnp = ap->a_cnp;
+	int error;
+
+	KASSERT(dvp != NULL);
+	KASSERT(vp != NULL);
+	KASSERT(cnp != NULL);
+	KASSERT(cnp->cn_nameptr != NULL);
+	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
+	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
+
+#if 0
+	/*
+	 * ZFS doesn't require dvp to be BSD-locked, and the caller
+	 * expects us to drop this lock anyway, so we might as well
+	 * drop it early to encourage concurrency.
+	 */
+	VOP_UNLOCK(dvp);
+#endif
+
+	/*
+	 * zfs_remove will look up the entry again itself, so discard vp.
+	 */
+	VOP_UNLOCK(vp);
+	VN_RELE(vp);
+
+	error = zfs_remove(dvp, __UNCONST(cnp->cn_nameptr), cnp->cn_cred, NULL,
+	    0);
+
+	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
 
-	return (zfs_remove(ap->a_dvp, (char *)ap->a_cnp->cn_nameptr,
-	    ap->a_cnp->cn_cred, NULL, 0));
+	/*
+	 * Unlock and release dvp because the VOP_REMOVE protocol is insane.
+	 */
+	VOP_UNLOCK(dvp);
+	VN_RELE(dvp);
+
+	return (error);
 }
 
 static int
 zfs_netbsd_mkdir(void *v)
 {
-	struct vop_mkdir_args *ap = v;
-	vattr_t *vap = ap->a_vap;
+	struct vop_mkdir_args /* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+		struct vattr *a_vap;
+	} */ *ap = v;
+	struct vnode *dvp = ap->a_dvp;
+	struct vnode **vpp = ap->a_vpp;
+	struct componentname *cnp = ap->a_cnp;
+	struct vattr *vap = ap->a_vap;
+	int error;
+
+	KASSERT(dvp != NULL);
+	KASSERT(vpp != NULL);
+	KASSERT(cnp != NULL);
+	KASSERT(vap != NULL);
+	KASSERT(cnp->cn_nameptr != NULL);
+	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
+
+#if 0
+	/*
+	 * ZFS doesn't require dvp to be BSD-locked, and the caller
+	 * expects us to drop this lock anyway, so we might as well
+	 * drop it early to encourage concurrency.
+	 */
+	VOP_UNLOCK(dvp);
+#endif
 
 	vattr_init_mask(vap);
 
-	return (zfs_mkdir(ap->a_dvp, (char *)ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
-	    ap->a_cnp->cn_cred, NULL, 0, NULL));
+	error = zfs_mkdir(dvp, __UNCONST(cnp->cn_nameptr), vap, vpp,
+	    cnp->cn_cred, NULL, 0, NULL);
+	if (error) {
+		KASSERT(*vpp == NULL);
+		goto out;
+	}
+	KASSERT(*vpp != NULL);
+
+	/*
+	 * Lock *vpp in conformance to the VOP_MKDIR protocol.
+	 */
+	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
+
+out:
+	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
+	KASSERT((*vpp == NULL) || (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE));
+
+	/*
+	 * Unlock and release dvp because the VOP_MKDIR protocol is insane.
+	 */
+	VOP_UNLOCK(dvp);
+	VN_RELE(dvp);
+
+	return (error);
 }
 
 static int
 zfs_netbsd_rmdir(void *v)
 {
-	struct vop_rmdir_args *ap = v;
+	struct vop_rmdir_args /* {
+		struct vnode *a_dvp;
+		struct vnode *a_vp;
+		struct componentname *a_cnp;
+	} */ *ap = v;
+	struct vnode *dvp = ap->a_dvp;
+	struct vnode *vp = ap->a_vp;
 	struct componentname *cnp = ap->a_cnp;
+	int error;
+
+	KASSERT(dvp != NULL);
+	KASSERT(vp != NULL);
+	KASSERT(cnp != NULL);
+	KASSERT(cnp->cn_nameptr != NULL);
+	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
+	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
+
+#if 0
+	/*
+	 * ZFS doesn't require dvp to be BSD-locked, and the caller
+	 * expects us to drop this lock anyway, so we might as well
+	 * drop it early to encourage concurrency.
+	 */
+	VOP_UNLOCK(dvp);
+#endif
+
+	/*
+	 * zfs_rmdir will look up the entry again itself, so discard vp.
+	 */
+	VOP_UNLOCK(vp);
+	VN_RELE(vp);
 
-	return (zfs_rmdir(ap->a_dvp, (char *)cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0));
+	error = zfs_rmdir(dvp, __UNCONST(cnp->cn_nameptr), NULL, cnp->cn_cred,
+	    NULL, 0);
+
+	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
+
+	/*
+	 * Unlock and release dvp because the VOP_RMDIR protocol is insane.
+	 */
+	VOP_UNLOCK(dvp);
+	VN_RELE(dvp);
+	return error;
 }
 
 static int
@@ -5053,21 +5290,63 @@ zfs_netbsd_rename(void *v)
 	} */ *ap = v;
 	vnode_t *fdvp = ap->a_fdvp;
 	vnode_t *fvp = ap->a_fvp;
+	struct componentname *fcnp = ap->a_fcnp;
 	vnode_t *tdvp = ap->a_tdvp;
 	vnode_t *tvp = ap->a_tvp;
+	struct componentname *tcnp = ap->a_tcnp;
+	kauth_cred_t cred;
 	int error;
 
-	error = zfs_rename(fdvp, (char *)ap->a_fcnp->cn_nameptr, tdvp,
-	    (char *)ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0);
+	KASSERT(fdvp != NULL);
+	KASSERT(fvp != NULL);
+	KASSERT(fcnp != NULL);
+	KASSERT(fcnp->cn_nameptr != NULL);
+	KASSERT(tdvp != NULL);
+	KASSERT(tcnp != NULL);
+	KASSERT(fcnp->cn_nameptr != NULL);
+	/* KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
+	/* KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
+	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
+	KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
+	KASSERT(fdvp->v_type == VDIR);
+	KASSERT(tdvp->v_type == VDIR);
 
-	if (tdvp == tvp)
-		VN_RELE(tdvp);
-	else
-		VN_URELE(tdvp);
-	if (tvp)
-		VN_URELE(tvp);
-	VN_RELE(fdvp);
+	cred = fcnp->cn_cred;
+
+	/*
+	 * XXX Want a better equality test.  `tcnp->cn_cred == cred'
+	 * hoses p2k because puffs transmits the creds separately and
+	 * allocates distinct but equivalent structures for them.
+	 */
+	KASSERT(kauth_cred_uidmatch(cred, tcnp->cn_cred));
+
+	/*
+	 * Drop the insane locks.
+	 */
+	VOP_UNLOCK(tdvp);
+	if ((tvp != NULL) && (tvp != tdvp))
+		VOP_UNLOCK(tvp);
+
+	/*
+	 * Release the source and target nodes; zfs_rename will look
+	 * them up again once the locking situation is sane.
+	 */
 	VN_RELE(fvp);
+	if (tvp != NULL)
+		VN_RELE(tvp);
+
+	/*
+	 * Do the rename ZFSly.
+	 */
+	error = zfs_rename(fdvp, __UNCONST(fcnp->cn_nameptr), tdvp,
+	    __UNCONST(tcnp->cn_nameptr), cred, NULL, 0);
+
+	/*
+	 * Release the directories now too, because the VOP_RENAME
+	 * protocol is insane.
+	 */
+	VN_RELE(fdvp);
+	VN_RELE(tdvp);
 
 	return (error);
 }
@@ -5075,15 +5354,65 @@ zfs_netbsd_rename(void *v)
 static int
 zfs_netbsd_symlink(void *v)
 {
-	struct vop_symlink_args *ap = v;
+	struct vop_symlink_args /* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+		struct vattr *a_vap;
+		char *a_target;
+	} */ *ap = v;
+	struct vnode *dvp = ap->a_dvp;
+	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
-	vattr_t *vap = ap->a_vap;
+	struct vattr *vap = ap->a_vap;
+	char *target = ap->a_target;
+	int error;
+
+	KASSERT(dvp != NULL);
+	KASSERT(vpp != NULL);
+	KASSERT(cnp != NULL);
+	KASSERT(vap != NULL);
+	KASSERT(target != NULL);
+	KASSERT(cnp->cn_nameptr != NULL);
+	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
+
+#if 0
+	/*
+	 * ZFS doesn't require dvp to be BSD-locked, and the caller
+	 * expects us to drop this lock anyway, so we might as well
+	 * drop it early to encourage concurrency.
+	 */
+	VOP_UNLOCK(dvp);
+#endif
 
 	vap->va_type = VLNK;	/* Netbsd: Syscall only sets va_mode. */
 	vattr_init_mask(vap);
 
-	return (zfs_symlink(ap->a_dvp, ap->a_vpp, (char *)cnp->cn_nameptr, vap,
-		ap->a_target, cnp->cn_cred, 0));
+	error = zfs_symlink(dvp, vpp, __UNCONST(cnp->cn_nameptr), vap, target,
+	    cnp->cn_cred, 0);
+	if (error) {
+		KASSERT(*vpp == NULL);
+		goto out;
+	}
+	KASSERT(*vpp != NULL);
+
+
+	/*
+	 * Lock *vpp in conformance to the VOP_SYMLINK protocol.
+	 */
+	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
+
+out:
+	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
+	KASSERT((*vpp == NULL) || (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE));
+
+	/*
+	 * Unlock and release dvp because the VOP_SYMLINK protocol is insane.
+	 */
+	VOP_UNLOCK(dvp);
+	VN_RELE(dvp);
+
+	return (error);
 }
 
 #ifdef PORT_SOLARIS
@@ -5263,10 +5592,41 @@ zfs_netbsd_readlink(void *v)
 static int
 zfs_netbsd_link(void *v)
 {
-	struct vop_link_args *ap = v;
+	struct vop_link_args /* {
+		struct vnode *a_dvp;
+		struct vnode *a_vp;
+		struct componentname *a_cnp;
+	} */ *ap = v;
+	struct vnode *dvp = ap->a_dvp;
+	struct vnode *vp = ap->a_vp;
 	struct componentname *cnp = ap->a_cnp;
+	int error;
+
+	KASSERT(dvp != NULL);
+	KASSERT(vp != NULL);
+	KASSERT(cnp != NULL);
+	KASSERT(cnp->cn_nameptr != NULL);
+	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
 
-	return (zfs_link(ap->a_dvp, ap->a_vp, (char *)cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
+#if 0
+	/*
+	 * ZFS doesn't require dvp to be BSD-locked, and the caller
+	 * expects us to drop this lock anyway, so we might as well
+	 * drop it early to encourage concurrency.
+	 */
+	VOP_UNLOCK(dvp);
+#endif
+
+	error = zfs_link(dvp, vp, __UNCONST(cnp->cn_nameptr), cnp->cn_cred,
+	    NULL, 0);
+
+	/*
+	 * Unlock and release dvp because the VOP_LINK protocol is insane.
+	 */
+	VOP_UNLOCK(dvp);
+	VN_RELE(dvp);
+
+	return (error);
 }
 
 static int
@@ -5286,91 +5646,67 @@ zfs_netbsd_inactive(void *v)
 	return (0);
 }
 
-/*
- * Destroy znode from taskq thread without ZFS_OBJ_MUTEX held.
- */
-static void
-zfs_reclaim_deferred(void *arg)
-{
-	znode_t *zp = arg;
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	uint64_t z_id = zp->z_id;
-
-	/*
-	 * Don't allow a zfs_zget() while were trying to release this znode
-	 */
-	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
-
-	/* Don't need to call ZFS_OBJ_HOLD_EXIT zfs_inactive did thatfor us. */
-	zfs_zinactive(zp);
-	
-}
-
 static int
 zfs_netbsd_reclaim(void *v)
 {
-	struct vop_reclaim_args *ap = v;
-	vnode_t	*vp = ap->a_vp;
-	znode_t	*zp = VTOZ(vp);
+	struct vop_reclaim_args /* {
+		struct vnode *a_vp;
+	} */ *ap = v;
+	struct vnode *vp = ap->a_vp;
+	znode_t	*zp;
 	zfsvfs_t *zfsvfs;
-	int locked;
-
-	locked = 0;
-	
-	ASSERT(zp != NULL);
-	KASSERT(!vn_has_cached_data(vp));
+	int error;
 
+	KASSERT(vp != NULL);
+	zp = VTOZ(vp);
+	KASSERT(zp != NULL);
 	zfsvfs = zp->z_zfsvfs;
-	
-	mutex_enter(&zp->z_lock);
-	ASSERT(zp->z_phys);
+	KASSERT(zfsvfs != NULL);
 
-//	dprintf("destroying znode %p -- vnode %p -- zp->z_buf = %p\n", zp, ZTOV(zp), zp->z_dbuf);
-//	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
-	genfs_node_destroy(vp);
-	cache_purge(vp);
+	/* Not until we get uvm and zfs talking to one another.  */
+	KASSERT(!vn_has_cached_data(vp));
 
+	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
 	if (zp->z_dbuf == NULL) {
 		/*
 		 * The fs has been unmounted, or we did a
 		 * suspend/resume and this file no longer exists.
 		 */
+		if (vn_has_cached_data(vp))
+			/* zfs and uvm are hosed.  Should not happen.  */
+			panic("zfs vnode has cached data (0): %p", vp);
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
-		mutex_exit(&zp->z_lock);
 		zfs_znode_free(zp);
 		return (0);
 	}
-	mutex_exit(&zp->z_lock);
 
-	mutex_enter(&zp->z_lock);
-	if (!zp->z_unlinked) {
-		/*
-		 * XXX Hack because ZFS_OBJ_MUTEX is held we can't call zfs_zinactive
-		 * now. I need to defer zfs_zinactive to another thread which doesn't hold this mutex.
-		 */
-		locked = MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)) ? 2 :
-		    ZFS_OBJ_HOLD_TRYENTER(zfsvfs, zp->z_id);
-		if (locked == 0) {
-			/*
-			 * Lock can't be obtained due to deadlock possibility,
-			 * so defer znode destruction.
-			 */
-			taskq_dispatch(system_taskq, zfs_reclaim_deferred, zp, 0);
+	/*
+	 * Attempt to push any data in the page cache.  If this fails
+	 * we will get kicked out later in zfs_zinactive().
+	 */
+	if (vn_has_cached_data(vp))
+		/* zfs and uvm are hosed.  Should not happen.  */
+		panic("zfs vnode has cached data (1): %p", vp);
+
+	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
+		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
+		dmu_tx_hold_bonus(tx, zp->z_id);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
 		} else {
-			zfs_znode_dmu_fini(zp);
-			/* Our LWP is holding ZFS_OBJ_HELD mutex but it was locked before
-			   zfs_zinactive was called therefore we can't release it. */
-			if (locked == 1)
-				ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
-			zfs_znode_free(zp);
+			dmu_buf_will_dirty(zp->z_dbuf, tx);
+			mutex_enter(&zp->z_lock);
+			zp->z_atime_dirty = 0;
+			mutex_exit(&zp->z_lock);
+			dmu_tx_commit(tx);
 		}
-	} else
-		mutex_exit(&zp->z_lock);
+	}
 
-	ZTOV(zp) = NULL;
-	vp->v_data = NULL; /* v_data must be NULL for a cleaned vnode. */
-	
-	return (0);
+	zfs_zinactive(zp);
+	rw_exit(&zfsvfs->z_teardown_inactive_lock);
+	return 0;
 }
 
 static int
@@ -5414,9 +5750,6 @@ zfs_netbsd_pathconf(void *v)
 		case _PC_CHOWN_RESTRICTED:
 			*ap->a_retval = 1;
 			return (0);
-		case _PC_NO_TRUNC:
-			*ap->a_retval = 1;
-			return (0);
 		case _PC_VDISABLE:
 			*ap->a_retval = _POSIX_VDISABLE;
 			return (0);
@@ -5424,10 +5757,15 @@ zfs_netbsd_pathconf(void *v)
 			return (EINVAL);
 		}
 		/* NOTREACHED */
-	}	
+	}
 	return (error);
 }
 
+#if 1
+#  define	zfs_netbsd_lock		genfs_lock
+#  define	zfs_netbsd_unlock	genfs_unlock
+#  define	zfs_netbsd_islocked	genfs_islocked
+#else
 int
 zfs_netbsd_lock(void *v)
 {
@@ -5449,6 +5787,7 @@ zfs_netbsd_islocked(void *v)
 
 	return LK_EXCLUSIVE;
 }
+#endif
 
 /*
 int

Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_znode.c
diff -u src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_znode.c:1.12 src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_znode.c:1.13
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_znode.c:1.12	Sun Nov 20 02:54:25 2011
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_znode.c	Mon Oct 15 23:08:19 2012
@@ -618,8 +618,8 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_bu
 	int error;
 
 	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
-	for (;;) {
 
+	for (;;) {
 		error = getnewvnode(VT_ZFS, zfsvfs->z_parent->z_vfs,
 		    zfs_vnodeop_p, NULL, &zp->z_vnode);
 		if (__predict_true(error == 0))
@@ -628,7 +628,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_bu
 		    "error=%d\n", error);
 		(void)kpause("zfsnewvn", false, hz, NULL);
 	}
-	
+
 	ASSERT(zp->z_dirlocks == NULL);
 	ASSERT(zp->z_dbuf == NULL);
 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
@@ -656,6 +656,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_bu
 	vp->v_vfsp = zfsvfs->z_parent->z_vfs;
 	vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
 	vp->v_data = zp;
+	genfs_node_init(vp, &zfs_genfsops);
 	switch (vp->v_type) {
 	case VDIR:
 		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
@@ -686,6 +687,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_bu
 	zp->z_zfsvfs = zfsvfs;
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
+	VFS_HOLD(zfsvfs->z_vfs);
 	return (zp);
 }
 
@@ -998,11 +1000,6 @@ again:
 	if (((znode_phys_t *)db->db_data)->zp_gen != 0) {
 		zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size);
 		*zpp = zp;
-
-		vp = ZTOV(zp);
-		genfs_node_init(vp, &zfs_genfsops);
-		VOP_UNLOCK(vp);
-
 		err = 0;
 	} else {
 		dmu_buf_rele(db, NULL);
@@ -1076,17 +1073,13 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *
 	zfs_znode_free(zp);
 }
 
-/*
- * zfs_zinactive must be called with ZFS_OBJ_HOLD_ENTER held. And this lock
- * will be released in zfs_zinactive.
- */
 void
 zfs_zinactive(znode_t *zp)
 {
 	vnode_t	*vp = ZTOV(zp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	uint64_t z_id = zp->z_id;
-	
+
 	ASSERT(zp->z_dbuf && zp->z_phys);
 
 	/*
@@ -1107,7 +1100,7 @@ zfs_zinactive(znode_t *zp)
 	}
 
 	mutex_exit(&zp->z_lock);
-	/* XXX why disabled zfs_znode_dmu_fini(zp); */
+	zfs_znode_dmu_fini(zp);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
 	zfs_znode_free(zp);
 }
@@ -1116,7 +1109,14 @@ void
 zfs_znode_free(znode_t *zp)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	ASSERT(ZTOV(zp) == NULL);
+	struct vnode *vp = ZTOV(zp);
+
+	/* XXX Not all callers are from VOP_RECLAIM.  What to do?  */
+	KASSERT(vp != NULL);
+	mutex_enter(vp->v_interlock); /* XXX Necessary?  */
+	genfs_node_destroy(vp);
+	vp->v_data = NULL;
+	mutex_exit(vp->v_interlock);
 
 	dprintf("destroying znode %p\n", zp);
 	//cpu_Debugger();
@@ -1403,6 +1403,8 @@ top:
 
 	dmu_tx_commit(tx);
 
+	zfs_range_unlock(rl);
+
 	/*
 	 * Clear any mapped pages in the truncated region.  This has to
 	 * happen outside of the transaction to avoid the possibility of

Reply via email to