Hi again! Another day, another patch. :)
Testing the same setup as in earlier "Godfather ZIO lock congestion"
thread (small strided reads from 256 threads concurrently on 40-core
machine) but with ARC size sufficient to fit all test data, I hit
another lock congestion on RRW lock of zfsvfs->z_teardown_lock. Earlier
I've seen the same congestion even on smaller machine while profiling
SPEC NFS benchmark.
To avoid the congestion with attached patch I've replaced single
teardown RRW lock per struct vfszfs with bunch (17) of them. Read
acquisitions are randomly distributed among them based on curthread
pointer to avoid any measurable congestion in a hot path. Write
acquisition are going to all the locks, but they should be rare enough
to not bother.
As result, performance on this test setup increased from ~475K IOPS to
~1.3M IOPS.
Any comments?
--
Alexander Motin
Index: sys/zfs_vfsops.h
===================================================================
--- sys/zfs_vfsops.h (revision 267049)
+++ sys/zfs_vfsops.h (working copy)
@@ -64,7 +64,8 @@ struct zfsvfs {
int z_norm; /* normalization flags */
boolean_t z_atime; /* enable atimes mount option */
boolean_t z_unmounted; /* unmounted */
- rrwlock_t z_teardown_lock;
+#define ZFS_TD_LOCKS 17 /* number of locks (prime
number) */
+ rrwlock_t z_teardown_lock[ZFS_TD_LOCKS];
krwlock_t z_teardown_inactive_lock;
list_t z_all_znodes; /* all vnodes in the fs */
kmutex_t z_znodes_lock; /* lock for z_all_znodes */
Index: sys/zfs_znode.h
===================================================================
--- sys/zfs_znode.h (revision 267049)
+++ sys/zfs_znode.h (working copy)
@@ -253,10 +253,12 @@ VTOZ(vnode_t *vp)
#define VTOZ(VP) ((znode_t *)(VP)->v_data)
#endif
+#define ZFS_TD_LOCK() (((uint32_t)(uintptr_t)(curthread)) %
ZFS_TD_LOCKS)
+
/* Called on entry to each ZFS vnode and vfs operation */
#define ZFS_ENTER(zfsvfs) \
{ \
- rrw_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \
+ rrw_enter_read(&(zfsvfs)->z_teardown_lock[ZFS_TD_LOCK()],
FTAG); \
if ((zfsvfs)->z_unmounted) { \
ZFS_EXIT(zfsvfs); \
return (EIO); \
@@ -265,11 +267,27 @@ VTOZ(vnode_t *vp)
/* Called on entry to each ZFS vnode and vfs operation that can not return EIO
*/
#define ZFS_ENTER_NOERROR(zfsvfs) \
- rrw_enter(&(zfsvfs)->z_teardown_lock, RW_READER, FTAG)
+ rrw_enter(&(zfsvfs)->z_teardown_lock[ZFS_TD_LOCK()], RW_READER, FTAG)
/* Must be called before exiting the vop */
-#define ZFS_EXIT(zfsvfs) rrw_exit(&(zfsvfs)->z_teardown_lock, FTAG)
+#define ZFS_EXIT(zfsvfs) \
+ rrw_exit(&(zfsvfs)->z_teardown_lock[ZFS_TD_LOCK()], FTAG)
+#define ZFS_ENTERW(zfsvfs) \
+ { \
+ int i; \
+ \
+ for (i = 0; i < ZFS_TD_LOCKS; i++) \
+ rrw_enter(&(zfsvfs)->z_teardown_lock[i], RW_WRITER,
FTAG); \
+ }
+#define ZFS_EXITW(zfsvfs) \
+ { \
+ int i; \
+ \
+ for (i = 0; i < ZFS_TD_LOCKS; i++) \
+ rrw_exit(&(zfsvfs)->z_teardown_lock[i], FTAG); \
+ }
+
/* Verifies the znode is valid */
#define ZFS_VERIFY_ZP(zp) \
if ((zp)->z_sa_hdl == NULL) { \
Index: zfs_ioctl.c
===================================================================
--- zfs_ioctl.c (revision 267049)
+++ zfs_ioctl.c (working copy)
@@ -1463,27 +1463,39 @@ zfsvfs_hold(const char *name, void *tag, zfsvfs_t
if (getzfsvfs(name, zfvp) != 0)
error = zfsvfs_create(name, zfvp);
- if (error == 0) {
- rrw_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER :
- RW_READER, tag);
+ if (error != 0)
+ return (error);
+
+ /*
+ * XXX we could probably try again, since the unmounting
+ * thread should be just about to disassociate the
+ * objset from the zfsvfs.
+ */
+ if (writer) {
+ ZFS_ENTERW(*zfvp);
if ((*zfvp)->z_unmounted) {
- /*
- * XXX we could probably try again, since the unmounting
- * thread should be just about to disassociate the
- * objset from the zfsvfs.
- */
- rrw_exit(&(*zfvp)->z_teardown_lock, tag);
+ ZFS_EXITW(*zfvp);
return (SET_ERROR(EBUSY));
}
+ } else {
+ ZFS_ENTER_NOERROR(*zfvp);
+ if ((*zfvp)->z_unmounted) {
+ ZFS_EXIT(*zfvp);
+ return (SET_ERROR(EBUSY));
+ }
}
return (error);
}
static void
-zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag)
+zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag, boolean_t writer)
{
- rrw_exit(&zfsvfs->z_teardown_lock, tag);
+ if (writer) {
+ ZFS_EXITW(zfsvfs);
+ } else
+ ZFS_EXIT(zfsvfs);
+
if (zfsvfs->z_vfs) {
VFS_RELE(zfsvfs->z_vfs);
} else {
@@ -2411,7 +2423,7 @@ zfs_prop_set_userquota(const char *dsname, nvpair_
err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE);
if (err == 0) {
err = zfs_set_userquota(zfsvfs, type, domain, rid, quota);
- zfsvfs_rele(zfsvfs, FTAG);
+ zfsvfs_rele(zfsvfs, FTAG, B_FALSE);
}
return (err);
@@ -2492,7 +2504,7 @@ zfs_prop_set_special(const char *dsname, zprop_sou
break;
err = zfs_set_version(zfsvfs, intval);
- zfsvfs_rele(zfsvfs, FTAG);
+ zfsvfs_rele(zfsvfs, FTAG, B_TRUE);
if (err == 0 && intval >= ZPL_VERSION_USERSPACE) {
zfs_cmd_t *zc;
@@ -4705,7 +4717,7 @@ zfs_ioc_userspace_one(zfs_cmd_t *zc)
error = zfs_userspace_one(zfsvfs,
zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie);
- zfsvfs_rele(zfsvfs, FTAG);
+ zfsvfs_rele(zfsvfs, FTAG, B_FALSE);
return (error);
}
@@ -4745,7 +4757,7 @@ zfs_ioc_userspace_many(zfs_cmd_t *zc)
zc->zc_nvlist_dst_size, zc->zc_iflags);
}
kmem_free(buf, bufsize);
- zfsvfs_rele(zfsvfs, FTAG);
+ zfsvfs_rele(zfsvfs, FTAG, B_FALSE);
return (error);
}
Index: zfs_vfsops.c
===================================================================
--- zfs_vfsops.c (revision 267049)
+++ zfs_vfsops.c (working copy)
@@ -988,7 +988,8 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
offsetof(znode_t, z_link_node));
- rrw_init(&zfsvfs->z_teardown_lock, B_FALSE);
+ for (i = 0; i < ZFS_TD_LOCKS; i++)
+ rrw_init(&zfsvfs->z_teardown_lock[i], B_FALSE);
rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
@@ -1104,7 +1105,8 @@ zfsvfs_free(zfsvfs_t *zfsvfs)
mutex_destroy(&zfsvfs->z_znodes_lock);
mutex_destroy(&zfsvfs->z_lock);
list_destroy(&zfsvfs->z_all_znodes);
- rrw_destroy(&zfsvfs->z_teardown_lock);
+ for (i = 0; i < ZFS_TD_LOCKS; i++)
+ rrw_destroy(&zfsvfs->z_teardown_lock[i]);
rw_destroy(&zfsvfs->z_teardown_inactive_lock);
rw_destroy(&zfsvfs->z_fuid_lock);
for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
@@ -1833,7 +1835,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmoun
{
znode_t *zp;
- rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+ ZFS_ENTERW(zfsvfs);
if (!unmounting) {
/*
@@ -1866,7 +1868,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmoun
*/
if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
rw_exit(&zfsvfs->z_teardown_inactive_lock);
- rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+ ZFS_EXITW(zfsvfs);
return (SET_ERROR(EIO));
}
@@ -1893,8 +1895,8 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmoun
*/
if (unmounting) {
zfsvfs->z_unmounted = B_TRUE;
- rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
rw_exit(&zfsvfs->z_teardown_inactive_lock);
+ ZFS_EXITW(zfsvfs);
}
/*
@@ -1970,9 +1972,9 @@ zfs_umount(vfs_t *vfsp, int fflag)
* vflush(FORCECLOSE). This way we ensure no future vnops
* will be called and risk operating on DOOMED vnodes.
*/
- rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+ ZFS_ENTERW(zfsvfs);
zfsvfs->z_unmounted = B_TRUE;
- rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+ ZFS_EXITW(zfsvfs);
}
/*
@@ -2240,7 +2242,7 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname
znode_t *zp;
uint64_t sa_obj = 0;
- ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
+ ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock[0]));
ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
/*
@@ -2296,7 +2298,7 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname
bail:
/* release the VOPs */
rw_exit(&zfsvfs->z_teardown_inactive_lock);
- rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+ ZFS_EXITW(zfsvfs);
if (err) {
/*
Index: zfs_znode.c
===================================================================
--- zfs_znode.c (revision 267049)
+++ zfs_znode.c (working copy)
@@ -276,7 +276,7 @@ zfs_znode_move(void *buf, void *newbuf, size_t siz
* can safely ensure that the filesystem is not and will not be
* unmounted. The next statement is equivalent to ZFS_ENTER().
*/
- rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
+ ZFS_ENTER_NOERROR(zfsvfs);
if (zfsvfs->z_unmounted) {
ZFS_EXIT(zfsvfs);
rw_exit(&zfsvfs_lock);
_______________________________________________
developer mailing list
[email protected]
http://lists.open-zfs.org/mailman/listinfo/developer