Hi again! Another day, another patch. :)

Testing the same setup as in earlier "Godfather ZIO lock congestion" thread (small strided reads from 256 threads concurrently on 40-core machine) but with ARC size sufficient to fit all test data, I hit another lock congestion on RRW lock of zfsvfs->z_teardown_lock. Earlier I've seen the same congestion even on smaller machine while profiling SPEC NFS benchmark.

To avoid the congestion with attached patch I've replaced single teardown RRW lock per struct vfszfs with bunch (17) of them. Read acquisitions are randomly distributed among them based on curthread pointer to avoid any measurable congestion in a hot path. Write acquisition are going to all the locks, but they should be rare enough to not bother.

As result, performance on this test setup increased from ~475K IOPS to ~1.3M IOPS.

Any comments?

--
Alexander Motin
Index: sys/zfs_vfsops.h
===================================================================
--- sys/zfs_vfsops.h    (revision 267049)
+++ sys/zfs_vfsops.h    (working copy)
@@ -64,7 +64,8 @@ struct zfsvfs {
        int             z_norm;         /* normalization flags */
        boolean_t       z_atime;        /* enable atimes mount option */
        boolean_t       z_unmounted;    /* unmounted */
-       rrwlock_t       z_teardown_lock;
+#define        ZFS_TD_LOCKS    17              /* number of locks (prime 
number) */
+       rrwlock_t       z_teardown_lock[ZFS_TD_LOCKS];
        krwlock_t       z_teardown_inactive_lock;
        list_t          z_all_znodes;   /* all vnodes in the fs */
        kmutex_t        z_znodes_lock;  /* lock for z_all_znodes */
Index: sys/zfs_znode.h
===================================================================
--- sys/zfs_znode.h     (revision 267049)
+++ sys/zfs_znode.h     (working copy)
@@ -253,10 +253,12 @@ VTOZ(vnode_t *vp)
 #define        VTOZ(VP)        ((znode_t *)(VP)->v_data)
 #endif
 
+#define        ZFS_TD_LOCK()   (((uint32_t)(uintptr_t)(curthread)) % 
ZFS_TD_LOCKS)
+
 /* Called on entry to each ZFS vnode and vfs operation  */
 #define        ZFS_ENTER(zfsvfs) \
        { \
-               rrw_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \
+               rrw_enter_read(&(zfsvfs)->z_teardown_lock[ZFS_TD_LOCK()], 
FTAG); \
                if ((zfsvfs)->z_unmounted) { \
                        ZFS_EXIT(zfsvfs); \
                        return (EIO); \
@@ -265,11 +267,27 @@ VTOZ(vnode_t *vp)
 
 /* Called on entry to each ZFS vnode and vfs operation that can not return EIO 
*/
 #define        ZFS_ENTER_NOERROR(zfsvfs) \
-       rrw_enter(&(zfsvfs)->z_teardown_lock, RW_READER, FTAG)
+       rrw_enter(&(zfsvfs)->z_teardown_lock[ZFS_TD_LOCK()], RW_READER, FTAG)
 
 /* Must be called before exiting the vop */
-#define        ZFS_EXIT(zfsvfs) rrw_exit(&(zfsvfs)->z_teardown_lock, FTAG)
+#define        ZFS_EXIT(zfsvfs) \
+       rrw_exit(&(zfsvfs)->z_teardown_lock[ZFS_TD_LOCK()], FTAG)
 
+#define        ZFS_ENTERW(zfsvfs) \
+       {                                                               \
+               int i;                                                  \
+                                                                       \
+               for (i = 0; i < ZFS_TD_LOCKS; i++)                      \
+                       rrw_enter(&(zfsvfs)->z_teardown_lock[i], RW_WRITER, 
FTAG); \
+       }
+#define        ZFS_EXITW(zfsvfs) \
+       {                                                               \
+               int i;                                                  \
+                                                                       \
+               for (i = 0; i < ZFS_TD_LOCKS; i++)                      \
+                       rrw_exit(&(zfsvfs)->z_teardown_lock[i], FTAG);  \
+       }
+
 /* Verifies the znode is valid */
 #define        ZFS_VERIFY_ZP(zp) \
        if ((zp)->z_sa_hdl == NULL) { \
Index: zfs_ioctl.c
===================================================================
--- zfs_ioctl.c (revision 267049)
+++ zfs_ioctl.c (working copy)
@@ -1463,27 +1463,39 @@ zfsvfs_hold(const char *name, void *tag, zfsvfs_t
 
        if (getzfsvfs(name, zfvp) != 0)
                error = zfsvfs_create(name, zfvp);
-       if (error == 0) {
-               rrw_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER :
-                   RW_READER, tag);
+       if (error != 0)
+               return (error);
+
+       /*
+        * XXX we could probably try again, since the unmounting
+        * thread should be just about to disassociate the
+        * objset from the zfsvfs.
+        */
+       if (writer) {
+               ZFS_ENTERW(*zfvp);
                if ((*zfvp)->z_unmounted) {
-                       /*
-                        * XXX we could probably try again, since the unmounting
-                        * thread should be just about to disassociate the
-                        * objset from the zfsvfs.
-                        */
-                       rrw_exit(&(*zfvp)->z_teardown_lock, tag);
+                       ZFS_EXITW(*zfvp);
                        return (SET_ERROR(EBUSY));
                }
+       } else {
+               ZFS_ENTER_NOERROR(*zfvp);
+               if ((*zfvp)->z_unmounted) {
+                       ZFS_EXIT(*zfvp);
+                       return (SET_ERROR(EBUSY));
+               }
        }
        return (error);
 }
 
 static void
-zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag)
+zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag, boolean_t writer)
 {
-       rrw_exit(&zfsvfs->z_teardown_lock, tag);
 
+       if (writer) {
+               ZFS_EXITW(zfsvfs);
+       } else
+               ZFS_EXIT(zfsvfs);
+
        if (zfsvfs->z_vfs) {
                VFS_RELE(zfsvfs->z_vfs);
        } else {
@@ -2411,7 +2423,7 @@ zfs_prop_set_userquota(const char *dsname, nvpair_
        err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE);
        if (err == 0) {
                err = zfs_set_userquota(zfsvfs, type, domain, rid, quota);
-               zfsvfs_rele(zfsvfs, FTAG);
+               zfsvfs_rele(zfsvfs, FTAG, B_FALSE);
        }
 
        return (err);
@@ -2492,7 +2504,7 @@ zfs_prop_set_special(const char *dsname, zprop_sou
                        break;
 
                err = zfs_set_version(zfsvfs, intval);
-               zfsvfs_rele(zfsvfs, FTAG);
+               zfsvfs_rele(zfsvfs, FTAG, B_TRUE);
 
                if (err == 0 && intval >= ZPL_VERSION_USERSPACE) {
                        zfs_cmd_t *zc;
@@ -4705,7 +4717,7 @@ zfs_ioc_userspace_one(zfs_cmd_t *zc)
 
        error = zfs_userspace_one(zfsvfs,
            zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie);
-       zfsvfs_rele(zfsvfs, FTAG);
+       zfsvfs_rele(zfsvfs, FTAG, B_FALSE);
 
        return (error);
 }
@@ -4745,7 +4757,7 @@ zfs_ioc_userspace_many(zfs_cmd_t *zc)
                    zc->zc_nvlist_dst_size, zc->zc_iflags);
        }
        kmem_free(buf, bufsize);
-       zfsvfs_rele(zfsvfs, FTAG);
+       zfsvfs_rele(zfsvfs, FTAG, B_FALSE);
 
        return (error);
 }
Index: zfs_vfsops.c
===================================================================
--- zfs_vfsops.c        (revision 267049)
+++ zfs_vfsops.c        (working copy)
@@ -988,7 +988,8 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
        mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
        list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
            offsetof(znode_t, z_link_node));
-       rrw_init(&zfsvfs->z_teardown_lock, B_FALSE);
+       for (i = 0; i < ZFS_TD_LOCKS; i++)
+               rrw_init(&zfsvfs->z_teardown_lock[i], B_FALSE);
        rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
        rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
        for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
@@ -1104,7 +1105,8 @@ zfsvfs_free(zfsvfs_t *zfsvfs)
        mutex_destroy(&zfsvfs->z_znodes_lock);
        mutex_destroy(&zfsvfs->z_lock);
        list_destroy(&zfsvfs->z_all_znodes);
-       rrw_destroy(&zfsvfs->z_teardown_lock);
+       for (i = 0; i < ZFS_TD_LOCKS; i++)
+               rrw_destroy(&zfsvfs->z_teardown_lock[i]);
        rw_destroy(&zfsvfs->z_teardown_inactive_lock);
        rw_destroy(&zfsvfs->z_fuid_lock);
        for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
@@ -1833,7 +1835,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmoun
 {
        znode_t *zp;
 
-       rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+       ZFS_ENTERW(zfsvfs);
 
        if (!unmounting) {
                /*
@@ -1866,7 +1868,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmoun
         */
        if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
                rw_exit(&zfsvfs->z_teardown_inactive_lock);
-               rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+               ZFS_EXITW(zfsvfs);
                return (SET_ERROR(EIO));
        }
 
@@ -1893,8 +1895,8 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmoun
         */
        if (unmounting) {
                zfsvfs->z_unmounted = B_TRUE;
-               rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
                rw_exit(&zfsvfs->z_teardown_inactive_lock);
+               ZFS_EXITW(zfsvfs);
        }
 
        /*
@@ -1970,9 +1972,9 @@ zfs_umount(vfs_t *vfsp, int fflag)
                 * vflush(FORCECLOSE). This way we ensure no future vnops
                 * will be called and risk operating on DOOMED vnodes.
                 */
-               rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+               ZFS_ENTERW(zfsvfs);
                zfsvfs->z_unmounted = B_TRUE;
-               rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+               ZFS_EXITW(zfsvfs);
        }
 
        /*
@@ -2240,7 +2242,7 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname
        znode_t *zp;
        uint64_t sa_obj = 0;
 
-       ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
+       ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock[0]));
        ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
 
        /*
@@ -2296,7 +2298,7 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname
 bail:
        /* release the VOPs */
        rw_exit(&zfsvfs->z_teardown_inactive_lock);
-       rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+       ZFS_EXITW(zfsvfs);
 
        if (err) {
                /*
Index: zfs_znode.c
===================================================================
--- zfs_znode.c (revision 267049)
+++ zfs_znode.c (working copy)
@@ -276,7 +276,7 @@ zfs_znode_move(void *buf, void *newbuf, size_t siz
         * can safely ensure that the filesystem is not and will not be
         * unmounted. The next statement is equivalent to ZFS_ENTER().
         */
-       rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
+       ZFS_ENTER_NOERROR(zfsvfs);
        if (zfsvfs->z_unmounted) {
                ZFS_EXIT(zfsvfs);
                rw_exit(&zfsvfs_lock);
_______________________________________________
developer mailing list
[email protected]
http://lists.open-zfs.org/mailman/listinfo/developer

Reply via email to