Re: [OpenZFS Developer] zfsvfs->z_teardown_lock congestion

Alexander Motin Sun, 08 Jun 2014 11:35:17 -0700

On 07.06.2014 22:35, Matthew Ahrens wrote:

code review:


zfsvfs_hold/rele need to pass the correct tags through to rrw_enter().
The existing code will not work (if reference_tracking_enable), because
you will be holding with tag "zfsvfs_hold" and releasing with
"zfsvfs_rele". The existing code holds and releases with tag e.g.
"zfs_prop_set_userquota"; that needs to remain the case.


Indeed there was a bug, thank you.

I think it would be cleaner to encapsulate the multitude of locks within
the rrw code, but I can live with it as is.

I had such thought initially, but then decided it will be too big messto do it inside. Also it may not always be needed. But now you've asked,I've made intermediate solution: let me introduce Recursable Read-Mostlylocks, as a wrapper on top of existing Recursable Read-Write locks. :)

The attached patch implements rrmlock, and uses them for teardown lock.It indeed looks much better. Thanks for pushing me to do it. :) What doyou think?

On Thu, Jun 5, 2014 at 12:34 PM, Alexander Motin <[email protected]
<mailto:[email protected]>> wrote:

    Hi again! Another day, another patch. :)

    Testing the same setup as in earlier "Godfather ZIO lock congestion"
    thread (small strided reads from 256 threads concurrently on 40-core
    machine) but with ARC size sufficient to fit all test data, I hit
    another lock congestion on RRW lock of zfsvfs->z_teardown_lock.
    Earlier I've seen the same congestion even on smaller machine while
    profiling SPEC NFS benchmark.

    To avoid the congestion with attached patch I've replaced single
    teardown RRW lock per struct vfszfs with bunch (17) of them. Read
    acquisitions are randomly distributed among them based on curthread
    pointer to avoid any measurable congestion in a hot path. Write
    acquisition are going to all the locks, but they should be rare
    enough to not bother.

    As result, performance on this test setup increased from ~475K IOPS
    to ~1.3M IOPS.

    Any comments?

    --
    Alexander Motin

    _______________________________________________
    developer mailing list
    [email protected] <mailto:[email protected]>
    http://lists.open-zfs.org/mailman/listinfo/developer



--
Alexander Motin

Index: rrwlock.c
===================================================================
--- rrwlock.c   (revision 267215)
+++ rrwlock.c   (working copy)
@@ -286,3 +286,73 @@ rrw_tsd_destroy(void *arg)
                    (void *)curthread, (void *)rn->rn_rrl);
        }
 }
+
+
+void
+rrm_init(rrmlock_t *rrl, boolean_t track_all)
+{
+       int i;
+
+       for (i = 0; i < RRM_NUM_LOCKS; i++)
+               rrw_init(&rrl->locks[i], track_all);
+}
+
+void
+rrm_destroy(rrmlock_t *rrl)
+{
+       int i;
+
+       for (i = 0; i < RRM_NUM_LOCKS; i++)
+               rrw_destroy(&rrl->locks[i]);
+}
+
+void
+rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag)
+{
+       if (rw == RW_READER)
+               rrm_enter_read(rrl, tag);
+       else
+               rrm_enter_write(rrl);
+}
+
+#define        RRM_TD_LOCK()   (((uint32_t)(uintptr_t)(curthread)) % 
RRM_NUM_LOCKS)
+
+void
+rrm_enter_read(rrmlock_t *rrl, void *tag)
+{
+
+       rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag);
+}
+
+void
+rrm_enter_write(rrmlock_t *rrl)
+{
+       int i;
+
+       for (i = 0; i < RRM_NUM_LOCKS; i++)
+               rrw_enter_write(&rrl->locks[i]);
+}
+
+void
+rrm_exit(rrmlock_t *rrl, void *tag)
+{
+       int i;
+
+       if (rrl->locks[0].rr_writer == curthread) {
+               rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag);
+       } else {
+               for (i = 0; i < RRM_NUM_LOCKS; i++)
+                       rrw_exit(&rrl->locks[i], tag);
+       }
+}
+
+boolean_t
+rrm_held(rrmlock_t *rrl, krw_t rw)
+{
+
+       if (rw == RW_WRITER) {
+               return (rrw_held(&rrl->locks[0], rw));
+       } else {
+               return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw));
+       }
+}
Index: sys/rrwlock.h
===================================================================
--- sys/rrwlock.h       (revision 267215)
+++ sys/rrwlock.h       (working copy)
@@ -79,6 +79,24 @@ void rrw_tsd_destroy(void *arg);
 #define        RRW_LOCK_HELD(x) \
        (rrw_held(x, RW_WRITER) || rrw_held(x, RW_READER))
 
+#define        RRM_NUM_LOCKS           17
+typedef struct rrmlock {
+       rrwlock_t       locks[RRM_NUM_LOCKS];
+} rrmlock_t;
+
+void rrm_init(rrmlock_t *rrl, boolean_t track_all);
+void rrm_destroy(rrmlock_t *rrl);
+void rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag);
+void rrm_enter_read(rrmlock_t *rrl, void *tag);
+void rrm_enter_write(rrmlock_t *rrl);
+void rrm_exit(rrmlock_t *rrl, void *tag);
+boolean_t rrm_held(rrmlock_t *rrl, krw_t rw);
+
+#define        RRM_READ_HELD(x)        rrm_held(x, RW_READER)
+#define        RRM_WRITE_HELD(x)       rrm_held(x, RW_WRITER)
+#define        RRM_LOCK_HELD(x) \
+       (rrm_held(x, RW_WRITER) || rrm_held(x, RW_READER))
+
 #ifdef __cplusplus
 }
 #endif
Index: sys/zfs_vfsops.h
===================================================================
--- sys/zfs_vfsops.h    (revision 267215)
+++ sys/zfs_vfsops.h    (working copy)
@@ -64,7 +64,7 @@ struct zfsvfs {
        int             z_norm;         /* normalization flags */
        boolean_t       z_atime;        /* enable atimes mount option */
        boolean_t       z_unmounted;    /* unmounted */
-       rrwlock_t       z_teardown_lock;
+       rrmlock_t       z_teardown_lock;
        krwlock_t       z_teardown_inactive_lock;
        list_t          z_all_znodes;   /* all vnodes in the fs */
        kmutex_t        z_znodes_lock;  /* lock for z_all_znodes */
Index: sys/zfs_znode.h
===================================================================
--- sys/zfs_znode.h     (revision 267215)
+++ sys/zfs_znode.h     (working copy)
@@ -256,7 +256,7 @@ VTOZ(vnode_t *vp)
 /* Called on entry to each ZFS vnode and vfs operation  */
 #define        ZFS_ENTER(zfsvfs) \
        { \
-               rrw_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \
+               rrm_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \
                if ((zfsvfs)->z_unmounted) { \
                        ZFS_EXIT(zfsvfs); \
                        return (EIO); \
@@ -265,10 +265,10 @@ VTOZ(vnode_t *vp)
 
 /* Called on entry to each ZFS vnode and vfs operation that can not return EIO 
*/
 #define        ZFS_ENTER_NOERROR(zfsvfs) \
-       rrw_enter(&(zfsvfs)->z_teardown_lock, RW_READER, FTAG)
+       rrm_enter(&(zfsvfs)->z_teardown_lock, RW_READER, FTAG)
 
 /* Must be called before exiting the vop */
-#define        ZFS_EXIT(zfsvfs) rrw_exit(&(zfsvfs)->z_teardown_lock, FTAG)
+#define        ZFS_EXIT(zfsvfs) rrm_exit(&(zfsvfs)->z_teardown_lock, FTAG)
 
 /* Verifies the znode is valid */
 #define        ZFS_VERIFY_ZP(zp) \
Index: zfs_ioctl.c
===================================================================
--- zfs_ioctl.c (revision 267215)
+++ zfs_ioctl.c (working copy)
@@ -1464,7 +1464,7 @@ zfsvfs_hold(const char *name, void *tag, zfsvfs_t
        if (getzfsvfs(name, zfvp) != 0)
                error = zfsvfs_create(name, zfvp);
        if (error == 0) {
-               rrw_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER :
+               rrm_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER :
                    RW_READER, tag);
                if ((*zfvp)->z_unmounted) {
                        /*
@@ -1472,7 +1472,7 @@ zfsvfs_hold(const char *name, void *tag, zfsvfs_t
                         * thread should be just about to disassociate the
                         * objset from the zfsvfs.
                         */
-                       rrw_exit(&(*zfvp)->z_teardown_lock, tag);
+                       rrm_exit(&(*zfvp)->z_teardown_lock, tag);
                        return (SET_ERROR(EBUSY));
                }
        }
@@ -1482,7 +1482,7 @@ zfsvfs_hold(const char *name, void *tag, zfsvfs_t
 static void
 zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag)
 {
-       rrw_exit(&zfsvfs->z_teardown_lock, tag);
+       rrm_exit(&zfsvfs->z_teardown_lock, tag);
 
        if (zfsvfs->z_vfs) {
                VFS_RELE(zfsvfs->z_vfs);
Index: zfs_vfsops.c
===================================================================
--- zfs_vfsops.c        (revision 267215)
+++ zfs_vfsops.c        (working copy)
@@ -988,7 +988,7 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
        mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
        list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
            offsetof(znode_t, z_link_node));
-       rrw_init(&zfsvfs->z_teardown_lock, B_FALSE);
+       rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
        rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
        rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
        for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
@@ -1104,7 +1104,7 @@ zfsvfs_free(zfsvfs_t *zfsvfs)
        mutex_destroy(&zfsvfs->z_znodes_lock);
        mutex_destroy(&zfsvfs->z_lock);
        list_destroy(&zfsvfs->z_all_znodes);
-       rrw_destroy(&zfsvfs->z_teardown_lock);
+       rrm_destroy(&zfsvfs->z_teardown_lock);
        rw_destroy(&zfsvfs->z_teardown_inactive_lock);
        rw_destroy(&zfsvfs->z_fuid_lock);
        for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
@@ -1833,7 +1833,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmoun
 {
        znode_t *zp;
 
-       rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+       rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
 
        if (!unmounting) {
                /*
@@ -1866,7 +1866,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmoun
         */
        if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
                rw_exit(&zfsvfs->z_teardown_inactive_lock);
-               rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+               rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
                return (SET_ERROR(EIO));
        }
 
@@ -1893,7 +1893,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmoun
         */
        if (unmounting) {
                zfsvfs->z_unmounted = B_TRUE;
-               rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+               rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
                rw_exit(&zfsvfs->z_teardown_inactive_lock);
        }
 
@@ -1970,9 +1970,9 @@ zfs_umount(vfs_t *vfsp, int fflag)
                 * vflush(FORCECLOSE). This way we ensure no future vnops
                 * will be called and risk operating on DOOMED vnodes.
                 */
-               rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+               rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
                zfsvfs->z_unmounted = B_TRUE;
-               rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+               rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
        }
 
        /*
@@ -2240,7 +2240,7 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname
        znode_t *zp;
        uint64_t sa_obj = 0;
 
-       ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
+       ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
        ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
 
        /*
@@ -2296,7 +2296,7 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname
 bail:
        /* release the VOPs */
        rw_exit(&zfsvfs->z_teardown_inactive_lock);
-       rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+       rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 
        if (err) {
                /*
Index: zfs_znode.c
===================================================================
--- zfs_znode.c (revision 267215)
+++ zfs_znode.c (working copy)
@@ -276,7 +276,7 @@ zfs_znode_move(void *buf, void *newbuf, size_t siz
         * can safely ensure that the filesystem is not and will not be
         * unmounted. The next statement is equivalent to ZFS_ENTER().
         */
-       rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
+       rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
        if (zfsvfs->z_unmounted) {
                ZFS_EXIT(zfsvfs);
                rw_exit(&zfsvfs_lock);

_______________________________________________
developer mailing list
[email protected]
http://lists.open-zfs.org/mailman/listinfo/developer

Re: [OpenZFS Developer] zfsvfs->z_teardown_lock congestion

Reply via email to