Hi.

While doing high-IOPS ZFS benchmarking and profiling on 40-core FreeBSD system with bunch of SSDs I noticed significant lock congestion spinning on spa->spa_async_zio_root ZIO lock. As I found, it was caused by multiple concurrently starting and completing prefetch ZIOs, fighting for the same lock. To mitigate the problem I've replaced the single async root ZIO with 16, as you may see in attached patch.

As result, on concurrent strided read with 4K block and 4K recordsize from 256 threads (iozone -i 5 -w -r 4k -t 256 -s 256M) I've practically doubled the test result, rising from 150K IOPS originally to 300K IOPS with the patch, while total CPU load reduced from 100% to ~60%.

So I would like to know people opinion about this patch. Won't having multiple ZIOs and calling multiple zio_wait()'s cause any problems on pool export?

Thank you.

--
Alexander Motin
Index: spa.c
===================================================================
--- spa.c       (revision 267049)
+++ spa.c       (working copy)
@@ -1129,7 +1129,7 @@ spa_deactivate(spa_t *spa)
        ASSERT(spa->spa_sync_on == B_FALSE);
        ASSERT(spa->spa_dsl_pool == NULL);
        ASSERT(spa->spa_root_vdev == NULL);
-       ASSERT(spa->spa_async_zio_root == NULL);
+       ASSERT(spa->spa_async_zio_root[0] == NULL);
        ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
 
        /*
@@ -1272,9 +1272,11 @@ spa_unload(spa_t *spa)
        /*
         * Wait for any outstanding async I/O to complete.
         */
-       if (spa->spa_async_zio_root != NULL) {
-               (void) zio_wait(spa->spa_async_zio_root);
-               spa->spa_async_zio_root = NULL;
+       for (i = 0; i < SPA_ASYNC_ROOTS; i++) {
+               if (spa->spa_async_zio_root[i] != NULL) {
+                       (void) zio_wait(spa->spa_async_zio_root[i]);
+                       spa->spa_async_zio_root[i] = NULL;
+               }
        }
 
        bpobj_close(&spa->spa_deferred_bpobj);
@@ -2139,7 +2141,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvli
        uberblock_t *ub = &spa->spa_uberblock;
        uint64_t children, config_cache_txg = spa->spa_config_txg;
        int orig_mode = spa->spa_mode;
-       int parse;
+       int i, parse;
        uint64_t obj;
        boolean_t missing_feat_write = B_FALSE;
 
@@ -2163,8 +2165,11 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvli
        /*
         * Create "The Godfather" zio to hold all async IOs
         */
-       spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
-           ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
+       for (i = 0; i < SPA_ASYNC_ROOTS; i++) {
+               spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
+                   ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+                   ZIO_FLAG_GODFATHER);
+       }
 
        /*
         * Parse the configuration into a vdev tree.  We explicitly set the
@@ -3464,7 +3469,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvl
        vdev_t *rvd;
        dsl_pool_t *dp;
        dmu_tx_t *tx;
-       int error = 0;
+       int error = 0, i;
        uint64_t txg = TXG_INITIAL;
        nvlist_t **spares, **l2cache;
        uint_t nspares, nl2cache;
@@ -3516,8 +3521,11 @@ spa_create(const char *pool, nvlist_t *nvroot, nvl
        /*
         * Create "The Godfather" zio to hold all async IOs
         */
-       spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
-           ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
+       for (i = 0; i < SPA_ASYNC_ROOTS; i++) {
+               spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
+                   ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+                   ZIO_FLAG_GODFATHER);
+       }
 
        /*
         * Create the root vdev.
Index: sys/spa_impl.h
===================================================================
--- sys/spa_impl.h      (revision 267049)
+++ sys/spa_impl.h      (working copy)
@@ -114,6 +114,8 @@ typedef struct spa_taskqs {
        taskq_t **stqs_taskq;
 } spa_taskqs_t;
 
+#define SPA_ASYNC_ROOTS        16
+
 struct spa {
        /*
         * Fields protected by spa_namespace_lock.
@@ -205,7 +207,9 @@ struct spa {
        uint64_t        spa_failmode;           /* failure mode for the pool */
        uint64_t        spa_delegation;         /* delegation on/off */
        list_t          spa_config_list;        /* previous cache file(s) */
-       zio_t           *spa_async_zio_root;    /* root of all async I/O */
+       int             spa_async_zio_root_last; /* last used async root */
+       zio_t           *spa_async_zio_root[SPA_ASYNC_ROOTS];
+                                               /* roots of all async I/O */
        zio_t           *spa_suspend_zio_root;  /* root of all suspended I/O */
        kmutex_t        spa_suspend_lock;       /* protects suspend_zio_root */
        kcondvar_t      spa_suspend_cv;         /* notification of resume */
Index: zio.c
===================================================================
--- zio.c       (revision 267049)
+++ zio.c       (working copy)
@@ -1404,8 +1404,11 @@ zio_nowait(zio_t *zio)
                 * will ensure they complete prior to unloading the pool.
                 */
                spa_t *spa = zio->io_spa;
+               int last;
 
-               zio_add_child(spa->spa_async_zio_root, zio);
+               last = atomic_add_int_nv(&spa->spa_async_zio_root_last, 1);
+               last &= (SPA_ASYNC_ROOTS - 1);
+               zio_add_child(spa->spa_async_zio_root[last], zio);
        }
 
        zio_execute(zio);
_______________________________________________
developer mailing list
[email protected]
http://lists.open-zfs.org/mailman/listinfo/developer

Reply via email to