Hi.
While doing high-IOPS ZFS benchmarking and profiling on 40-core FreeBSD
system with bunch of SSDs I noticed significant lock congestion spinning
on spa->spa_async_zio_root ZIO lock. As I found, it was caused by
multiple concurrently starting and completing prefetch ZIOs, fighting
for the same lock. To mitigate the problem I've replaced the single
async root ZIO with 16, as you may see in attached patch.
As result, on concurrent strided read with 4K block and 4K recordsize
from 256 threads (iozone -i 5 -w -r 4k -t 256 -s 256M) I've practically
doubled the test result, rising from 150K IOPS originally to 300K IOPS
with the patch, while total CPU load reduced from 100% to ~60%.
So I would like to know people opinion about this patch. Won't having
multiple ZIOs and calling multiple zio_wait()'s cause any problems on
pool export?
Thank you.
--
Alexander Motin
Index: spa.c
===================================================================
--- spa.c (revision 267049)
+++ spa.c (working copy)
@@ -1129,7 +1129,7 @@ spa_deactivate(spa_t *spa)
ASSERT(spa->spa_sync_on == B_FALSE);
ASSERT(spa->spa_dsl_pool == NULL);
ASSERT(spa->spa_root_vdev == NULL);
- ASSERT(spa->spa_async_zio_root == NULL);
+ ASSERT(spa->spa_async_zio_root[0] == NULL);
ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
/*
@@ -1272,9 +1272,11 @@ spa_unload(spa_t *spa)
/*
* Wait for any outstanding async I/O to complete.
*/
- if (spa->spa_async_zio_root != NULL) {
- (void) zio_wait(spa->spa_async_zio_root);
- spa->spa_async_zio_root = NULL;
+ for (i = 0; i < SPA_ASYNC_ROOTS; i++) {
+ if (spa->spa_async_zio_root[i] != NULL) {
+ (void) zio_wait(spa->spa_async_zio_root[i]);
+ spa->spa_async_zio_root[i] = NULL;
+ }
}
bpobj_close(&spa->spa_deferred_bpobj);
@@ -2139,7 +2141,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvli
uberblock_t *ub = &spa->spa_uberblock;
uint64_t children, config_cache_txg = spa->spa_config_txg;
int orig_mode = spa->spa_mode;
- int parse;
+ int i, parse;
uint64_t obj;
boolean_t missing_feat_write = B_FALSE;
@@ -2163,8 +2165,11 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvli
/*
* Create "The Godfather" zio to hold all async IOs
*/
- spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
+ for (i = 0; i < SPA_ASYNC_ROOTS; i++) {
+ spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+ ZIO_FLAG_GODFATHER);
+ }
/*
* Parse the configuration into a vdev tree. We explicitly set the
@@ -3464,7 +3469,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvl
vdev_t *rvd;
dsl_pool_t *dp;
dmu_tx_t *tx;
- int error = 0;
+ int error = 0, i;
uint64_t txg = TXG_INITIAL;
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
@@ -3516,8 +3521,11 @@ spa_create(const char *pool, nvlist_t *nvroot, nvl
/*
* Create "The Godfather" zio to hold all async IOs
*/
- spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
+ for (i = 0; i < SPA_ASYNC_ROOTS; i++) {
+ spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+ ZIO_FLAG_GODFATHER);
+ }
/*
* Create the root vdev.
Index: sys/spa_impl.h
===================================================================
--- sys/spa_impl.h (revision 267049)
+++ sys/spa_impl.h (working copy)
@@ -114,6 +114,8 @@ typedef struct spa_taskqs {
taskq_t **stqs_taskq;
} spa_taskqs_t;
+#define SPA_ASYNC_ROOTS 16
+
struct spa {
/*
* Fields protected by spa_namespace_lock.
@@ -205,7 +207,9 @@ struct spa {
uint64_t spa_failmode; /* failure mode for the pool */
uint64_t spa_delegation; /* delegation on/off */
list_t spa_config_list; /* previous cache file(s) */
- zio_t *spa_async_zio_root; /* root of all async I/O */
+ int spa_async_zio_root_last; /* last used async root */
+ zio_t *spa_async_zio_root[SPA_ASYNC_ROOTS];
+ /* roots of all async I/O */
zio_t *spa_suspend_zio_root; /* root of all suspended I/O */
kmutex_t spa_suspend_lock; /* protects suspend_zio_root */
kcondvar_t spa_suspend_cv; /* notification of resume */
Index: zio.c
===================================================================
--- zio.c (revision 267049)
+++ zio.c (working copy)
@@ -1404,8 +1404,11 @@ zio_nowait(zio_t *zio)
* will ensure they complete prior to unloading the pool.
*/
spa_t *spa = zio->io_spa;
+ int last;
- zio_add_child(spa->spa_async_zio_root, zio);
+ last = atomic_add_int_nv(&spa->spa_async_zio_root_last, 1);
+ last &= (SPA_ASYNC_ROOTS - 1);
+ zio_add_child(spa->spa_async_zio_root[last], zio);
}
zio_execute(zio);
_______________________________________________
developer mailing list
[email protected]
http://lists.open-zfs.org/mailman/listinfo/developer