On 05/01/2015 03:53 PM, Arne Jansen wrote:
> 
> So how do we continue? Can we ping some FreeBSD folks?
> Or just mark the patch as potentially dangerous in the
> commit message?
> I can of course build the refcnt/cv mechanism, should
> we choose this path.

As no one stepped up to tell me to implement the refcnt, I chose the
easier path and put up a big warning sign in both the commit message
and the code.

Attached is the revised patch and the diff to the previous one. I also
updated the webrev at

http://cr.illumos.org/~webrev/sensille/5269_zpool_import_slow/

Dan, George: Thanks for the additional reviews! Is it ok to list you
as reviewers?

-Arne
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
index 0d51da9..20ff685 100644
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -1629,7 +1629,7 @@ dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
 	zap_cursor_t zc;
 	zap_attribute_t *attr;
 	uint64_t thisobj;
-	int err;
+	int err = 0;
 
 	/* don't process if there already was an error */
 	if (*dcp->dc_error)
@@ -1637,7 +1637,7 @@ dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
 
 	err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, NULL, FTAG, &dd);
 	if (err != 0)
-		goto fail;
+		goto out;
 
 	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
 	if (dd->dd_myname[0] == '$') {
@@ -1709,19 +1709,19 @@ dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
 	kmem_free(attr, sizeof (zap_attribute_t));
 
 	if (err != 0)
-		goto fail;
+		goto out;
 
 	/*
 	 * Apply to self.
 	 */
 	err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
 	if (err != 0)
-		goto fail;
+		goto out;
 	err = dcp->dc_func(dp, ds, dcp->dc_arg);
 	dsl_dataset_rele(ds, FTAG);
 
-fail:
-	if (err) {
+out:
+	if (err != 0) {
 		mutex_enter(dcp->dc_error_lock);
 		/* only keep first error */
 		if (*dcp->dc_error == 0)
@@ -1729,7 +1729,6 @@ fail:
 		mutex_exit(dcp->dc_error_lock);
 	}
 
-out:
 	kmem_free(dcp, sizeof(*dcp));
 }
 
@@ -1803,6 +1802,15 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
 	/* dcp will be freed by task */
 	(void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);
 
+	/*
+	 * PORTING: this code relies on the property of taskq_wait to wait
+	 * until no more tasks are queued and no more tasks are active. As
+	 * we always queue new tasks from within other tasks, task_wait
+	 * reliably waits for the full recursion to finish, even though we
+	 * enqueue new tasks after taskq_wait has been called.
+	 * On platforms other than illumos, taskq_wait may not have this
+	 * property.
+	 */
 	taskq_wait(tq);
 	taskq_destroy(tq);
 	mutex_destroy(&err_lock);
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
index 985ab27..9e98350 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
@@ -142,8 +142,8 @@ struct objset {
 int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
 int dmu_objset_own(const char *name, dmu_objset_type_t type,
     boolean_t readonly, void *tag, objset_t **osp);
-int dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
-    boolean_t readonly, void *tag, objset_t **osp);
+int dmu_objset_own_obj(struct dsl_pool *dp, uint64_t obj,
+    dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp);
 void dmu_objset_refresh_ownership(objset_t *os, void *tag);
 void dmu_objset_rele(objset_t *os, void *tag);
 void dmu_objset_disown(objset_t *os, void *tag);
diff --git a/usr/src/uts/common/fs/zfs/sys/zil.h b/usr/src/uts/common/fs/zfs/sys/zil.h
index 33a152f..d3fe6e9 100644
--- a/usr/src/uts/common/fs/zfs/sys/zil.h
+++ b/usr/src/uts/common/fs/zfs/sys/zil.h
@@ -32,13 +32,14 @@
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_dataset.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
+struct dsl_pool;
+struct dsl_dataset;
+
 /*
  * Intent log format:
  *
@@ -406,9 +407,10 @@ extern void	zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
 extern void	zil_commit(zilog_t *zilog, uint64_t oid);
 
 extern int	zil_vdev_offline(const char *osname, void *txarg);
-extern int	zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg);
-extern int 	zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds,
-    void *tx);
+extern int	zil_claim(struct dsl_pool *dp,
+    struct dsl_dataset *ds, void *txarg);
+extern int 	zil_check_log_chain(struct dsl_pool *dp,
+    struct dsl_dataset *ds, void *tx);
 extern void	zil_sync(zilog_t *zilog, dmu_tx_t *tx);
 extern void	zil_clean(zilog_t *zilog, uint64_t synced_txg);
 
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 7ef66bf..5099038 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -181,10 +181,9 @@ vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
 static int
 vdev_count_leaves_impl(vdev_t *vd)
 {
-	vdev_t *mvd;
 	int n = 0;
 
-	if (vd->vdev_children == 0)
+	if (vd->vdev_ops->vdev_op_leaf)
 		return (1);
 
 	for (int c = 0; c < vd->vdev_children; c++)
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
index b500e32..38b57e0 100644
--- a/usr/src/uts/common/fs/zfs/zil.c
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -692,6 +692,7 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
  * Checksum errors are ok as they indicate the end of the chain.
  * Any other error (no device or read failure) returns an error.
  */
+/* ARGSUSED */
 int
 zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
 {
>From 657681ab7b9db667e14a61adf80e3e16068c36d0 Mon Sep 17 00:00:00 2001
From: Arne Jansen <[email protected]>
Date: Tue, 28 Oct 2014 09:37:04 +0100
Subject: [PATCH] 5269 zfs: zpool import slow

While importing a pool all objsets are enumerated twice, once to check
the zil log chains and once to claim them. On pools with many datasets
this process might take a substantial amount of time.
Speed up the process by parallelizing it utilizing a taskq. The number
of parallel tasks is limited to 4 times the number of leaf vdevs.

PORTING: this code relies on the property of taskq_wait to wait
until no more tasks are queued and no more tasks are active. As
we always queue new tasks from within other tasks, task_wait
reliably waits for the full recursion to finish, even though we
enqueue new tasks after taskq_wait has been called.
On platforms other than illumos, taskq_wait may not have this
property.

Reviewed by: Matthew Ahrens <[email protected]>
---
 usr/src/uts/common/fs/zfs/dmu_objset.c     |  211 ++++++++++++++++++++++-----
 usr/src/uts/common/fs/zfs/dsl_pool.c       |   10 +-
 usr/src/uts/common/fs/zfs/spa.c            |   11 +-
 usr/src/uts/common/fs/zfs/sys/dmu.h        |    1 +
 usr/src/uts/common/fs/zfs/sys/dmu_objset.h |    2 +
 usr/src/uts/common/fs/zfs/sys/dsl_pool.h   |    1 +
 usr/src/uts/common/fs/zfs/sys/vdev.h       |    1 +
 usr/src/uts/common/fs/zfs/sys/zil.h        |    9 +-
 usr/src/uts/common/fs/zfs/vdev.c           |   20 +++
 usr/src/uts/common/fs/zfs/zil.c            |   23 ++--
 10 files changed, 229 insertions(+), 60 deletions(-)

diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
index 88edcda..20ff685 100644
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -25,6 +25,7 @@
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -49,6 +50,7 @@
 #include <sys/sa.h>
 #include <sys/zfs_onexit.h>
 #include <sys/dsl_destroy.h>
+#include <sys/vdev.h>
 
 /*
  * Needed to close a window in dnode_move() that allows the objset to be freed
@@ -56,6 +58,8 @@
  */
 krwlock_t os_lock;
 
+static void dmu_objset_find_dp_cb(void *arg);
+
 void
 dmu_objset_init(void)
 {
@@ -504,6 +508,25 @@ dmu_objset_hold(const char *name, void *tag, objset_t **osp)
 	return (err);
 }
 
+static int
+dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
+    boolean_t readonly, void *tag, objset_t **osp)
+{
+	int err;
+
+	err = dmu_objset_from_ds(ds, osp);
+	if (err != 0) {
+		dsl_dataset_disown(ds, tag);
+	} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
+		dsl_dataset_disown(ds, tag);
+		return (SET_ERROR(EINVAL));
+	} else if (!readonly && dsl_dataset_is_snapshot(ds)) {
+		dsl_dataset_disown(ds, tag);
+		return (SET_ERROR(EROFS));
+	}
+	return (err);
+}
+
 /*
  * dsl_pool must not be held when this is called.
  * Upon successful return, there will be a longhold on the dataset,
@@ -525,21 +548,26 @@ dmu_objset_own(const char *name, dmu_objset_type_t type,
 		dsl_pool_rele(dp, FTAG);
 		return (err);
 	}
-
-	err = dmu_objset_from_ds(ds, osp);
+	err = dmu_objset_own_impl(ds, type, readonly, tag, osp);
 	dsl_pool_rele(dp, FTAG);
-	if (err != 0) {
-		dsl_dataset_disown(ds, tag);
-	} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
-		dsl_dataset_disown(ds, tag);
-		return (SET_ERROR(EINVAL));
-	} else if (!readonly && ds->ds_is_snapshot) {
-		dsl_dataset_disown(ds, tag);
-		return (SET_ERROR(EROFS));
-	}
+
 	return (err);
 }
 
+int
+dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
+    boolean_t readonly, void *tag, objset_t **osp)
+{
+	dsl_dataset_t *ds;
+	int err;
+
+	err = dsl_dataset_own_obj(dp, obj, tag, &ds);
+	if (err != 0)
+		return (err);
+
+	return (dmu_objset_own_impl(ds, type, readonly, tag, osp));
+}
+
 void
 dmu_objset_rele(objset_t *os, void *tag)
 {
@@ -1580,30 +1608,41 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name,
 	return (0);
 }
 
-/*
- * Find objsets under and including ddobj, call func(ds) on each.
- */
-int
-dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
-    int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
+typedef struct dmu_objset_find_ctx {
+	taskq_t		*dc_tq;
+	dsl_pool_t	*dc_dp;
+	uint64_t	dc_ddobj;
+	int		(*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);
+	void		*dc_arg;
+	int		dc_flags;
+	kmutex_t	*dc_error_lock;
+	int		*dc_error;
+} dmu_objset_find_ctx_t;
+
+static void
+dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
 {
+	dsl_pool_t *dp = dcp->dc_dp;
+	dmu_objset_find_ctx_t *child_dcp;
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	zap_cursor_t zc;
 	zap_attribute_t *attr;
 	uint64_t thisobj;
-	int err;
+	int err = 0;
 
-	ASSERT(dsl_pool_config_held(dp));
+	/* don't process if there already was an error */
+	if (*dcp->dc_error)
+		goto out;
 
-	err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd);
+	err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, NULL, FTAG, &dd);
 	if (err != 0)
-		return (err);
+		goto out;
 
 	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
 	if (dd->dd_myname[0] == '$') {
 		dsl_dir_rele(dd, FTAG);
-		return (0);
+		goto out;
 	}
 
 	thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
@@ -1612,7 +1651,7 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
 	/*
 	 * Iterate over all children.
 	 */
-	if (flags & DS_FIND_CHILDREN) {
+	if (dcp->dc_flags & DS_FIND_CHILDREN) {
 		for (zap_cursor_init(&zc, dp->dp_meta_objset,
 		    dsl_dir_phys(dd)->dd_child_dir_zapobj);
 		    zap_cursor_retrieve(&zc, attr) == 0;
@@ -1621,24 +1660,22 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
 			    sizeof (uint64_t));
 			ASSERT3U(attr->za_num_integers, ==, 1);
 
-			err = dmu_objset_find_dp(dp, attr->za_first_integer,
-			    func, arg, flags);
-			if (err != 0)
-				break;
+			child_dcp = kmem_alloc(sizeof(*child_dcp), KM_SLEEP);
+			*child_dcp = *dcp;
+			child_dcp->dc_ddobj = attr->za_first_integer;
+			if (dcp->dc_tq)
+				(void) taskq_dispatch(dcp->dc_tq,
+				    dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
+			else
+				dmu_objset_find_dp_impl(child_dcp);
 		}
 		zap_cursor_fini(&zc);
-
-		if (err != 0) {
-			dsl_dir_rele(dd, FTAG);
-			kmem_free(attr, sizeof (zap_attribute_t));
-			return (err);
-		}
 	}
 
 	/*
 	 * Iterate over all snapshots.
 	 */
-	if (flags & DS_FIND_SNAPSHOTS) {
+	if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
 		dsl_dataset_t *ds;
 		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
 
@@ -1659,7 +1696,7 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
 				    attr->za_first_integer, FTAG, &ds);
 				if (err != 0)
 					break;
-				err = func(dp, ds, arg);
+				err = dcp->dc_func(dp, ds, dcp->dc_arg);
 				dsl_dataset_rele(ds, FTAG);
 				if (err != 0)
 					break;
@@ -1672,17 +1709,113 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
 	kmem_free(attr, sizeof (zap_attribute_t));
 
 	if (err != 0)
-		return (err);
+		goto out;
 
 	/*
 	 * Apply to self.
 	 */
 	err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
 	if (err != 0)
-		return (err);
-	err = func(dp, ds, arg);
+		goto out;
+	err = dcp->dc_func(dp, ds, dcp->dc_arg);
 	dsl_dataset_rele(ds, FTAG);
-	return (err);
+
+out:
+	if (err != 0) {
+		mutex_enter(dcp->dc_error_lock);
+		/* only keep first error */
+		if (*dcp->dc_error == 0)
+			*dcp->dc_error = err;
+		mutex_exit(dcp->dc_error_lock);
+	}
+
+	kmem_free(dcp, sizeof(*dcp));
+}
+
+static void
+dmu_objset_find_dp_cb(void *arg)
+{
+	dmu_objset_find_ctx_t *dcp = arg;
+	dsl_pool_t *dp = dcp->dc_dp;
+
+	dsl_pool_config_enter(dp, FTAG);
+
+	dmu_objset_find_dp_impl(dcp);
+
+	dsl_pool_config_exit(dp, FTAG);
+}
+
+/*
+ * Find objsets under and including ddobj, call func(ds) on each.
+ * The order for the enumeration is completely undefined.
+ * func is called with dsl_pool_config held.
+ */
+int
+dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
+    int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
+{
+	int error = 0;
+	taskq_t *tq = NULL;
+	int ntasks;
+	dmu_objset_find_ctx_t *dcp;
+	kmutex_t err_lock;
+
+	mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
+	dcp = kmem_alloc(sizeof(*dcp), KM_SLEEP);
+	dcp->dc_tq = NULL;
+	dcp->dc_dp = dp;
+	dcp->dc_ddobj = ddobj;
+	dcp->dc_func = func;
+	dcp->dc_arg = arg;
+	dcp->dc_flags = flags;
+	dcp->dc_error_lock = &err_lock;
+	dcp->dc_error = &error;
+
+	if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) {
+		/*
+		 * In case a write lock is held we can't make use of
+		 * parallelism, as down the stack of the worker threads
+		 * the lock is asserted via dsl_pool_config_held.
+		 * In case of a read lock this is solved by getting a read
+		 * lock in each worker thread, which isn't possible in case
+		 * of a writer lock. So we fall back to the synchronous path
+		 * here.
+		 * In the future it might be possible to get some magic into
+		 * dsl_pool_config_held in a way that it returns true for
+		 * the worker threads so that a single lock held from this
+		 * thread suffices. For now, stay single threaded.
+		 */
+		dmu_objset_find_dp_impl(dcp);
+
+		return (error);
+	}
+
+	ntasks = vdev_count_leaves(dp->dp_spa) * 4;
+	tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks,
+	    INT_MAX, 0);
+	if (!tq) {
+		kmem_free(dcp, sizeof(*dcp));
+		return (SET_ERROR(ENOMEM));
+	}
+	dcp->dc_tq = tq;
+
+	/* dcp will be freed by task */
+	(void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);
+
+	/*
+	 * PORTING: this code relies on the property of taskq_wait to wait
+	 * until no more tasks are queued and no more tasks are active. As
+	 * we always queue new tasks from within other tasks, task_wait
+	 * reliably waits for the full recursion to finish, even though we
+	 * enqueue new tasks after taskq_wait has been called.
+	 * On platforms other than illumos, taskq_wait may not have this
+	 * property.
+	 */
+	taskq_wait(tq);
+	taskq_destroy(tq);
+	mutex_destroy(&err_lock);
+
+	return (error);
 }
 
 /*
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
index 2e7f798..8c5d820 100644
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -758,7 +758,7 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 	ASSERT(dp->dp_origin_snap != NULL);
 
 	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
-	    tx, DS_FIND_CHILDREN));
+	    tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
 }
 
 /* ARGSUSED */
@@ -812,7 +812,7 @@ dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 	VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
 
 	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
-	    upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN));
+	    upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
 }
 
 void
@@ -1056,3 +1056,9 @@ dsl_pool_config_held(dsl_pool_t *dp)
 {
 	return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
 }
+
+boolean_t
+dsl_pool_config_held_writer(dsl_pool_t *dp)
+{
+	return (RRW_WRITE_HELD(&dp->dp_config_rwlock));
+}
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index d9fc3e8..26cbc91 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -1738,13 +1738,14 @@ static boolean_t
 spa_check_logs(spa_t *spa)
 {
 	boolean_t rv = B_FALSE;
+	dsl_pool_t *dp = spa_get_dsl(spa);
 
 	switch (spa->spa_log_state) {
 	case SPA_LOG_MISSING:
 		/* need to recheck in case slog has been restored */
 	case SPA_LOG_UNKNOWN:
-		rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain,
-		    NULL, DS_FIND_CHILDREN) != 0);
+		rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+		    zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
 		if (rv)
 			spa_set_log_state(spa, SPA_LOG_MISSING);
 		break;
@@ -2715,6 +2716,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 	    spa->spa_load_max_txg == UINT64_MAX)) {
 		dmu_tx_t *tx;
 		int need_update = B_FALSE;
+		dsl_pool_t *dp = spa_get_dsl(spa);
 
 		ASSERT(state != SPA_LOAD_TRYIMPORT);
 
@@ -2727,9 +2729,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 		 */
 		spa->spa_claiming = B_TRUE;
 
-		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
-		    spa_first_txg(spa));
-		(void) dmu_objset_find(spa_name(spa),
+		tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
+		(void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 		    zil_claim, tx, DS_FIND_CHILDREN);
 		dmu_tx_commit(tx);
 
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
index b26061b..9864129 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -243,6 +243,7 @@ void zfs_znode_byteswap(void *buf, size_t size);
 
 #define	DS_FIND_SNAPSHOTS	(1<<0)
 #define	DS_FIND_CHILDREN	(1<<1)
+#define	DS_FIND_SERIALIZE	(1<<2)
 
 /*
  * The maximum number of bytes that can be accessed as part of one
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
index e00847e..9e98350 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
@@ -142,6 +142,8 @@ struct objset {
 int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
 int dmu_objset_own(const char *name, dmu_objset_type_t type,
     boolean_t readonly, void *tag, objset_t **osp);
+int dmu_objset_own_obj(struct dsl_pool *dp, uint64_t obj,
+    dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp);
 void dmu_objset_refresh_ownership(objset_t *os, void *tag);
 void dmu_objset_rele(objset_t *os, void *tag);
 void dmu_objset_disown(objset_t *os, void *tag);
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
index a6fb201..0f6d175 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
@@ -154,6 +154,7 @@ void dsl_pool_mos_diduse_space(dsl_pool_t *dp,
 void dsl_pool_config_enter(dsl_pool_t *dp, void *tag);
 void dsl_pool_config_exit(dsl_pool_t *dp, void *tag);
 boolean_t dsl_pool_config_held(dsl_pool_t *dp);
+boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp);
 boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp);
 
 taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp);
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h
index 043ba6d..08ce54b 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h
@@ -60,6 +60,7 @@ extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
 extern boolean_t vdev_is_bootable(vdev_t *vd);
 extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
 extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
+extern int vdev_count_leaves(spa_t *spa);
 extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
     uint64_t txg, uint64_t size);
 extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
diff --git a/usr/src/uts/common/fs/zfs/sys/zil.h b/usr/src/uts/common/fs/zfs/sys/zil.h
index 895d632..d3fe6e9 100644
--- a/usr/src/uts/common/fs/zfs/sys/zil.h
+++ b/usr/src/uts/common/fs/zfs/sys/zil.h
@@ -37,6 +37,9 @@
 extern "C" {
 #endif
 
+struct dsl_pool;
+struct dsl_dataset;
+
 /*
  * Intent log format:
  *
@@ -404,8 +407,10 @@ extern void	zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
 extern void	zil_commit(zilog_t *zilog, uint64_t oid);
 
 extern int	zil_vdev_offline(const char *osname, void *txarg);
-extern int	zil_claim(const char *osname, void *txarg);
-extern int	zil_check_log_chain(const char *osname, void *txarg);
+extern int	zil_claim(struct dsl_pool *dp,
+    struct dsl_dataset *ds, void *txarg);
+extern int 	zil_check_log_chain(struct dsl_pool *dp,
+    struct dsl_dataset *ds, void *tx);
 extern void	zil_sync(zilog_t *zilog, dmu_tx_t *tx);
 extern void	zil_clean(zilog_t *zilog, uint64_t synced_txg);
 
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 9bc46ce..5099038 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -178,6 +178,26 @@ vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
 	return (NULL);
 }
 
+static int
+vdev_count_leaves_impl(vdev_t *vd)
+{
+	int n = 0;
+
+	if (vd->vdev_ops->vdev_op_leaf)
+		return (1);
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		n += vdev_count_leaves_impl(vd->vdev_child[c]);
+
+	return (n);
+}
+
+int
+vdev_count_leaves(spa_t *spa)
+{
+	return (vdev_count_leaves_impl(spa->spa_root_vdev));
+}
+
 void
 vdev_add_child(vdev_t *pvd, vdev_t *cvd)
 {
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
index f48828d..38b57e0 100644
--- a/usr/src/uts/common/fs/zfs/zil.c
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -627,7 +627,7 @@ zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
 }
 
 int
-zil_claim(const char *osname, void *txarg)
+zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
 {
 	dmu_tx_t *tx = txarg;
 	uint64_t first_txg = dmu_tx_get_txg(tx);
@@ -636,15 +636,16 @@ zil_claim(const char *osname, void *txarg)
 	objset_t *os;
 	int error;
 
-	error = dmu_objset_own(osname, DMU_OST_ANY, B_FALSE, FTAG, &os);
+	error = dmu_objset_own_obj(dp, ds->ds_object,
+	    DMU_OST_ANY, B_FALSE, FTAG, &os);
 	if (error != 0) {
 		/*
 		 * EBUSY indicates that the objset is inconsistent, in which
 		 * case it can not have a ZIL.
 		 */
 		if (error != EBUSY) {
-			cmn_err(CE_WARN, "can't open objset for %s, error %u",
-			    osname, error);
+			cmn_err(CE_WARN, "can't open objset for %llu, error %u",
+			    (unsigned long long)ds->ds_object, error);
 		}
 		return (0);
 	}
@@ -691,8 +692,9 @@ zil_claim(const char *osname, void *txarg)
  * Checksum errors are ok as they indicate the end of the chain.
  * Any other error (no device or read failure) returns an error.
  */
+/* ARGSUSED */
 int
-zil_check_log_chain(const char *osname, void *tx)
+zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
 {
 	zilog_t *zilog;
 	objset_t *os;
@@ -701,9 +703,10 @@ zil_check_log_chain(const char *osname, void *tx)
 
 	ASSERT(tx == NULL);
 
-	error = dmu_objset_hold(osname, FTAG, &os);
+	error = dmu_objset_from_ds(ds, &os);
 	if (error != 0) {
-		cmn_err(CE_WARN, "can't open objset for %s", osname);
+		cmn_err(CE_WARN, "can't open objset %llu, error %d",
+		    (unsigned long long)ds->ds_object, error);
 		return (0);
 	}
 
@@ -726,10 +729,8 @@ zil_check_log_chain(const char *osname, void *tx)
 			valid = vdev_log_state_valid(vd);
 		spa_config_exit(os->os_spa, SCL_STATE, FTAG);
 
-		if (!valid) {
-			dmu_objset_rele(os, FTAG);
+		if (!valid)
 			return (0);
-		}
 	}
 
 	/*
@@ -742,8 +743,6 @@ zil_check_log_chain(const char *osname, void *tx)
 	error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
 	    zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa));
 
-	dmu_objset_rele(os, FTAG);
-
 	return ((error == ECKSUM || error == ENOENT) ? 0 : error);
 }
 
-- 
1.7.7

_______________________________________________
developer mailing list
[email protected]
http://lists.open-zfs.org/mailman/listinfo/developer

Reply via email to