Author: mav
Date: Thu Feb 22 03:22:27 2018
New Revision: 329799
URL: https://svnweb.freebsd.org/changeset/base/329799

Log:
  9079 race condition in starting and ending condesing thread for indirect vdevs
  
  illumos/illumos-gate@667ec66f1b4f491d5e839644e0912cad1c9e7122
  
  The timeline of the race condition is the following:
  [1] Thread A is about to finish condesing the first vdev in 
spa_condense_indirect_thread(),
  so it calls the spa_condense_indirect_complete_sync() sync task which sets the
  spa_condensing_indirect field to NULL. Waiting for the sync task to finish, 
thread A
  sleeps until the txg is done. When this happens, thread A will acquire 
spa_async_lock
  and set spa_condense_thread to NULL.
  [2] While thread A waits for the txg to finish, thread B which is running 
spa_sync() checks
  whether it should condense the second vdev in vdev_indirect_should_condense() 
by checking
  the spa_condensing_indirect field which was set to NULL by 
spa_condense_indirect_thread()
  from thread A. So it goes on and tries to spawn a new condensing thread in
  spa_condense_indirect_start_sync() and the aforementioned assertions fails 
because thread A
  has not set spa_condense_thread to NULL (which is basically the last thing it 
does before
  returning).
  
  The main issue here is that we rely on both spa_condensing_indirect and 
spa_condense_thread to
  signify whether a condensing thread is running. Ideally we would only use one 
throughout the
  codebase. In addition, for managing spa_condense_thread we currently use 
spa_async_lock which
  basically tights condensing to scrubing when it comes to pausing and resuming 
those actions
  during spa export.
  
  Reviewed by: Matt Ahrens <mahr...@delphix.com>
  Reviewed by: Pavel Zakharov <pavel.zakha...@delphix.com>
  Approved by: Hans Rosenfeld <rosenf...@grumpf.hope-2000.org>
  Author: Serapheim Dimitropoulos <seraph...@delphix.com>

Modified:
  vendor-sys/illumos/dist/uts/common/Makefile.files
  vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa_impl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_removal.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect.c

Modified: vendor-sys/illumos/dist/uts/common/Makefile.files
==============================================================================
--- vendor-sys/illumos/dist/uts/common/Makefile.files   Thu Feb 22 03:15:35 
2018        (r329798)
+++ vendor-sys/illumos/dist/uts/common/Makefile.files   Thu Feb 22 03:22:27 
2018        (r329799)
@@ -1421,7 +1421,8 @@ ZFS_COMMON_OBJS +=                \
        zio_compress.o          \
        zio_inject.o            \
        zle.o                   \
-       zrlock.o
+       zrlock.o                \
+       zthr.o
 
 ZFS_SHARED_OBJS +=             \
        zfeature_common.o       \

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c     Thu Feb 22 03:15:35 
2018        (r329798)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c     Thu Feb 22 03:22:27 
2018        (r329799)
@@ -1338,6 +1338,12 @@ spa_unload(spa_t *spa)
                spa->spa_vdev_removal = NULL;
        }
 
+       if (spa->spa_condense_zthr != NULL) {
+               ASSERT(!zthr_isrunning(spa->spa_condense_zthr));
+               zthr_destroy(spa->spa_condense_zthr);
+               spa->spa_condense_zthr = NULL;
+       }
+
        spa_condense_fini(spa);
 
        bpobj_close(&spa->spa_deferred_bpobj);
@@ -2079,6 +2085,16 @@ spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
        return (SET_ERROR(err));
 }
 
+static void
+spa_spawn_aux_threads(spa_t *spa)
+{
+       ASSERT(spa_writeable(spa));
+
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+       spa_start_indirect_condensing_thread(spa);
+}
+
 /*
  * Fix up config after a partly-completed split.  This is done with the
  * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
@@ -3486,18 +3502,6 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char
                ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
 
                /*
-                * We must check this before we start the sync thread, because
-                * we only want to start a condense thread for condense
-                * operations that were in progress when the pool was
-                * imported.  Once we start syncing, spa_sync() could
-                * initiate a condense (and start a thread for it).  In
-                * that case it would be wrong to start a second
-                * condense thread.
-                */
-               boolean_t condense_in_progress =
-                   (spa->spa_condensing_indirect != NULL);
-
-               /*
                 * Traverse the ZIL and claim all blocks.
                 */
                spa_ld_claim_log_blocks(spa);
@@ -3549,15 +3553,9 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char
                 */
                dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
 
-               /*
-                * Note: unlike condensing, we don't need an analogous
-                * "removal_in_progress" dance because no other thread
-                * can start a removal while we hold the spa_namespace_lock.
-                */
                spa_restart_removal(spa);
 
-               if (condense_in_progress)
-                       spa_condense_indirect_restart(spa);
+               spa_spawn_aux_threads(spa);
        }
 
        spa_load_note(spa, "LOADED");
@@ -4466,6 +4464,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_
         */
        txg_wait_synced(spa->spa_dsl_pool, txg);
 
+       spa_spawn_aux_threads(spa);
+
        spa_write_cachefile(spa, B_FALSE, B_TRUE);
        spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
 
@@ -6405,12 +6405,15 @@ spa_async_suspend(spa_t *spa)
 {
        mutex_enter(&spa->spa_async_lock);
        spa->spa_async_suspended++;
-       while (spa->spa_async_thread != NULL ||
-           spa->spa_condense_thread != NULL)
+       while (spa->spa_async_thread != NULL)
                cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
        mutex_exit(&spa->spa_async_lock);
 
        spa_vdev_remove_suspend(spa);
+
+       zthr_t *condense_thread = spa->spa_condense_zthr;
+       if (condense_thread != NULL && zthr_isrunning(condense_thread))
+               VERIFY0(zthr_cancel(condense_thread));
 }
 
 void
@@ -6421,6 +6424,10 @@ spa_async_resume(spa_t *spa)
        spa->spa_async_suspended--;
        mutex_exit(&spa->spa_async_lock);
        spa_restart_removal(spa);
+
+       zthr_t *condense_thread = spa->spa_condense_zthr;
+       if (condense_thread != NULL && !zthr_isrunning(condense_thread))
+               zthr_resume(condense_thread);
 }
 
 static boolean_t

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa_impl.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa_impl.h    Thu Feb 22 
03:15:35 2018        (r329798)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa_impl.h    Thu Feb 22 
03:22:27 2018        (r329799)
@@ -43,6 +43,7 @@
 #include <sys/bplist.h>
 #include <sys/bpobj.h>
 #include <sys/zfeature.h>
+#include <sys/zthr.h>
 #include <zfeature_common.h>
 
 #ifdef __cplusplus
@@ -278,7 +279,7 @@ struct spa {
 
        spa_condensing_indirect_phys_t  spa_condensing_indirect_phys;
        spa_condensing_indirect_t       *spa_condensing_indirect;
-       kthread_t       *spa_condense_thread;   /* thread doing condense. */
+       zthr_t          *spa_condense_zthr;     /* zthr doing condense. */
 
        char            *spa_root;              /* alternate root directory */
        uint64_t        spa_ena;                /* spa-wide ereport ENA */

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_removal.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_removal.h        Thu Feb 
22 03:15:35 2018        (r329798)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_removal.h        Thu Feb 
22 03:22:27 2018        (r329799)
@@ -76,7 +76,7 @@ extern int spa_remove_init(spa_t *);
 extern void spa_restart_removal(spa_t *);
 extern int spa_condense_init(spa_t *);
 extern void spa_condense_fini(spa_t *);
-extern void spa_condense_indirect_restart(spa_t *);
+extern void spa_start_indirect_condensing_thread(spa_t *);
 extern void spa_vdev_condense_suspend(spa_t *);
 extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t);
 extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t, uint64_t);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect.c   Thu Feb 22 
03:15:35 2018        (r329798)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect.c   Thu Feb 22 
03:22:27 2018        (r329799)
@@ -30,6 +30,8 @@
 #include <sys/dmu_tx.h>
 #include <sys/dsl_synctask.h>
 #include <sys/zap.h>
+#include <sys/abd.h>
+#include <sys/zthr.h>
 
 /*
  * An indirect vdev corresponds to a vdev that has been removed.  Since
@@ -475,7 +477,7 @@ spa_condense_indirect_commit_entry(spa_t *spa,
 
 static void
 spa_condense_indirect_generate_new_mapping(vdev_t *vd,
-    uint32_t *obsolete_counts, uint64_t start_index)
+    uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr)
 {
        spa_t *spa = vd->vdev_spa;
        uint64_t mapi = start_index;
@@ -490,7 +492,15 @@ spa_condense_indirect_generate_new_mapping(vdev_t *vd,
            (u_longlong_t)vd->vdev_id,
            (u_longlong_t)mapi);
 
-       while (mapi < old_num_entries && !spa_shutting_down(spa)) {
+       while (mapi < old_num_entries) {
+
+               if (zthr_iscancelled(zthr)) {
+                       zfs_dbgmsg("pausing condense of vdev %llu "
+                           "at index %llu", (u_longlong_t)vd->vdev_id,
+                           (u_longlong_t)mapi);
+                       break;
+               }
+
                vdev_indirect_mapping_entry_phys_t *entry =
                    &old_mapping->vim_entries[mapi];
                uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst);
@@ -508,18 +518,30 @@ spa_condense_indirect_generate_new_mapping(vdev_t *vd,
 
                mapi++;
        }
-       if (spa_shutting_down(spa)) {
-               zfs_dbgmsg("pausing condense of vdev %llu at index %llu",
-                   (u_longlong_t)vd->vdev_id,
-                   (u_longlong_t)mapi);
-       }
 }
 
-static void
-spa_condense_indirect_thread(void *arg)
+/* ARGSUSED */
+static boolean_t
+spa_condense_indirect_thread_check(void *arg, zthr_t *zthr)
 {
-       vdev_t *vd = arg;
-       spa_t *spa = vd->vdev_spa;
+       spa_t *spa = arg;
+
+       return (spa->spa_condensing_indirect != NULL);
+}
+
+/* ARGSUSED */
+static int
+spa_condense_indirect_thread(void *arg, zthr_t *zthr)
+{
+       spa_t *spa = arg;
+       vdev_t *vd;
+
+       ASSERT3P(spa->spa_condensing_indirect, !=, NULL);
+       spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+       vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev);
+       ASSERT3P(vd, !=, NULL);
+       spa_config_exit(spa, SCL_VDEV, FTAG);
+
        spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
        spa_condensing_indirect_phys_t *scip =
            &spa->spa_condensing_indirect_phys;
@@ -593,25 +615,24 @@ spa_condense_indirect_thread(void *arg)
                }
        }
 
-       spa_condense_indirect_generate_new_mapping(vd, counts, start_index);
+       spa_condense_indirect_generate_new_mapping(vd, counts,
+           start_index, zthr);
 
        vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts);
 
        /*
-        * We may have bailed early from generate_new_mapping(), if
-        * the spa is shutting down.  In this case, do not complete
-        * the condense.
+        * If the zthr has received a cancellation signal while running
+        * in generate_new_mapping() or at any point after that, then bail
+        * early. We don't want to complete the condense if the spa is
+        * shutting down.
         */
-       if (!spa_shutting_down(spa)) {
-               VERIFY0(dsl_sync_task(spa_name(spa), NULL,
-                   spa_condense_indirect_complete_sync, sci, 0,
-                   ZFS_SPACE_CHECK_NONE));
-       }
+       if (zthr_iscancelled(zthr))
+               return (0);
 
-       mutex_enter(&spa->spa_async_lock);
-       spa->spa_condense_thread = NULL;
-       cv_broadcast(&spa->spa_async_cv);
-       mutex_exit(&spa->spa_async_lock);
+       VERIFY0(dsl_sync_task(spa_name(spa), NULL,
+           spa_condense_indirect_complete_sync, sci, 0, ZFS_SPACE_CHECK_NONE));
+
+       return (0);
 }
 
 /*
@@ -664,9 +685,7 @@ spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t 
            (u_longlong_t)scip->scip_prev_obsolete_sm_object,
            (u_longlong_t)scip->scip_next_mapping_object);
 
-       ASSERT3P(spa->spa_condense_thread, ==, NULL);
-       spa->spa_condense_thread = thread_create(NULL, 0,
-           spa_condense_indirect_thread, vd, 0, &p0, TS_RUN, minclsyspri);
+       zthr_wakeup(spa->spa_condense_zthr);
 }
 
 /*
@@ -743,24 +762,12 @@ spa_condense_fini(spa_t *spa)
        }
 }
 
-/*
- * Restart the condense - called when the pool is opened.
- */
 void
-spa_condense_indirect_restart(spa_t *spa)
+spa_start_indirect_condensing_thread(spa_t *spa)
 {
-       vdev_t *vd;
-       ASSERT(spa->spa_condensing_indirect != NULL);
-       spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-       vd = vdev_lookup_top(spa,
-           spa->spa_condensing_indirect_phys.scip_vdev);
-       ASSERT(vd != NULL);
-       spa_config_exit(spa, SCL_VDEV, FTAG);
-
-       ASSERT3P(spa->spa_condense_thread, ==, NULL);
-       spa->spa_condense_thread = thread_create(NULL, 0,
-           spa_condense_indirect_thread, vd, 0, &p0, TS_RUN,
-           minclsyspri);
+       ASSERT3P(spa->spa_condense_zthr, ==, NULL);
+       spa->spa_condense_zthr = zthr_create(spa_condense_indirect_thread_check,
+           spa_condense_indirect_thread, spa);
 }
 
 /*
_______________________________________________
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to