Author: avg
Date: Mon Jul  6 10:41:29 2015
New Revision: 285203
URL: https://svnweb.freebsd.org/changeset/base/285203

Log:
  MFC r284593: MFV r284412: 5911 ZFS "hangs" while deleting file
  
  illumos/illumos-gate@46e1baa6cf6d5432f5fd231bb588df8f9570c858
  https://www.illumos.org/issues/5911
  Sometimes ZFS appears to hang while deleting a file. It is actually
  making slow progress at the file deletion, but other operations
  (administrative and writes via the data path) "hang" until the file
  removal completes, which can take a long time if the file has many
  blocks. The deletion (or most of it) happens in a single txg, and the
  sync thread spends most of its time reading indirect blocks...
  
  Reviewed by: Bayard Bell <buffer.g.overf...@gmail.com>
  Reviewed by: Alek Pinchuk <a...@nexenta.com>
  Reviewed by: Simon Klinkert <simon.klink...@gmail.com>
  Reviewed by: Dan McDonald <dan...@omniti.com>
  Approved by: Richard Lowe <richl...@richlowe.net>
  Author: Matthew Ahrens <mahr...@delphix.com>
  
  PR:   199775

Modified:
  stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
  stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
  stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
  stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
  stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
Directory Properties:
  stable/9/sys/   (props changed)
  stable/9/sys/cddl/contrib/opensolaris/   (props changed)

Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c      Mon Jul 
 6 10:40:51 2015        (r285202)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c      Mon Jul 
 6 10:41:29 2015        (r285203)
@@ -21,7 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
@@ -1294,6 +1294,16 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_
        dbuf_dirty_record_t *dr, **drp;
 
        ASSERT(txg != 0);
+
+       /*
+        * Due to our use of dn_nlevels below, this can only be called
+        * in open context, unless we are operating on the MOS.
+        * From syncing context, dn_nlevels may be different from the
+        * dn_nlevels used when dbuf was dirtied.
+        */
+       ASSERT(db->db_objset ==
+           dmu_objset_pool(db->db_objset)->dp_meta_objset ||
+           txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
        ASSERT(db->db_blkid != DMU_BONUS_BLKID);
        ASSERT0(db->db_level);
        ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -1316,11 +1326,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_
 
        ASSERT(db->db.db_size != 0);
 
-       /*
-        * Any space we accounted for in dp_dirty_* will be cleaned up by
-        * dsl_pool_sync().  This is relatively rare so the discrepancy
-        * is not a big deal.
-        */
+       dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
+           dr->dr_accounted, txg);
 
        *drp = dr->dr_next;
 
@@ -1335,7 +1342,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_
                list_remove(&dr->dr_parent->dt.di.dr_children, dr);
                mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
        } else if (db->db_blkid == DMU_SPILL_BLKID ||
-           db->db_level+1 == dn->dn_nlevels) {
+           db->db_level + 1 == dn->dn_nlevels) {
                ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
                mutex_enter(&dn->dn_mtx);
                list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
@@ -1352,11 +1359,6 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_
                        VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
        }
 
-       if (db->db_level != 0) {
-               mutex_destroy(&dr->dt.di.dr_mtx);
-               list_destroy(&dr->dt.di.dr_children);
-       }
-
        kmem_free(dr, sizeof (dbuf_dirty_record_t));
 
        ASSERT(db->db_dirtycnt > 0);
@@ -2277,7 +2279,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *
 
        zio = dr->dr_zio;
        mutex_enter(&dr->dt.di.dr_mtx);
-       dbuf_sync_list(&dr->dt.di.dr_children, tx);
+       dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
        ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
        mutex_exit(&dr->dt.di.dr_mtx);
        zio_nowait(zio);
@@ -2423,7 +2425,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, 
 }
 
 void
-dbuf_sync_list(list_t *list, dmu_tx_t *tx)
+dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
 {
        dbuf_dirty_record_t *dr;
 
@@ -2440,6 +2442,10 @@ dbuf_sync_list(list_t *list, dmu_tx_t *t
                            DMU_META_DNODE_OBJECT);
                        break;
                }
+               if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+                   dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
+                       VERIFY3U(dr->dr_dbuf->db_level, ==, level);
+               }
                list_remove(list, dr);
                if (dr->dr_dbuf->db_level > 0)
                        dbuf_sync_indirect(dr, tx);

Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c    Mon Jul 
 6 10:40:51 2015        (r285202)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c    Mon Jul 
 6 10:41:29 2015        (r285203)
@@ -21,7 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -652,7 +652,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t 
                        uint64_t ibyte = i << shift;
                        err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
                        i = ibyte >> shift;
-                       if (err == ESRCH)
+                       if (err == ESRCH || i > end)
                                break;
                        if (err) {
                                tx->tx_err = err;

Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c     Mon Jul 
 6 10:40:51 2015        (r285202)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c     Mon Jul 
 6 10:41:29 2015        (r285203)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -1455,6 +1455,16 @@ out:
                rw_downgrade(&dn->dn_struct_rwlock);
 }
 
+static void
+dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
+{
+       dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
+       if (db != NULL) {
+               dmu_buf_will_dirty(&db->db, tx);
+               dbuf_rele(db, FTAG);
+       }
+}
+
 void
 dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 {
@@ -1575,27 +1585,67 @@ dnode_free_range(dnode_t *dn, uint64_t o
                nblks += 1;
 
        /*
-        * Dirty the first and last indirect blocks, as they (and/or their
-        * parents) will need to be written out if they were only
-        * partially freed.  Interior indirect blocks will be themselves freed,
-        * by free_children(), so they need not be dirtied.  Note that these
-        * interior blocks have already been prefetched by dmu_tx_hold_free().
+        * Dirty all the indirect blocks in this range.  Note that only
+        * the first and last indirect blocks can actually be written
+        * (if they were partially freed) -- they must be dirtied, even if
+        * they do not exist on disk yet.  The interior blocks will
+        * be freed by free_children(), so they will not actually be written.
+        * Even though these interior blocks will not be written, we
+        * dirty them for two reasons:
+        *
+        *  - It ensures that the indirect blocks remain in memory until
+        *    syncing context.  (They have already been prefetched by
+        *    dmu_tx_hold_free(), so we don't have to worry about reading
+        *    them serially here.)
+        *
+        *  - The dirty space accounting will put pressure on the txg sync
+        *    mechanism to begin syncing, and to delay transactions if there
+        *    is a large amount of freeing.  Even though these indirect
+        *    blocks will not be written, we could need to write the same
+        *    amount of space if we copy the freed BPs into deadlists.
         */
        if (dn->dn_nlevels > 1) {
                uint64_t first, last;
 
                first = blkid >> epbs;
-               if (db = dbuf_hold_level(dn, 1, first, FTAG)) {
-                       dmu_buf_will_dirty(&db->db, tx);
-                       dbuf_rele(db, FTAG);
-               }
+               dnode_dirty_l1(dn, first, tx);
                if (trunc)
                        last = dn->dn_maxblkid >> epbs;
                else
                        last = (blkid + nblks - 1) >> epbs;
-               if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) {
-                       dmu_buf_will_dirty(&db->db, tx);
-                       dbuf_rele(db, FTAG);
+               if (last != first)
+                       dnode_dirty_l1(dn, last, tx);
+
+               int shift = dn->dn_datablkshift + dn->dn_indblkshift -
+                   SPA_BLKPTRSHIFT;
+               for (uint64_t i = first + 1; i < last; i++) {
+                       /*
+                        * Set i to the blockid of the next non-hole
+                        * level-1 indirect block at or after i.  Note
+                        * that dnode_next_offset() operates in terms of
+                        * level-0-equivalent bytes.
+                        */
+                       uint64_t ibyte = i << shift;
+                       int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
+                           &ibyte, 2, 1, 0);
+                       i = ibyte >> shift;
+                       if (i >= last)
+                               break;
+
+                       /*
+                        * Normally we should not see an error, either
+                        * from dnode_next_offset() or dbuf_hold_level()
+                        * (except for ESRCH from dnode_next_offset).
+                        * If there is an i/o error, then when we read
+                        * this block in syncing context, it will use
+                        * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
+                        * to the "failmode" property.  dnode_next_offset()
+                        * doesn't have a flag to indicate MUSTSUCCEED.
+                        */
+                       if (err != 0)
+                               break;
+
+                       dnode_dirty_l1(dn, i, tx);
                }
        }
 

Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c        
Mon Jul  6 10:40:51 2015        (r285202)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c        
Mon Jul  6 10:41:29 2015        (r285203)
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -717,7 +717,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
                mutex_exit(&dn->dn_mtx);
        }
 
-       dbuf_sync_list(list, tx);
+       dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx);
 
        if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
                ASSERT3P(list_head(list), ==, NULL);

Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h  Mon Jul 
 6 10:40:51 2015        (r285202)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h  Mon Jul 
 6 10:41:29 2015        (r285203)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  */
 
@@ -280,7 +280,7 @@ void dbuf_evict(dmu_buf_impl_t *db);
 
 void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 void dbuf_unoverride(dbuf_dirty_record_t *dr);
-void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
+void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx);
 void dbuf_release_bp(dmu_buf_impl_t *db);
 
 void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
_______________________________________________
svn-src-all@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to