Author: avg
Date: Thu Aug 15 14:57:27 2019
New Revision: 351074
URL: https://svnweb.freebsd.org/changeset/base/351074

Log:
  MFV r350898: 8423 8199 7432 Implement large_dnode pool feature
  
  8423 8199 7432 Implement large_dnode pool feature
  
  8423 Implement large_dnode pool feature
  8199 multi-threaded dmu_object_alloc()
  7432 Large dnode pool feature
  
  llumos/illumos-gate@54811da5ac6b517992fdc173df5d605e4e61fdc0
  
https://github.com/illumos/illumos-gate/commit/54811da5ac6b517992fdc173df5d605e4e61fdc0
  https://www.illumos.org/issues/8423
  https://www.illumos.org/issues/8199
  https://www.illumos.org/issues/7432
  
    ZoL issues:
    Improved dnode allocation #6564
    Clean up large dnode code #6262
    Fix dnode_hold() freeing dnode behavior #8172
    Fix dnode allocation race #6414, #6439
    Partial: Raw sends must be able to decrease nlevels #6821, #6864
    Remove unnecessary txg syncs from receive_object() Closes #7197
  
  This updates FreeBSD large_dnode code (that was imported from ZoL) to a 
version
  that was committed to illumos.  It has some cleanups, improvements and fixes
  comparing to what we have in FreeBSD now.  I think that the most significant
  update is 8199 multi-threaded dmu_object_alloc().
  
  Obtained from:        illumos
  MFC after:    3 weeks

Modified:
  head/cddl/contrib/opensolaris/cmd/zdb/zdb.c
  head/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c
  head/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c
  head/cddl/contrib/opensolaris/cmd/ztest/ztest.c
  head/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
Directory Properties:
  head/cddl/contrib/opensolaris/   (props changed)
  head/cddl/contrib/opensolaris/cmd/zdb/   (props changed)
  head/sys/cddl/contrib/opensolaris/   (props changed)

Modified: head/cddl/contrib/opensolaris/cmd/zdb/zdb.c
==============================================================================
--- head/cddl/contrib/opensolaris/cmd/zdb/zdb.c Thu Aug 15 14:54:18 2019        
(r351073)
+++ head/cddl/contrib/opensolaris/cmd/zdb/zdb.c Thu Aug 15 14:57:27 2019        
(r351074)
@@ -2134,7 +2134,8 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES 
 };
 
 static void
-dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
+dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header,
+    uint64_t *dnode_slots_used)
 {
        dmu_buf_t *db = NULL;
        dmu_object_info_t doi;
@@ -2154,7 +2155,7 @@ dump_object(objset_t *os, uint64_t object, int verbosi
        CTASSERT(sizeof (bonus_size) >= NN_NUMBUF_SZ);
 
        if (*print_header) {
-               (void) printf("\n%10s  %3s  %5s  %5s  %5s  %6s %5s  %6s  %s\n",
+               (void) printf("\n%10s  %3s  %5s  %5s  %5s  %6s  %5s  %6s  %s\n",
                    "Object", "lvl", "iblk", "dblk", "dsize", "dnsize",
                    "lsize", "%full", "type");
                *print_header = 0;
@@ -2173,6 +2174,9 @@ dump_object(objset_t *os, uint64_t object, int verbosi
        }
        dmu_object_info_from_dnode(dn, &doi);
 
+       if (dnode_slots_used != NULL)
+               *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE;
+
        zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk));
        zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk));
        zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize));
@@ -2195,8 +2199,9 @@ dump_object(objset_t *os, uint64_t object, int verbosi
                    ZDB_COMPRESS_NAME(doi.doi_compress));
        }
 
-       (void) printf("%10lld  %3u  %5s  %5s  %5s  %6s  %5s  %6s  %s%s\n",
-           (u_longlong_t)object, doi.doi_indirection, iblk, dblk,
+       (void) printf("%10" PRIu64
+           "  %3u  %5s  %5s  %5s  %5s  %5s  %6s  %s%s\n",
+           object, doi.doi_indirection, iblk, dblk,
            asize, dnsize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux);
 
        if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
@@ -2305,6 +2310,9 @@ dump_dir(objset_t *os)
        int print_header = 1;
        unsigned i;
        int error;
+       uint64_t total_slots_used = 0;
+       uint64_t max_slot_used = 0;
+       uint64_t dnode_slots;
 
        /* make sure nicenum has enough space */
        CTASSERT(sizeof (numbuf) >= NN_NUMBUF_SZ);
@@ -2349,7 +2357,7 @@ dump_dir(objset_t *os)
        if (zopt_objects != 0) {
                for (i = 0; i < zopt_objects; i++)
                        dump_object(os, zopt_object[i], verbosity,
-                           &print_header);
+                           &print_header, NULL);
                (void) printf("\n");
                return;
        }
@@ -2374,22 +2382,37 @@ dump_dir(objset_t *os)
        if (BP_IS_HOLE(os->os_rootbp))
                return;
 
-       dump_object(os, 0, verbosity, &print_header);
+       dump_object(os, 0, verbosity, &print_header, NULL);
        object_count = 0;
        if (DMU_USERUSED_DNODE(os) != NULL &&
            DMU_USERUSED_DNODE(os)->dn_type != 0) {
-               dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header);
-               dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header);
+               dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header,
+                   NULL);
+               dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header,
+                   NULL);
        }
 
        object = 0;
        while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
-               dump_object(os, object, verbosity, &print_header);
+               dump_object(os, object, verbosity, &print_header, &dnode_slots);
                object_count++;
+               total_slots_used += dnode_slots;
+               max_slot_used = object + dnode_slots - 1;
        }
 
        (void) printf("\n");
 
+       (void) printf("    Dnode slots:\n");
+       (void) printf("\tTotal used:    %10llu\n",
+           (u_longlong_t)total_slots_used);
+       (void) printf("\tMax used:      %10llu\n",
+           (u_longlong_t)max_slot_used);
+       (void) printf("\tPercent empty: %10lf\n",
+           (double)(max_slot_used - total_slots_used)*100 /
+           (double)max_slot_used);
+
+       (void) printf("\n");
+
        if (error != ESRCH) {
                (void) fprintf(stderr, "dmu_object_next() = %d\n", error);
                abort();
@@ -2581,7 +2604,7 @@ dump_path_impl(objset_t *os, uint64_t obj, char *name)
                        return (dump_path_impl(os, child_obj, s + 1));
                /*FALLTHROUGH*/
        case DMU_OT_PLAIN_FILE_CONTENTS:
-               dump_object(os, child_obj, dump_opt['v'], &header);
+               dump_object(os, child_obj, dump_opt['v'], &header, NULL);
                return (0);
        default:
                (void) fprintf(stderr, "object %llu has non-file/directory "

Modified: head/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c
==============================================================================
--- head/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c      Thu Aug 15 14:54:18 
2019        (r351073)
+++ head/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c      Thu Aug 15 14:57:27 
2019        (r351074)
@@ -84,15 +84,15 @@ zil_prt_rec_create(zilog_t *zilog, int txtype, void *a
        }
 
        (void) printf("%s%s", tab_prefix, ctime(&crtime));
-       (void) printf("%sdoid %llu, foid %llu, slots %llu, mode %llo\n", 
tab_prefix,
-           (u_longlong_t)lr->lr_doid, 
-           (u_longlong_t)LR_FOID_GET_OBJ(lr->lr_foid),
-           (u_longlong_t)LR_FOID_GET_SLOTS(lr->lr_foid),
-           (longlong_t)lr->lr_mode);
-       (void) printf("%suid %llu, gid %llu, gen %llu, rdev 0x%llx\n",
-           tab_prefix,
-           (u_longlong_t)lr->lr_uid, (u_longlong_t)lr->lr_gid,
-           (u_longlong_t)lr->lr_gen, (u_longlong_t)lr->lr_rdev);
+       (void) printf("%sdoid %" PRIu64 ", foid %" PRIu64 ", slots %" PRIu64
+           ", mode %" PRIo64 "\n",
+           tab_prefix, lr->lr_doid,
+           (uint64_t)LR_FOID_GET_OBJ(lr->lr_foid),
+           (uint64_t)LR_FOID_GET_SLOTS(lr->lr_foid),
+           lr->lr_mode);
+       (void) printf("%suid %" PRIu64 ", gid %" PRIu64 ", gen %" PRIu64
+           ", rdev %#" PRIx64 "\n",
+           tab_prefix, lr->lr_uid, lr->lr_gid, lr->lr_gen, lr->lr_rdev);
 }
 
 /* ARGSUSED */

Modified: head/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c
==============================================================================
--- head/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c Thu Aug 15 
14:54:18 2019        (r351073)
+++ head/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c Thu Aug 15 
14:57:27 2019        (r351074)
@@ -416,13 +416,15 @@ main(int argc, char *argv[])
                                drro->drr_toguid = BSWAP_64(drro->drr_toguid);
                        }
                        if (verbose) {
-                               (void) printf("OBJECT object = %llu type = %u "
-                                   "bonustype = %u blksz = %u bonuslen = %u\n",
-                                   (u_longlong_t)drro->drr_object,
+                               (void) printf("OBJECT object = %" PRIu64
+                                   " type = %u bonustype = %u blksz = %u"
+                                   " bonuslen = %u dn_slots = %u\n",
+                                   drro->drr_object,
                                    drro->drr_type,
                                    drro->drr_bonustype,
                                    drro->drr_blksz,
-                                   drro->drr_bonuslen);
+                                   drro->drr_bonuslen,
+                                   drro->drr_dn_slots);
                        }
                        if (drro->drr_bonuslen > 0) {
                                (void) ssread(buf,

Modified: head/cddl/contrib/opensolaris/cmd/ztest/ztest.c
==============================================================================
--- head/cddl/contrib/opensolaris/cmd/ztest/ztest.c     Thu Aug 15 14:54:18 
2019        (r351073)
+++ head/cddl/contrib/opensolaris/cmd/ztest/ztest.c     Thu Aug 15 14:57:27 
2019        (r351074)
@@ -196,6 +196,7 @@ extern uint64_t zfs_deadman_synctime_ms;
 extern int metaslab_preload_limit;
 extern boolean_t zfs_compressed_arc_enabled;
 extern boolean_t zfs_abd_scatter_enabled;
+extern int dmu_object_alloc_chunk_shift;
 extern boolean_t zfs_force_some_double_word_sm_entries;
 
 static ztest_shared_opts_t *ztest_shared_opts;
@@ -322,6 +323,7 @@ static ztest_shared_callstate_t *ztest_shared_callstat
 ztest_func_t ztest_dmu_read_write;
 ztest_func_t ztest_dmu_write_parallel;
 ztest_func_t ztest_dmu_object_alloc_free;
+ztest_func_t ztest_dmu_object_next_chunk;
 ztest_func_t ztest_dmu_commit_callbacks;
 ztest_func_t ztest_zap;
 ztest_func_t ztest_zap_parallel;
@@ -363,6 +365,7 @@ ztest_info_t ztest_info[] = {
        { ztest_dmu_read_write,                 1,      &zopt_always    },
        { ztest_dmu_write_parallel,             10,     &zopt_always    },
        { ztest_dmu_object_alloc_free,          1,      &zopt_always    },
+       { ztest_dmu_object_next_chunk,          1,      &zopt_sometimes },
        { ztest_dmu_commit_callbacks,           1,      &zopt_always    },
        { ztest_zap,                            30,     &zopt_always    },
        { ztest_zap_parallel,                   100,    &zopt_always    },
@@ -1366,7 +1369,7 @@ ztest_bt_bonus(dmu_buf_t *db)
  * it unique to the object, generation, and offset to verify that data
  * is not getting overwritten by data from other dnodes.
  */
-#define        ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \
+#define        ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset)    \
        (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset))
 
 /*
@@ -1895,6 +1898,7 @@ ztest_replay_setattr(void *arg1, void *arg2, boolean_t
        ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode,
            txg, crtxg);
        ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen);
+
        dmu_buf_rele(db, FTAG);
 
        (void) ztest_log_setattr(zd, tx, lr);
@@ -3815,8 +3819,10 @@ ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t i
        ztest_od_t od[4];
        int batchsize = sizeof (od) / sizeof (od[0]);
 
-       for (int b = 0; b < batchsize; b++)
-               ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0, 
0);
+       for (int b = 0; b < batchsize; b++) {
+               ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER,
+                   0, 0, 0);
+       }
 
        /*
         * Destroy the previous batch of objects, create a new batch,
@@ -3831,6 +3837,26 @@ ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t i
 }
 
 /*
+ * Rewind the global allocator to verify object allocation backfilling.
+ */
+void
+ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id)
+{
+       objset_t *os = zd->zd_os;
+       int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
+       uint64_t object;
+
+       /*
+        * Rewind the global allocator randomly back to a lower object number
+        * to force backfilling and reclamation of recently freed dnodes.
+        */
+       mutex_enter(&os->os_obj_lock);
+       object = ztest_random(os->os_obj_next_chunk);
+       os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk);
+       mutex_exit(&os->os_obj_lock);
+}
+
+/*
  * Verify that dmu_{read,write} work as expected.
  */
 void
@@ -3876,8 +3902,10 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
        /*
         * Read the directory info.  If it's the first time, set things up.
         */
-       ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 
chunksize);
-       ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 
chunksize);
+       ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0,
+           chunksize);
+       ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0,
+           chunksize);
 
        if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
                return;
@@ -4146,8 +4174,10 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id
        /*
         * Read the directory info.  If it's the first time, set things up.
         */
-       ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 
0);
-       ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 
chunksize);
+       ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize,
+           0, 0);
+       ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0,
+           chunksize);
 
        if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
                return;
@@ -4347,7 +4377,8 @@ ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id)
         * to verify that parallel writes to an object -- even to the
         * same blocks within the object -- doesn't cause any trouble.
         */
-       ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 
0);
+       ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER,
+           0, 0, 0);
 
        if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
                return;
@@ -4366,7 +4397,8 @@ ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id)
        uint64_t blocksize = ztest_random_blocksize();
        void *data;
 
-       ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 
0);
+       ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize,
+           0, 0);
 
        if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
                return;
@@ -4590,7 +4622,8 @@ ztest_zap_parallel(ztest_ds_t *zd, uint64_t id)
        char name[20], string_value[20];
        void *data;
 
-       ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 
0);
+       ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER,
+           0, 0, 0);
 
        if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
                return;
@@ -5411,7 +5444,8 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
        blocksize = ztest_random_blocksize();
        blocksize = MIN(blocksize, 2048);       /* because we write so many */
 
-       ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 
0);
+       ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize,
+           0, 0);
 
        if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
                return;

Modified: head/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c     Thu Aug 15 
14:54:18 2019        (r351073)
+++ head/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c     Thu Aug 15 
14:57:27 2019        (r351074)
@@ -292,10 +292,11 @@ zfs_prop_init(void)
            ZFS_VOLMODE_DEFAULT, PROP_INHERIT,
            ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
            "default | geom | dev | none", "VOLMODE", volmode_table);
+ 
        zprop_register_index(ZFS_PROP_DNODESIZE, "dnodesize",
            ZFS_DNSIZE_LEGACY, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
            "legacy | auto | 1k | 2k | 4k | 8k | 16k", "DNSIZE", dnsize_table);
- 
+
        /* inherit index (boolean) properties */
        zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
            ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c  Thu Aug 15 
14:54:18 2019        (r351073)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c  Thu Aug 15 
14:57:27 2019        (r351074)
@@ -3757,7 +3757,8 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb
                if (dn->dn_type == DMU_OT_DNODE) {
                        i = 0;
                        while (i < db->db.db_size) {
-                               dnode_phys_t *dnp = db->db.db_data + i;
+                               dnode_phys_t *dnp =
+                                   (void *)(((char *)db->db.db_data) + i);
 
                                i += DNODE_MIN_SIZE;
                                if (dnp->dn_type != DMU_OT_NONE) {

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c    Thu Aug 
15 14:54:18 2019        (r351073)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c    Thu Aug 
15 14:57:27 2019        (r351074)
@@ -32,6 +32,14 @@
 #include <sys/zfeature.h>
 #include <sys/dsl_dataset.h>
 
+/*
+ * Each of the concurrent object allocators will grab
+ * 2^dmu_object_alloc_chunk_shift dnode slots at a time.  The default is to
+ * grab 128 slots, which is 4 blocks worth.  This was experimentally
+ * determined to be the lowest value that eliminates the measurable effect
+ * of lock contention from this code path.
+ */
+int dmu_object_alloc_chunk_shift = 7;
 
 static uint64_t
 dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
@@ -44,6 +52,10 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t 
        dnode_t *dn = NULL;
        int dn_slots = dnodesize >> DNODE_SHIFT;
        boolean_t restarted = B_FALSE;
+       uint64_t *cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
+           os->os_obj_next_percpu_len];
+       int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
+       int error;
 
        if (dn_slots == 0) {
                dn_slots = DNODE_MIN_SLOTS;
@@ -51,93 +63,145 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t 
                ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
                ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
        }
- 
-       mutex_enter(&os->os_obj_lock);
+
+       /*
+        * The "chunk" of dnodes that is assigned to a CPU-specific
+        * allocator needs to be at least one block's worth, to avoid
+        * lock contention on the dbuf.  It can be at most one L1 block's
+        * worth, so that the "rescan after polishing off a L1's worth"
+        * logic below will be sure to kick in.
+        */
+       if (dnodes_per_chunk < DNODES_PER_BLOCK)
+               dnodes_per_chunk = DNODES_PER_BLOCK;
+       if (dnodes_per_chunk > L1_dnode_count)
+               dnodes_per_chunk = L1_dnode_count;
+
+       object = *cpuobj;
+
        for (;;) {
-               object = os->os_obj_next;
                /*
-                * Each time we polish off a L1 bp worth of dnodes (2^12
-                * objects), move to another L1 bp that's still
-                * reasonably sparse (at most 1/4 full). Look from the
-                * beginning at most once per txg. If we still can't
-                * allocate from that L1 block, search for an empty L0
-                * block, which will quickly skip to the end of the
-                * metadnode if the no nearby L0 blocks are empty. This
-                * fallback avoids a pathology where full dnode blocks
-                * containing large dnodes appear sparse because they
-                * have a low blk_fill, leading to many failed
-                * allocation attempts. In the long term a better
-                * mechanism to search for sparse metadnode regions,
-                * such as spacemaps, could be implemented.
-                *
-                * os_scan_dnodes is set during txg sync if enough objects
-                * have been freed since the previous rescan to justify
-                * backfilling again.
-                *
-                * Note that dmu_traverse depends on the behavior that we use
-                * multiple blocks of the dnode object before going back to
-                * reuse objects.  Any change to this algorithm should preserve
-                * that property or find another solution to the issues
-                * described in traverse_visitbp.
+                * If we finished a chunk of dnodes, get a new one from
+                * the global allocator.
                 */
-               if (P2PHASE(object, L1_dnode_count) == 0) {
-                       uint64_t offset;
-                       uint64_t blkfill;
-                       int minlvl;
-                       int error;
-                       if (os->os_rescan_dnodes) {
-                               offset = 0;
-                               os->os_rescan_dnodes = B_FALSE;
-                       } else {
-                               offset = object << DNODE_SHIFT;
+               if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
+                   (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
+                   dn_slots)) {
+                       DNODE_STAT_BUMP(dnode_alloc_next_chunk);
+                       mutex_enter(&os->os_obj_lock);
+                       ASSERT0(P2PHASE(os->os_obj_next_chunk,
+                           dnodes_per_chunk));
+                       object = os->os_obj_next_chunk;
+
+                       /*
+                        * Each time we polish off a L1 bp worth of dnodes
+                        * (2^12 objects), move to another L1 bp that's
+                        * still reasonably sparse (at most 1/4 full). Look
+                        * from the beginning at most once per txg. If we
+                        * still can't allocate from that L1 block, search
+                        * for an empty L0 block, which will quickly skip
+                        * to the end of the metadnode if the no nearby L0
+                        * blocks are empty. This fallback avoids a
+                        * pathology where full dnode blocks containing
+                        * large dnodes appear sparse because they have a
+                        * low blk_fill, leading to many failed allocation
+                        * attempts. In the long term a better mechanism to
+                        * search for sparse metadnode regions, such as
+                        * spacemaps, could be implemented.
+                        *
+                        * os_scan_dnodes is set during txg sync if enough
+                        * objects have been freed since the previous
+                        * rescan to justify backfilling again.
+                        *
+                        * Note that dmu_traverse depends on the behavior
+                        * that we use multiple blocks of the dnode object
+                        * before going back to reuse objects. Any change
+                        * to this algorithm should preserve that property
+                        * or find another solution to the issues described
+                        * in traverse_visitbp.
+                        */
+                       if (P2PHASE(object, L1_dnode_count) == 0) {
+                               uint64_t offset;
+                               uint64_t blkfill;
+                               int minlvl;
+                               if (os->os_rescan_dnodes) {
+                                       offset = 0;
+                                       os->os_rescan_dnodes = B_FALSE;
+                               } else {
+                                       offset = object << DNODE_SHIFT;
+                               }
+                               blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
+                               minlvl = restarted ? 1 : 2;
+                               restarted = B_TRUE;
+                               error = dnode_next_offset(DMU_META_DNODE(os),
+                                   DNODE_FIND_HOLE, &offset, minlvl,
+                                   blkfill, 0);
+                               if (error == 0) {
+                                       object = offset >> DNODE_SHIFT;
+                               }
                        }
-                       blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
-                       minlvl = restarted ? 1 : 2;
-                       restarted = B_TRUE;
-                       error = dnode_next_offset(DMU_META_DNODE(os),
-                           DNODE_FIND_HOLE, &offset, minlvl, blkfill, 0);
-                       if (error == 0)
-                               object = offset >> DNODE_SHIFT;
+                       /*
+                        * Note: if "restarted", we may find a L0 that
+                        * is not suitably aligned.
+                        */
+                       os->os_obj_next_chunk =
+                           P2ALIGN(object, dnodes_per_chunk) +
+                           dnodes_per_chunk;
+                       (void) atomic_swap_64(cpuobj, object);
+                       mutex_exit(&os->os_obj_lock);
                }
-               os->os_obj_next = object + dn_slots;
 
                /*
+                * The value of (*cpuobj) before adding dn_slots is the object
+                * ID assigned to us.  The value afterwards is the object ID
+                * assigned to whoever wants to do an allocation next.
+                */
+               object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
+
+               /*
                 * XXX We should check for an i/o error here and return
                 * up to our caller.  Actually we should pre-read it in
                 * dmu_tx_assign(), but there is currently no mechanism
                 * to do so.
                 */
-               (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
-                   FTAG, &dn);
-               if (dn)
-                       break;
-
-               if (dmu_object_next(os, &object, B_TRUE, 0) == 0)
-                       os->os_obj_next = object;
-               else
+               error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
+                   dn_slots, FTAG, &dn);
+               if (error == 0) {
+                       rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
                        /*
-                        * Skip to next known valid starting point for a dnode.
+                        * Another thread could have allocated it; check
+                        * again now that we have the struct lock.
                         */
-                       os->os_obj_next = P2ROUNDUP(object + 1,
-                           DNODES_PER_BLOCK);
-       }
+                       if (dn->dn_type == DMU_OT_NONE) {
+                               dnode_allocate(dn, ot, blocksize, 0,
+                                   bonustype, bonuslen, dn_slots, tx);
+                               rw_exit(&dn->dn_struct_rwlock);
+                               dmu_tx_add_new_object(tx, dn);
+                               dnode_rele(dn, FTAG);
+                               return (object);
+                       }
+                       rw_exit(&dn->dn_struct_rwlock);
+                       dnode_rele(dn, FTAG);
+                       DNODE_STAT_BUMP(dnode_alloc_race);
+               }
 
-       dnode_allocate(dn, ot, blocksize, indirect_blockshift,
-                      bonustype, bonuslen, dn_slots, tx);
-       mutex_exit(&os->os_obj_lock);
-
-       dmu_tx_add_new_object(tx, dn);
-       dnode_rele(dn, FTAG);
-
-       return (object);
+               /*
+                * Skip to next known valid starting point on error. This
+                * is the start of the next block of dnodes.
+                */
+               if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
+                       object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
+                       DNODE_STAT_BUMP(dnode_alloc_next_block);
+               }
+               (void) atomic_swap_64(cpuobj, object);
+       }
 }
 
 uint64_t
 dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
-       return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
-           bonuslen, 0, tx);
+       return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
+           bonuslen, 0, tx));
 }
 
 uint64_t
@@ -145,8 +209,8 @@ dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t o
     int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
     dmu_tx_t *tx)
 {
-       return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
-           bonustype, bonuslen, 0, tx);
+       return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
+           bonustype, bonuslen, 0, tx));
 }
 
 uint64_t
@@ -178,7 +242,7 @@ dmu_object_claim_dnsize(objset_t *os, uint64_t object,
                dn_slots = DNODE_MIN_SLOTS;
        ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
        ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
-       
+
        if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
                return (SET_ERROR(EBADF));
 
@@ -260,28 +324,52 @@ int
 dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
 {
        uint64_t offset;
-       dmu_object_info_t doi;
+       uint64_t start_obj;
        struct dsl_dataset *ds = os->os_dsl_dataset;
-       int dnodesize;
        int error;
 
-       /*
-        * Avoid expensive dnode hold if this dataset doesn't use large dnodes.
-        */
-       if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) {
-               error = dmu_object_info(os, *objectp, &doi);
-               if (error && !(error == EINVAL && *objectp == 0))
-                       return (SET_ERROR(error));
-               else
-                       dnodesize = doi.doi_dnodesize;
+       if (*objectp == 0) {
+               start_obj = 1;
+       } else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) {
+               uint64_t i = *objectp + 1;
+               uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
+               dmu_object_info_t doi;
+
+               /*
+                * Scan through the remaining meta dnode block. The contents
+                * of each slot in the block are known so it can be quickly
+                * checked. If the block is exhausted without a match then
+                * hand off to dnode_next_offset() for further scanning.
+                */
+               while (i <= last_obj) {
+                       error = dmu_object_info(os, i, &doi);
+                       if (error == ENOENT) {
+                               if (hole) {
+                                       *objectp = i;
+                                       return (0);
+                               } else {
+                                       i++;
+                               }
+                       } else if (error == EEXIST) {
+                               i++;
+                       } else if (error == 0) {
+                               if (hole) {
+                                       i += doi.doi_dnodesize >> DNODE_SHIFT;
+                               } else {
+                                       *objectp = i;
+                                       return (0);
+                               }
+                       } else {
+                               return (error);
+                       }
+               }
+
+               start_obj = i;
        } else {
-               dnodesize = DNODE_MIN_SIZE;
+               start_obj = *objectp + 1;
        }
 
-       if (*objectp == 0)
-               offset = 1 << DNODE_SHIFT;
-       else
-               offset = (*objectp << DNODE_SHIFT) + dnodesize;
+       offset = start_obj << DNODE_SHIFT;
 
        error = dnode_next_offset(DMU_META_DNODE(os),
            (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c    Thu Aug 
15 14:54:18 2019        (r351073)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c    Thu Aug 
15 14:57:27 2019        (r351074)
@@ -566,6 +566,9 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, bl
        mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
+       os->os_obj_next_percpu_len = boot_ncpus;
+       os->os_obj_next_percpu = kmem_zalloc(os->os_obj_next_percpu_len *
+           sizeof (os->os_obj_next_percpu[0]), KM_SLEEP);
 
        dnode_special_open(os, &os->os_phys->os_meta_dnode,
            DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
@@ -843,6 +846,9 @@ dmu_objset_evict_done(objset_t *os)
         */
        rw_enter(&os_lock, RW_READER);
        rw_exit(&os_lock);
+
+       kmem_free(os->os_obj_next_percpu,
+           os->os_obj_next_percpu_len * sizeof (os->os_obj_next_percpu[0]));
 
        mutex_destroy(&os->os_lock);
        mutex_destroy(&os->os_userused_lock);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c      Thu Aug 
15 14:54:18 2019        (r351073)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c      Thu Aug 
15 14:57:27 2019        (r351074)
@@ -1437,17 +1437,12 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
        /*
         * The receiving code doesn't know how to translate large blocks
         * to smaller ones, so the pool must have the LARGE_BLOCKS
-        * feature enabled if the stream has LARGE_BLOCKS.
+        * feature enabled if the stream has LARGE_BLOCKS. Same with
+        * large dnodes.
         */
        if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
            !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
                return (SET_ERROR(ENOTSUP));
-
-       /*
-        * The receiving code doesn't know how to translate large dnodes
-        * to smaller ones, so the pool must have the LARGE_DNODE
-        * feature enabled if the stream has LARGE_DNODE.
-        */
        if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) &&
            !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE))
                return (SET_ERROR(ENOTSUP));
@@ -1655,6 +1650,9 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
        dsl_dataset_t *ds;
        const char *tofs = drba->drba_cookie->drc_tofs;
 
+       /* 6 extra bytes for /%recv */
+       char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+
        /* already checked */
        ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
        ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING);
@@ -1682,8 +1680,18 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
            !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
                return (SET_ERROR(ENOTSUP));
 
-       /* 6 extra bytes for /%recv */
-       char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+       /*
+        * The receiving code doesn't know how to translate large blocks
+        * to smaller ones, so the pool must have the LARGE_BLOCKS
+        * feature enabled if the stream has LARGE_BLOCKS. Same with
+        * large dnodes.
+        */
+       if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+           !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+               return (SET_ERROR(ENOTSUP));
+       if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) &&
+           !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE))
+               return (SET_ERROR(ENOTSUP));
 
        (void) snprintf(recvname, sizeof (recvname), "%s/%s",
            tofs, recv_clone_name);
@@ -2155,15 +2163,16 @@ receive_object(struct receive_writer_arg *rwa, struct 
            drro->drr_blksz < SPA_MINBLOCKSIZE ||
            drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) ||
            drro->drr_bonuslen >
-           DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os)))) {
+           DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) ||
+           drro->drr_dn_slots >
+           (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) {
                return (SET_ERROR(EINVAL));
        }
 
        err = dmu_object_info(rwa->os, drro->drr_object, &doi);
 
-       if (err != 0 && err != ENOENT)
+       if (err != 0 && err != ENOENT && err != EEXIST)
                return (SET_ERROR(EINVAL));
-       object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT;
 
        if (drro->drr_object > rwa->max_object)
                rwa->max_object = drro->drr_object;
@@ -2176,18 +2185,66 @@ receive_object(struct receive_writer_arg *rwa, struct 
        if (err == 0) {
                int nblkptr;
 
+               object = drro->drr_object;
+
                nblkptr = deduce_nblkptr(drro->drr_bonustype,
                    drro->drr_bonuslen);
 
                if (drro->drr_blksz != doi.doi_data_block_size ||
-                   nblkptr < doi.doi_nblkptr) {
+                   nblkptr < doi.doi_nblkptr ||
+                   drro->drr_dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
                        err = dmu_free_long_range(rwa->os, drro->drr_object,
                            0, DMU_OBJECT_END);
                        if (err != 0)
                                return (SET_ERROR(EINVAL));
                }
+       } else if (err == EEXIST) {
+               /*
+                * The object requested is currently an interior slot of a
+                * multi-slot dnode. This will be resolved when the next txg
+                * is synced out, since the send stream will have told us
+                * to free this slot when we freed the associated dnode
+                * earlier in the stream.
+                */
+               txg_wait_synced(dmu_objset_pool(rwa->os), 0);
+               object = drro->drr_object;
+       } else {
+               /* object is free and we are about to allocate a new one */
+               object = DMU_NEW_OBJECT;
        }
 
+       /*
+        * If this is a multi-slot dnode there is a chance that this
+        * object will expand into a slot that is already used by
+        * another object from the previous snapshot. We must free
+        * these objects before we attempt to allocate the new dnode.
+        */
+       if (drro->drr_dn_slots > 1) {
+               boolean_t need_sync = B_FALSE;
+
+               for (uint64_t slot = drro->drr_object + 1;
+                   slot < drro->drr_object + drro->drr_dn_slots;
+                   slot++) {
+                       dmu_object_info_t slot_doi;
+
+                       err = dmu_object_info(rwa->os, slot, &slot_doi);
+                       if (err == ENOENT || err == EEXIST)
+                               continue;
+                       else if (err != 0)
+                               return (err);
+
+                       err = dmu_free_long_object(rwa->os, slot);
+
+                       if (err != 0)
+                               return (err);
+
+                       need_sync = B_TRUE;
+               }
+
+               if (need_sync)
+                       txg_wait_synced(dmu_objset_pool(rwa->os), 0);
+       }
+
        tx = dmu_tx_create(rwa->os);
        dmu_tx_hold_bonus(tx, object);
        err = dmu_tx_assign(tx, TXG_WAIT);
@@ -2259,10 +2316,10 @@ receive_freeobjects(struct receive_writer_arg *rwa,
                dmu_object_info_t doi;
                int err;
 
-               err = dmu_object_info(rwa->os, obj, &doi);
+               err = dmu_object_info(rwa->os, obj, NULL);
                if (err == ENOENT) {
                        obj++;
-                       continue;
+                       continue;
                } else if (err != 0) {
                        return (err);
                }

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c        Thu Aug 
15 14:54:18 2019        (r351073)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c        Thu Aug 
15 14:57:27 2019        (r351074)
@@ -1252,11 +1252,13 @@ dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
 void
 dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
 {
-       dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx,
-           tx->tx_objset, object, THT_SPILL, 0, 0);
+       dmu_tx_hold_t *txh;
 
-       (void) refcount_add_many(&txh->txh_space_towrite,
-           SPA_OLD_MAXBLOCKSIZE, FTAG);
+       txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
+           THT_SPILL, 0, 0);
+       if (txh != NULL)
+               (void) refcount_add_many(&txh->txh_space_towrite,
+                   SPA_OLD_MAXBLOCKSIZE, FTAG);
 }
 
 void

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c Thu Aug 15 
14:54:18 2019        (r351073)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c Thu Aug 15 
14:57:27 2019        (r351074)
@@ -40,21 +40,41 @@
 #include <sys/dmu_zfetch.h>
 #include <sys/range_tree.h>
 
+dnode_stats_t dnode_stats = {
+       { "dnode_hold_dbuf_hold",               KSTAT_DATA_UINT64 },
+       { "dnode_hold_dbuf_read",               KSTAT_DATA_UINT64 },
+       { "dnode_hold_alloc_hits",              KSTAT_DATA_UINT64 },
+       { "dnode_hold_alloc_misses",            KSTAT_DATA_UINT64 },
+       { "dnode_hold_alloc_interior",          KSTAT_DATA_UINT64 },
+       { "dnode_hold_alloc_lock_retry",        KSTAT_DATA_UINT64 },
+       { "dnode_hold_alloc_lock_misses",       KSTAT_DATA_UINT64 },
+       { "dnode_hold_alloc_type_none",         KSTAT_DATA_UINT64 },
+       { "dnode_hold_free_hits",               KSTAT_DATA_UINT64 },
+       { "dnode_hold_free_misses",             KSTAT_DATA_UINT64 },
+       { "dnode_hold_free_lock_misses",        KSTAT_DATA_UINT64 },
+       { "dnode_hold_free_lock_retry",         KSTAT_DATA_UINT64 },
+       { "dnode_hold_free_overflow",           KSTAT_DATA_UINT64 },
+       { "dnode_hold_free_refcount",           KSTAT_DATA_UINT64 },
+       { "dnode_hold_free_txg",                KSTAT_DATA_UINT64 },
+       { "dnode_free_interior_lock_retry",     KSTAT_DATA_UINT64 },
+       { "dnode_allocate",                     KSTAT_DATA_UINT64 },
+       { "dnode_reallocate",                   KSTAT_DATA_UINT64 },
+       { "dnode_buf_evict",                    KSTAT_DATA_UINT64 },
+       { "dnode_alloc_next_chunk",             KSTAT_DATA_UINT64 },
+       { "dnode_alloc_race",                   KSTAT_DATA_UINT64 },
+       { "dnode_alloc_next_block",             KSTAT_DATA_UINT64 },
+       { "dnode_move_invalid",                 KSTAT_DATA_UINT64 },
+       { "dnode_move_recheck1",                KSTAT_DATA_UINT64 },
+       { "dnode_move_recheck2",                KSTAT_DATA_UINT64 },
+       { "dnode_move_special",                 KSTAT_DATA_UINT64 },
+       { "dnode_move_handle",                  KSTAT_DATA_UINT64 },
+       { "dnode_move_rwlock",                  KSTAT_DATA_UINT64 },
+       { "dnode_move_active",                  KSTAT_DATA_UINT64 },
+};
+
+static kstat_t *dnode_ksp;
 static kmem_cache_t *dnode_cache;
-/*
- * Define DNODE_STATS to turn on statistic gathering. By default, it is only
- * turned on when DEBUG is also defined.
- */
-#ifdef DEBUG
-#define        DNODE_STATS
-#endif /* DEBUG */
 
-#ifdef DNODE_STATS
-#define        DNODE_STAT_ADD(stat)                    ((stat)++)
-#else
-#define        DNODE_STAT_ADD(stat)                    /* nothing */
-#endif /* DNODE_STATS */
-
 static dnode_phys_t dnode_phys_zero;
 
 int zfs_default_bs = SPA_MINBLOCKSHIFT;
@@ -215,12 +235,25 @@ dnode_init(void)
            0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
 #ifdef _KERNEL
        kmem_cache_set_move(dnode_cache, dnode_move);
+
+       dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc",
+           KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t),
+           KSTAT_FLAG_VIRTUAL);
+       if (dnode_ksp != NULL) {
+               dnode_ksp->ks_data = &dnode_stats;
+               kstat_install(dnode_ksp);
+       }
 #endif /* _KERNEL */
 }
 
 void
 dnode_fini(void)
 {
+       if (dnode_ksp != NULL) {
+               kstat_delete(dnode_ksp);
+               dnode_ksp = NULL;
+       }
+
        kmem_cache_destroy(dnode_cache);
        dnode_cache = NULL;
 }
@@ -333,6 +366,7 @@ dnode_byteswap(dnode_phys_t *dnp)
        /* Swap SPILL block if we have one */
        if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
                byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));
+
 }
 
 void
@@ -344,7 +378,7 @@ dnode_buf_byteswap(void *vbuf, size_t size)
        ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
 
        while (i < size) {
-               dnode_phys_t *dnp = vbuf + i;
+               dnode_phys_t *dnp = (void *)(((char *)vbuf) + i);
                dnode_byteswap(dnp);
 
                i += DNODE_MIN_SIZE;
@@ -448,14 +482,10 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_
        dmu_zfetch_init(&dn->dn_zfetch, dn);
 
        ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
+       ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
+       ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode));
 
        mutex_enter(&os->os_lock);
-       if (dnh->dnh_dnode != NULL) {
-               /* Lost the allocation race. */
-               mutex_exit(&os->os_lock);
-               kmem_cache_free(dnode_cache, dn);
-               return (dnh->dnh_dnode);
-       }
 
        /*
         * Exclude special dnodes from os_dnodes so an empty os_dnodes
@@ -478,6 +508,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_
        mutex_exit(&os->os_lock);
 
        arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE);
+
        return (dn);
 }
 
@@ -503,7 +534,8 @@ dnode_destroy(dnode_t *dn)
        mutex_exit(&os->os_lock);
 
        /* the dnode can no longer move, so we can release the handle */
-       zrl_remove(&dn->dn_handle->dnh_zrlock);
+       if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
+               zrl_remove(&dn->dn_handle->dnh_zrlock);
 
        dn->dn_allocated_txg = 0;
        dn->dn_free_txg = 0;
@@ -559,8 +591,10 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int 
 

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to