Author: mav
Date: Fri Oct 14 07:14:35 2016
New Revision: 307266
URL: https://svnweb.freebsd.org/changeset/base/307266

Log:
  MFC r305323: MFV r302991: 6950 ARC should cache compressed data
  
  illumos/illumos-gate@dcbf3bd6a1f1360fc1afcee9e22c6dcff7844bf2
  
https://github.com/illumos/illumos-gate/commit/dcbf3bd6a1f1360fc1afcee9e22c6dcff
  7844bf2
  
  https://www.illumos.org/issues/6950
    When reading compressed data from disk, the ARC should keep the compressed
    block cached and only decompress it when consumers access the block. The
    uncompressed data should be short-lived allowing the ARC to cache a much 
large
  r
    amount of data. The DMU would also maintain a smaller cache of uncompressed
    blocks to minimize the impact of decompressing frequently accessed blocks.
  
  Reviewed by: Prakash Surya <prakash.su...@delphix.com>
  Reviewed by: Dan Kimmel <dan.kim...@delphix.com>
  Reviewed by: Matt Ahrens <mahr...@delphix.com>
  Reviewed by: Paul Dagnelie <p...@delphix.com>
  Reviewed by: Don Brady <don.br...@intel.com>
  Reviewed by: Richard Elling <richard.ell...@richardelling.com>
  Approved by: Richard Lowe <richl...@richlowe.net>
  Author: George Wilson <george.wil...@delphix.com>

Modified:
  stable/10/cddl/contrib/opensolaris/cmd/zdb/zdb.c
  stable/10/cddl/contrib/opensolaris/cmd/ztest/ztest.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
Directory Properties:
  stable/10/   (props changed)

Modified: stable/10/cddl/contrib/opensolaris/cmd/zdb/zdb.c
==============================================================================
--- stable/10/cddl/contrib/opensolaris/cmd/zdb/zdb.c    Fri Oct 14 07:13:43 
2016        (r307265)
+++ stable/10/cddl/contrib/opensolaris/cmd/zdb/zdb.c    Fri Oct 14 07:14:35 
2016        (r307266)
@@ -1289,7 +1289,7 @@ visit_indirect(spa_t *spa, const dnode_p
                }
                if (!err)
                        ASSERT3U(fill, ==, BP_GET_FILL(bp));
-               (void) arc_buf_remove_ref(buf, &buf);
+               arc_buf_destroy(buf, &buf);
        }
 
        return (err);

Modified: stable/10/cddl/contrib/opensolaris/cmd/ztest/ztest.c
==============================================================================
--- stable/10/cddl/contrib/opensolaris/cmd/ztest/ztest.c        Fri Oct 14 
07:13:43 2016        (r307265)
+++ stable/10/cddl/contrib/opensolaris/cmd/ztest/ztest.c        Fri Oct 14 
07:14:35 2016        (r307266)
@@ -189,6 +189,7 @@ extern uint64_t metaslab_gang_bang;
 extern uint64_t metaslab_df_alloc_threshold;
 extern uint64_t zfs_deadman_synctime_ms;
 extern int metaslab_preload_limit;
+extern boolean_t zfs_compressed_arc_enabled;
 
 static ztest_shared_opts_t *ztest_shared_opts;
 static ztest_shared_opts_t ztest_opts;
@@ -5355,6 +5356,12 @@ ztest_resume_thread(void *arg)
                if (spa_suspended(spa))
                        ztest_resume(spa);
                (void) poll(NULL, 0, 100);
+
+               /*
+                * Periodically change the zfs_compressed_arc_enabled setting.
+                */
+               if (ztest_random(10) == 0)
+                       zfs_compressed_arc_enabled = ztest_random(2);
        }
        return (NULL);
 }

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c      Fri Oct 
14 07:13:43 2016        (r307265)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c      Fri Oct 
14 07:14:35 2016        (r307266)
@@ -120,9 +120,134 @@
  *     - ARC header release, as it removes from L2ARC buflists
  */
 
+/*
+ * ARC operation:
+ *
+ * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
+ * This structure can point either to a block that is still in the cache or to
+ * one that is only accessible in an L2 ARC device, or it can provide
+ * information about a block that was recently evicted. If a block is
+ * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
+ * information to retrieve it from the L2ARC device. This information is
+ * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
+ * that is in this state cannot access the data directly.
+ *
+ * Blocks that are actively being referenced or have not been evicted
+ * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
+ * the arc_buf_hdr_t that will point to the data block in memory. A block can
+ * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
+ * caches data in two ways -- in a list of arc buffers (arc_buf_t) and
+ * also in the arc_buf_hdr_t's private physical data block pointer (b_pdata).
+ * Each arc buffer (arc_buf_t) is being actively accessed by a specific ARC
+ * consumer, and always contains uncompressed data. The ARC will provide
+ * references to this data and will keep it cached until it is no longer in
+ * use. Typically, the arc will try to cache only the L1ARC's physical data
+ * block and will aggressively evict any arc_buf_t that is no longer 
referenced.
+ * The amount of memory consumed by the arc_buf_t's can be seen via the
+ * "overhead_size" kstat.
+ *
+ *
+ *                arc_buf_hdr_t
+ *                +-----------+
+ *                |           |
+ *                |           |
+ *                |           |
+ *                +-----------+
+ * l2arc_buf_hdr_t|           |
+ *                |           |
+ *                +-----------+
+ * l1arc_buf_hdr_t|           |
+ *                |           |                 arc_buf_t
+ *                |    b_buf  +------------>+---------+      arc_buf_t
+ *                |           |             |b_next   +---->+---------+
+ *                |  b_pdata  +-+           |---------|     |b_next   +-->NULL
+ *                +-----------+ |           |         |     +---------+
+ *                              |           |b_data   +-+   |         |
+ *                              |           +---------+ |   |b_data   +-+
+ *                              +->+------+             |   +---------+ |
+ *                   (potentially) |      |             |               |
+ *                     compressed  |      |             |               |
+ *                        data     +------+             |               v
+ *                                                      +->+------+     
+------+
+ *                                            uncompressed |      |     |      
|
+ *                                                data     |      |     |      
|
+ *                                                         +------+     
+------+
+ *
+ * The L1ARC's data pointer, however, may or may not be uncompressed. The
+ * ARC has the ability to store the physical data (b_pdata) associated with
+ * the DVA of the arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk
+ * physical block, it will match its on-disk compression characteristics.
+ * If the block on-disk is compressed, then the physical data block
+ * in the cache will also be compressed and vice-versa. This behavior
+ * can be disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
+ * compressed ARC functionality is disabled, the b_pdata will point to an
+ * uncompressed version of the on-disk data.
+ *
+ * When a consumer reads a block, the ARC must first look to see if the
+ * arc_buf_hdr_t is cached. If the hdr is cached and already has an arc_buf_t,
+ * then an additional arc_buf_t is allocated and the uncompressed data is
+ * bcopied from the existing arc_buf_t. If the hdr is cached but does not
+ * have an arc_buf_t, then the ARC allocates a new arc_buf_t and decompresses
+ * the b_pdata contents into the arc_buf_t's b_data. If the arc_buf_hdr_t's
+ * b_pdata is not compressed, then the block is shared with the newly
+ * allocated arc_buf_t. This block sharing only occurs with one arc_buf_t
+ * in the arc buffer chain. Sharing the block reduces the memory overhead
+ * required when the hdr is caching uncompressed blocks or the compressed
+ * arc functionality has been disabled via 'zfs_compressed_arc_enabled'.
+ *
+ * The diagram below shows an example of an uncompressed ARC hdr that is
+ * sharing its data with an arc_buf_t:
+ *
+ *                arc_buf_hdr_t
+ *                +-----------+
+ *                |           |
+ *                |           |
+ *                |           |
+ *                +-----------+
+ * l2arc_buf_hdr_t|           |
+ *                |           |
+ *                +-----------+
+ * l1arc_buf_hdr_t|           |
+ *                |           |                 arc_buf_t    (shared)
+ *                |    b_buf  +------------>+---------+      arc_buf_t
+ *                |           |             |b_next   +---->+---------+
+ *                |  b_pdata  +-+           |---------|     |b_next   +-->NULL
+ *                +-----------+ |           |         |     +---------+
+ *                              |           |b_data   +-+   |         |
+ *                              |           +---------+ |   |b_data   +-+
+ *                              +->+------+             |   +---------+ |
+ *                                 |      |             |               |
+ *                   uncompressed  |      |             |               |
+ *                        data     +------+             |               |
+ *                                    ^                 +->+------+     |
+ *                                    |       uncompressed |      |     |
+ *                                    |           data     |      |     |
+ *                                    |                    +------+     |
+ *                                    +---------------------------------+
+ *
+ * Writing to the arc requires that the ARC first discard the b_pdata
+ * since the physical block is about to be rewritten. The new data contents
+ * will be contained in the arc_buf_t (uncompressed). As the I/O pipeline
+ * performs the write, it may compress the data before writing it to disk.
+ * The ARC will be called with the transformed data and will bcopy the
+ * transformed on-disk block into a newly allocated b_pdata.
+ *
+ * When the L2ARC is in use, it will also take advantage of the b_pdata. The
+ * L2ARC will always write the contents of b_pdata to the L2ARC. This means
+ * that when compressed arc is enabled that the L2ARC blocks are identical
+ * to the on-disk block in the main data pool. This provides a significant
+ * advantage since the ARC can leverage the bp's checksum when reading from the
+ * L2ARC to determine if the contents are valid. However, if the compressed
+ * arc is disabled, then the L2ARC's block must be transformed to look
+ * like the physical block in the main data pool before comparing the
+ * checksum and determining its validity.
+ */
+
 #include <sys/spa.h>
 #include <sys/zio.h>
+#include <sys/spa_impl.h>
 #include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/refcount.h>
@@ -154,10 +279,6 @@ static kcondvar_t  arc_reclaim_thread_cv;
 static boolean_t       arc_reclaim_thread_exit;
 static kcondvar_t      arc_reclaim_waiters_cv;
 
-static kmutex_t                arc_user_evicts_lock;
-static kcondvar_t      arc_user_evicts_cv;
-static boolean_t       arc_user_evicts_thread_exit;
-
 uint_t arc_reduce_dnlc_percent = 3;
 
 /*
@@ -229,13 +350,14 @@ uint64_t zfs_arc_meta_min = 0;
 int zfs_arc_grow_retry = 0;
 int zfs_arc_shrink_shift = 0;
 int zfs_arc_p_min_shift = 0;
-int zfs_disable_dup_eviction = 0;
 uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
 u_int zfs_arc_free_target = 0;
 
 /* Absolute min for arc min / max is 16MB. */
 static uint64_t arc_abs_min = 16 << 20;
 
+boolean_t zfs_compressed_arc_enabled = B_TRUE;
+
 static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
 static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS);
 static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS);
@@ -268,6 +390,8 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_ave
 SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
     &arc_shrink_shift, 0,
     "log2(fraction of arc to reclaim)");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN,
+    &zfs_compressed_arc_enabled, 0, "Enable compressed ARC");
 
 /*
  * We don't have a tunable for arc_free_target due to the dependency on
@@ -349,7 +473,7 @@ typedef struct arc_state {
        /*
         * total amount of evictable data in this state
         */
-       uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];
+       refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
        /*
         * total amount of data in this state; this includes: evictable,
         * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
@@ -415,6 +539,26 @@ typedef struct arc_stats {
        kstat_named_t arcstat_c_max;
        kstat_named_t arcstat_size;
        /*
+        * Number of compressed bytes stored in the arc_buf_hdr_t's b_pdata.
+        * Note that the compressed bytes may match the uncompressed bytes
+        * if the block is either not compressed or compressed arc is disabled.
+        */
+       kstat_named_t arcstat_compressed_size;
+       /*
+        * Uncompressed size of the data stored in b_pdata. If compressed
+        * arc is disabled then this value will be identical to the stat
+        * above.
+        */
+       kstat_named_t arcstat_uncompressed_size;
+       /*
+        * Number of bytes stored in all the arc_buf_t's. This is classified
+        * as "overhead" since this data is typically short-lived and will
+        * be evicted from the arc when it becomes unreferenced unless the
+        * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
+        * values have been set (see comment in dbuf.c for more information).
+        */
+       kstat_named_t arcstat_overhead_size;
+       /*
         * Number of bytes consumed by internal ARC structures necessary
         * for tracking purposes; these structures are not actually
         * backed by ARC buffers. This includes arc_buf_hdr_t structures
@@ -559,16 +703,12 @@ typedef struct arc_stats {
        kstat_named_t arcstat_l2_evict_reading;
        kstat_named_t arcstat_l2_evict_l1cached;
        kstat_named_t arcstat_l2_free_on_write;
-       kstat_named_t arcstat_l2_cdata_free_on_write;
        kstat_named_t arcstat_l2_abort_lowmem;
        kstat_named_t arcstat_l2_cksum_bad;
        kstat_named_t arcstat_l2_io_error;
        kstat_named_t arcstat_l2_size;
        kstat_named_t arcstat_l2_asize;
        kstat_named_t arcstat_l2_hdr_size;
-       kstat_named_t arcstat_l2_compress_successes;
-       kstat_named_t arcstat_l2_compress_zeros;
-       kstat_named_t arcstat_l2_compress_failures;
        kstat_named_t arcstat_l2_padding_needed;
        kstat_named_t arcstat_l2_write_trylock_fail;
        kstat_named_t arcstat_l2_write_passed_headroom;
@@ -583,9 +723,6 @@ typedef struct arc_stats {
        kstat_named_t arcstat_l2_write_buffer_list_iter;
        kstat_named_t arcstat_l2_write_buffer_list_null_iter;
        kstat_named_t arcstat_memory_throttle_count;
-       kstat_named_t arcstat_duplicate_buffers;
-       kstat_named_t arcstat_duplicate_buffers_size;
-       kstat_named_t arcstat_duplicate_reads;
        kstat_named_t arcstat_meta_used;
        kstat_named_t arcstat_meta_limit;
        kstat_named_t arcstat_meta_max;
@@ -628,6 +765,9 @@ static arc_stats_t arc_stats = {
        { "c_min",                      KSTAT_DATA_UINT64 },
        { "c_max",                      KSTAT_DATA_UINT64 },
        { "size",                       KSTAT_DATA_UINT64 },
+       { "compressed_size",            KSTAT_DATA_UINT64 },
+       { "uncompressed_size",          KSTAT_DATA_UINT64 },
+       { "overhead_size",              KSTAT_DATA_UINT64 },
        { "hdr_size",                   KSTAT_DATA_UINT64 },
        { "data_size",                  KSTAT_DATA_UINT64 },
        { "metadata_size",              KSTAT_DATA_UINT64 },
@@ -661,16 +801,12 @@ static arc_stats_t arc_stats = {
        { "l2_evict_reading",           KSTAT_DATA_UINT64 },
        { "l2_evict_l1cached",          KSTAT_DATA_UINT64 },
        { "l2_free_on_write",           KSTAT_DATA_UINT64 },
-       { "l2_cdata_free_on_write",     KSTAT_DATA_UINT64 },
        { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
        { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
        { "l2_io_error",                KSTAT_DATA_UINT64 },
        { "l2_size",                    KSTAT_DATA_UINT64 },
        { "l2_asize",                   KSTAT_DATA_UINT64 },
        { "l2_hdr_size",                KSTAT_DATA_UINT64 },
-       { "l2_compress_successes",      KSTAT_DATA_UINT64 },
-       { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
-       { "l2_compress_failures",       KSTAT_DATA_UINT64 },
        { "l2_padding_needed",          KSTAT_DATA_UINT64 },
        { "l2_write_trylock_fail",      KSTAT_DATA_UINT64 },
        { "l2_write_passed_headroom",   KSTAT_DATA_UINT64 },
@@ -685,9 +821,6 @@ static arc_stats_t arc_stats = {
        { "l2_write_buffer_list_iter",  KSTAT_DATA_UINT64 },
        { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
        { "memory_throttle_count",      KSTAT_DATA_UINT64 },
-       { "duplicate_buffers",          KSTAT_DATA_UINT64 },
-       { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
-       { "duplicate_reads",            KSTAT_DATA_UINT64 },
        { "arc_meta_used",              KSTAT_DATA_UINT64 },
        { "arc_meta_limit",             KSTAT_DATA_UINT64 },
        { "arc_meta_max",               KSTAT_DATA_UINT64 },
@@ -760,8 +893,12 @@ static arc_state_t *arc_l2c_only;
 #define        arc_meta_used   ARCSTAT(arcstat_meta_used) /* size of metadata 
*/
 #define        arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of 
metadata */
 
-#define        L2ARC_IS_VALID_COMPRESS(_c_) \
-       ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
+/* compressed size of entire arc */
+#define        arc_compressed_size     ARCSTAT(arcstat_compressed_size)
+/* uncompressed size of entire arc */
+#define        arc_uncompressed_size   ARCSTAT(arcstat_uncompressed_size)
+/* number of bytes in the arc from arc_buf_t's */
+#define        arc_overhead_size       ARCSTAT(arcstat_overhead_size)
 
 static int             arc_no_grow;    /* Don't try to grow cache size */
 static uint64_t                arc_tempreserve;
@@ -821,6 +958,7 @@ struct arc_write_callback {
  */
 typedef struct l1arc_buf_hdr {
        kmutex_t                b_freeze_lock;
+       zio_cksum_t             *b_freeze_cksum;
 #ifdef ZFS_DEBUG
        /*
         * used for debugging wtih kmem_flags - by allocating and freeing
@@ -831,9 +969,10 @@ typedef struct l1arc_buf_hdr {
 #endif
 
        arc_buf_t               *b_buf;
-       uint32_t                b_datacnt;
+       uint32_t                b_bufcnt;
        /* for waiting on writes to complete */
        kcondvar_t              b_cv;
+       uint8_t                 b_byteswap;
 
        /* protected by arc state mutex */
        arc_state_t             *b_state;
@@ -846,8 +985,7 @@ typedef struct l1arc_buf_hdr {
        refcount_t              b_refcnt;
 
        arc_callback_t          *b_acb;
-       /* temporary buffer holder for in-flight compressed or padded data */
-       void                    *b_tmp_cdata;
+       void                    *b_pdata;
 } l1arc_buf_hdr_t;
 
 typedef struct l2arc_dev l2arc_dev_t;
@@ -856,9 +994,6 @@ typedef struct l2arc_buf_hdr {
        /* protected by arc_buf_hdr mutex */
        l2arc_dev_t             *b_dev;         /* L2ARC device */
        uint64_t                b_daddr;        /* disk address, offset byte */
-       /* real alloc'd buffer size depending on b_compress applied */
-       int32_t                 b_asize;
-       uint8_t                 b_compress;
 
        list_node_t             b_l2node;
 } l2arc_buf_hdr_t;
@@ -867,20 +1002,37 @@ struct arc_buf_hdr {
        /* protected by hash lock */
        dva_t                   b_dva;
        uint64_t                b_birth;
-       /*
-        * Even though this checksum is only set/verified when a buffer is in
-        * the L1 cache, it needs to be in the set of common fields because it
-        * must be preserved from the time before a buffer is written out to
-        * L2ARC until after it is read back in.
-        */
-       zio_cksum_t             *b_freeze_cksum;
 
+       arc_buf_contents_t      b_type;
        arc_buf_hdr_t           *b_hash_next;
        arc_flags_t             b_flags;
 
-       /* immutable */
-       int32_t                 b_size;
-       uint64_t                b_spa;
+       /*
+        * This field stores the size of the data buffer after
+        * compression, and is set in the arc's zio completion handlers.
+        * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
+        *
+        * While the block pointers can store up to 32MB in their psize
+        * field, we can only store up to 32MB minus 512B. This is due
+        * to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
+        * a field of zeros represents 512B in the bp). We can't use a
+        * bias of 1 since we need to reserve a psize of zero, here, to
+        * represent holes and embedded blocks.
+        *
+        * This isn't a problem in practice, since the maximum size of a
+        * buffer is limited to 16MB, so we never need to store 32MB in
+        * this field. Even in the upstream illumos code base, the
+        * maximum size of a buffer is limited to 16MB.
+        */
+       uint16_t                b_psize;
+
+       /*
+        * This field stores the size of the data buffer before
+        * compression, and cannot change once set. It is in units
+        * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
+        */
+       uint16_t                b_lsize;        /* immutable */
+       uint64_t                b_spa;          /* immutable */
 
        /* L2ARC fields. Undefined when not in L2ARC. */
        l2arc_buf_hdr_t         b_l2hdr;
@@ -984,9 +1136,6 @@ sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_AR
 }
 #endif
 
-static arc_buf_t *arc_eviction_list;
-static arc_buf_hdr_t arc_eviction_hdr;
-
 #define        GHOST_STATE(state)      \
        ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
        (state) == arc_l2c_only)
@@ -995,25 +1144,35 @@ static arc_buf_hdr_t arc_eviction_hdr;
 #define        HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & 
ARC_FLAG_IO_IN_PROGRESS)
 #define        HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
 #define        HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_FLAG_PREFETCH)
-#define        HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & 
ARC_FLAG_FREED_IN_READ)
-#define        HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & 
ARC_FLAG_BUF_AVAILABLE)
+#define        HDR_COMPRESSION_ENABLED(hdr)    \
+       ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
 
 #define        HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_FLAG_L2CACHE)
-#define        HDR_L2COMPRESS(hdr)     ((hdr)->b_flags & ARC_FLAG_L2COMPRESS)
 #define        HDR_L2_READING(hdr)     \
-           (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&      \
-           ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
+       (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&  \
+       ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
 #define        HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
 #define        HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
 #define        HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & 
ARC_FLAG_L2_WRITE_HEAD)
+#define        HDR_SHARED_DATA(hdr)    ((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
 
 #define        HDR_ISTYPE_METADATA(hdr)        \
-           ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
+       ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
 #define        HDR_ISTYPE_DATA(hdr)    (!HDR_ISTYPE_METADATA(hdr))
 
 #define        HDR_HAS_L1HDR(hdr)      ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
 #define        HDR_HAS_L2HDR(hdr)      ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
 
+/* For storing compression mode in b_flags */
+#define        HDR_COMPRESS_OFFSET     (highbit64(ARC_FLAG_COMPRESS_0) - 1)
+
+#define        HDR_GET_COMPRESS(hdr)   ((enum 
zio_compress)BF32_GET((hdr)->b_flags, \
+       HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
+#define        HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
+       HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
+
+#define        ARC_BUF_LAST(buf)       ((buf)->b_next == NULL)
+
 /*
  * Other sizes
  */
@@ -1066,16 +1225,6 @@ uint64_t zfs_crc64_table[256];
 #define        L2ARC_FEED_SECS         1               /* caching interval 
secs */
 #define        L2ARC_FEED_MIN_MS       200             /* min caching interval 
ms */
 
-/*
- * Used to distinguish headers that are being process by
- * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk
- * address. This can happen when the header is added to the l2arc's list
- * of buffers to write in the first stage of l2arc_write_buffers(), but
- * has not yet been written out which happens in the second stage of
- * l2arc_write_buffers().
- */
-#define        L2ARC_ADDR_UNSET        ((uint64_t)(-1))
-
 #define        l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 #define        l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 
@@ -1110,41 +1259,47 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_nor
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
     &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
-    &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
-    &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD,
+    &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+    "size of anonymous state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD,
+    &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+    "size of anonymous state");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
     &ARC_mru.arcs_size.rc_count, 0, "size of mru state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
-    &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru 
state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
-    &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD,
+    &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+    "size of metadata in mru state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD,
+    &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+    "size of data in mru state");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
     &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
-    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD,
+    &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
     "size of metadata in mru ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
-    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD,
+    &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
     "size of data in mru ghost state");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
     &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
-    &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu 
state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
-    &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD,
+    &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+    "size of metadata in mfu state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD,
+    &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+    "size of data in mfu state");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
     &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
-    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD,
+    &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
     "size of metadata in mfu ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
-    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
+    &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
     "size of data in mfu ghost state");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
@@ -1177,12 +1332,10 @@ static kmutex_t l2arc_free_on_write_mtx;
 static uint64_t l2arc_ndev;                    /* number of devices */
 
 typedef struct l2arc_read_callback {
-       arc_buf_t               *l2rcb_buf;             /* read buffer */
-       spa_t                   *l2rcb_spa;             /* spa */
+       arc_buf_hdr_t           *l2rcb_hdr;             /* read buffer */
        blkptr_t                l2rcb_bp;               /* original blkptr */
        zbookmark_phys_t        l2rcb_zb;               /* original bookmark */
        int                     l2rcb_flags;            /* original flags */
-       enum zio_compress       l2rcb_compress;         /* applied compress */
        void                    *l2rcb_data;            /* temporary buffer */
 } l2arc_read_callback_t;
 
@@ -1195,7 +1348,7 @@ typedef struct l2arc_data_free {
        /* protected by l2arc_free_on_write_mtx */
        void            *l2df_data;
        size_t          l2df_size;
-       void            (*l2df_func)(void *, size_t);
+       arc_buf_contents_t l2df_type;
        list_node_t     l2df_list_node;
 } l2arc_data_free_t;
 
@@ -1203,21 +1356,22 @@ static kmutex_t l2arc_feed_thr_lock;
 static kcondvar_t l2arc_feed_thr_cv;
 static uint8_t l2arc_thread_exit;
 
-static void arc_get_data_buf(arc_buf_t *);
+static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
+static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
+static void arc_hdr_free_pdata(arc_buf_hdr_t *hdr);
+static void arc_hdr_alloc_pdata(arc_buf_hdr_t *);
 static void arc_access(arc_buf_hdr_t *, kmutex_t *);
 static boolean_t arc_is_overflowing();
 static void arc_buf_watch(arc_buf_t *);
 
 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
 static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
+static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
+static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
 
 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
 static void l2arc_read_done(zio_t *);
 
-static boolean_t l2arc_transform_buf(arc_buf_hdr_t *, boolean_t);
-static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress);
-static void l2arc_release_cdata_buf(arc_buf_hdr_t *);
-
 static void
 l2arc_trim(const arc_buf_hdr_t *hdr)
 {
@@ -1226,13 +1380,9 @@ l2arc_trim(const arc_buf_hdr_t *hdr)
        ASSERT(HDR_HAS_L2HDR(hdr));
        ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
 
-       if (hdr->b_l2hdr.b_daddr == L2ARC_ADDR_UNSET)
-               return;
-       if (hdr->b_l2hdr.b_asize != 0) {
+       if (HDR_GET_PSIZE(hdr) != 0) {
                trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr,
-                   hdr->b_l2hdr.b_asize, 0);
-       } else {
-               ASSERT3U(hdr->b_l2hdr.b_compress, ==, ZIO_COMPRESS_EMPTY);
+                   HDR_GET_PSIZE(hdr), 0);
        }
 }
 
@@ -1253,14 +1403,14 @@ buf_hash(uint64_t spa, const dva_t *dva,
        return (crc);
 }
 
-#define        BUF_EMPTY(buf)                                          \
-       ((buf)->b_dva.dva_word[0] == 0 &&                       \
-       (buf)->b_dva.dva_word[1] == 0)
-
-#define        BUF_EQUAL(spa, dva, birth, buf)                         \
-       ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&     \
-       ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&     \
-       ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
+#define        HDR_EMPTY(hdr)                                          \
+       ((hdr)->b_dva.dva_word[0] == 0 &&                       \
+       (hdr)->b_dva.dva_word[1] == 0)
+
+#define        HDR_EQUAL(spa, dva, birth, hdr)                         \
+       ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&     \
+       ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&     \
+       ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
 
 static void
 buf_discard_identity(arc_buf_hdr_t *hdr)
@@ -1282,7 +1432,7 @@ buf_hash_find(uint64_t spa, const blkptr
        mutex_enter(hash_lock);
        for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
            hdr = hdr->b_hash_next) {
-               if (BUF_EQUAL(spa, dva, birth, hdr)) {
+               if (HDR_EQUAL(spa, dva, birth, hdr)) {
                        *lockp = hash_lock;
                        return (hdr);
                }
@@ -1320,13 +1470,13 @@ buf_hash_insert(arc_buf_hdr_t *hdr, kmut
 
        for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
            fhdr = fhdr->b_hash_next, i++) {
-               if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
+               if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
                        return (fhdr);
        }
 
        hdr->b_hash_next = buf_hash_table.ht_table[idx];
        buf_hash_table.ht_table[idx] = hdr;
-       hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
+       arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
 
        /* collect some hash table performance data */
        if (i > 0) {
@@ -1354,12 +1504,12 @@ buf_hash_remove(arc_buf_hdr_t *hdr)
 
        hdrp = &buf_hash_table.ht_table[idx];
        while ((fhdr = *hdrp) != hdr) {
-               ASSERT(fhdr != NULL);
+               ASSERT3P(fhdr, !=, NULL);
                hdrp = &fhdr->b_hash_next;
        }
        *hdrp = hdr->b_hash_next;
        hdr->b_hash_next = NULL;
-       hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE;
+       arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
 
        /* collect some hash table performance data */
        ARCSTAT_BUMPDOWN(arcstat_hash_elements);
@@ -1445,7 +1595,7 @@ hdr_full_dest(void *vbuf, void *unused)
 {
        arc_buf_hdr_t *hdr = vbuf;
 
-       ASSERT(BUF_EMPTY(hdr));
+       ASSERT(HDR_EMPTY(hdr));
        cv_destroy(&hdr->b_l1hdr.b_cv);
        refcount_destroy(&hdr->b_l1hdr.b_refcnt);
        mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
@@ -1459,7 +1609,7 @@ hdr_l2only_dest(void *vbuf, void *unused
 {
        arc_buf_hdr_t *hdr = vbuf;
 
-       ASSERT(BUF_EMPTY(hdr));
+       ASSERT(HDR_EMPTY(hdr));
        arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 }
 
@@ -1532,166 +1682,138 @@ retry:
        }
 }
 
-/*
- * Transition between the two allocation states for the arc_buf_hdr struct.
- * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
- * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
- * version is used when a cache buffer is only in the L2ARC in order to reduce
- * memory usage.
- */
-static arc_buf_hdr_t *
-arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
-{
-       ASSERT(HDR_HAS_L2HDR(hdr));
-
-       arc_buf_hdr_t *nhdr;
-       l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
-
-       ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
-           (old == hdr_l2only_cache && new == hdr_full_cache));
-
-       nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
-
-       ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
-       buf_hash_remove(hdr);
-
-       bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
-
-       if (new == hdr_full_cache) {
-               nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
-               /*
-                * arc_access and arc_change_state need to be aware that a
-                * header has just come out of L2ARC, so we set its state to
-                * l2c_only even though it's about to change.
-                */
-               nhdr->b_l1hdr.b_state = arc_l2c_only;
-
-               /* Verify previous threads set to NULL before freeing */
-               ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL);
-       } else {
-               ASSERT(hdr->b_l1hdr.b_buf == NULL);
-               ASSERT0(hdr->b_l1hdr.b_datacnt);
-
-               /*
-                * If we've reached here, We must have been called from
-                * arc_evict_hdr(), as such we should have already been
-                * removed from any ghost list we were previously on
-                * (which protects us from racing with arc_evict_state),
-                * thus no locking is needed during this check.
-                */
-               ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
-
-               /*
-                * A buffer must not be moved into the arc_l2c_only
-                * state if it's not finished being written out to the
-                * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field
-                * might try to be accessed, even though it was removed.
-                */
-               VERIFY(!HDR_L2_WRITING(hdr));
-               VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+#define        ARC_MINTIME     (hz>>4) /* 62 ms */
 
-#ifdef ZFS_DEBUG
-               if (hdr->b_l1hdr.b_thawed != NULL) {
-                       kmem_free(hdr->b_l1hdr.b_thawed, 1);
-                       hdr->b_l1hdr.b_thawed = NULL;
-               }
-#endif
+static inline boolean_t
+arc_buf_is_shared(arc_buf_t *buf)
+{
+       boolean_t shared = (buf->b_data != NULL &&
+           buf->b_data == buf->b_hdr->b_l1hdr.b_pdata);
+       IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
+       return (shared);
+}
 
-               nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
+static inline void
+arc_cksum_free(arc_buf_hdr_t *hdr)
+{
+       ASSERT(HDR_HAS_L1HDR(hdr));
+       mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
+       if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
+               kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
+               hdr->b_l1hdr.b_freeze_cksum = NULL;
        }
-       /*
-        * The header has been reallocated so we need to re-insert it into any
-        * lists it was on.
-        */
-       (void) buf_hash_insert(nhdr, NULL);
-
-       ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
-
-       mutex_enter(&dev->l2ad_mtx);
-
-       /*
-        * We must place the realloc'ed header back into the list at
-        * the same spot. Otherwise, if it's placed earlier in the list,
-        * l2arc_write_buffers() could find it during the function's
-        * write phase, and try to write it out to the l2arc.
-        */
-       list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
-       list_remove(&dev->l2ad_buflist, hdr);
-
-       mutex_exit(&dev->l2ad_mtx);
-
-       /*
-        * Since we're using the pointer address as the tag when
-        * incrementing and decrementing the l2ad_alloc refcount, we
-        * must remove the old pointer (that we're about to destroy) and
-        * add the new pointer to the refcount. Otherwise we'd remove
-        * the wrong pointer address when calling arc_hdr_destroy() later.
-        */
-
-       (void) refcount_remove_many(&dev->l2ad_alloc,
-           hdr->b_l2hdr.b_asize, hdr);
-
-       (void) refcount_add_many(&dev->l2ad_alloc,
-           nhdr->b_l2hdr.b_asize, nhdr);
-
-       buf_discard_identity(hdr);
-       hdr->b_freeze_cksum = NULL;
-       kmem_cache_free(old, hdr);
-
-       return (nhdr);
+       mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 }
 
-
-#define        ARC_MINTIME     (hz>>4) /* 62 ms */
-
 static void
 arc_cksum_verify(arc_buf_t *buf)
 {
+       arc_buf_hdr_t *hdr = buf->b_hdr;
        zio_cksum_t zc;
 
        if (!(zfs_flags & ZFS_DEBUG_MODIFY))
                return;
 
-       mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
-       if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) {
-               mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+       ASSERT(HDR_HAS_L1HDR(hdr));
+
+       mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
+       if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
+               mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
                return;
        }
-       fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc);
-       if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
+       fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL, &zc);
+       if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
                panic("buffer modified while frozen!");
-       mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+       mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 }
 
-static int
-arc_cksum_equal(arc_buf_t *buf)
+static boolean_t
+arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
 {
-       zio_cksum_t zc;
-       int equal;
+       enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp);
+       boolean_t valid_cksum;
 
-       mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
-       fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc);
-       equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
-       mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+       ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
+       VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
+
+       /*
+        * We rely on the blkptr's checksum to determine if the block
+        * is valid or not. When compressed arc is enabled, the l2arc
+        * writes the block to the l2arc just as it appears in the pool.
+        * This allows us to use the blkptr's checksum to validate the
+        * data that we just read off of the l2arc without having to store
+        * a separate checksum in the arc_buf_hdr_t. However, if compressed
+        * arc is disabled, then the data written to the l2arc is always
+        * uncompressed and won't match the block as it exists in the main
+        * pool. When this is the case, we must first compress it if it is
+        * compressed on the main pool before we can validate the checksum.
+        */
+       if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) {
+               ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+               uint64_t lsize = HDR_GET_LSIZE(hdr);
+               uint64_t csize;
+
+               void *cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr));
+               csize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
+               ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr));
+               if (csize < HDR_GET_PSIZE(hdr)) {
+                       /*
+                        * Compressed blocks are always a multiple of the
+                        * smallest ashift in the pool. Ideally, we would
+                        * like to round up the csize to the next
+                        * spa_min_ashift but that value may have changed
+                        * since the block was last written. Instead,
+                        * we rely on the fact that the hdr's psize
+                        * was set to the psize of the block when it was
+                        * last written. We set the csize to that value
+                        * and zero out any part that should not contain
+                        * data.
+                        */
+                       bzero((char *)cbuf + csize, HDR_GET_PSIZE(hdr) - csize);
+                       csize = HDR_GET_PSIZE(hdr);
+               }
+               zio_push_transform(zio, cbuf, csize, HDR_GET_PSIZE(hdr), NULL);
+       }
 
-       return (equal);
+       /*
+        * Block pointers always store the checksum for the logical data.
+        * If the block pointer has the gang bit set, then the checksum
+        * it represents is for the reconstituted data and not for an
+        * individual gang member. The zio pipeline, however, must be able to
+        * determine the checksum of each of the gang constituents so it
+        * treats the checksum comparison differently than what we need
+        * for l2arc blocks. This prevents us from using the
+        * zio_checksum_error() interface directly. Instead we must call the
+        * zio_checksum_error_impl() so that we can ensure the checksum is
+        * generated using the correct checksum algorithm and accounts for the
+        * logical I/O size and not just a gang fragment.
+        */
+       valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
+           BP_GET_CHECKSUM(zio->io_bp), zio->io_data, zio->io_size,
+           zio->io_offset, NULL) == 0);
+       zio_pop_transforms(zio);
+       return (valid_cksum);
 }
 
 static void
-arc_cksum_compute(arc_buf_t *buf, boolean_t force)
+arc_cksum_compute(arc_buf_t *buf)
 {
-       if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
+       arc_buf_hdr_t *hdr = buf->b_hdr;
+
+       if (!(zfs_flags & ZFS_DEBUG_MODIFY))
                return;
 
+       ASSERT(HDR_HAS_L1HDR(hdr));
        mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
-       if (buf->b_hdr->b_freeze_cksum != NULL) {
-               mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+       if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
+               mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
                return;
        }
-       buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
-       fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
-           NULL, buf->b_hdr->b_freeze_cksum);
-       mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+       hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
+           KM_SLEEP);
+       fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL,
+           hdr->b_l1hdr.b_freeze_cksum);
+       mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 #ifdef illumos
        arc_buf_watch(buf);
 #endif
@@ -1733,7 +1855,7 @@ arc_buf_watch(arc_buf_t *buf)
                procctl_t ctl;
                ctl.cmd = PCWATCH;
                ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
-               ctl.prwatch.pr_size = buf->b_hdr->b_size;
+               ctl.prwatch.pr_size = HDR_GET_LSIZE(buf->b_hdr);
                ctl.prwatch.pr_wflags = WA_WRITE;
                result = write(arc_procfd, &ctl, sizeof (ctl));
                ASSERT3U(result, ==, sizeof (ctl));
@@ -1745,11 +1867,14 @@ arc_buf_watch(arc_buf_t *buf)
 static arc_buf_contents_t
 arc_buf_type(arc_buf_hdr_t *hdr)
 {
+       arc_buf_contents_t type;
        if (HDR_ISTYPE_METADATA(hdr)) {
-               return (ARC_BUFC_METADATA);
+               type = ARC_BUFC_METADATA;
        } else {
-               return (ARC_BUFC_DATA);
+               type = ARC_BUFC_DATA;
        }
+       VERIFY3U(hdr->b_type, ==, type);
+       return (type);
 }
 
 static uint32_t
@@ -1771,29 +1896,29 @@ arc_bufc_to_flags(arc_buf_contents_t typ
 void
 arc_buf_thaw(arc_buf_t *buf)
 {
+       arc_buf_hdr_t *hdr = buf->b_hdr;
+
        if (zfs_flags & ZFS_DEBUG_MODIFY) {
-               if (buf->b_hdr->b_l1hdr.b_state != arc_anon)
+               if (hdr->b_l1hdr.b_state != arc_anon)
                        panic("modifying non-anon buffer!");

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-stable-10@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-stable-10
To unsubscribe, send any mail to "svn-src-stable-10-unsubscr...@freebsd.org"

Reply via email to