Author: avg
Date: Tue May 17 08:43:50 2016
New Revision: 300039
URL: https://svnweb.freebsd.org/changeset/base/300039

Log:
  MFC r297848: l2arc: make sure that all writes honor ashift of a cache device
  
  Note: no MFC stable/9 because it has become quite out of date with head,
  so the merge would be quite labourious and, thus, risky.

Modified:
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
Directory Properties:
  stable/10/   (props changed)

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c      Tue May 
17 08:36:54 2016        (r300038)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c      Tue May 
17 08:43:50 2016        (r300039)
@@ -565,6 +565,7 @@ typedef struct arc_stats {
        kstat_named_t arcstat_l2_compress_successes;
        kstat_named_t arcstat_l2_compress_zeros;
        kstat_named_t arcstat_l2_compress_failures;
+       kstat_named_t arcstat_l2_padding_needed;
        kstat_named_t arcstat_l2_write_trylock_fail;
        kstat_named_t arcstat_l2_write_passed_headroom;
        kstat_named_t arcstat_l2_write_spa_mismatch;
@@ -666,6 +667,7 @@ static arc_stats_t arc_stats = {
        { "l2_compress_successes",      KSTAT_DATA_UINT64 },
        { "l2_compress_zeros",          KSTAT_DATA_UINT64 },
        { "l2_compress_failures",       KSTAT_DATA_UINT64 },
+       { "l2_padding_needed",          KSTAT_DATA_UINT64 },
        { "l2_write_trylock_fail",      KSTAT_DATA_UINT64 },
        { "l2_write_passed_headroom",   KSTAT_DATA_UINT64 },
        { "l2_write_spa_mismatch",      KSTAT_DATA_UINT64 },
@@ -839,7 +841,7 @@ typedef struct l1arc_buf_hdr {
        refcount_t              b_refcnt;
 
        arc_callback_t          *b_acb;
-       /* temporary buffer holder for in-flight compressed data */
+       /* temporary buffer holder for in-flight compressed or padded data */
        void                    *b_tmp_cdata;
 } l1arc_buf_hdr_t;
 
@@ -1100,6 +1102,7 @@ typedef struct l2arc_read_callback {
        zbookmark_phys_t        l2rcb_zb;               /* original bookmark */
        int                     l2rcb_flags;            /* original flags */
        enum zio_compress       l2rcb_compress;         /* applied compress */
+       void                    *l2rcb_data;            /* temporary buffer */
 } l2arc_read_callback_t;
 
 typedef struct l2arc_write_callback {
@@ -1130,7 +1133,7 @@ static uint32_t arc_bufc_to_flags(arc_bu
 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
 static void l2arc_read_done(zio_t *);
 
-static boolean_t l2arc_compress_buf(arc_buf_hdr_t *);
+static boolean_t l2arc_transform_buf(arc_buf_hdr_t *, boolean_t);
 static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress);
 static void l2arc_release_cdata_buf(arc_buf_hdr_t *);
 
@@ -2217,6 +2220,8 @@ arc_buf_data_free(arc_buf_t *buf, void (
 static void
 arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
 {
+       size_t align, asize, len;
+
        ASSERT(HDR_HAS_L2HDR(hdr));
        ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx));
 
@@ -2238,16 +2243,15 @@ arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr
        }
 
        /*
-        * The header does not have compression enabled. This can be due
-        * to the buffer not being compressible, or because we're
-        * freeing the buffer before the second phase of
-        * l2arc_write_buffer() has started (which does the compression
-        * step). In either case, b_tmp_cdata does not point to a
-        * separately compressed buffer, so there's nothing to free (it
-        * points to the same buffer as the arc_buf_t's b_data field).
-        */
-       if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_OFF) {
-               hdr->b_l1hdr.b_tmp_cdata = NULL;
+        * The bufer has been chosen for writing to L2ARC, but it's
+        * not being written just yet.  In other words,
+        * b_tmp_cdata points to exactly the same buffer as b_data,
+        * l2arc_transform_buf hasn't been called.
+        */
+       if (hdr->b_l2hdr.b_daddr == L2ARC_ADDR_UNSET) {
+               ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==,
+                   hdr->b_l1hdr.b_buf->b_data);
+               ASSERT3U(hdr->b_l2hdr.b_compress, ==, ZIO_COMPRESS_OFF);
                return;
        }
 
@@ -2260,12 +2264,18 @@ arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr
                return;
        }
 
-       ASSERT(L2ARC_IS_VALID_COMPRESS(hdr->b_l2hdr.b_compress));
-
-       arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata,
-           hdr->b_size, zio_data_buf_free);
+       /*
+        * Nothing to do if the temporary buffer was not required.
+        */
+       if (hdr->b_l1hdr.b_tmp_cdata == NULL)
+               return;
 
        ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
+       len = hdr->b_size;
+       align = (size_t)1 << hdr->b_l2hdr.b_dev->l2ad_vdev->vdev_ashift;
+       asize = P2ROUNDUP(len, align);
+       arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, asize,
+           zio_data_buf_free);
        hdr->b_l1hdr.b_tmp_cdata = NULL;
 }
 
@@ -4528,6 +4538,7 @@ top:
                            !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
                            !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
                                l2arc_read_callback_t *cb;
+                               void* b_data;
 
                                DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
                                ARCSTAT_BUMP(arcstat_l2_hits);
@@ -4540,6 +4551,14 @@ top:
                                cb->l2rcb_zb = *zb;
                                cb->l2rcb_flags = zio_flags;
                                cb->l2rcb_compress = b_compress;
+                               if (b_asize > hdr->b_size) {
+                                       ASSERT3U(b_compress, ==,
+                                           ZIO_COMPRESS_OFF);
+                                       b_data = zio_data_buf_alloc(b_asize);
+                                       cb->l2rcb_data = b_data;
+                               } else {
+                                       b_data = buf->b_data;
+                               }
 
                                ASSERT(addr >= VDEV_LABEL_START_SIZE &&
                                    addr + size < vd->vdev_psize -
@@ -4552,6 +4571,7 @@ top:
                                 * was squashed to zero size by compression.
                                 */
                                if (b_compress == ZIO_COMPRESS_EMPTY) {
+                                       ASSERT3U(b_asize, ==, 0);
                                        rzio = zio_null(pio, spa, vd,
                                            l2arc_read_done, cb,
                                            zio_flags | ZIO_FLAG_DONT_CACHE |
@@ -4560,7 +4580,7 @@ top:
                                            ZIO_FLAG_DONT_RETRY);
                                } else {
                                        rzio = zio_read_phys(pio, vd, addr,
-                                           b_asize, buf->b_data,
+                                           b_asize, b_data,
                                            ZIO_CHECKSUM_OFF,
                                            l2arc_read_done, cb, priority,
                                            zio_flags | ZIO_FLAG_DONT_CACHE |
@@ -6045,6 +6065,32 @@ l2arc_read_done(zio_t *zio)
        ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
        /*
+        * If the data was read into a temporary buffer,
+        * move it and free the buffer.
+        */
+       if (cb->l2rcb_data != NULL) {
+               ASSERT3U(hdr->b_size, <, zio->io_size);
+               ASSERT3U(cb->l2rcb_compress, ==, ZIO_COMPRESS_OFF);
+               if (zio->io_error == 0)
+                       bcopy(cb->l2rcb_data, buf->b_data, hdr->b_size);
+
+               /*
+                * The following must be done regardless of whether
+                * there was an error:
+                * - free the temporary buffer
+                * - point zio to the real ARC buffer
+                * - set zio size accordingly
+                * These are required because zio is either re-used for
+                * an I/O of the block in the case of the error
+                * or the zio is passed to arc_read_done() and it
+                * needs real data.
+                */
+               zio_data_buf_free(cb->l2rcb_data, zio->io_size);
+               zio->io_size = zio->io_orig_size = hdr->b_size;
+               zio->io_data = zio->io_orig_data = buf->b_data;
+       }
+
+       /*
         * If the buffer was compressed, decompress it first.
         */
        if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
@@ -6328,6 +6374,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
                        kmutex_t *hash_lock;
                        uint64_t buf_sz;
                        uint64_t buf_a_sz;
+                       size_t align;
 
                        if (arc_warm == B_FALSE)
                                hdr_prev = multilist_sublist_next(mls, hdr);
@@ -6365,7 +6412,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
                         * disk block size.
                         */
                        buf_sz = hdr->b_size;
-                       buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
+                       align = (size_t)1 << dev->l2ad_vdev->vdev_ashift;
+                       buf_a_sz = P2ROUNDUP(buf_sz, align);
 
                        if ((write_asize + buf_a_sz) > target_sz) {
                                full = B_TRUE;
@@ -6469,26 +6517,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
        mutex_enter(&dev->l2ad_mtx);
 
        /*
-        * Note that elsewhere in this file arcstat_l2_asize
-        * and the used space on l2ad_vdev are updated using b_asize,
-        * which is not necessarily rounded up to the device block size.
-        * Too keep accounting consistent we do the same here as well:
-        * stats_size accumulates the sum of b_asize of the written buffers,
-        * while write_asize accumulates the sum of b_asize rounded up
-        * to the device block size.
-        * The latter sum is used only to validate the corectness of the code.
-        */
-       uint64_t stats_size = 0;
-       write_asize = 0;
-
-       /*
         * Now start writing the buffers. We're starting at the write head
         * and work backwards, retracing the course of the buffer selector
         * loop above.
         */
+       write_asize = 0;
        for (hdr = list_prev(&dev->l2ad_buflist, head); hdr;
            hdr = list_prev(&dev->l2ad_buflist, hdr)) {
                uint64_t buf_sz;
+               boolean_t compress;
 
                /*
                 * We rely on the L1 portion of the header below, so
@@ -6507,22 +6544,26 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
                 */
                hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
 
-               if ((HDR_L2COMPRESS(hdr)) &&
-                   hdr->b_l2hdr.b_asize >= buf_compress_minsz) {
-                       if (l2arc_compress_buf(hdr)) {
-                               /*
-                                * If compression succeeded, enable headroom
-                                * boost on the next scan cycle.
-                                */
-                               *headroom_boost = B_TRUE;
-                       }
+               /*
+                * Save a pointer to the original buffer data we had previously
+                * stashed away.
+                */
+               buf_data = hdr->b_l1hdr.b_tmp_cdata;
+
+               compress = HDR_L2COMPRESS(hdr) &&
+                   hdr->b_l2hdr.b_asize >= buf_compress_minsz;
+               if (l2arc_transform_buf(hdr, compress)) {
+                       /*
+                        * If compression succeeded, enable headroom
+                        * boost on the next scan cycle.
+                        */
+                       *headroom_boost = B_TRUE;
                }
 
                /*
-                * Pick up the buffer data we had previously stashed away
-                * (and now potentially also compressed).
+                * Get the new buffer size that accounts for compression
+                * and padding.
                 */
-               buf_data = hdr->b_l1hdr.b_tmp_cdata;
                buf_sz = hdr->b_l2hdr.b_asize;
 
                /*
@@ -6534,8 +6575,12 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
 
                /* Compression may have squashed the buffer to zero length. */
                if (buf_sz != 0) {
-                       uint64_t buf_a_sz;
-
+                       /*
+                        * If the data was padded or compressed, then it
+                        * it is in a new buffer.
+                        */
+                       if (hdr->b_l1hdr.b_tmp_cdata != NULL)
+                               buf_data = hdr->b_l1hdr.b_tmp_cdata;
                        wzio = zio_write_phys(pio, dev->l2ad_vdev,
                            dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
                            NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
@@ -6545,14 +6590,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
                            zio_t *, wzio);
                        (void) zio_nowait(wzio);
 
-                       stats_size += buf_sz;
-
-                       /*
-                        * Keep the clock hand suitably device-aligned.
-                        */
-                       buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
-                       write_asize += buf_a_sz;
-                       dev->l2ad_hand += buf_a_sz;
+                       write_asize += buf_sz;
+                       dev->l2ad_hand += buf_sz;
                }
        }
 
@@ -6562,8 +6601,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
        ARCSTAT_BUMP(arcstat_l2_writes_sent);
        ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
        ARCSTAT_INCR(arcstat_l2_size, write_sz);
-       ARCSTAT_INCR(arcstat_l2_asize, stats_size);
-       vdev_space_update(dev->l2ad_vdev, stats_size, 0, 0);
+       ARCSTAT_INCR(arcstat_l2_asize, write_asize);
+       vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0);
 
        /*
         * Bump device hand to the device start if it is approaching the end.
@@ -6582,12 +6621,18 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
 }
 
 /*
- * Compresses an L2ARC buffer.
+ * Transforms, possibly compresses and pads, an L2ARC buffer.
  * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its
  * size in l2hdr->b_asize. This routine tries to compress the data and
  * depending on the compression result there are three possible outcomes:
- * *) The buffer was incompressible. The original l2hdr contents were left
- *    untouched and are ready for writing to an L2 device.
+ * *) The buffer was incompressible. The buffer size was already ashift 
aligned.
+ *    The original hdr contents were left untouched except for b_tmp_cdata,
+ *    which is reset to NULL. The caller must keep a pointer to the original
+ *    data.
+ * *) The buffer was incompressible. The buffer size was not ashift aligned.
+ *    b_tmp_cdata was replaced with a temporary data buffer which holds a 
padded
+ *    (aligned) copy of the data. Once writing is done, invoke
+ *    l2arc_release_cdata_buf on this hdr to free the temporary buffer.
  * *) The buffer was all-zeros, so there is no need to write it to an L2
  *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
  *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
@@ -6601,10 +6646,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
  * buffer was incompressible).
  */
 static boolean_t
-l2arc_compress_buf(arc_buf_hdr_t *hdr)
+l2arc_transform_buf(arc_buf_hdr_t *hdr, boolean_t compress)
 {
        void *cdata;
-       size_t csize, len, rounded;
+       size_t align, asize, csize, len, rounded;
+
        ASSERT(HDR_HAS_L2HDR(hdr));
        l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
 
@@ -6613,14 +6659,19 @@ l2arc_compress_buf(arc_buf_hdr_t *hdr)
        ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
 
        len = l2hdr->b_asize;
-       cdata = zio_data_buf_alloc(len);
+       align = (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift;
+       asize = P2ROUNDUP(len, align);
+       cdata = zio_data_buf_alloc(asize);
        ASSERT3P(cdata, !=, NULL);
-       csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata,
-           cdata, l2hdr->b_asize);
+       if (compress)
+               csize = zio_compress_data(ZIO_COMPRESS_LZ4,
+                   hdr->b_l1hdr.b_tmp_cdata, cdata, len);
+       else
+               csize = len;
 
        if (csize == 0) {
                /* zero block, indicate that there's nothing to write */
-               zio_data_buf_free(cdata, len);
+               zio_data_buf_free(cdata, asize);
                l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
                l2hdr->b_asize = 0;
                hdr->b_l1hdr.b_tmp_cdata = NULL;
@@ -6628,8 +6679,8 @@ l2arc_compress_buf(arc_buf_hdr_t *hdr)
                return (B_TRUE);
        }
 
-       rounded = P2ROUNDUP(csize,
-           (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift);
+       rounded = P2ROUNDUP(csize, align);
+       ASSERT3U(rounded, <=, asize);
        if (rounded < len) {
                /*
                 * Compression succeeded, we'll keep the cdata around for
@@ -6646,11 +6697,32 @@ l2arc_compress_buf(arc_buf_hdr_t *hdr)
                return (B_TRUE);
        } else {
                /*
-                * Compression failed, release the compressed buffer.
-                * l2hdr will be left unmodified.
+                * Compression did not save space.
                 */
-               zio_data_buf_free(cdata, len);
-               ARCSTAT_BUMP(arcstat_l2_compress_failures);
+               if (P2PHASE(len, align) != 0) {
+                       /*
+                        * Use compression buffer for a copy of data padded to
+                        * the proper size.  Compression algorithm remains set
+                        * to ZIO_COMPRESS_OFF.
+                        */
+                       ASSERT3U(len, <, asize);
+                       bcopy(hdr->b_l1hdr.b_tmp_cdata, cdata, len);
+                       bzero((char *)cdata + len, asize - len);
+                       l2hdr->b_asize = asize;
+                       hdr->b_l1hdr.b_tmp_cdata = cdata;
+                       ARCSTAT_BUMP(arcstat_l2_padding_needed);
+               } else {
+                       ASSERT3U(len, ==, asize);
+                       /*
+                        * The original buffer is good as is,
+                        * release the compressed buffer.
+                        * l2hdr will be left unmodified except for b_tmp_cdata.
+                        */
+                       zio_data_buf_free(cdata, asize);
+                       hdr->b_l1hdr.b_tmp_cdata = NULL;
+               }
+               if (compress)
+                       ARCSTAT_BUMP(arcstat_l2_compress_failures);
                return (B_FALSE);
        }
 }
@@ -6719,44 +6791,30 @@ l2arc_decompress_zio(zio_t *zio, arc_buf
 
 /*
  * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
- * This buffer serves as a temporary holder of compressed data while
+ * This buffer serves as a temporary holder of compressed or padded data while
  * the buffer entry is being written to an l2arc device. Once that is
  * done, we can dispose of it.
  */
 static void
 l2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
 {
-       ASSERT(HDR_HAS_L2HDR(hdr));
+       size_t align, asize, len;
        enum zio_compress comp = hdr->b_l2hdr.b_compress;
 
+       ASSERT(HDR_HAS_L2HDR(hdr));
        ASSERT(HDR_HAS_L1HDR(hdr));
        ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp));
 
-       if (comp == ZIO_COMPRESS_OFF) {
-               /*
-                * In this case, b_tmp_cdata points to the same buffer
-                * as the arc_buf_t's b_data field. We don't want to
-                * free it, since the arc_buf_t will handle that.
-                */
+       if (hdr->b_l1hdr.b_tmp_cdata != NULL) {
+               ASSERT(comp != ZIO_COMPRESS_EMPTY);
+               len = hdr->b_size;
+               align = (size_t)1 << hdr->b_l2hdr.b_dev->l2ad_vdev->vdev_ashift;
+               asize = P2ROUNDUP(len, align);
+               zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, asize);
                hdr->b_l1hdr.b_tmp_cdata = NULL;
-       } else if (comp == ZIO_COMPRESS_EMPTY) {
-               /*
-                * In this case, b_tmp_cdata was compressed to an empty
-                * buffer, thus there's nothing to free and b_tmp_cdata
-                * should have been set to NULL in l2arc_write_buffers().
-                */
-               ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
        } else {
-               /*
-                * If the data was compressed, then we've allocated a
-                * temporary buffer for it, so now we need to release it.
-                */
-               ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
-               zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata,
-                   hdr->b_size);
-               hdr->b_l1hdr.b_tmp_cdata = NULL;
+               ASSERT(comp == ZIO_COMPRESS_OFF || comp == ZIO_COMPRESS_EMPTY);
        }
-
 }
 
 /*

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c      Tue May 
17 08:36:54 2016        (r300038)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c      Tue May 
17 08:43:50 2016        (r300039)
@@ -2810,11 +2810,13 @@ zio_vdev_io_start(zio_t *zio)
                ASSERT0(P2PHASE(zio->io_size, align));
        } else {
                /*
-                * For physical writes, we allow 512b aligned writes and assume
-                * the device will perform a read-modify-write as necessary.
+                * For the physical io we allow alignment
+                * to a logical block size.
                 */
-               ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
-               ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
+               uint64_t log_align =
+                   1ULL << vd->vdev_top->vdev_logical_ashift;
+               ASSERT0(P2PHASE(zio->io_offset, log_align));
+               ASSERT0(P2PHASE(zio->io_size, log_align));
        }
 
        VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa));
_______________________________________________
[email protected] mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-stable-10
To unsubscribe, send any mail to "[email protected]"

Reply via email to