On Wed, Dec 04 2019 at  9:06P -0500,
Nikos Tsironis <[email protected]> wrote:

> dm-clone maintains an on-disk bitmap which records which regions are
> valid in the destination device, i.e., which regions have already been
> hydrated, or have been written to directly, via user I/O.
> 
> Setting a bit in the on-disk bitmap meas the corresponding region is
> valid in the destination device and we redirect all I/O regarding it to
> the destination device.
> 
> Suppose the destination device has a volatile write-back cache and the
> following sequence of events occur:
> 
> 1. A region gets hydrated, either through the background hydration or
>    because it was written to directly, via user I/O.
> 
> 2. The commit timeout expires and we commit the metadata, marking that
>    region as valid in the destination device.
> 
> 3. The system crashes and the destination device's cache has not been
>    flushed, meaning the region's data are lost.
> 
> The next time we read that region we read it from the destination
> device, since the metadata have been successfully committed, but the
> data are lost due to the crash, so we read garbage instead of the old
> data.
> 
> This has several implications:
> 
> 1. In case of background hydration or of writes with size smaller than
>    the region size (which means we first copy the whole region and then
>    issue the smaller write), we corrupt data that the user never
>    touched.
> 
> 2. In case of writes with size equal to the device's logical block size,
>    we fail to provide atomic sector writes. When the system recovers the
>    user will read garbage from the sector instead of the old data or the
>    new data.
> 
> 3. In case of writes without the FUA flag set, after the system
>    recovers, the written sectors will contain garbage instead of a
>    random mix of sectors containing either old data or new data, thus we
>    fail again to provide atomic sector writes.
> 
> 4. Even when the user flushes the dm-clone device, because we first
>    commit the metadata and then pass down the flush, the same risk for
>    corruption exists (if the system crashes after the metadata have been
>    committed but before the flush is passed down).
> 
> The only case which is unaffected is that of writes with size equal to
> the region size and with the FUA flag set. But, because FUA writes
> trigger metadata commits, this case can trigger the corruption
> indirectly.
> 
> To solve this and avoid the potential data corruption we flush the
> destination device **before** committing the metadata.
> 
> This ensures that any freshly hydrated regions, for which we commit the
> metadata, are properly written to non-volatile storage and won't be lost
> in case of a crash.
> 
> Fixes: 7431b7835f55 ("dm: add clone target")
> Cc: [email protected] # v5.4+
> Signed-off-by: Nikos Tsironis <[email protected]>
> ---
>  drivers/md/dm-clone-target.c | 46 
> ++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 40 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c
> index 613c913c296c..d1e1b5b56b1b 100644
> --- a/drivers/md/dm-clone-target.c
> +++ b/drivers/md/dm-clone-target.c
> @@ -86,6 +86,12 @@ struct clone {
>  
>       struct dm_clone_metadata *cmd;
>  
> +     /*
> +      * bio used to flush the destination device, before committing the
> +      * metadata.
> +      */
> +     struct bio flush_bio;
> +
>       /* Region hydration hash table */
>       struct hash_table_bucket *ht;
>  
> @@ -1108,10 +1114,13 @@ static bool need_commit_due_to_time(struct clone 
> *clone)
>  /*
>   * A non-zero return indicates read-only or fail mode.
>   */
> -static int commit_metadata(struct clone *clone)
> +static int commit_metadata(struct clone *clone, bool *dest_dev_flushed)
>  {
>       int r = 0;
>  
> +     if (dest_dev_flushed)
> +             *dest_dev_flushed = false;
> +
>       mutex_lock(&clone->commit_lock);
>  
>       if (!dm_clone_changed_this_transaction(clone->cmd))
> @@ -1128,6 +1137,19 @@ static int commit_metadata(struct clone *clone)
>               goto out;
>       }
>  
> +     bio_reset(&clone->flush_bio);
> +     bio_set_dev(&clone->flush_bio, clone->dest_dev->bdev);
> +     clone->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
> +
> +     r = submit_bio_wait(&clone->flush_bio);
> +     if (unlikely(r)) {
> +             __metadata_operation_failed(clone, "flush destination device", 
> r);
> +             goto out;
> +     }
> +
> +     if (dest_dev_flushed)
> +             *dest_dev_flushed = true;
> +
>       r = dm_clone_metadata_commit(clone->cmd);
>       if (unlikely(r)) {
>               __metadata_operation_failed(clone, "dm_clone_metadata_commit", 
> r);
> @@ -1199,6 +1221,7 @@ static void process_deferred_bios(struct clone *clone)
>  static void process_deferred_flush_bios(struct clone *clone)
>  {
>       struct bio *bio;
> +     bool dest_dev_flushed;
>       struct bio_list bios = BIO_EMPTY_LIST;
>       struct bio_list bio_completions = BIO_EMPTY_LIST;
>  
> @@ -1218,7 +1241,7 @@ static void process_deferred_flush_bios(struct clone 
> *clone)
>           !(dm_clone_changed_this_transaction(clone->cmd) && 
> need_commit_due_to_time(clone)))
>               return;
>  
> -     if (commit_metadata(clone)) {
> +     if (commit_metadata(clone, &dest_dev_flushed)) {
>               bio_list_merge(&bios, &bio_completions);
>  
>               while ((bio = bio_list_pop(&bios)))
> @@ -1232,8 +1255,17 @@ static void process_deferred_flush_bios(struct clone 
> *clone)
>       while ((bio = bio_list_pop(&bio_completions)))
>               bio_endio(bio);
>  
> -     while ((bio = bio_list_pop(&bios)))
> -             generic_make_request(bio);
> +     while ((bio = bio_list_pop(&bios))) {
> +             if ((bio->bi_opf & REQ_PREFLUSH) && dest_dev_flushed) {
> +                     /* We just flushed the destination device as part of
> +                      * the metadata commit, so there is no reason to send
> +                      * another flush.
> +                      */
> +                     bio_endio(bio);
> +             } else {
> +                     generic_make_request(bio);
> +             }
> +     }
>  }
>  
>  static void do_worker(struct work_struct *work)
> @@ -1405,7 +1437,7 @@ static void clone_status(struct dm_target *ti, 
> status_type_t type,
>  
>               /* Commit to ensure statistics aren't out-of-date */
>               if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && 
> !dm_suspended(ti))
> -                     (void) commit_metadata(clone);
> +                     (void) commit_metadata(clone, NULL);
>  
>               r = dm_clone_get_free_metadata_block_count(clone->cmd, 
> &nr_free_metadata_blocks);
>  
> @@ -1839,6 +1871,7 @@ static int clone_ctr(struct dm_target *ti, unsigned int 
> argc, char **argv)
>       bio_list_init(&clone->deferred_flush_completions);
>       clone->hydration_offset = 0;
>       atomic_set(&clone->hydrations_in_flight, 0);
> +     bio_init(&clone->flush_bio, NULL, 0);
>  
>       clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
>       if (!clone->wq) {
> @@ -1912,6 +1945,7 @@ static void clone_dtr(struct dm_target *ti)
>       struct clone *clone = ti->private;
>  
>       mutex_destroy(&clone->commit_lock);
> +     bio_uninit(&clone->flush_bio);
>  
>       for (i = 0; i < clone->nr_ctr_args; i++)
>               kfree(clone->ctr_args[i]);
> @@ -1966,7 +2000,7 @@ static void clone_postsuspend(struct dm_target *ti)
>       wait_event(clone->hydration_stopped, 
> !atomic_read(&clone->hydrations_in_flight));
>       flush_workqueue(clone->wq);
>  
> -     (void) commit_metadata(clone);
> +     (void) commit_metadata(clone, NULL);
>  }
>  
>  static void clone_resume(struct dm_target *ti)
> -- 
> 2.11.0
> 


Like the dm-thin patch I replied to, would rather avoid open-coding
blkdev_issue_flush (also I check !bio_has_data), here is incremental:

---
 drivers/md/dm-clone-target.c | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c
index d1e1b5b56b1b..bce99bff8678 100644
--- a/drivers/md/dm-clone-target.c
+++ b/drivers/md/dm-clone-target.c
@@ -86,12 +86,6 @@ struct clone {
 
        struct dm_clone_metadata *cmd;
 
-       /*
-        * bio used to flush the destination device, before committing the
-        * metadata.
-        */
-       struct bio flush_bio;
-
        /* Region hydration hash table */
        struct hash_table_bucket *ht;
 
@@ -1137,11 +1131,7 @@ static int commit_metadata(struct clone *clone, bool 
*dest_dev_flushed)
                goto out;
        }
 
-       bio_reset(&clone->flush_bio);
-       bio_set_dev(&clone->flush_bio, clone->dest_dev->bdev);
-       clone->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
-
-       r = submit_bio_wait(&clone->flush_bio);
+       r = blkdev_issue_flush(clone->dest_dev->bdev, GFP_NOIO, NULL);
        if (unlikely(r)) {
                __metadata_operation_failed(clone, "flush destination device", 
r);
                goto out;
@@ -1256,7 +1246,8 @@ static void process_deferred_flush_bios(struct clone 
*clone)
                bio_endio(bio);
 
        while ((bio = bio_list_pop(&bios))) {
-               if ((bio->bi_opf & REQ_PREFLUSH) && dest_dev_flushed) {
+               if (dest_dev_flushed &&
+                   (bio->bi_opf & REQ_PREFLUSH) && !bio_has_data(bio)) {
                        /* We just flushed the destination device as part of
                         * the metadata commit, so there is no reason to send
                         * another flush.
@@ -1871,7 +1862,6 @@ static int clone_ctr(struct dm_target *ti, unsigned int 
argc, char **argv)
        bio_list_init(&clone->deferred_flush_completions);
        clone->hydration_offset = 0;
        atomic_set(&clone->hydrations_in_flight, 0);
-       bio_init(&clone->flush_bio, NULL, 0);
 
        clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
        if (!clone->wq) {
@@ -1945,7 +1935,6 @@ static void clone_dtr(struct dm_target *ti)
        struct clone *clone = ti->private;
 
        mutex_destroy(&clone->commit_lock);
-       bio_uninit(&clone->flush_bio);
 
        for (i = 0; i < clone->nr_ctr_args; i++)
                kfree(clone->ctr_args[i]);
-- 
2.15.0


--
dm-devel mailing list
[email protected]
https://www.redhat.com/mailman/listinfo/dm-devel

Reply via email to