This patch implement data repair function when direct read fails.

The detail of the implementation is:
- When we find the data is not right, we try to read the data from the other
  mirror.
- When the io on the mirror ends, we will insert the endio work into the
  dedicated btrfs workqueue, not common read endio workqueue, because the
  original endio work is still blocked in the btrfs endio workqueue, if we
  insert the endio work of the io on the mirror into that workqueue, deadlock
  would happen.
- After we get right data, we write it back to the corrupted mirror.
- And if the data on the new mirror is still corrupted, we will try next
  mirror until we read right data or all the mirrors are traversed.
- After the above work, we set the uptodate flag according to the result.

Signed-off-by: Miao Xie <mi...@cn.fujitsu.com>
---
Changelog v3 -> v4:
- Use a dedicated btrfs workqueue instead of the system workqueue to
  deal with the completed repair bio, this suggest was from Chris.

Changelog v1 -> v3:
- None
---
 fs/btrfs/async-thread.c |   1 +
 fs/btrfs/async-thread.h |   1 +
 fs/btrfs/btrfs_inode.h  |   2 +-
 fs/btrfs/ctree.h        |   1 +
 fs/btrfs/disk-io.c      |  11 +-
 fs/btrfs/disk-io.h      |   1 +
 fs/btrfs/extent_io.c    |  12 ++-
 fs/btrfs/extent_io.h    |   5 +-
 fs/btrfs/inode.c        | 276 ++++++++++++++++++++++++++++++++++++++++++++----
 9 files changed, 281 insertions(+), 29 deletions(-)

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index fbd76de..2da0a66 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -74,6 +74,7 @@ BTRFS_WORK_HELPER(endio_helper);
 BTRFS_WORK_HELPER(endio_meta_helper);
 BTRFS_WORK_HELPER(endio_meta_write_helper);
 BTRFS_WORK_HELPER(endio_raid56_helper);
+BTRFS_WORK_HELPER(endio_repair_helper);
 BTRFS_WORK_HELPER(rmw_helper);
 BTRFS_WORK_HELPER(endio_write_helper);
 BTRFS_WORK_HELPER(freespace_write_helper);
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index e9e31c9..e386c29 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -53,6 +53,7 @@ BTRFS_WORK_HELPER_PROTO(endio_helper);
 BTRFS_WORK_HELPER_PROTO(endio_meta_helper);
 BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper);
 BTRFS_WORK_HELPER_PROTO(endio_raid56_helper);
+BTRFS_WORK_HELPER_PROTO(endio_repair_helper);
 BTRFS_WORK_HELPER_PROTO(rmw_helper);
 BTRFS_WORK_HELPER_PROTO(endio_write_helper);
 BTRFS_WORK_HELPER_PROTO(freespace_write_helper);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 4d30947..7a7521c 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -271,7 +271,7 @@ struct btrfs_dio_private {
         * The original bio may be splited to several sub-bios, this is
         * done during endio of sub-bios
         */
-       int (*subio_endio)(struct inode *, struct btrfs_io_bio *);
+       int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int);
 };
 
 /*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7b54cd9..63acfd8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1538,6 +1538,7 @@ struct btrfs_fs_info {
        struct btrfs_workqueue *endio_workers;
        struct btrfs_workqueue *endio_meta_workers;
        struct btrfs_workqueue *endio_raid56_workers;
+       struct btrfs_workqueue *endio_repair_workers;
        struct btrfs_workqueue *rmw_workers;
        struct btrfs_workqueue *endio_meta_write_workers;
        struct btrfs_workqueue *endio_write_workers;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ff3ee22..1594d91 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -713,7 +713,11 @@ static void end_workqueue_bio(struct bio *bio, int err)
                        func = btrfs_endio_write_helper;
                }
        } else {
-               if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
+               if (unlikely(end_io_wq->metadata ==
+                            BTRFS_WQ_ENDIO_DIO_REPAIR)) {
+                       wq = fs_info->endio_repair_workers;
+                       func = btrfs_endio_repair_helper;
+               } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
                        wq = fs_info->endio_raid56_workers;
                        func = btrfs_endio_raid56_helper;
                } else if (end_io_wq->metadata) {
@@ -741,6 +745,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct 
bio *bio,
                        int metadata)
 {
        struct end_io_wq *end_io_wq;
+
        end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
        if (!end_io_wq)
                return -ENOMEM;
@@ -2059,6 +2064,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info 
*fs_info)
        btrfs_destroy_workqueue(fs_info->endio_workers);
        btrfs_destroy_workqueue(fs_info->endio_meta_workers);
        btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
+       btrfs_destroy_workqueue(fs_info->endio_repair_workers);
        btrfs_destroy_workqueue(fs_info->rmw_workers);
        btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
        btrfs_destroy_workqueue(fs_info->endio_write_workers);
@@ -2576,6 +2582,8 @@ int open_ctree(struct super_block *sb,
                btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
        fs_info->endio_raid56_workers =
                btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
+       fs_info->endio_repair_workers =
+               btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
        fs_info->rmw_workers =
                btrfs_alloc_workqueue("rmw", flags, max_active, 2);
        fs_info->endio_write_workers =
@@ -2597,6 +2605,7 @@ int open_ctree(struct super_block *sb,
              fs_info->submit_workers && fs_info->flush_workers &&
              fs_info->endio_workers && fs_info->endio_meta_workers &&
              fs_info->endio_meta_write_workers &&
+             fs_info->endio_repair_workers &&
              fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
              fs_info->endio_freespace_worker && fs_info->rmw_workers &&
              fs_info->caching_workers && fs_info->readahead_workers &&
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 52a17db..14d06ee 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -30,6 +30,7 @@ enum {
        BTRFS_WQ_ENDIO_METADATA = 1,
        BTRFS_WQ_ENDIO_FREE_SPACE = 2,
        BTRFS_WQ_ENDIO_RAID56 = 3,
+       BTRFS_WQ_ENDIO_DIO_REPAIR = 4,
 };
 
 static inline u64 btrfs_sb_offset(int mirror)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 94c5c04..86dc352 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1962,7 +1962,7 @@ static void check_page_uptodate(struct extent_io_tree 
*tree, struct page *page)
                SetPageUptodate(page);
 }
 
-static int free_io_failure(struct inode *inode, struct io_failure_record *rec)
+int free_io_failure(struct inode *inode, struct io_failure_record *rec)
 {
        int ret;
        int err = 0;
@@ -2081,8 +2081,8 @@ int repair_eb_io_failure(struct btrfs_root *root, struct 
extent_buffer *eb,
  * each time an IO finishes, we do a fast check in the IO failure tree
  * to see if we need to process or clean up an io_failure_record
  */
-static int clean_io_failure(struct inode *inode, u64 start,
-                           struct page *page, unsigned int pg_offset)
+int clean_io_failure(struct inode *inode, u64 start, struct page *page,
+                    unsigned int pg_offset)
 {
        u64 private;
        u64 private_failure;
@@ -2291,7 +2291,7 @@ int btrfs_check_repairable(struct inode *inode, struct 
bio *failed_bio,
 struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio 
*failed_bio,
                                    struct io_failure_record *failrec,
                                    struct page *page, int pg_offset, int icsum,
-                                   bio_end_io_t *endio_func)
+                                   bio_end_io_t *endio_func, void *data)
 {
        struct bio *bio;
        struct btrfs_io_bio *btrfs_failed_bio;
@@ -2305,6 +2305,7 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, 
struct bio *failed_bio,
        bio->bi_iter.bi_sector = failrec->logical >> 9;
        bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
        bio->bi_iter.bi_size = 0;
+       bio->bi_private = data;
 
        btrfs_failed_bio = btrfs_io_bio(failed_bio);
        if (btrfs_failed_bio->csum) {
@@ -2362,7 +2363,8 @@ static int bio_readpage_error(struct bio *failed_bio, u64 
phy_offset,
        phy_offset >>= inode->i_sb->s_blocksize_bits;
        bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
                                      start - page_offset(page),
-                                     (int)phy_offset, failed_bio->bi_end_io);
+                                     (int)phy_offset, failed_bio->bi_end_io,
+                                     NULL);
        if (!bio) {
                free_io_failure(inode, failrec);
                return -EIO;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index bf0597f..176a4b1 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -341,6 +341,8 @@ struct btrfs_fs_info;
 int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
                      struct page *page, unsigned int pg_offset,
                      int mirror_num);
+int clean_io_failure(struct inode *inode, u64 start, struct page *page,
+                    unsigned int pg_offset);
 int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
                         int mirror_num);
@@ -371,7 +373,8 @@ int btrfs_check_repairable(struct inode *inode, struct bio 
*failed_bio,
 struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio 
*failed_bio,
                                    struct io_failure_record *failrec,
                                    struct page *page, int pg_offset, int icsum,
-                                   bio_end_io_t *endio_func);
+                                   bio_end_io_t *endio_func, void *data);
+int free_io_failure(struct inode *inode, struct io_failure_record *rec);
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 noinline u64 find_lock_delalloc_range(struct inode *inode,
                                      struct extent_io_tree *tree,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index cf79f79..bc8cdaf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7198,30 +7198,267 @@ unlock_err:
        return ret;
 }
 
-static int btrfs_subio_endio_read(struct inode *inode,
-                                 struct btrfs_io_bio *io_bio)
+static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
+                                       int rw, int mirror_num)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       int ret;
+
+       BUG_ON(rw & REQ_WRITE);
+
+       bio_get(bio);
+
+       ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+                                 BTRFS_WQ_ENDIO_DIO_REPAIR);
+       if (ret)
+               goto err;
+
+       ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
+err:
+       bio_put(bio);
+       return ret;
+}
+
+static int btrfs_check_dio_repairable(struct inode *inode,
+                                     struct bio *failed_bio,
+                                     struct io_failure_record *failrec,
+                                     int failed_mirror)
+{
+       int num_copies;
+
+       num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
+                                     failrec->logical, failrec->len);
+       if (num_copies == 1) {
+               /*
+                * we only have a single copy of the data, so don't bother with
+                * all the retry and error correction code that follows. no
+                * matter what the error is, it is very likely to persist.
+                */
+               pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, 
next_mirror %d, failed_mirror %d\n",
+                        num_copies, failrec->this_mirror, failed_mirror);
+               return 0;
+       }
+
+       failrec->failed_mirror = failed_mirror;
+       failrec->this_mirror++;
+       if (failrec->this_mirror == failed_mirror)
+               failrec->this_mirror++;
+
+       if (failrec->this_mirror > num_copies) {
+               pr_debug("Check DIO Repairable: (fail) num_copies=%d, 
next_mirror %d, failed_mirror %d\n",
+                        num_copies, failrec->this_mirror, failed_mirror);
+               return 0;
+       }
+
+       return 1;
+}
+
+static int dio_read_error(struct inode *inode, struct bio *failed_bio,
+                         struct page *page, u64 start, u64 end,
+                         int failed_mirror, bio_end_io_t *repair_endio,
+                         void *repair_arg)
+{
+       struct io_failure_record *failrec;
+       struct bio *bio;
+       int isector;
+       int read_mode;
+       int ret;
+
+       BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+
+       ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
+       if (ret)
+               return ret;
+
+       ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
+                                        failed_mirror);
+       if (!ret) {
+               free_io_failure(inode, failrec);
+               return -EIO;
+       }
+
+       if (failed_bio->bi_vcnt > 1)
+               read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+       else
+               read_mode = READ_SYNC;
+
+       isector = start - btrfs_io_bio(failed_bio)->logical;
+       isector >>= inode->i_sb->s_blocksize_bits;
+       bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
+                                     0, isector, repair_endio, repair_arg);
+       if (!bio) {
+               free_io_failure(inode, failrec);
+               return -EIO;
+       }
+
+       btrfs_debug(BTRFS_I(inode)->root->fs_info,
+                   "Repair DIO Read Error: submitting new dio read[%#x] to 
this_mirror=%d, in_validation=%d\n",
+                   read_mode, failrec->this_mirror, failrec->in_validation);
+
+       ret = submit_dio_repair_bio(inode, bio, read_mode,
+                                   failrec->this_mirror);
+       if (ret) {
+               free_io_failure(inode, failrec);
+               bio_put(bio);
+       }
+
+       return ret;
+}
+
+struct btrfs_retry_complete {
+       struct completion done;
+       struct inode *inode;
+       u64 start;
+       int uptodate;
+};
+
+static void btrfs_retry_endio_nocsum(struct bio *bio, int err)
+{
+       struct btrfs_retry_complete *done = bio->bi_private;
+       struct bio_vec *bvec;
+       int i;
+
+       if (err)
+               goto end;
+
+       done->uptodate = 1;
+       bio_for_each_segment_all(bvec, bio, i)
+               clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
+end:
+       complete(&done->done);
+       bio_put(bio);
+}
+
+static int __btrfs_correct_data_nocsum(struct inode *inode,
+                                      struct btrfs_io_bio *io_bio)
 {
        struct bio_vec *bvec;
+       struct btrfs_retry_complete done;
        u64 start;
        int i;
        int ret;
-       int err = 0;
 
-       if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
-               return 0;
+       start = io_bio->logical;
+       done.inode = inode;
+
+       bio_for_each_segment_all(bvec, &io_bio->bio, i) {
+try_again:
+               done.uptodate = 0;
+               done.start = start;
+               init_completion(&done.done);
+
+               ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+                                    start + bvec->bv_len - 1,
+                                    io_bio->mirror_num,
+                                    btrfs_retry_endio_nocsum, &done);
+               if (ret)
+                       return ret;
+
+               wait_for_completion(&done.done);
+
+               if (!done.uptodate) {
+                       /* We might have another mirror, so try again */
+                       goto try_again;
+               }
+
+               start += bvec->bv_len;
+       }
+
+       return 0;
+}
+
+static void btrfs_retry_endio(struct bio *bio, int err)
+{
+       struct btrfs_retry_complete *done = bio->bi_private;
+       struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+       struct bio_vec *bvec;
+       int uptodate;
+       int ret;
+       int i;
+
+       if (err)
+               goto end;
+
+       uptodate = 1;
+       bio_for_each_segment_all(bvec, bio, i) {
+               ret = __readpage_endio_check(done->inode, io_bio, i,
+                                            bvec->bv_page, 0,
+                                            done->start, bvec->bv_len);
+               if (!ret)
+                       clean_io_failure(done->inode, done->start,
+                                        bvec->bv_page, 0);
+               else
+                       uptodate = 0;
+       }
+
+       done->uptodate = uptodate;
+end:
+       complete(&done->done);
+       bio_put(bio);
+}
 
+static int __btrfs_subio_endio_read(struct inode *inode,
+                                   struct btrfs_io_bio *io_bio, int err)
+{
+       struct bio_vec *bvec;
+       struct btrfs_retry_complete done;
+       u64 start;
+       u64 offset = 0;
+       int i;
+       int ret;
+
+       err = 0;
        start = io_bio->logical;
+       done.inode = inode;
+
        bio_for_each_segment_all(bvec, &io_bio->bio, i) {
                ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
                                             0, start, bvec->bv_len);
-               if (ret)
-                       err = -EIO;
+               if (likely(!ret))
+                       goto next;
+try_again:
+               done.uptodate = 0;
+               done.start = start;
+               init_completion(&done.done);
+
+               ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+                                    start + bvec->bv_len - 1,
+                                    io_bio->mirror_num,
+                                    btrfs_retry_endio, &done);
+               if (ret) {
+                       err = ret;
+                       goto next;
+               }
+
+               wait_for_completion(&done.done);
+
+               if (!done.uptodate) {
+                       /* We might have another mirror, so try again */
+                       goto try_again;
+               }
+next:
+               offset += bvec->bv_len;
                start += bvec->bv_len;
        }
 
        return err;
 }
 
+static int btrfs_subio_endio_read(struct inode *inode,
+                                 struct btrfs_io_bio *io_bio, int err)
+{
+       bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+       if (skip_csum) {
+               if (unlikely(err))
+                       return __btrfs_correct_data_nocsum(inode, io_bio);
+               else
+                       return 0;
+       } else {
+               return __btrfs_subio_endio_read(inode, io_bio, err);
+       }
+}
+
 static void btrfs_endio_direct_read(struct bio *bio, int err)
 {
        struct btrfs_dio_private *dip = bio->bi_private;
@@ -7229,8 +7466,8 @@ static void btrfs_endio_direct_read(struct bio *bio, int 
err)
        struct bio *dio_bio;
        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
 
-       if (!err && (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED))
-               err = btrfs_subio_endio_read(inode, io_bio);
+       if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
+               err = btrfs_subio_endio_read(inode, io_bio, err);
 
        unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
                      dip->logical_offset + dip->bytes - 1);
@@ -7309,19 +7546,16 @@ static int __btrfs_submit_bio_start_direct_io(struct 
inode *inode, int rw,
 static void btrfs_end_dio_bio(struct bio *bio, int err)
 {
        struct btrfs_dio_private *dip = bio->bi_private;
-       int ret;
 
-       if (err) {
-               btrfs_err(BTRFS_I(dip->inode)->root->fs_info,
-                         "direct IO failed ino %llu rw %lu sector %#Lx len %u 
err no %d",
-                     btrfs_ino(dip->inode), bio->bi_rw,
-                     (unsigned long long)bio->bi_iter.bi_sector,
-                     bio->bi_iter.bi_size, err);
-       } else if (dip->subio_endio) {
-               ret = dip->subio_endio(dip->inode, btrfs_io_bio(bio));
-               if (ret)
-                       err = ret;
-       }
+       if (err)
+               btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
+                          "direct IO failed ino %llu rw %lu sector %#Lx len %u 
err no %d",
+                          btrfs_ino(dip->inode), bio->bi_rw,
+                          (unsigned long long)bio->bi_iter.bi_sector,
+                          bio->bi_iter.bi_size, err);
+
+       if (dip->subio_endio)
+               err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
 
        if (err) {
                dip->errors = 1;
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to