From: Zhao Lei <[email protected]>

Add scrub support for partial csum.
The only challenge is that, scrub is done in unit of bio(or page size
yet), but partial csum is done in unit of 1/8 of nodesize.

So here a new function scrub_check_node_checksum and a new tree block
csum check loop is introduced to do partial csum check while reading the
tree block.

Signed-off-by: Zhao Lei <[email protected]>
Signed-off-by: Qu Wenruo <[email protected]>
---
 fs/btrfs/scrub.c | 207 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 206 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ab58115..0610474 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -307,6 +307,7 @@ static void copy_nocow_pages_worker(struct btrfs_work 
*work);
 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 static void scrub_put_ctx(struct scrub_ctx *sctx);
+static int scrub_check_fsid(u8 fsid[], struct scrub_page *spage);
 
 
 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
@@ -878,6 +879,91 @@ static inline void scrub_put_recover(struct scrub_recover 
*recover)
 }
 
 /*
+ * Page_bad arg should be a page include leaf header
+ *
+ * Return 0 if this header seems correct,
+ * Return 1 on other cases
+ */
+static int scrub_check_head(struct scrub_page *spage, u8 *csum)
+{
+       void *mapped_buffer;
+       struct btrfs_header *h;
+
+       mapped_buffer = kmap_atomic(spage->page);
+       h = (struct btrfs_header *)mapped_buffer;
+
+       if (spage->logical != btrfs_stack_header_bytenr(h))
+               goto header_err;
+       if (!scrub_check_fsid(h->fsid, spage))
+               goto header_err;
+       if (memcmp(h->chunk_tree_uuid,
+                  spage->dev->dev_root->fs_info->chunk_tree_uuid,
+                  BTRFS_UUID_SIZE))
+               goto header_err;
+       if (spage->generation != btrfs_stack_header_generation(h))
+               goto header_err;
+
+       if (csum)
+               memcpy(csum, h->csum, sizeof(h->csum));
+
+       kunmap_atomic(mapped_buffer);
+       return 0;
+
+header_err:
+       kunmap_atomic(mapped_buffer);
+       return 1;
+}
+
+/*
+ * return 1 if checksum ok, 0 on other case
+ */
+static int scrub_check_node_checksum(struct scrub_block *sblock,
+                                    int part,
+                                    u8 *csum)
+{
+       int offset;
+       int len;
+       u32 crc = ~(u32)0;
+
+       if (part == 0) {
+               offset = BTRFS_CSUM_SIZE;
+               len = sblock->sctx->nodesize - BTRFS_CSUM_SIZE;
+       } else if (part == 1) {
+               offset = BTRFS_CSUM_SIZE;
+               len = sblock->sctx->nodesize * 2 / 8 - BTRFS_CSUM_SIZE;
+       } else {
+               offset = part * sblock->sctx->nodesize / 8;
+               len = sblock->sctx->nodesize / 8;
+       }
+
+       while (len > 0) {
+               int page_num = offset / PAGE_SIZE;
+               int page_data_offset = offset - page_num * PAGE_SIZE;
+               int page_data_len = min(len,
+                                       (int)(PAGE_SIZE - page_data_offset));
+               u8 *mapped_buffer;
+
+               WARN_ON(page_num >= sblock->page_count);
+
+               if (sblock->pagev[page_num]->io_error)
+                       return 0;
+
+               mapped_buffer = kmap_atomic(
+                                     sblock->pagev[page_num]->page);
+
+               crc = btrfs_csum_data(mapped_buffer + page_data_offset, crc,
+                                     page_data_len);
+
+               offset += page_data_len;
+               len -= page_data_len;
+
+               kunmap_atomic(mapped_buffer);
+       }
+       btrfs_csum_final(crc, (char *)&crc);
+       return (crc == ((u32 *)csum)[part]);
+}
+
+/*
  * scrub_handle_errored_block gets called when either verification of the
  * pages failed or the bio failed to read, e.g. with EIO. In the latter
  * case, this function handles all pages in the bio, even though only one
@@ -905,6 +991,9 @@ static int scrub_handle_errored_block(struct scrub_block 
*sblock_to_check)
        int success;
        static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
                                      DEFAULT_RATELIMIT_BURST);
+       u8 node_csum[BTRFS_CSUM_SIZE];
+       int get_right_sum = 0;
+       int per_page_recover_start = 0;
 
        BUG_ON(sblock_to_check->page_count < 1);
        fs_info = sctx->dev_root->fs_info;
@@ -1151,11 +1240,125 @@ nodatasum_case:
         * area are unreadable.
         */
        success = 1;
+
+       /*
+        * maybe some mirror's head is broken
+        * we select to use right head for checksum
+        */
+       for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS &&
+            sblocks_for_recheck[mirror_index].page_count > 0;
+            mirror_index++) {
+               if (scrub_check_head(sblocks_for_recheck[mirror_index].pagev[0],
+                                    node_csum) == 0) {
+                       get_right_sum = 1;
+                       break;
+               }
+       }
+
        for (page_num = 0; page_num < sblock_bad->page_count;
             page_num++) {
                struct scrub_page *page_bad = sblock_bad->pagev[page_num];
                struct scrub_block *sblock_other = NULL;
 
+               if (is_metadata && get_right_sum) {
+                       /*
+                        * For tree block which may support partial csum
+                        *
+                        * | page | page | page | page | page | page |
+                        * |   checksum  |   checksum  |   checksum  |
+                        *               ^      ^
+                        *               |      |
+                        *               |      page_num
+                        *               |
+                        *               per_page_recover_start
+                        *
+                        * |<-  done   ->|
+                        */
+                       int start_csum_part;
+                       int next_csum_part;
+                       int sub_page_num;
+
+                       /*
+                        * Don't worry that start_csum_part is rounded in
+                        * calculate, because per_page_recover_start should
+                        * always align with checksum
+                        */
+                       start_csum_part = per_page_recover_start * 8 *
+                               sblock_to_check->sctx->sectorsize /
+                               sblock_to_check->sctx->nodesize;
+                       start_csum_part = start_csum_part ? : 1;
+                       next_csum_part = (page_num  + 1) * 8 *
+                               sblock_to_check->sctx->sectorsize /
+                               sblock_to_check->sctx->nodesize;
+                       next_csum_part = next_csum_part ? : 1;
+
+                       if (next_csum_part == start_csum_part) {
+                               /* this page hasn't wrap to next checksum */
+                               continue;
+                       }
+
+                       /*
+                        * to find which mirror have right data for current
+                        * csum parts
+                        */
+                       for (mirror_index = 0;
+                            mirror_index < BTRFS_MAX_MIRRORS &&
+                            sblocks_for_recheck[mirror_index].page_count > 0;
+                            mirror_index++) {
+                               int csum_part;
+
+                               for (csum_part = start_csum_part;
+                                    csum_part < next_csum_part; csum_part++) {
+                                       if (!scrub_check_node_checksum(
+                                                       sblocks_for_recheck +
+                                                       mirror_index, csum_part,
+                                                       node_csum)) {
+                                               break;
+                                       }
+                               }
+                               if (csum_part == next_csum_part) {
+                                       /*
+                                        * all part of the mirror has right csum
+                                        */
+                                       sblock_other = sblocks_for_recheck +
+                                                      mirror_index;
+                                       break;
+                               }
+                       }
+
+                       if (sctx->is_dev_replace) {
+                               if (!sblock_other)
+                                       sblock_other = sblock_bad;
+
+                               for (sub_page_num = per_page_recover_start;
+                                    sub_page_num <= page_num; sub_page_num++) {
+                                       if (scrub_write_page_to_dev_replace(
+                                                       sblock_other,
+                                                       sub_page_num) != 0) {
+                                               btrfs_dev_replace_stats_inc(
+                                                       &sctx->dev_root->
+                                                       fs_info->dev_replace.
+                                                       num_write_errors);
+                                               success = 0;
+                                       }
+                               }
+                       } else if (sblock_other) {
+                               for (sub_page_num = per_page_recover_start;
+                                    sub_page_num <= page_num; sub_page_num++) {
+                                       if (!scrub_repair_page_from_good_copy(
+                                                       sblock_bad,
+                                                       sblock_other,
+                                                       sub_page_num, 0))
+                                               page_bad->io_error = 0;
+                                       else
+                                               success = 0;
+                               }
+                       }
+
+                       per_page_recover_start = page_num  + 1;
+
+                       continue;
+               }
                /* skip no-io-error page in scrub */
                if (!page_bad->io_error && !sctx->is_dev_replace)
                        continue;
@@ -1321,6 +1524,7 @@ static int scrub_setup_recheck_block(struct scrub_block 
*original_sblock,
        struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
        u64 length = original_sblock->page_count * PAGE_SIZE;
        u64 logical = original_sblock->pagev[0]->logical;
+       u64 generation = original_sblock->pagev[0]->generation;
        struct scrub_recover *recover;
        struct btrfs_bio *bbio;
        u64 sublen;
@@ -1387,7 +1591,7 @@ leave_nomem:
                        scrub_page_get(page);
                        sblock->pagev[page_index] = page;
                        page->logical = logical;
-
+                       page->generation = generation;
                        scrub_stripe_index_and_offset(logical,
                                                      bbio->map_type,
                                                      bbio->raid_map,
@@ -1839,6 +2043,7 @@ static int scrub_checksum(struct scrub_block *sblock)
        WARN_ON(sblock->page_count < 1);
        flags = sblock->pagev[0]->flags;
        ret = 0;
+
        if (flags & BTRFS_EXTENT_FLAG_DATA)
                ret = scrub_checksum_data(sblock);
        else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
-- 
2.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to