(2012/11/07 1:38), Stefan Behrens wrote: > The device replace procedure makes use of the scrub code. The scrub > code is the most efficient code to read the allocated data of a disk, > i.e. it reads sequentially in order to avoid disk head movements, it > skips unallocated blocks, it uses read ahead mechanisms, and it > contains all the code to detect and repair defects. > This commit adds code to scrub to allow the scrub code to copy read > data to another disk. > One goal is to be able to perform as fast as possible. Therefore the > write requests are collected until huge bios are build, and the > write process is decoupled from the read process with some kind of > flow control, of course, in order to limit the allocated memory. > The best performance on spinning disks could by reached when the > head movements are avoided as much as possible. Therefore a single > worker is used to interface the read process with the write process. > The regular scrub operation works as fast as before, it is not > negatively influenced and actually it is more or less unchanged. > > Signed-off-by: Stefan Behrens <sbehr...@giantdisaster.de> > --- > fs/btrfs/ctree.h | 2 + > fs/btrfs/reada.c | 10 +- > fs/btrfs/scrub.c | 881 > ++++++++++++++++++++++++++++++++++++++++++++++++++----- > fs/btrfs/super.c | 3 +- > 4 files changed, 823 insertions(+), 73 deletions(-) > > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h > index 83904b5..e17f211 100644 > --- a/fs/btrfs/ctree.h > +++ b/fs/btrfs/ctree.h > @@ -1483,6 +1483,8 @@ struct btrfs_fs_info { > struct rw_semaphore scrub_super_lock; > int scrub_workers_refcnt; > struct btrfs_workers scrub_workers; > + struct btrfs_workers scrub_wr_completion_workers; > + struct btrfs_workers scrub_nocow_workers; > > #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY > u32 check_integrity_print_mask; > diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c > index 0ddc565..9f363e1 100644 > --- a/fs/btrfs/reada.c > +++ b/fs/btrfs/reada.c > @@ -418,12 +418,17 @@ static struct reada_extent *reada_find_extent(struct > btrfs_root *root, > */ > continue; > } > + if (!dev->bdev) { > + /* cannot read ahead on missing device */ > + continue; > + } > prev_dev = dev; > ret = radix_tree_insert(&dev->reada_extents, index, re); > if (ret) { > while (--i >= 0) { > dev = bbio->stripes[i].dev; > BUG_ON(dev == NULL); > + /* ignore whether the entry was inserted */ > radix_tree_delete(&dev->reada_extents, index); > } > BUG_ON(fs_info == NULL); > @@ -914,7 +919,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root > *root, > generation = btrfs_header_generation(node); > free_extent_buffer(node); > > - reada_add_block(rc, start, &max_key, level, generation); > + if (reada_add_block(rc, start, &max_key, level, generation)) { > + kfree(rc); > + return ERR_PTR(-ENOMEM); > + } > > reada_start_machine(root->fs_info); > > diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c > index 460e30b..59c69e0 100644 > --- a/fs/btrfs/scrub.c > +++ b/fs/btrfs/scrub.c > @@ -25,6 +25,7 @@ > #include "transaction.h" > #include "backref.h" > #include "extent_io.h" > +#include "dev-replace.h" > #include "check-integrity.h" > #include "rcu-string.h" > > @@ -44,8 +45,15 @@ > struct scrub_block; > struct scrub_ctx; > > -#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ > -#define SCRUB_BIOS_PER_CTX 16 /* 1 MB per device in flight */ > +/* > + * the following three values only influence the performance. > + * The last one configures the number of parallel and outstanding I/O > + * operations. The first two values configure an upper limit for the number > + * of (dynamically allocated) pages that are added to a bio. > + */ > +#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */ > +#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */ > +#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */ > > /* > * the following value times PAGE_SIZE needs to be large enough to match the > @@ -62,6 +70,7 @@ struct scrub_page { > u64 generation; > u64 logical; > u64 physical; > + u64 physical_for_dev_replace; > atomic_t ref_count; > struct { > unsigned int mirror_num:8; > @@ -79,7 +88,11 @@ struct scrub_bio { > int err; > u64 logical; > u64 physical; > - struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; > +#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO > + struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO]; > +#else > + struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO]; > +#endif > int page_count; > int next_free; > struct btrfs_work work; > @@ -99,8 +112,16 @@ struct scrub_block { > }; > }; > > +struct scrub_wr_ctx { > + struct scrub_bio *wr_curr_bio; > + struct btrfs_device *tgtdev; > + int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ > + atomic_t flush_all_writes; > + struct mutex wr_lock; > +}; > + > struct scrub_ctx { > - struct scrub_bio *bios[SCRUB_BIOS_PER_CTX]; > + struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX]; > struct btrfs_root *dev_root; > int first_free; > int curr; > @@ -112,12 +133,13 @@ struct scrub_ctx { > struct list_head csum_list; > atomic_t cancel_req; > int readonly; > - int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ > + int pages_per_rd_bio; > u32 sectorsize; > u32 nodesize; > u32 leafsize; > > int is_dev_replace; > + struct scrub_wr_ctx wr_ctx; > > /* > * statistics > @@ -135,6 +157,15 @@ struct scrub_fixup_nodatasum { > int mirror_num; > }; > > +struct scrub_copy_nocow_ctx { > + struct scrub_ctx *sctx; > + u64 logical; > + u64 len; > + int mirror_num; > + u64 physical_for_dev_replace; > + struct btrfs_work work; > +}; > + > struct scrub_warning { > struct btrfs_path *path; > u64 extent_item_size; > @@ -156,8 +187,9 @@ static void scrub_pending_trans_workers_dec(struct > scrub_ctx *sctx); > static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); > static int scrub_setup_recheck_block(struct scrub_ctx *sctx, > struct btrfs_fs_info *fs_info, > + struct scrub_block *original_sblock, > u64 length, u64 logical, > - struct scrub_block *sblock); > + struct scrub_block *sblocks_for_recheck); > static void scrub_recheck_block(struct btrfs_fs_info *fs_info, > struct scrub_block *sblock, int is_metadata, > int have_csum, u8 *csum, u64 generation, > @@ -174,6 +206,9 @@ static int scrub_repair_block_from_good_copy(struct > scrub_block *sblock_bad, > static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, > struct scrub_block *sblock_good, > int page_num, int force_write); > +static void scrub_write_block_to_dev_replace(struct scrub_block *sblock); > +static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, > + int page_num); > static int scrub_checksum_data(struct scrub_block *sblock); > static int scrub_checksum_tree_block(struct scrub_block *sblock); > static int scrub_checksum_super(struct scrub_block *sblock); > @@ -181,14 +216,38 @@ static void scrub_block_get(struct scrub_block *sblock); > static void scrub_block_put(struct scrub_block *sblock); > static void scrub_page_get(struct scrub_page *spage); > static void scrub_page_put(struct scrub_page *spage); > -static int scrub_add_page_to_bio(struct scrub_ctx *sctx, > - struct scrub_page *spage); > +static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, > + struct scrub_page *spage); > static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, > u64 physical, struct btrfs_device *dev, u64 flags, > - u64 gen, int mirror_num, u8 *csum, int force); > + u64 gen, int mirror_num, u8 *csum, int force, > + u64 physical_for_dev_replace); > static void scrub_bio_end_io(struct bio *bio, int err); > static void scrub_bio_end_io_worker(struct btrfs_work *work); > static void scrub_block_complete(struct scrub_block *sblock); > +static void scrub_remap_extent(struct btrfs_fs_info *fs_info, > + u64 extent_logical, u64 extent_len, > + u64 *extent_physical, > + struct btrfs_device **extent_dev, > + int *extent_mirror_num); > +static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, > + struct scrub_wr_ctx *wr_ctx, > + struct btrfs_fs_info *fs_info, > + struct btrfs_device *dev, > + int is_dev_replace); > +static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx); > +static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, > + struct scrub_page *spage); > +static void scrub_wr_submit(struct scrub_ctx *sctx); > +static void scrub_wr_bio_end_io(struct bio *bio, int err); > +static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); > +static int write_page_nocow(struct scrub_ctx *sctx, > + u64 physical_for_dev_replace, struct page *page); > +static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, > + void *ctx); > +static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, > + int mirror_num, u64 physical_for_dev_replace); > +static void copy_nocow_pages_worker(struct btrfs_work *work); > > > static void scrub_pending_bio_inc(struct scrub_ctx *sctx) > @@ -262,19 +321,20 @@ static noinline_for_stack void scrub_free_ctx(struct > scrub_ctx *sctx) > if (!sctx) > return; > > + scrub_free_wr_ctx(&sctx->wr_ctx); > + > /* this can happen when scrub is cancelled */ > if (sctx->curr != -1) { > struct scrub_bio *sbio = sctx->bios[sctx->curr]; > > for (i = 0; i < sbio->page_count; i++) { > - BUG_ON(!sbio->pagev[i]); > - BUG_ON(!sbio->pagev[i]->page); > + WARN_ON(!sbio->pagev[i]->page); > scrub_block_put(sbio->pagev[i]->sblock); > } > bio_put(sbio->bio); > } > > - for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) { > + for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { > struct scrub_bio *sbio = sctx->bios[i]; > > if (!sbio) > @@ -292,18 +352,29 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device > *dev, int is_dev_replace) > struct scrub_ctx *sctx; > int i; > struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; > - int pages_per_bio; > + int pages_per_rd_bio; > + int ret; > > - pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, > - bio_get_nr_vecs(dev->bdev)); > + /* > + * the setting of pages_per_rd_bio is correct for scrub but might > + * be wrong for the dev_replace code where we might read from > + * different devices in the initial huge bios. However, that > + * code is able to correctly handle the case when adding a page > + * to a bio fails. > + */ > + if (dev->bdev) > + pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO, > + bio_get_nr_vecs(dev->bdev)); > + else > + pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; > sctx = kzalloc(sizeof(*sctx), GFP_NOFS); > if (!sctx) > goto nomem; > sctx->is_dev_replace = is_dev_replace; > - sctx->pages_per_bio = pages_per_bio; > + sctx->pages_per_rd_bio = pages_per_rd_bio; > sctx->curr = -1; > sctx->dev_root = dev->dev_root; > - for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) { > + for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { > struct scrub_bio *sbio; > > sbio = kzalloc(sizeof(*sbio), GFP_NOFS); > @@ -316,7 +387,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device > *dev, int is_dev_replace) > sbio->page_count = 0; > sbio->work.func = scrub_bio_end_io_worker; > > - if (i != SCRUB_BIOS_PER_CTX - 1) > + if (i != SCRUB_BIOS_PER_SCTX - 1) > sctx->bios[i]->next_free = i + 1; > else > sctx->bios[i]->next_free = -1; > @@ -334,6 +405,13 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device > *dev, int is_dev_replace) > spin_lock_init(&sctx->list_lock); > spin_lock_init(&sctx->stat_lock); > init_waitqueue_head(&sctx->list_wait); > + > + ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info, > + fs_info->dev_replace.tgtdev, is_dev_replace); > + if (ret) { > + scrub_free_ctx(sctx); > + return ERR_PTR(ret); > + } > return sctx; > > nomem: > @@ -341,7 +419,8 @@ nomem: > return ERR_PTR(-ENOMEM); > } > > -static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void > *ctx) > +static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, > + void *warn_ctx) > { > u64 isize; > u32 nlink; > @@ -349,7 +428,7 @@ static int scrub_print_warning_inode(u64 inum, u64 > offset, u64 root, void *ctx) > int i; > struct extent_buffer *eb; > struct btrfs_inode_item *inode_item; > - struct scrub_warning *swarn = ctx; > + struct scrub_warning *swarn = warn_ctx; > struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; > struct inode_fs_paths *ipath = NULL; > struct btrfs_root *local_root; > @@ -492,11 +571,11 @@ out: > kfree(swarn.msg_buf); > } > > -static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) > +static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void > *fixup_ctx) > { > struct page *page = NULL; > unsigned long index; > - struct scrub_fixup_nodatasum *fixup = ctx; > + struct scrub_fixup_nodatasum *fixup = fixup_ctx; > int ret; > int corrected = 0; > struct btrfs_key key; > @@ -660,7 +739,9 @@ out: > spin_lock(&sctx->stat_lock); > ++sctx->stat.uncorrectable_errors; > spin_unlock(&sctx->stat_lock); > - > + btrfs_dev_replace_stats_inc( > + &sctx->dev_root->fs_info->dev_replace. > + num_uncorrectable_read_errors); > printk_ratelimited_in_rcu(KERN_ERR > "btrfs: unable to fixup (nodatasum) error at logical > %llu on dev %s\n", > (unsigned long long)fixup->logical, > @@ -715,6 +796,11 @@ static int scrub_handle_errored_block(struct scrub_block > *sblock_to_check) > csum = sblock_to_check->pagev[0]->csum; > dev = sblock_to_check->pagev[0]->dev; > > + if (sctx->is_dev_replace && !is_metadata && !have_csum) { > + sblocks_for_recheck = NULL; > + goto nodatasum_case; > + } > + > /* > * read all mirrors one after the other. This includes to > * re-read the extent or metadata block that failed (that was > @@ -758,7 +844,7 @@ static int scrub_handle_errored_block(struct scrub_block > *sblock_to_check) > } > > /* setup the context, map the logical blocks and alloc the pages */ > - ret = scrub_setup_recheck_block(sctx, fs_info, length, > + ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length, > logical, sblocks_for_recheck); > if (ret) { > spin_lock(&sctx->stat_lock); > @@ -789,6 +875,8 @@ static int scrub_handle_errored_block(struct scrub_block > *sblock_to_check) > sctx->stat.unverified_errors++; > spin_unlock(&sctx->stat_lock); > > + if (sctx->is_dev_replace) > + scrub_write_block_to_dev_replace(sblock_bad); > goto out; > } > > @@ -822,12 +910,15 @@ static int scrub_handle_errored_block(struct > scrub_block *sblock_to_check) > BTRFS_DEV_STAT_CORRUPTION_ERRS); > } > > - if (sctx->readonly) > + if (sctx->readonly && !sctx->is_dev_replace) > goto did_not_correct_error; > > if (!is_metadata && !have_csum) { > struct scrub_fixup_nodatasum *fixup_nodatasum; > > +nodatasum_case: > + WARN_ON(sctx->is_dev_replace); > + > /* > * !is_metadata and !have_csum, this means that the data > * might not be COW'ed, that it might be modified > @@ -883,18 +974,79 @@ static int scrub_handle_errored_block(struct > scrub_block *sblock_to_check) > if (!sblock_other->header_error && > !sblock_other->checksum_error && > sblock_other->no_io_error_seen) { > - int force_write = is_metadata || have_csum; > - > - ret = scrub_repair_block_from_good_copy(sblock_bad, > - sblock_other, > - force_write); > + if (sctx->is_dev_replace) { > + scrub_write_block_to_dev_replace(sblock_other); > + } else { > + int force_write = is_metadata || have_csum; > + > + ret = scrub_repair_block_from_good_copy( > + sblock_bad, sblock_other, > + force_write); > + } > if (0 == ret) > goto corrected_error; > } > } > > /* > - * in case of I/O errors in the area that is supposed to be > + * for dev_replace, pick good pages and write to the target device. > + */ > + if (sctx->is_dev_replace) { > + success = 1; > + for (page_num = 0; page_num < sblock_bad->page_count; > + page_num++) { > + int sub_success; > + > + sub_success = 0; > + for (mirror_index = 0; > + mirror_index < BTRFS_MAX_MIRRORS && > + sblocks_for_recheck[mirror_index].page_count > 0; > + mirror_index++) { > + struct scrub_block *sblock_other = > + sblocks_for_recheck + mirror_index; > + struct scrub_page *page_other = > + sblock_other->pagev[page_num]; > + > + if (!page_other->io_error) { > + ret = scrub_write_page_to_dev_replace( > + sblock_other, page_num); > + if (ret == 0) { > + /* succeeded for this page */ > + sub_success = 1; > + break; > + } else { > + btrfs_dev_replace_stats_inc( > + &sctx->dev_root-> > + fs_info->dev_replace. > + num_write_errors); > + } > + } > + } > + > + if (!sub_success) { > + /* > + * did not find a mirror to fetch the page > + * from. scrub_write_page_to_dev_replace() > + * handles this case (page->io_error), by > + * filling the block with zeros before > + * submitting the write request > + */ > + success = 0; > + ret = scrub_write_page_to_dev_replace( > + sblock_bad, page_num); > + if (ret) > + btrfs_dev_replace_stats_inc( > + &sctx->dev_root->fs_info-> > + dev_replace.num_write_errors); > + } > + } > + > + goto out; > + } > + > + /* > + * for regular scrub, repair those pages that are errored. > + * In case of I/O errors in the area that is supposed to be > * repaired, continue by picking good copies of those pages. > * Select the good pages from mirrors to rewrite bad pages from > * the area to fix. Afterwards verify the checksum of the block > @@ -1017,6 +1169,7 @@ out: > > static int scrub_setup_recheck_block(struct scrub_ctx *sctx, > struct btrfs_fs_info *fs_info, > + struct scrub_block *original_sblock, > u64 length, u64 logical, > struct scrub_block *sblocks_for_recheck) > { > @@ -1047,7 +1200,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx > *sctx, > return -EIO; > } > > - BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); > + BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); > for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; > mirror_index++) { > struct scrub_block *sblock; > @@ -1071,6 +1224,10 @@ leave_nomem: > sblock->pagev[page_index] = page; > page->logical = logical; > page->physical = bbio->stripes[mirror_index].physical; > + BUG_ON(page_index >= original_sblock->page_count); > + page->physical_for_dev_replace = > + original_sblock->pagev[page_index]-> > + physical_for_dev_replace; > /* for missing devices, dev->bdev is NULL */ > page->dev = bbio->stripes[mirror_index].dev; > page->mirror_num = mirror_index + 1; > @@ -1249,6 +1406,12 @@ static int scrub_repair_page_from_good_copy(struct > scrub_block *sblock_bad, > int ret; > DECLARE_COMPLETION_ONSTACK(complete); > > + if (!page_bad->dev->bdev) { > + printk_ratelimited(KERN_WARNING > + "btrfs: scrub_repair_page_from_good_copy(bdev > == NULL) is unexpected!\n"); > + return -EIO; > + } > + > bio = bio_alloc(GFP_NOFS, 1); > if (!bio) > return -EIO; > @@ -1269,6 +1432,9 @@ static int scrub_repair_page_from_good_copy(struct > scrub_block *sblock_bad, > if (!bio_flagged(bio, BIO_UPTODATE)) { > btrfs_dev_stat_inc_and_print(page_bad->dev, > BTRFS_DEV_STAT_WRITE_ERRS); > + btrfs_dev_replace_stats_inc( > + &sblock_bad->sctx->dev_root->fs_info-> > + dev_replace.num_write_errors); > bio_put(bio); > return -EIO; > } > @@ -1278,7 +1444,166 @@ static int scrub_repair_page_from_good_copy(struct > scrub_block *sblock_bad, > return 0; > } > > -static void scrub_checksum(struct scrub_block *sblock) > +static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) > +{ > + int page_num; > + > + for (page_num = 0; page_num < sblock->page_count; page_num++) { > + int ret; > + > + ret = scrub_write_page_to_dev_replace(sblock, page_num); > + if (ret) > + btrfs_dev_replace_stats_inc( > + &sblock->sctx->dev_root->fs_info->dev_replace. > + num_write_errors); > + } > +} > + > +static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, > + int page_num) > +{ > + struct scrub_page *spage = sblock->pagev[page_num]; > + > + BUG_ON(spage->page == NULL); > + if (spage->io_error) { > + void *mapped_buffer = kmap_atomic(spage->page); > + > + memset(mapped_buffer, 0, PAGE_CACHE_SIZE); > + flush_dcache_page(spage->page); > + kunmap_atomic(mapped_buffer); > + } > + return scrub_add_page_to_wr_bio(sblock->sctx, spage); > +} > + > +static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, > + struct scrub_page *spage) > +{ > + struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; > + struct scrub_bio *sbio; > + int ret; > + > + mutex_lock(&wr_ctx->wr_lock); > +again: > + if (!wr_ctx->wr_curr_bio) { > + wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio), > + GFP_NOFS); > + if (!wr_ctx->wr_curr_bio)
I think mutex_unlock(&wr_ctx->wr_lock) is necessary before it returns. > + return -ENOMEM; > + wr_ctx->wr_curr_bio->sctx = sctx; > + wr_ctx->wr_curr_bio->page_count = 0; > + } ... ... - Tsutomu -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html