This is adding checksum to meta/data/parity resident on the raid5/6 log. So recovery now can verify checksum to see if anything inside meta/data/parity has been changed.
If anything is wrong in meta block, we stops replaying data/parity at that position, while if anything is wrong in data/parity block, we just skip this this meta/data/parity pair and move onto the next one. Signed-off-by: Liu Bo <bo.li....@oracle.com> --- fs/btrfs/raid56.c | 235 ++++++++++++++++++++++++++++++++++++++++++++---------- fs/btrfs/raid56.h | 4 + 2 files changed, 197 insertions(+), 42 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 8f47e56..8bc7ba4 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -43,6 +43,7 @@ #include "async-thread.h" #include "check-integrity.h" #include "rcu-string.h" +#include "hash.h" /* set when additional merges to this rbio are not allowed */ #define RBIO_RMW_LOCKED_BIT 1 @@ -197,6 +198,7 @@ struct btrfs_r5l_log { u64 last_cp_seq; u64 seq; u64 log_start; + u32 uuid_csum; struct btrfs_r5l_io_unit *current_io; }; @@ -1309,7 +1311,7 @@ static int btrfs_r5l_get_meta(struct btrfs_r5l_log *log, struct btrfs_raid_bio * return 0; } -static void btrfs_r5l_append_payload_meta(struct btrfs_r5l_log *log, u16 type, u64 location, u64 devid) +static void btrfs_r5l_append_payload_meta(struct btrfs_r5l_log *log, u16 type, u64 location, u64 devid, u32 csum) { struct btrfs_r5l_io_unit *io = log->current_io; struct btrfs_r5l_payload *payload; @@ -1326,11 +1328,11 @@ static void btrfs_r5l_append_payload_meta(struct btrfs_r5l_log *log, u16 type, u payload->size = cpu_to_le32(16); /* stripe_len / PAGE_SIZE */ payload->devid = cpu_to_le64(devid); payload->location = cpu_to_le64(location); + payload->csum = cpu_to_le32(csum); kunmap(io->meta_page); - /* XXX: add checksum later */ io->meta_offset += sizeof(*payload); - //io->meta_offset += sizeof(__le32); + #ifdef BTRFS_DEBUG_R5LOG trace_printk("io->meta_offset %d\n", io->meta_offset); #endif @@ -1380,6 +1382,10 @@ static void btrfs_r5l_log_stripe(struct btrfs_r5l_log *log, int data_pages, int int meta_size; int stripe, pagenr; struct page *page; + char *kaddr; + u32 csum; + u64 location; + u64 devid; /* * parity pages are contiguous on disk, thus only one @@ -1394,8 +1400,6 @@ static void btrfs_r5l_log_stripe(struct btrfs_r5l_log *log, int data_pages, int /* add data blocks which need to be written */ for (stripe = 0; stripe < rbio->nr_data; stripe++) { for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - u64 location; - u64 devid; if (stripe < rbio->nr_data) { page = page_in_rbio(rbio, stripe, pagenr, 1); if (!page) @@ -1406,7 +1410,11 @@ static void btrfs_r5l_log_stripe(struct btrfs_r5l_log *log, int data_pages, int #ifdef BTRFS_DEBUG_R5LOG trace_printk("data: stripe %d pagenr %d location 0x%llx devid %llu\n", stripe, pagenr, location, devid); #endif - btrfs_r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, location, devid); + kaddr = kmap(page); + csum = btrfs_crc32c(log->uuid_csum, kaddr, PAGE_SIZE); + kunmap(page); + + btrfs_r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, location, devid, csum); btrfs_r5l_append_payload_page(log, page); } } @@ -1414,17 +1422,26 @@ static void btrfs_r5l_log_stripe(struct btrfs_r5l_log *log, int data_pages, int /* add the whole parity blocks */ for (; stripe < rbio->real_stripes; stripe++) { - u64 location = btrfs_compute_location(rbio, stripe, 0); - u64 devid = btrfs_compute_devid(rbio, stripe); + location = btrfs_compute_location(rbio, stripe, 0); + devid = btrfs_compute_devid(rbio, stripe); #ifdef BTRFS_DEBUG_R5LOG trace_printk("parity: stripe %d location 0x%llx devid %llu\n", stripe, location, devid); #endif - btrfs_r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, location, devid); for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { page = rbio_stripe_page(rbio, stripe, pagenr); + + kaddr = kmap(page); + if (pagenr == 0) + csum = btrfs_crc32c(log->uuid_csum, kaddr, PAGE_SIZE); + else + csum = btrfs_crc32c(csum, kaddr, PAGE_SIZE); + kunmap(page); + btrfs_r5l_append_payload_page(log, page); } + + btrfs_r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, location, devid, csum); } } @@ -1432,12 +1449,16 @@ static void btrfs_r5l_submit_current_io(struct btrfs_r5l_log *log) { struct btrfs_r5l_io_unit *io = log->current_io; struct btrfs_r5l_meta_block *mb; + u32 csum; if (!io) return; mb = kmap(io->meta_page); mb->meta_size = cpu_to_le32(io->meta_offset); + ASSERT(mb->csum == 0); + csum = btrfs_crc32c(log->uuid_csum, mb, PAGE_SIZE); + mb->csum = cpu_to_le32(csum); kunmap(io->meta_page); log->current_io = NULL; @@ -1506,6 +1527,7 @@ static int btrfs_r5l_write_empty_meta_block(struct btrfs_r5l_log *log, u64 pos, { struct page *page; struct btrfs_r5l_meta_block *mb; + u32 csum; int ret = 0; #ifdef BTRFS_DEBUG_R5LOG @@ -1520,6 +1542,9 @@ static int btrfs_r5l_write_empty_meta_block(struct btrfs_r5l_log *log, u64 pos, mb->meta_size = cpu_to_le32(sizeof(struct btrfs_r5l_meta_block)); mb->seq = cpu_to_le64(seq); mb->position = cpu_to_le64(pos); + + csum = btrfs_crc32c(log->uuid_csum, mb, PAGE_SIZE); + mb->csum = cpu_to_le32(csum); kunmap(page); if (!btrfs_r5l_sync_page_io(log, log->dev, (pos >> 9), PAGE_SIZE, page, REQ_OP_WRITE | REQ_FUA)) { @@ -1607,6 +1632,9 @@ static int btrfs_r5l_recover_read_page(struct btrfs_r5l_recover_ctx *ctx, struct static int btrfs_r5l_recover_load_meta(struct btrfs_r5l_recover_ctx *ctx) { struct btrfs_r5l_meta_block *mb; + u32 csum; + u32 expected; + int ret = 0; ret = btrfs_r5l_recover_read_page(ctx, ctx->meta_page, ctx->pos); if (ret) @@ -1623,25 +1651,131 @@ static int btrfs_r5l_recover_load_meta(struct btrfs_r5l_recover_ctx *ctx) #ifdef BTRFS_DEBUG_R5LOG trace_printk("%s: mismatch magic %llu default %llu\n", __func__, le32_to_cpu(mb->magic), BTRFS_R5LOG_MAGIC); #endif - return -EINVAL; + ret = -EINVAL; + goto out; } - ASSERT(le32_to_cpu(mb->meta_size) <= PAGE_SIZE); - kunmap(ctx->meta_page); + expected = le32_to_cpu(mb->csum); + /* + * when we calculate mb->csum, it's zero, so we need to zero + * it back. + */ + mb->csum = 0; + csum = btrfs_crc32c(ctx->log->uuid_csum, mb, PAGE_SIZE); + if (csum != expected) { +#ifdef BTRFS_DEBUG_R5LOG + pr_info("%s: mismatch checksum for r5l meta block\n", __func__); +#endif + ret = -EINVAL; + goto out; + } + ASSERT(le32_to_cpu(mb->meta_size) <= PAGE_SIZE); /* meta_block */ ctx->total_size = PAGE_SIZE; - return 0; +out: + kunmap(ctx->meta_page); + + return ret; +} + +static int btrfs_r5l_recover_verify_checksum(struct btrfs_r5l_recover_ctx *ctx) +{ + u64 offset; + u32 meta_size; + u64 csum_io_offset; + u64 read_pos; + char *kaddr; + u32 csum; + int type; + struct btrfs_r5l_meta_block *mb; + struct btrfs_r5l_payload *payload; + struct btrfs_r5l_log *log = ctx->log; + struct btrfs_device *dev; + int ret = 0; + + mb = kmap(ctx->meta_page); + meta_size = le32_to_cpu(mb->meta_size); + csum_io_offset = PAGE_SIZE; + + for (offset = sizeof(struct btrfs_r5l_meta_block); + offset < meta_size; + offset += sizeof(struct btrfs_r5l_payload)) { + payload = (void *)mb + offset; + + /* check if there is any invalid device, if so, skip writing this mb. */ + dev = btrfs_find_device(log->fs_info, le64_to_cpu(payload->devid), NULL, NULL); + if (!dev || dev->missing) { + ret = -EINVAL; + goto out; + } + + type = le16_to_cpu(payload->type); + if (type == R5LOG_PAYLOAD_DATA) { + read_pos = btrfs_r5l_ring_add(log, ctx->pos, csum_io_offset); + csum_io_offset += PAGE_SIZE; + + ASSERT(le32_to_cpu(payload->size) == 1); + ret = btrfs_r5l_recover_read_page(ctx, ctx->io_page, read_pos); + if (ret) { + ret = -EIO; + goto out; + } + + kaddr = kmap(ctx->io_page); + csum = btrfs_crc32c(log->uuid_csum, kaddr, PAGE_SIZE); + kunmap(ctx->io_page); + } else if (type == R5LOG_PAYLOAD_PARITY) { + int i; + for (i = 0; i < le32_to_cpu(payload->size); i++) { + read_pos = btrfs_r5l_ring_add(log, ctx->pos, csum_io_offset); + csum_io_offset += PAGE_SIZE; + + ret = btrfs_r5l_recover_read_page(ctx, ctx->io_page, read_pos); + if (ret) { + ret = -EIO; + goto out; + } + + kaddr = kmap(ctx->io_page); + if (i == 0) + csum = btrfs_crc32c(log->uuid_csum, kaddr, PAGE_SIZE); + else + csum = btrfs_crc32c(csum, kaddr, PAGE_SIZE); + kunmap(ctx->io_page); + } + } else { + ASSERT(0); + } + + if (csum != le32_to_cpu(payload->csum)) { + trace_printk("r5l data csum fails location 0x%llx devid %llu\n", le64_to_cpu(payload->location), le64_to_cpu(payload->devid)); + ret = -EAGAIN; + goto out; + } + } +out: + kunmap(ctx->meta_page); + return ret; } -static int btrfs_r5l_recover_load_data(struct btrfs_r5l_log *log, struct btrfs_r5l_recover_ctx *ctx) +static int btrfs_r5l_recover_load_data(struct btrfs_r5l_recover_ctx *ctx) { u64 offset; struct btrfs_r5l_meta_block *mb; - u64 meta_size; + u32 meta_size; u64 io_offset; + u64 read_pos; struct btrfs_device *dev; + struct btrfs_r5l_payload *payload; + struct btrfs_r5l_log *log = ctx->log; + int ret = 0; + + /* if any checksum fails, skip writing this mb. */ + ret = btrfs_r5l_recover_verify_checksum(ctx); + if (ret) + return ret; mb = kmap(ctx->meta_page); @@ -1649,67 +1783,81 @@ static int btrfs_r5l_recover_load_data(struct btrfs_r5l_log *log, struct btrfs_r offset = sizeof(struct btrfs_r5l_meta_block); meta_size = le32_to_cpu(mb->meta_size); - while (offset < meta_size) { - struct btrfs_r5l_payload *payload = (void *)mb + offset; + for (offset = sizeof(struct btrfs_r5l_meta_block); + offset < meta_size; + offset += sizeof(struct btrfs_r5l_payload)) { + payload = (void *)mb + offset; /* read data from log disk and write to payload->location */ #ifdef BTRFS_DEBUG_R5LOG trace_printk("payload type %d flags %d size %d location 0x%llx devid %llu\n", le16_to_cpu(payload->type), le16_to_cpu(payload->flags), le32_to_cpu(payload->size), le64_to_cpu(payload->location), le64_to_cpu(payload->devid)); #endif + /* liubo: how to handle the case where dev is suddenly off? */ dev = btrfs_find_device(log->fs_info, le64_to_cpu(payload->devid), NULL, NULL); - if (!dev || dev->missing) { - ASSERT(0); - } + ASSERT(dev && !dev->missing); if (le16_to_cpu(payload->type) == R5LOG_PAYLOAD_DATA) { - ASSERT(le32_to_cpu(payload->size) == 1); - btrfs_r5l_sync_page_io(log, log->dev, (ctx->pos + io_offset) >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_READ); - btrfs_r5l_sync_page_io(log, dev, le64_to_cpu(payload->location) >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_WRITE); + read_pos = btrfs_r5l_ring_add(log, ctx->pos, io_offset); io_offset += PAGE_SIZE; + + ret = btrfs_r5l_recover_read_page(ctx, ctx->io_page, read_pos); + if (ret) + goto out; + + if (!btrfs_r5l_sync_page_io(log, dev, le64_to_cpu(payload->location) >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_WRITE)) { + ret = -EIO; + goto out; + } } else if (le16_to_cpu(payload->type) == R5LOG_PAYLOAD_PARITY) { int i; - ASSERT(le32_to_cpu(payload->size) == 16); + + ASSERT(offset + sizeof(struct btrfs_r5l_payload) == meta_size); + for (i = 0; i < le32_to_cpu(payload->size); i++) { - /* liubo: parity are guaranteed to be - * contiguous, use just one bio to - * hold all pages and flush them. */ u64 parity_off = le64_to_cpu(payload->location) + i * PAGE_SIZE; - btrfs_r5l_sync_page_io(log, log->dev, (ctx->pos + io_offset) >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_READ); - btrfs_r5l_sync_page_io(log, dev, parity_off >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_WRITE); + read_pos = btrfs_r5l_ring_add(log, ctx->pos, io_offset); io_offset += PAGE_SIZE; + + ret = btrfs_r5l_recover_read_page(ctx, ctx->io_page, read_pos); + if (ret) + goto out; + + if (!btrfs_r5l_sync_page_io(log, dev, parity_off >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_WRITE)) { + ret = -EIO; + goto out; + } } } else { ASSERT(0); } - - offset += sizeof(struct btrfs_r5l_payload); } - kunmap(ctx->meta_page); ctx->total_size += (io_offset - PAGE_SIZE); - return 0; +out: + kunmap(ctx->meta_page); + return ret; } -static int btrfs_r5l_recover_flush_log(struct btrfs_r5l_log *log, struct btrfs_r5l_recover_ctx *ctx) +static int btrfs_r5l_recover_flush_log(struct btrfs_r5l_recover_ctx *ctx) { int ret; while (1) { - ret = btrfs_r5l_recover_load_meta(log, ctx); + ret = btrfs_r5l_recover_load_meta(ctx); if (ret) break; - ret = btrfs_r5l_recover_load_data(log, ctx); - ASSERT(!ret || ret > 0); - if (ret) + ret = btrfs_r5l_recover_load_data(ctx); + if (ret && ret != -EAGAIN) break; ctx->seq++; - ctx->pos = btrfs_r5l_ring_add(log, ctx->pos, ctx->total_size); + ctx->pos = btrfs_r5l_ring_add(ctx->log, ctx->pos, ctx->total_size); } - return ret; + return 0; +} static int btrfs_r5l_recover_allocate_ra(struct btrfs_r5l_recover_ctx *ctx) { @@ -1801,6 +1949,7 @@ int btrfs_r5l_load_log(struct btrfs_fs_info *fs_info, u64 cp) struct page *page; struct btrfs_r5l_meta_block *mb; bool create_new = false; + int ret; ASSERT(log); @@ -1856,10 +2005,10 @@ int btrfs_r5l_load_log(struct btrfs_fs_info *fs_info, u64 cp) log->seq = log->last_cp_seq + 1; log->next_checkpoint = cp; } else { - btrfs_r5l_recover_log(log); + ret = btrfs_r5l_recover_log(log); } - return 0; + return ret; } /* @@ -3576,6 +3725,8 @@ int btrfs_set_r5log(struct btrfs_fs_info *fs_info, struct btrfs_device *device) log->device_size = round_down(log->device_size, PAGE_SIZE); log->dev = device; log->fs_info = fs_info; + ASSERT(sizeof(device->uuid) == BTRFS_UUID_SIZE); + log->uuid_csum = btrfs_crc32c(~0, device->uuid, sizeof(device->uuid)); mutex_init(&log->io_mutex); cmpxchg(&fs_info->r5log, NULL, log); diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 314d299..569cec8 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -87,6 +87,8 @@ struct btrfs_r5l_payload { /* data or parity */ __le64 location; __le64 devid; + + __le32 csum; }; /* io unit starts from a meta block. */ @@ -96,6 +98,8 @@ struct btrfs_r5l_meta_block { /* the whole size of the block */ __le32 meta_size; + __le32 csum; + __le64 seq; __le64 position; -- 2.9.4 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html