Re: A start at RAID[56] support.
On Sat, 2009-07-11 at 15:40 +0100, David Woodhouse wrote: On Sat, 2009-07-11 at 15:39 +0100, David Woodhouse wrote: This is a preliminary attempt to add RAID5 and RAID6 support. Matching btrfs-progs patch... And this makes it actually write the P and Q stripes... These patches at git://, http://git.infradead.org/users/dwmw2/btrfs-progs-raid56.git I can now make a 4-disk RAID6 file system, copy some stuff to it, then kick out two of the disks and use it in degraded mode, and everything seems to work fine. diff --git a/Makefile b/Makefile index 8097b5a..2d8d349 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ CFLAGS = -g -Werror -Os objects = ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \ root-tree.o dir-item.o file-item.o inode-item.o \ inode-map.o crc32c.o rbtree.o extent-cache.o extent_io.o \ - volumes.o utils.o + volumes.o utils.o raid6.o # CHECKFLAGS=-D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ -Wbitwise \ diff --git a/disk-io.c b/disk-io.c index addebe1..c33c31b 100644 --- a/disk-io.c +++ b/disk-io.c @@ -138,7 +138,7 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, dev_nr = 0; length = blocksize; ret = btrfs_map_block(root-fs_info-mapping_tree, READ, - bytenr, length, multi, 0); + bytenr, length, multi, 0, NULL); BUG_ON(ret); device = multi-stripes[0].dev; device-total_ios++; @@ -196,7 +196,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, length = blocksize; while (1) { ret = btrfs_map_block(root-fs_info-mapping_tree, READ, - eb-start, length, multi, mirror_num); + eb-start, length, multi, mirror_num, + NULL); BUG_ON(ret); device = multi-stripes[0].dev; eb-fd = device-fd; @@ -224,12 +225,93 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, return NULL; } +static int write_raid56_with_parity(struct extent_buffer *eb, + struct btrfs_multi_bio *multi, + u64 stripe_len, u64 *raid_map) +{ + struct extent_buffer *ebs[multi-num_stripes], *p_eb = NULL, *q_eb = NULL; + u64 start_ofs, end_ofs; + int i, j; + int ret; + + start_ofs = eb-start % stripe_len; + end_ofs = start_ofs + eb-len; + BUG_ON(end_ofs stripe_len); + + j = 0; + for (i = 0; i multi-num_stripes; i++) { + struct extent_buffer *new_eb; + if (start_ofs) { + multi-stripes[i].physical += start_ofs; + if (raid_map[i] != (u64)-1 raid_map[i] != (u64)-2) + raid_map[i] += start_ofs; + } + if (raid_map[i] == eb-start) { + eb-dev_bytenr = multi-stripes[i].physical; + eb-fd = multi-stripes[i].dev-fd; + multi-stripes[i].dev-total_ios++; + ebs[j++] = eb; + continue; + } + new_eb = kmalloc(sizeof(*eb) + eb-len, GFP_NOFS); + BUG_ON(!new_eb); + new_eb-dev_bytenr = multi-stripes[i].physical; + new_eb-fd = multi-stripes[i].dev-fd; + multi-stripes[i].dev-total_ios++; + new_eb-len = eb-len; + if (raid_map[i] == (u64)-1) { + p_eb = new_eb; + } else if (raid_map[i] == (u64)-2) { + q_eb = new_eb; + } else { + ret = read_extent_from_disk(new_eb); + BUG_ON(ret); + ebs[j++] = new_eb; + } + } + ebs[j++] = p_eb; + if (q_eb) { + void *pointers[multi-num_stripes]; + + ebs[j++] = q_eb; + + for (i = 0; i multi-num_stripes; i++) + pointers[i] = ebs[i]-data; + + raid6_gen_syndrome(multi-num_stripes, eb-len, pointers); + + ret = write_extent_to_disk(q_eb); + BUG_ON(ret); + } else { + memcpy(p_eb-data, ebs[0]-data, eb-len); + for (j = 1; j multi-num_stripes - 1; j++) { + for (i = 0; i eb-len; i += sizeof(unsigned long)) { + *(unsigned long *)(p_eb-data + i) ^= + *(unsigned long *)(ebs[j]-data + i); + } + } + } + + ret = write_extent_to_disk(p_eb); + BUG_ON(ret); + + ret = write_extent_to_disk(eb); + BUG_ON(ret); + + for (i = 0; i multi-num_stripes; i++) + if (ebs[i] != eb) +
Re: A start at RAID[56] support.
On Sat, 2009-07-11 at 15:39 +0100, David Woodhouse wrote: This is a preliminary attempt to add RAID5 and RAID6 support. So far it doesn't attempt to write or read the parity blocks -- it just lays the data blocks out as we want them, so it's effectively just a complex and wasteful kind of RAID0. The next step is to make btrfs_map_bio() do the right thing: - Satisfy read requests for mirrors #2 and #3 by recreating data from RAID5 parity or RAID6 error correction stripe respectively. - Write out parity and RAID6 blocks appropriately when data writes happen. Actually, the next step is to tweak __btrfs_map_block() a bit more to let it return information about the whole stripe-set, so that btrfs_map_bio() _can_ do what we say above... So rather than just mapping the requested address as if it's RAID0, we (where appropriate) return information about the _entire_ disk set in the btrfs_multi_bio, with an auxiliary array giving the _logical_ offset corresponding to each physical stripe in the referenced set (with special values for the P and Q stripes). We do this for all writes, and for reads where mirror_num 1 (i.e. when we're being asked to rebuild it from parity, rather than reading the original data blocks). git://, http://git.infradead.org/users/dwmw2/btrfs-raid56.git commit ed90c58ba7c60555af4b8c00a104c7d71f6db6d2 Author: David Woodhouse david.woodho...@intel.com Date: Sun Jul 12 11:15:22 2009 +0100 Btrfs: Let btrfs_map_block() return full stripe information for RAID[56] ... in the cases where it's necessary -- which is for a write, or for a parity recovery attempt. We'll let btrfs_map_bio() do the rest. Signed-off-by: David Woodhouse david.woodho...@intel.com diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3b231ef..55facd3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -62,6 +62,11 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_device *device); static int btrfs_relocate_sys_chunks(struct btrfs_root *root); +#define RAID5_P_STRIPE ((u64)-1) +#define RAID6_Q_STRIPE ((u64)-2) + +#define is_parity_stripe(x) ( ((x) == RAID5_P_STRIPE) || ((x) == RAID6_Q_STRIPE) ) + #define map_lookup_size(n) (sizeof(struct map_lookup) + \ (sizeof(struct btrfs_bio_stripe) * (n))) @@ -2614,7 +2619,8 @@ static int find_live_mirror(struct map_lookup *map, int first, int num, static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, struct btrfs_multi_bio **multi_ret, -int mirror_num, struct page *unplug_page) +int mirror_num, struct page *unplug_page, +u64 **raid_map_ret) { struct extent_map *em; struct map_lookup *map; @@ -2622,6 +2628,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 offset; u64 stripe_offset; u64 stripe_nr; + u64 *raid_map = NULL; int stripes_allocated = 8; int stripes_required = 1; int stripe_index; @@ -2674,9 +2681,24 @@ again: max_errors = 1; } } - if (multi_ret (rw (1 BIO_RW)) - stripes_allocated stripes_required) { - stripes_allocated = map-num_stripes; + if (map-type (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) +multi_ret (rw (1 BIO_RW) || mirror_num 1) raid_map_ret) { + /* RAID[56] write or recovery. Return all stripes */ + stripes_required = map-num_stripes; + max_errors = nr_parity_stripes(map); + + /* Only allocate the map if we've already got a large enough multi_ret */ + if (stripes_allocated = stripes_required) { + raid_map = kmalloc(sizeof(u64) * map-num_stripes, GFP_NOFS); + if (!raid_map) { + free_extent_map(em); + kfree(multi); + return -ENOMEM; + } + } + } + if (multi_ret stripes_allocated stripes_required) { + stripes_allocated = stripes_required; free_extent_map(em); kfree(multi); goto again; @@ -2749,18 +2771,43 @@ again: stripe_index = do_div(stripe_nr, nr_data_stripes(map)); - /* -* Mirror #0 or #1 means the original data block. -* Mirror #2 is RAID5 parity block. -* Mirror #3 is RAID6 Q block. -*/ - if (mirror_num 1) - stripe_index = nr_data_stripes(map) + mirror_num - 2; - - /* We distribute the
A start at RAID[56] support.
This is a preliminary attempt to add RAID5 and RAID6 support. So far it doesn't attempt to write or read the parity blocks -- it just lays the data blocks out as we want them, so it's effectively just a complex and wasteful kind of RAID0. The next step is to make btrfs_map_bio() do the right thing: - Satisfy read requests for mirrors #2 and #3 by recreating data from RAID5 parity or RAID6 error correction stripe respectively. - Write out parity and RAID6 blocks appropriately when data writes happen. The former is relatively easy; the latter is slightly more interesting. Chris suggests that we can avoid read/modify/write cycles for the parity blocks by ensuring that the file system always writes a full set of stripes. So for a RAID5 of 4 disks with 64KiB stripe_len, that would be a 192KiB minimum write size, for example. I'm not entirely sure of the best way to do that -- can we set a minimum allocation size for a chunk, and then maybe have it fall back to RAID1 (or a RAID5 chunk with smaller stripe_len) for smaller allocations if they'd be too wasteful on the larger RAID5 chunks? And how would we handle nodatacow? I think I'm going to do a crappy r/m/w thing for now (in the knowledge that the error correction stripes won't be powerfail-safe), and then we can set about trying to render it unnecessary. (Yes, I know I need to fix up btrfs_discard_extent() for RAID5 too -- it doesn't discard the parity stripes, and I may want to make it avoid discarding partial stripes for now, until we fix the above.) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 98a8738..40168d7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -653,6 +653,8 @@ struct btrfs_csum_item { #define BTRFS_BLOCK_GROUP_RAID1(1 4) #define BTRFS_BLOCK_GROUP_DUP (1 5) #define BTRFS_BLOCK_GROUP_RAID10 (1 6) +#define BTRFS_BLOCK_GROUP_RAID5(1 7) +#define BTRFS_BLOCK_GROUP_RAID6(1 8) struct btrfs_block_group_item { __le64 used; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d829ef3..fadec64 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2496,6 +2496,8 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { u64 extra_flags = flags (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_DUP); if (extra_flags) { @@ -2524,29 +2526,34 @@ static void set_block_group_readonly(struct btrfs_block_group_cache *cache) u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { u64 num_devices = root-fs_info-fs_devices-rw_devices; + u64 tmp; + /* First, mask out the RAID levels which aren't possible */ if (num_devices == 1) - flags = ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); + flags = ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5); + if (num_devices 3) + flags = ~BTRFS_BLOCK_GROUP_RAID6; if (num_devices 4) flags = ~BTRFS_BLOCK_GROUP_RAID10; - if ((flags BTRFS_BLOCK_GROUP_DUP) - (flags (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10))) { - flags = ~BTRFS_BLOCK_GROUP_DUP; - } + tmp = flags (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); + flags = ~tmp; - if ((flags BTRFS_BLOCK_GROUP_RAID1) - (flags BTRFS_BLOCK_GROUP_RAID10)) { - flags = ~BTRFS_BLOCK_GROUP_RAID1; - } + if (tmp BTRFS_BLOCK_GROUP_RAID6) + tmp = BTRFS_BLOCK_GROUP_RAID6; + else if (tmp BTRFS_BLOCK_GROUP_RAID5) + tmp = BTRFS_BLOCK_GROUP_RAID5; + else if (tmp BTRFS_BLOCK_GROUP_RAID10) + tmp = BTRFS_BLOCK_GROUP_RAID10; + else if (tmp BTRFS_BLOCK_GROUP_RAID1) + tmp = BTRFS_BLOCK_GROUP_RAID1; + else if (tmp BTRFS_BLOCK_GROUP_RAID0) + tmp = BTRFS_BLOCK_GROUP_RAID0; - if ((flags BTRFS_BLOCK_GROUP_RAID0) - ((flags BTRFS_BLOCK_GROUP_RAID1) | -(flags BTRFS_BLOCK_GROUP_RAID10) | -(flags BTRFS_BLOCK_GROUP_DUP))) - flags = ~BTRFS_BLOCK_GROUP_RAID0; - return flags; + return flags | tmp; } static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data) @@ -6548,6 +6555,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) { u64 num_devices; u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5 |