This is a preliminary attempt to add RAID5 and RAID6 support. So far it doesn't attempt to write or read the parity blocks -- it just lays the data blocks out as we want them, so it's effectively just a complex and wasteful kind of RAID0.
The next step is to make btrfs_map_bio() do the right thing: - Satisfy read requests for mirrors #2 and #3 by recreating data from RAID5 parity or RAID6 error correction stripe respectively. - Write out parity and RAID6 blocks appropriately when data writes happen. The former is relatively easy; the latter is slightly more interesting. Chris suggests that we can avoid read/modify/write cycles for the parity blocks by ensuring that the file system always writes a full set of stripes. So for a RAID5 of 4 disks with 64KiB stripe_len, that would be a 192KiB minimum write size, for example. I'm not entirely sure of the best way to do that -- can we set a minimum allocation size for a chunk, and then maybe have it fall back to RAID1 (or a RAID5 chunk with smaller stripe_len) for smaller allocations if they'd be too wasteful on the larger RAID5 chunks? And how would we handle nodatacow? I think I'm going to do a crappy r/m/w thing for now (in the knowledge that the error correction stripes won't be powerfail-safe), and then we can set about trying to render it unnecessary. (Yes, I know I need to fix up btrfs_discard_extent() for RAID5 too -- it doesn't discard the parity stripes, and I may want to make it avoid discarding partial stripes for now, until we fix the above.) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 98a8738..40168d7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -653,6 +653,8 @@ struct btrfs_csum_item { #define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) #define BTRFS_BLOCK_GROUP_DUP (1 << 5) #define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) +#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7) +#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8) struct btrfs_block_group_item { __le64 used; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d829ef3..fadec64 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2496,6 +2496,8 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_DUP); if (extra_flags) { @@ -2524,29 +2526,34 @@ static void set_block_group_readonly(struct btrfs_block_group_cache *cache) u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { u64 num_devices = root->fs_info->fs_devices->rw_devices; + u64 tmp; + /* First, mask out the RAID levels which aren't possible */ if (num_devices == 1) - flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); + flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5); + if (num_devices < 3) + flags &= ~BTRFS_BLOCK_GROUP_RAID6; if (num_devices < 4) flags &= ~BTRFS_BLOCK_GROUP_RAID10; - if ((flags & BTRFS_BLOCK_GROUP_DUP) && - (flags & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10))) { - flags &= ~BTRFS_BLOCK_GROUP_DUP; - } + tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); + flags &= ~tmp; - if ((flags & BTRFS_BLOCK_GROUP_RAID1) && - (flags & BTRFS_BLOCK_GROUP_RAID10)) { - flags &= ~BTRFS_BLOCK_GROUP_RAID1; - } + if (tmp & BTRFS_BLOCK_GROUP_RAID6) + tmp = BTRFS_BLOCK_GROUP_RAID6; + else if (tmp & BTRFS_BLOCK_GROUP_RAID5) + tmp = BTRFS_BLOCK_GROUP_RAID5; + else if (tmp & BTRFS_BLOCK_GROUP_RAID10) + tmp = BTRFS_BLOCK_GROUP_RAID10; + else if (tmp & BTRFS_BLOCK_GROUP_RAID1) + tmp = BTRFS_BLOCK_GROUP_RAID1; + else if (tmp & BTRFS_BLOCK_GROUP_RAID0) + tmp = BTRFS_BLOCK_GROUP_RAID0; - if ((flags & BTRFS_BLOCK_GROUP_RAID0) && - ((flags & BTRFS_BLOCK_GROUP_RAID1) | - (flags & BTRFS_BLOCK_GROUP_RAID10) | - (flags & BTRFS_BLOCK_GROUP_DUP))) - flags &= ~BTRFS_BLOCK_GROUP_RAID0; - return flags; + return flags | tmp; } static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data) @@ -6548,6 +6555,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) { u64 num_devices; u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; num_devices = root->fs_info->fs_devices->rw_devices; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f1f10ea..3b231ef 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -42,6 +42,21 @@ struct map_lookup { struct btrfs_bio_stripe stripes[]; }; +static inline int nr_parity_stripes(struct map_lookup *map) +{ + if (map->type & BTRFS_BLOCK_GROUP_RAID5) + return 1; + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) + return 2; + else + return 0; +} + +static inline int nr_data_stripes(struct map_lookup *map) +{ + return map->num_stripes - nr_parity_stripes(map); +} + static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); @@ -1140,6 +1155,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) goto out; } + if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && + root->fs_info->fs_devices->rw_devices <= 2) { + printk(KERN_ERR "btrfs: unable to go below two " + "devices on raid5\n"); + ret = -EINVAL; + goto out; + } + if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && + root->fs_info->fs_devices->rw_devices <= 3) { + printk(KERN_ERR "btrfs: unable to go below three " + "devices on raid6\n"); + ret = -EINVAL; + goto out; + } + if (strcmp(device_path, "missing") == 0) { struct list_head *devices; struct btrfs_device *tmp; @@ -2090,6 +2120,10 @@ static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size, return calc_size; else if (type & BTRFS_BLOCK_GROUP_RAID10) return calc_size * (num_stripes / sub_stripes); + else if (type & BTRFS_BLOCK_GROUP_RAID5) + return calc_size * (num_stripes - 1); + else if (type & BTRFS_BLOCK_GROUP_RAID6) + return calc_size * (num_stripes - 2); else return calc_size * num_stripes; } @@ -2153,6 +2187,18 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, sub_stripes = 2; min_stripes = 4; } + if (type & (BTRFS_BLOCK_GROUP_RAID5)) { + num_stripes = fs_devices->rw_devices; + if (num_stripes < 2) + return -ENOSPC; + min_stripes = 2; + } + if (type & (BTRFS_BLOCK_GROUP_RAID6)) { + num_stripes = fs_devices->rw_devices; + if (num_stripes < 3) + return -ENOSPC; + min_stripes = 3; + } if (type & BTRFS_BLOCK_GROUP_DATA) { max_chunk_size = 10 * calc_size; @@ -2539,6 +2585,10 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) ret = map->num_stripes; else if (map->type & BTRFS_BLOCK_GROUP_RAID10) ret = map->sub_stripes; + else if (map->type & BTRFS_BLOCK_GROUP_RAID5) + ret = 2; + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) + ret = 3; else ret = 1; free_extent_map(em); @@ -2645,6 +2695,7 @@ again: stripe_offset = offset - stripe_offset; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_DUP)) { /* we limit the length of each bio to what fits in a stripe */ @@ -2691,6 +2742,25 @@ again: map->sub_stripes, stripe_index + current->pid % map->sub_stripes); } + + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + u64 tmp; + + stripe_index = do_div(stripe_nr, nr_data_stripes(map)); + + /* + * Mirror #0 or #1 means the original data block. + * Mirror #2 is RAID5 parity block. + * Mirror #3 is RAID6 Q block. + */ + if (mirror_num > 1) + stripe_index = nr_data_stripes(map) + mirror_num - 2; + + /* We distribute the parity blocks across stripes */ + tmp = stripe_nr + stripe_index; + stripe_index = do_div(tmp, map->num_stripes); + } else { /* * after this do_div call, stripe_nr is the number of stripes @@ -2749,6 +2819,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 bytenr; u64 length; u64 stripe_nr; + u64 rmap_len; int i, j, nr = 0; spin_lock(&em_tree->lock); @@ -2759,10 +2830,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, map = (struct map_lookup *)em->bdev; length = em->len; + rmap_len = map->stripe_len; + if (map->type & BTRFS_BLOCK_GROUP_RAID10) do_div(length, map->num_stripes / map->sub_stripes); else if (map->type & BTRFS_BLOCK_GROUP_RAID0) do_div(length, map->num_stripes); + else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + do_div(length, nr_data_stripes(map)); + rmap_len = map->stripe_len * nr_data_stripes(map); + } buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); BUG_ON(!buf); @@ -2782,8 +2860,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, do_div(stripe_nr, map->sub_stripes); } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { stripe_nr = stripe_nr * map->num_stripes + i; - } - bytenr = chunk_start + stripe_nr * map->stripe_len; + } /* else if RAID[56], multiply by nr_data_stripes(). + * Alternatively, just use rmap_len below instead of + * map->stripe_len */ + + bytenr = chunk_start + stripe_nr * rmap_len; WARN_ON(nr >= map->num_stripes); for (j = 0; j < nr; j++) { if (buf[j] == bytenr) @@ -2797,7 +2878,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, *logical = buf; *naddrs = nr; - *stripe_len = map->stripe_len; + *stripe_len = rmap_len; free_extent_map(em); return 0; -- David Woodhouse Open Source Technology Centre david.woodho...@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html