Re: A start at RAID[56] support.

2009-07-14 Thread David Woodhouse
On Sat, 2009-07-11 at 15:40 +0100, David Woodhouse wrote:
 On Sat, 2009-07-11 at 15:39 +0100, David Woodhouse wrote:
  This is a preliminary attempt to add RAID5 and RAID6 support.
 
 Matching btrfs-progs patch...

And this makes it actually write the P and Q stripes...

These patches at git://,
http://git.infradead.org/users/dwmw2/btrfs-progs-raid56.git

I can now make a 4-disk RAID6 file system, copy some stuff to it, then
kick out two of the disks and use it in degraded mode, and everything
seems to work fine.

diff --git a/Makefile b/Makefile
index 8097b5a..2d8d349 100644
--- a/Makefile
+++ b/Makefile
@@ -4,7 +4,7 @@ CFLAGS = -g -Werror -Os
 objects = ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
  root-tree.o dir-item.o file-item.o inode-item.o \
  inode-map.o crc32c.o rbtree.o extent-cache.o extent_io.o \
- volumes.o utils.o
+ volumes.o utils.o raid6.o
 
 #
 CHECKFLAGS=-D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ -Wbitwise \
diff --git a/disk-io.c b/disk-io.c
index addebe1..c33c31b 100644
--- a/disk-io.c
+++ b/disk-io.c
@@ -138,7 +138,7 @@ int readahead_tree_block(struct btrfs_root *root, u64 
bytenr, u32 blocksize,
dev_nr = 0;
length = blocksize;
ret = btrfs_map_block(root-fs_info-mapping_tree, READ,
- bytenr, length, multi, 0);
+ bytenr, length, multi, 0, NULL);
BUG_ON(ret);
device = multi-stripes[0].dev;
device-total_ios++;
@@ -196,7 +196,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root 
*root, u64 bytenr,
length = blocksize;
while (1) {
ret = btrfs_map_block(root-fs_info-mapping_tree, READ,
- eb-start, length, multi, mirror_num);
+ eb-start, length, multi, mirror_num,
+ NULL);
BUG_ON(ret);
device = multi-stripes[0].dev;
eb-fd = device-fd;
@@ -224,12 +225,93 @@ struct extent_buffer *read_tree_block(struct btrfs_root 
*root, u64 bytenr,
return NULL;
 }
 
+static int write_raid56_with_parity(struct extent_buffer *eb,
+   struct btrfs_multi_bio *multi,
+   u64 stripe_len, u64 *raid_map)
+{
+   struct extent_buffer *ebs[multi-num_stripes], *p_eb = NULL, *q_eb = 
NULL;
+   u64 start_ofs, end_ofs;
+   int i, j;
+   int ret;
+
+   start_ofs = eb-start % stripe_len;
+   end_ofs = start_ofs + eb-len;
+   BUG_ON(end_ofs  stripe_len);
+
+   j = 0;
+   for (i = 0; i  multi-num_stripes; i++) {
+   struct extent_buffer *new_eb;
+   if (start_ofs) {
+   multi-stripes[i].physical += start_ofs;
+   if (raid_map[i] != (u64)-1  raid_map[i] != (u64)-2)
+   raid_map[i] += start_ofs;
+   }
+   if (raid_map[i] == eb-start) {
+   eb-dev_bytenr = multi-stripes[i].physical;
+   eb-fd = multi-stripes[i].dev-fd;
+   multi-stripes[i].dev-total_ios++;
+   ebs[j++] = eb;
+   continue;
+   }
+   new_eb = kmalloc(sizeof(*eb) + eb-len, GFP_NOFS);
+   BUG_ON(!new_eb);
+   new_eb-dev_bytenr = multi-stripes[i].physical;
+   new_eb-fd = multi-stripes[i].dev-fd;
+   multi-stripes[i].dev-total_ios++;
+   new_eb-len = eb-len;
+   if (raid_map[i] == (u64)-1) {
+   p_eb = new_eb;
+   } else if (raid_map[i] == (u64)-2) {
+   q_eb = new_eb;
+   } else {
+   ret = read_extent_from_disk(new_eb);
+   BUG_ON(ret);
+   ebs[j++] = new_eb;
+   }
+   }
+   ebs[j++] = p_eb;
+   if (q_eb) {
+   void *pointers[multi-num_stripes];
+
+   ebs[j++] = q_eb;
+
+   for (i = 0; i  multi-num_stripes; i++)
+   pointers[i] = ebs[i]-data;
+
+   raid6_gen_syndrome(multi-num_stripes, eb-len, pointers);
+
+   ret = write_extent_to_disk(q_eb);
+   BUG_ON(ret);
+   } else {
+   memcpy(p_eb-data, ebs[0]-data, eb-len);
+   for (j = 1; j  multi-num_stripes - 1; j++) {
+   for (i = 0; i  eb-len; i += sizeof(unsigned long)) {
+   *(unsigned long *)(p_eb-data + i) ^=
+   *(unsigned long *)(ebs[j]-data + i);
+   }
+   }
+   }
+
+   ret = write_extent_to_disk(p_eb);
+   BUG_ON(ret);
+
+   ret = write_extent_to_disk(eb);
+   BUG_ON(ret);
+
+   for (i = 0; i  multi-num_stripes; i++)
+   if (ebs[i] != eb)
+  

Re: A start at RAID[56] support.

2009-07-13 Thread David Woodhouse
On Sat, 2009-07-11 at 15:39 +0100, David Woodhouse wrote:
 This is a preliminary attempt to add RAID5 and RAID6 support.
 
 So far it doesn't attempt to write or read the parity blocks -- it
 just
 lays the data blocks out as we want them, so it's effectively just a
 complex and wasteful kind of RAID0.
 
 The next step is to make btrfs_map_bio() do the right thing:
  - Satisfy read requests for mirrors #2 and #3 by recreating data from
RAID5 parity or RAID6 error correction stripe respectively.
  - Write out parity and RAID6 blocks appropriately when data writes
happen.

Actually, the next step is to tweak __btrfs_map_block() a bit more to
let it return information about the whole stripe-set, so that
btrfs_map_bio() _can_ do what we say above...

So rather than just mapping the requested address as if it's RAID0, we
(where appropriate) return information about the _entire_ disk set in
the btrfs_multi_bio, with an auxiliary array giving the _logical_ offset
corresponding to each physical stripe in the referenced set (with
special values for the P and Q stripes).

We do this for all writes, and for reads where mirror_num  1 (i.e. when
we're being asked to rebuild it from parity, rather than reading the
original data blocks).

  git://, http://git.infradead.org/users/dwmw2/btrfs-raid56.git

commit ed90c58ba7c60555af4b8c00a104c7d71f6db6d2
Author: David Woodhouse david.woodho...@intel.com
Date:   Sun Jul 12 11:15:22 2009 +0100

Btrfs: Let btrfs_map_block() return full stripe information for RAID[56]

... in the cases where it's necessary -- which is for a write, or for a
parity recovery attempt. We'll let btrfs_map_bio() do the rest.

Signed-off-by: David Woodhouse david.woodho...@intel.com

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3b231ef..55facd3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -62,6 +62,11 @@ static int init_first_rw_device(struct btrfs_trans_handle 
*trans,
struct btrfs_device *device);
 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
 
+#define RAID5_P_STRIPE ((u64)-1)
+#define RAID6_Q_STRIPE ((u64)-2)
+
+#define is_parity_stripe(x) ( ((x) == RAID5_P_STRIPE) || ((x) == 
RAID6_Q_STRIPE) )
+
 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
(sizeof(struct btrfs_bio_stripe) * (n)))
 
@@ -2614,7 +2619,8 @@ static int find_live_mirror(struct map_lookup *map, int 
first, int num,
 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 u64 logical, u64 *length,
 struct btrfs_multi_bio **multi_ret,
-int mirror_num, struct page *unplug_page)
+int mirror_num, struct page *unplug_page,
+u64 **raid_map_ret)
 {
struct extent_map *em;
struct map_lookup *map;
@@ -2622,6 +2628,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree 
*map_tree, int rw,
u64 offset;
u64 stripe_offset;
u64 stripe_nr;
+   u64 *raid_map = NULL;
int stripes_allocated = 8;
int stripes_required = 1;
int stripe_index;
@@ -2674,9 +2681,24 @@ again:
max_errors = 1;
}
}
-   if (multi_ret  (rw  (1  BIO_RW)) 
-   stripes_allocated  stripes_required) {
-   stripes_allocated = map-num_stripes;
+   if (map-type  (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)
+multi_ret  (rw  (1  BIO_RW) || mirror_num  1)  
raid_map_ret) {
+   /* RAID[56] write or recovery. Return all stripes */
+   stripes_required = map-num_stripes;
+   max_errors = nr_parity_stripes(map);
+
+   /* Only allocate the map if we've already got a large 
enough multi_ret */
+   if (stripes_allocated = stripes_required) {
+   raid_map = kmalloc(sizeof(u64) * map-num_stripes, 
GFP_NOFS);
+   if (!raid_map) {
+   free_extent_map(em);
+   kfree(multi);
+   return -ENOMEM;
+   }
+   }
+   }
+   if (multi_ret  stripes_allocated  stripes_required) {
+   stripes_allocated = stripes_required;
free_extent_map(em);
kfree(multi);
goto again;
@@ -2749,18 +2771,43 @@ again:
 
stripe_index = do_div(stripe_nr, nr_data_stripes(map));
 
-   /*
-* Mirror #0 or #1 means the original data block.
-* Mirror #2 is RAID5 parity block.
-* Mirror #3 is RAID6 Q block.
-*/
-   if (mirror_num  1)
-   stripe_index = nr_data_stripes(map) + mirror_num - 2;
-
-   /* We distribute the 

A start at RAID[56] support.

2009-07-11 Thread David Woodhouse
This is a preliminary attempt to add RAID5 and RAID6 support.

So far it doesn't attempt to write or read the parity blocks -- it just
lays the data blocks out as we want them, so it's effectively just a
complex and wasteful kind of RAID0.

The next step is to make btrfs_map_bio() do the right thing:
 - Satisfy read requests for mirrors #2 and #3 by recreating data from
   RAID5 parity or RAID6 error correction stripe respectively.
 - Write out parity and RAID6 blocks appropriately when data writes
   happen.

The former is relatively easy; the latter is slightly more interesting.

Chris suggests that we can avoid read/modify/write cycles for the parity
blocks by ensuring that the file system always writes a full set of
stripes. So for a RAID5 of 4 disks with 64KiB stripe_len, that would be
a 192KiB minimum write size, for example.

I'm not entirely sure of the best way to do that -- can we set a minimum
allocation size for a chunk, and then maybe have it fall back to RAID1
(or a RAID5 chunk with smaller stripe_len) for smaller allocations if
they'd be too wasteful on the larger RAID5 chunks?

And how would we handle nodatacow?

I think I'm going to do a crappy r/m/w thing for now (in the knowledge
that the error correction stripes won't be powerfail-safe), and then we
can set about trying to render it unnecessary.

(Yes, I know I need to fix up btrfs_discard_extent() for RAID5 too -- it
doesn't discard the parity stripes, and I may want to make it avoid
discarding partial stripes for now, until we fix the above.)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 98a8738..40168d7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -653,6 +653,8 @@ struct btrfs_csum_item {
 #define BTRFS_BLOCK_GROUP_RAID1(1  4)
 #define BTRFS_BLOCK_GROUP_DUP (1  5)
 #define BTRFS_BLOCK_GROUP_RAID10   (1  6)
+#define BTRFS_BLOCK_GROUP_RAID5(1  7)
+#define BTRFS_BLOCK_GROUP_RAID6(1  8)
 
 struct btrfs_block_group_item {
__le64 used;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d829ef3..fadec64 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2496,6 +2496,8 @@ static void set_avail_alloc_bits(struct btrfs_fs_info 
*fs_info, u64 flags)
 {
u64 extra_flags = flags  (BTRFS_BLOCK_GROUP_RAID0 |
   BTRFS_BLOCK_GROUP_RAID1 |
+  BTRFS_BLOCK_GROUP_RAID5 |
+  BTRFS_BLOCK_GROUP_RAID6 |
   BTRFS_BLOCK_GROUP_RAID10 |
   BTRFS_BLOCK_GROUP_DUP);
if (extra_flags) {
@@ -2524,29 +2526,34 @@ static void set_block_group_readonly(struct 
btrfs_block_group_cache *cache)
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
u64 num_devices = root-fs_info-fs_devices-rw_devices;
+   u64 tmp;
 
+   /* First, mask out the RAID levels which aren't possible */
if (num_devices == 1)
-   flags = ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
+   flags = ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
+  BTRFS_BLOCK_GROUP_RAID5);
+   if (num_devices  3)
+   flags = ~BTRFS_BLOCK_GROUP_RAID6;
if (num_devices  4)
flags = ~BTRFS_BLOCK_GROUP_RAID10;
 
-   if ((flags  BTRFS_BLOCK_GROUP_DUP) 
-   (flags  (BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10))) {
-   flags = ~BTRFS_BLOCK_GROUP_DUP;
-   }
+   tmp = flags  (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
+  BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
+  BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
+   flags = ~tmp;
 
-   if ((flags  BTRFS_BLOCK_GROUP_RAID1) 
-   (flags  BTRFS_BLOCK_GROUP_RAID10)) {
-   flags = ~BTRFS_BLOCK_GROUP_RAID1;
-   }
+   if (tmp  BTRFS_BLOCK_GROUP_RAID6)
+   tmp = BTRFS_BLOCK_GROUP_RAID6;
+   else if (tmp  BTRFS_BLOCK_GROUP_RAID5)
+   tmp = BTRFS_BLOCK_GROUP_RAID5;
+   else if (tmp  BTRFS_BLOCK_GROUP_RAID10)
+   tmp = BTRFS_BLOCK_GROUP_RAID10;
+   else if (tmp  BTRFS_BLOCK_GROUP_RAID1)
+   tmp = BTRFS_BLOCK_GROUP_RAID1;
+   else if (tmp  BTRFS_BLOCK_GROUP_RAID0)
+   tmp = BTRFS_BLOCK_GROUP_RAID0;
 
-   if ((flags  BTRFS_BLOCK_GROUP_RAID0) 
-   ((flags  BTRFS_BLOCK_GROUP_RAID1) |
-(flags  BTRFS_BLOCK_GROUP_RAID10) |
-(flags  BTRFS_BLOCK_GROUP_DUP)))
-   flags = ~BTRFS_BLOCK_GROUP_RAID0;
-   return flags;
+   return flags | tmp;
 }
 
 static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
@@ -6548,6 +6555,7 @@ static u64 update_block_group_flags(struct btrfs_root 
*root, u64 flags)
 {
u64 num_devices;
u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
+   BTRFS_BLOCK_GROUP_RAID5 |