[PATCH 006 of 13] md: Infrastructure to allow normal IO to continue while array is expanding.

2006-03-16 Thread NeilBrown

We need to allow that different stripes are of different effective sizes,
and use the appropriate size.
Also, when a stripe is being expanded, we must block any IO attempts
until the stripe is stable again.

Key elements in this change are:
 - each stripe_head gets a 'disk' field which is part of the key,
   thus there can sometimes be two stripe heads of the same area of
   the array, but covering different numbers of devices.  One of these
   will be marked STRIPE_EXPANDING and so won't accept new requests.
 - conf-expand_progress tracks how the expansion is progressing and
   is used to determine whether the target part of the array has been
   expanded yet or not.


Signed-off-by: Neil Brown [EMAIL PROTECTED]

### Diffstat output
 ./drivers/md/raid5.c |   88 ---
 ./include/linux/raid/raid5.h |6 ++
 2 files changed, 64 insertions(+), 30 deletions(-)

diff ./drivers/md/raid5.c~current~ ./drivers/md/raid5.c
--- ./drivers/md/raid5.c~current~   2006-03-17 11:48:56.0 +1100
+++ ./drivers/md/raid5.c2006-03-17 11:48:56.0 +1100
@@ -178,10 +178,10 @@ static int grow_buffers(struct stripe_he
 
 static void raid5_build_block (struct stripe_head *sh, int i);
 
-static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
+static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, 
int disks)
 {
raid5_conf_t *conf = sh-raid_conf;
-   int disks = conf-raid_disks, i;
+   int i;
 
if (atomic_read(sh-count) != 0)
BUG();
@@ -198,7 +198,9 @@ static void init_stripe(struct stripe_he
sh-pd_idx = pd_idx;
sh-state = 0;
 
-   for (i=disks; i--; ) {
+   sh-disks = disks;
+
+   for (i = sh-disks; i--; ) {
struct r5dev *dev = sh-dev[i];
 
if (dev-toread || dev-towrite || dev-written ||
@@ -215,7 +217,7 @@ static void init_stripe(struct stripe_he
insert_hash(conf, sh);
 }
 
-static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
+static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, 
int disks)
 {
struct stripe_head *sh;
struct hlist_node *hn;
@@ -223,7 +225,7 @@ static struct stripe_head *__find_stripe
CHECK_DEVLOCK();
PRINTK(__find_stripe, sector %llu\n, (unsigned long long)sector);
hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
-   if (sh-sector == sector)
+   if (sh-sector == sector  sh-disks == disks)
return sh;
PRINTK(__stripe %llu not in cache\n, (unsigned long long)sector);
return NULL;
@@ -232,8 +234,8 @@ static struct stripe_head *__find_stripe
 static void unplug_slaves(mddev_t *mddev);
 static void raid5_unplug_device(request_queue_t *q);
 
-static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t 
sector,
-int pd_idx, int noblock) 
+static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t 
sector, int disks,
+int pd_idx, int noblock)
 {
struct stripe_head *sh;
 
@@ -245,7 +247,7 @@ static struct stripe_head *get_active_st
wait_event_lock_irq(conf-wait_for_stripe,
conf-quiesce == 0,
conf-device_lock, /* nothing */);
-   sh = __find_stripe(conf, sector);
+   sh = __find_stripe(conf, sector, disks);
if (!sh) {
if (!conf-inactive_blocked)
sh = get_free_stripe(conf);
@@ -263,7 +265,7 @@ static struct stripe_head *get_active_st
);
conf-inactive_blocked = 0;
} else
-   init_stripe(sh, sector, pd_idx);
+   init_stripe(sh, sector, pd_idx, disks);
} else {
if (atomic_read(sh-count)) {
if (!list_empty(sh-lru))
@@ -300,6 +302,7 @@ static int grow_one_stripe(raid5_conf_t 
kmem_cache_free(conf-slab_cache, sh);
return 0;
}
+   sh-disks = conf-raid_disks;
/* we just created an active stripe so... */
atomic_set(sh-count, 1);
atomic_inc(conf-active_stripes);
@@ -470,7 +473,7 @@ static int raid5_end_read_request(struct
 {
struct stripe_head *sh = bi-bi_private;
raid5_conf_t *conf = sh-raid_conf;
-   int disks = conf-raid_disks, i;
+   int disks = sh-disks, i;
int uptodate = test_bit(BIO_UPTODATE, bi-bi_flags);
 
if (bi-bi_size)
@@ -568,7 +571,7 @@ static int raid5_end_write_request (stru
 {
struct stripe_head *sh = bi-bi_private;
raid5_conf_t *conf = sh-raid_conf;
-   int disks = conf-raid_disks, i;
+  

Re: [PATCH 006 of 13] md: Infrastructure to allow normal IO to continue while array is expanding.

2006-03-16 Thread Andrew Morton
NeilBrown [EMAIL PROTECTED] wrote:

  -retry:
   prepare_to_wait(conf-wait_for_overlap, w, 
 TASK_UNINTERRUPTIBLE);
  -sh = get_active_stripe(conf, new_sector, pd_idx, 
 (bi-bi_rwRWA_MASK));
  +sh = get_active_stripe(conf, new_sector, disks, pd_idx, 
 (bi-bi_rwRWA_MASK));
   if (sh) {
  -if (!add_stripe_bio(sh, bi, dd_idx, 
 (bi-bi_rwRW_MASK))) {
  -/* Add failed due to overlap.  Flush everything
  +if (unlikely(conf-expand_progress != MaxSector)) {
  +/* expansion might have moved on while waiting 
 for a
  + * stripe, so we much do the range check again.
  + */
  +int must_retry = 0;
  +spin_lock_irq(conf-device_lock);
  +if (logical_sector   conf-expand_progress 
  +disks == conf-previous_raid_disks)
  +/* mismatch, need to try again */
  +must_retry = 1;
  +spin_unlock_irq(conf-device_lock);
  +if (must_retry) {
  +release_stripe(sh);
  +goto retry;
  +}
  +}

The locking in here looks strange.  We take the lock, do some arithmetic
and some tests and then drop the lock again.  Is it not possible that the
result of those tests now becomes invalid?

-
To unsubscribe from this list: send the line unsubscribe linux-raid in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 006 of 13] md: Infrastructure to allow normal IO to continue while array is expanding.

2006-03-16 Thread Neil Brown
On Thursday March 16, [EMAIL PROTECTED] wrote:
 NeilBrown [EMAIL PROTECTED] wrote:
 
   -  retry:
  prepare_to_wait(conf-wait_for_overlap, w, 
  TASK_UNINTERRUPTIBLE);
   -  sh = get_active_stripe(conf, new_sector, pd_idx, 
  (bi-bi_rwRWA_MASK));
   +  sh = get_active_stripe(conf, new_sector, disks, pd_idx, 
  (bi-bi_rwRWA_MASK));
  if (sh) {
   -  if (!add_stripe_bio(sh, bi, dd_idx, 
  (bi-bi_rwRW_MASK))) {
   -  /* Add failed due to overlap.  Flush everything
   +  if (unlikely(conf-expand_progress != MaxSector)) {
   +  /* expansion might have moved on while waiting 
  for a
   +   * stripe, so we much do the range check again.
   +   */
   +  int must_retry = 0;
   +  spin_lock_irq(conf-device_lock);
   +  if (logical_sector   conf-expand_progress 
   +  disks == conf-previous_raid_disks)
   +  /* mismatch, need to try again */
   +  must_retry = 1;
   +  spin_unlock_irq(conf-device_lock);
   +  if (must_retry) {
   +  release_stripe(sh);
   +  goto retry;
   +  }
   +  }
 
 The locking in here looks strange.  We take the lock, do some arithmetic
 and some tests and then drop the lock again.  Is it not possible that the
 result of those tests now becomes invalid?

Obviously another comment missing.
 conf-expand_progress is sector_t and so could be 64bits on a 32 bit
 platform, and so I cannot be sure it is updated atomically.  So I
 always access it within a lock (unless I am comparing for equality with ~0).
 
 Yes, the result can become invalid, but only in one direction:  As
 expand_progress always increases, it is possible that it will pass
 logical_sector.  When that happens, STRIPE_EXPANDING gets set on the
 stripe_head at logical_sector.
 So because we took a reference to logical_sector *before* this test,
 and check for stripe_expanding *after* the test, we can easily catch
 that transition.

 Putting it another way, this test is to catch cases where
 logical_sector is a long way from expand_progress, the subsequent
 test of STRIPE_EXPANDING catches cases where they are close together,
 and the ordering wrt get_stripe_active ensures there are no holes.

Now to put that into a few short-but-clear comments.

Thanks again!

NeilBrown
-
To unsubscribe from this list: send the line unsubscribe linux-raid in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html