[PATCH] md: make raid5 and raid6 robust against failure during recovery.

Linux Kernel Mailing List Sat, 12 Mar 2005 23:49:33 -0800

ChangeSet 1.2065.3.56, 2005/03/12 08:35:55-08:00, [EMAIL PROTECTED]

        [PATCH] md: make raid5 and raid6 robust against failure during recovery.
        
        Two problems are fixed here.
        1/ if the array is known to require a resync (parity update),
          but there are too many failed devices,  the resync cannot complete
          but will be retried indefinitely.
        2/ if the array has too many failed drives to be usable and a spare is
          available, reconstruction will be attempted, but cannot work.  This
          also is retried indefinitely.
        
        Signed-off-by: Neil Brown <[EMAIL PROTECTED]>
        Signed-off-by: Andrew Morton <[EMAIL PROTECTED]>
        Signed-off-by: Linus Torvalds <[EMAIL PROTECTED]>




 md.c        |   12 ++++++------
 raid5.c     |   13 +++++++++++++
 raid6main.c |   12 ++++++++++++
 3 files changed, 31 insertions(+), 6 deletions(-)


diff -Nru a/drivers/md/md.c b/drivers/md/md.c
--- a/drivers/md/md.c   2005-03-12 21:36:43 -08:00
+++ b/drivers/md/md.c   2005-03-12 21:36:43 -08:00
@@ -3545,18 +3545,18 @@
 
                /* no recovery is running.
                 * remove any failed drives, then
-                * add spares if possible
+                * add spares if possible.
+                * Spare are also removed and re-added, to allow
+                * the personality to fail the re-add.
                 */
-               ITERATE_RDEV(mddev,rdev,rtmp) {
+               ITERATE_RDEV(mddev,rdev,rtmp)
                        if (rdev->raid_disk >= 0 &&
-                           rdev->faulty &&
+                           (rdev->faulty || ! rdev->in_sync) &&
                            atomic_read(&rdev->nr_pending)==0) {
                                if (mddev->pers->hot_remove_disk(mddev, 
rdev->raid_disk)==0)
                                        rdev->raid_disk = -1;
                        }
-                       if (!rdev->faulty && rdev->raid_disk >= 0 && 
!rdev->in_sync)
-                               spares++;
-               }
+
                if (mddev->degraded) {
                        ITERATE_RDEV(mddev,rdev,rtmp)
                                if (rdev->raid_disk < 0
diff -Nru a/drivers/md/raid5.c b/drivers/md/raid5.c
--- a/drivers/md/raid5.c        2005-03-12 21:36:43 -08:00
+++ b/drivers/md/raid5.c        2005-03-12 21:36:43 -08:00
@@ -1493,6 +1493,15 @@
                unplug_slaves(mddev);
                return 0;
        }
+       /* if there is 1 or more failed drives and we are trying
+        * to resync, then assert that we are finished, because there is
+        * nothing we can do.
+        */
+       if (mddev->degraded >= 1 && test_bit(MD_RECOVERY_SYNC, 
&mddev->recovery)) {
+               int rv = (mddev->size << 1) - sector_nr;
+               md_done_sync(mddev, rv, 1);
+               return rv;
+       }
 
        x = sector_nr;
        chunk_offset = sector_div(x, sectors_per_chunk);
@@ -1883,6 +1892,10 @@
        int found = 0;
        int disk;
        struct disk_info *p;
+
+       if (mddev->degraded > 1)
+               /* no point adding a device */
+               return 0;
 
        /*
         * find the disk ...
diff -Nru a/drivers/md/raid6main.c b/drivers/md/raid6main.c
--- a/drivers/md/raid6main.c    2005-03-12 21:36:43 -08:00
+++ b/drivers/md/raid6main.c    2005-03-12 21:36:43 -08:00
@@ -1652,6 +1652,15 @@
                unplug_slaves(mddev);
                return 0;
        }
+       /* if there are 2 or more failed drives and we are trying
+        * to resync, then assert that we are finished, because there is
+        * nothing we can do.
+        */
+       if (mddev->degraded >= 2 && test_bit(MD_RECOVERY_SYNC, 
&mddev->recovery)) {
+               int rv = (mddev->size << 1) - sector_nr;
+               md_done_sync(mddev, rv, 1);
+               return rv;
+       }
 
        x = sector_nr;
        chunk_offset = sector_div(x, sectors_per_chunk);
@@ -2050,6 +2059,9 @@
        int disk;
        struct disk_info *p;
 
+       if (mddev->degraded > 2)
+               /* no point adding a device */
+               return 0;
        /*
         * find the disk ...
         */
-
To unsubscribe from this list: send the line "unsubscribe bk-commits-head" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] md: make raid5 and raid6 robust against failure during recovery.

Reply via email to