Linus (et al)

 The raid1 code has a concept of finding a "next available drive".  It
 uses this for read balancing.
 Currently, this is implemented via a simple linked list that links
 the working drives together.

 However, there is no locking to make sure that the list does not get
 modified by two processors at the same time, and hence corrupted
 (though it is changed so  rarely that that is unlikely).
 Also, when choosing a drive to read from for rebuilding a new spare,
 the "last_used" drive is used, even though that might be a drive which
 failed recently.  i.e. there is no check the the "last_used" drive is
 actually still valid.  I managed to exploit this to get the kernel
 into a tight spin.

 This patch discards the linked list and just walks the array ignoring
 failed drives.  I also makes sure that "last_used" is always
 validated before being used.

 patch against 2.4.0-test12-pre8

NeilBrown

--- ./include/linux/raid/raid1.h        2000/12/10 22:38:20     1.1
+++ ./include/linux/raid/raid1.h        2000/12/10 22:38:25     1.2
@@ -7,7 +7,6 @@
        int             number;
        int             raid_disk;
        kdev_t          dev;
-       int             next;
        int             sect_limit;
        int             head_position;
 
--- ./drivers/md/raid1.c        2000/12/10 22:36:34     1.2
+++ ./drivers/md/raid1.c        2000/12/10 22:38:25     1.3
@@ -463,16 +463,12 @@
        if (conf->resync_mirrors)
                goto rb_out;
        
-       if (conf->working_disks < 2) {
-               int i = 0;
-               
-               while( !conf->mirrors[new_disk].operational &&
-                               (i < MD_SB_DISKS) ) {
-                       new_disk = conf->mirrors[new_disk].next;
-                       i++;
-               }
-               
-               if (i >= MD_SB_DISKS) {
+
+       /* make sure that disk is operational */
+       while( !conf->mirrors[new_disk].operational) {
+               if (new_disk <= 0) new_disk = conf->raid_disks;
+               new_disk--;
+               if (new_disk == disk) {
                        /*
                         * This means no working disk was found
                         * Nothing much to do, lets not change anything
@@ -480,11 +476,13 @@
                         */
                        
                        new_disk = conf->last_used;
+
+                       goto rb_out;
                }
-               
-               goto rb_out;
        }
-
+       disk = new_disk;
+       /* now disk == new_disk == starting point for search */
+       
        /*
         * Don't touch anything for sequential reads.
         */
@@ -501,16 +499,16 @@
        
        if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
                conf->sect_count = 0;
-               
-               while( new_disk != conf->mirrors[new_disk].next ) {
-                       if ((conf->mirrors[new_disk].write_only) ||
-                               (!conf->mirrors[new_disk].operational) )
-                               continue;
-                       
-                       new_disk = conf->mirrors[new_disk].next;
-                       break;
-               }
-               
+
+               do {
+                       if (new_disk<=0)
+                               new_disk = conf->raid_disks;
+                       new_disk--;
+                       if (new_disk == disk)
+                               break;
+               } while ((conf->mirrors[new_disk].write_only) ||
+                        (!conf->mirrors[new_disk].operational));
+
                goto rb_out;
        }
        
@@ -519,8 +517,10 @@
        
        /* Find the disk which is closest */
        
-       while( conf->mirrors[disk].next != conf->last_used ) {
-               disk = conf->mirrors[disk].next;
+       do {
+               if (disk <= 0)
+                       disk = conf->raid_disks;
+               disk--;
                
                if ((conf->mirrors[disk].write_only) ||
                                (!conf->mirrors[disk].operational))
@@ -534,7 +534,7 @@
                        current_distance = new_distance;
                        new_disk = disk;
                }
-       }
+       } while (disk != conf->last_used);
 
 rb_out:
        conf->mirrors[new_disk].head_position = this_sector + sectors;
@@ -702,16 +702,6 @@
        return sz;
 }
 
-static void unlink_disk (raid1_conf_t *conf, int target)
-{
-       int disks = MD_SB_DISKS;
-       int i;
-
-       for (i = 0; i < disks; i++)
-               if (conf->mirrors[i].next == target)
-                       conf->mirrors[i].next = conf->mirrors[target].next;
-}
-
 #define LAST_DISK KERN_ALERT \
 "raid1: only one disk left and IO error.\n"
 
@@ -735,7 +725,6 @@
        mdp_super_t *sb = mddev->sb;
 
        mirror->operational = 0;
-       unlink_disk(conf, failed);
        mark_disk_faulty(sb->disks+mirror->number);
        mark_disk_nonsync(sb->disks+mirror->number);
        mark_disk_inactive(sb->disks+mirror->number);
@@ -786,25 +775,6 @@
 #undef DISK_FAILED
 #undef START_SYNCING
 
-/*
- * Insert the spare disk into the drive-ring
- */
-static void link_disk(raid1_conf_t *conf, struct mirror_info *mirror)
-{
-       int j, next;
-       int disks = MD_SB_DISKS;
-       struct mirror_info *p = conf->mirrors;
-
-       for (j = 0; j < disks; j++, p++)
-               if (p->operational && !p->write_only) {
-                       next = p->next;
-                       p->next = mirror->raid_disk;
-                       mirror->next = next;
-                       return;
-               }
-
-       printk("raid1: bug: no read-operational devices\n");
-}
 
 static void print_raid1_conf (raid1_conf_t *conf)
 {
@@ -1017,7 +987,6 @@
                 */
                fdisk->spare = 0;
                fdisk->write_only = 0;
-               link_disk(conf, fdisk);
 
                /*
                 * if we activate a spare, we definitely replace a
@@ -1325,6 +1294,7 @@
        struct raid1_bh *r1_bh;
        struct buffer_head *bh;
        int bsize;
+       int disk;
 
        spin_lock_irq(&conf->segment_lock);
        if (!block_nr) {
@@ -1377,6 +1347,16 @@
         * could dedicate one to rebuild and others to
         * service read requests ..
         */
+       disk = conf->last_used;
+       /* make sure disk is operational */
+       while (!conf->mirrors[disk].operational) {
+               if (disk <= 0) disk = conf->raid_disks;
+               disk--;
+               if (disk == conf->last_used)
+                       break;
+       }
+       conf->last_used = disk;
+       
        mirror = conf->mirrors+conf->last_used;
        
        r1_bh = raid1_alloc_buf (conf);
@@ -1717,19 +1697,10 @@
         * find the first working one and use it as a starting point
         * to read balancing.
         */
-       for (j = 0; !conf->mirrors[j].operational; j++)
+       for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++)
                /* nothing */;
        conf->last_used = j;
 
-       /*
-        * initialize the 'working disks' list.
-        */
-       for (i = conf->raid_disks - 1; i >= 0; i--) {
-               if (conf->mirrors[i].operational) {
-                       conf->mirrors[i].next = j;
-                       j = i;
-               }
-       }
 
        if (conf->working_disks != sb->raid_disks) {
                printk(KERN_ALERT "raid1: md%d, not all disks are operational -- 
trying to recover array\n", mdidx(mddev));
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]

Reply via email to