Linus (et al) The raid1 code has a concept of finding a "next available drive". It uses this for read balancing. Currently, this is implemented via a simple linked list that links the working drives together. However, there is no locking to make sure that the list does not get modified by two processors at the same time, and hence corrupted (though it is changed so rarely that that is unlikely). Also, when choosing a drive to read from for rebuilding a new spare, the "last_used" drive is used, even though that might be a drive which failed recently. i.e. there is no check the the "last_used" drive is actually still valid. I managed to exploit this to get the kernel into a tight spin. This patch discards the linked list and just walks the array ignoring failed drives. I also makes sure that "last_used" is always validated before being used. patch against 2.4.0-test12-pre8 NeilBrown --- ./include/linux/raid/raid1.h 2000/12/10 22:38:20 1.1 +++ ./include/linux/raid/raid1.h 2000/12/10 22:38:25 1.2 @@ -7,7 +7,6 @@ int number; int raid_disk; kdev_t dev; - int next; int sect_limit; int head_position; --- ./drivers/md/raid1.c 2000/12/10 22:36:34 1.2 +++ ./drivers/md/raid1.c 2000/12/10 22:38:25 1.3 @@ -463,16 +463,12 @@ if (conf->resync_mirrors) goto rb_out; - if (conf->working_disks < 2) { - int i = 0; - - while( !conf->mirrors[new_disk].operational && - (i < MD_SB_DISKS) ) { - new_disk = conf->mirrors[new_disk].next; - i++; - } - - if (i >= MD_SB_DISKS) { + + /* make sure that disk is operational */ + while( !conf->mirrors[new_disk].operational) { + if (new_disk <= 0) new_disk = conf->raid_disks; + new_disk--; + if (new_disk == disk) { /* * This means no working disk was found * Nothing much to do, lets not change anything @@ -480,11 +476,13 @@ */ new_disk = conf->last_used; + + goto rb_out; } - - goto rb_out; } - + disk = new_disk; + /* now disk == new_disk == starting point for search */ + /* * Don't touch anything for sequential reads. */ @@ -501,16 +499,16 @@ if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) { conf->sect_count = 0; - - while( new_disk != conf->mirrors[new_disk].next ) { - if ((conf->mirrors[new_disk].write_only) || - (!conf->mirrors[new_disk].operational) ) - continue; - - new_disk = conf->mirrors[new_disk].next; - break; - } - + + do { + if (new_disk<=0) + new_disk = conf->raid_disks; + new_disk--; + if (new_disk == disk) + break; + } while ((conf->mirrors[new_disk].write_only) || + (!conf->mirrors[new_disk].operational)); + goto rb_out; } @@ -519,8 +517,10 @@ /* Find the disk which is closest */ - while( conf->mirrors[disk].next != conf->last_used ) { - disk = conf->mirrors[disk].next; + do { + if (disk <= 0) + disk = conf->raid_disks; + disk--; if ((conf->mirrors[disk].write_only) || (!conf->mirrors[disk].operational)) @@ -534,7 +534,7 @@ current_distance = new_distance; new_disk = disk; } - } + } while (disk != conf->last_used); rb_out: conf->mirrors[new_disk].head_position = this_sector + sectors; @@ -702,16 +702,6 @@ return sz; } -static void unlink_disk (raid1_conf_t *conf, int target) -{ - int disks = MD_SB_DISKS; - int i; - - for (i = 0; i < disks; i++) - if (conf->mirrors[i].next == target) - conf->mirrors[i].next = conf->mirrors[target].next; -} - #define LAST_DISK KERN_ALERT \ "raid1: only one disk left and IO error.\n" @@ -735,7 +725,6 @@ mdp_super_t *sb = mddev->sb; mirror->operational = 0; - unlink_disk(conf, failed); mark_disk_faulty(sb->disks+mirror->number); mark_disk_nonsync(sb->disks+mirror->number); mark_disk_inactive(sb->disks+mirror->number); @@ -786,25 +775,6 @@ #undef DISK_FAILED #undef START_SYNCING -/* - * Insert the spare disk into the drive-ring - */ -static void link_disk(raid1_conf_t *conf, struct mirror_info *mirror) -{ - int j, next; - int disks = MD_SB_DISKS; - struct mirror_info *p = conf->mirrors; - - for (j = 0; j < disks; j++, p++) - if (p->operational && !p->write_only) { - next = p->next; - p->next = mirror->raid_disk; - mirror->next = next; - return; - } - - printk("raid1: bug: no read-operational devices\n"); -} static void print_raid1_conf (raid1_conf_t *conf) { @@ -1017,7 +987,6 @@ */ fdisk->spare = 0; fdisk->write_only = 0; - link_disk(conf, fdisk); /* * if we activate a spare, we definitely replace a @@ -1325,6 +1294,7 @@ struct raid1_bh *r1_bh; struct buffer_head *bh; int bsize; + int disk; spin_lock_irq(&conf->segment_lock); if (!block_nr) { @@ -1377,6 +1347,16 @@ * could dedicate one to rebuild and others to * service read requests .. */ + disk = conf->last_used; + /* make sure disk is operational */ + while (!conf->mirrors[disk].operational) { + if (disk <= 0) disk = conf->raid_disks; + disk--; + if (disk == conf->last_used) + break; + } + conf->last_used = disk; + mirror = conf->mirrors+conf->last_used; r1_bh = raid1_alloc_buf (conf); @@ -1717,19 +1697,10 @@ * find the first working one and use it as a starting point * to read balancing. */ - for (j = 0; !conf->mirrors[j].operational; j++) + for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++) /* nothing */; conf->last_used = j; - /* - * initialize the 'working disks' list. - */ - for (i = conf->raid_disks - 1; i >= 0; i--) { - if (conf->mirrors[i].operational) { - conf->mirrors[i].next = j; - j = i; - } - } if (conf->working_disks != sb->raid_disks) { printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev)); - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED]