[PATCH 004 of 4] md: Fix an occasional deadlock in raid5 - FIX
(This should be merged with fix-occasional-deadlock-in-raid5.patch) As we don't call stripe_handle in make_request any more, we need to clear STRIPE_DELAYED to (previously done by stripe_handle) to ensure that we test if the stripe still needs to be delayed or not. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid5.c |1 + 1 file changed, 1 insertion(+) diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2008-01-18 14:58:55.0 +1100 +++ ./drivers/md/raid5.c2008-01-18 14:59:53.0 +1100 @@ -3549,6 +3549,7 @@ static int make_request(struct request_q } finish_wait(conf-wait_for_overlap, w); set_bit(STRIPE_HANDLE, sh-state); + clear_bit(STRIPE_DELAYED, sh-state); release_stripe(sh); } else { /* cannot get stripe for read-ahead, just give-up */ - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 003 of 4] md: Change ITERATE_RDEV_GENERIC to rdev_for_each_list, and remove ITERATE_RDEV_PENDING.
Finish ITERATE_ to for_each conversion. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c |8 ./include/linux/raid/md_k.h | 14 -- 2 files changed, 8 insertions(+), 14 deletions(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2008-01-18 11:19:09.0 +1100 +++ ./drivers/md/md.c 2008-01-18 11:19:24.0 +1100 @@ -3766,7 +3766,7 @@ static void autorun_devices(int part) printk(KERN_INFO md: considering %s ...\n, bdevname(rdev0-bdev,b)); INIT_LIST_HEAD(candidates); - ITERATE_RDEV_PENDING(rdev,tmp) + rdev_for_each_list(rdev, tmp, pending_raid_disks) if (super_90_load(rdev, rdev0, 0) = 0) { printk(KERN_INFO md: adding %s ...\n, bdevname(rdev-bdev,b)); @@ -3810,7 +3810,7 @@ static void autorun_devices(int part) } else { printk(KERN_INFO md: created %s\n, mdname(mddev)); mddev-persistent = 1; - ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { + rdev_for_each_list(rdev, tmp, candidates) { list_del_init(rdev-same_set); if (bind_rdev_to_array(rdev, mddev)) export_rdev(rdev); @@ -3821,7 +3821,7 @@ static void autorun_devices(int part) /* on success, candidates will be empty, on error * it won't... */ - ITERATE_RDEV_GENERIC(candidates,rdev,tmp) + rdev_for_each_list(rdev, tmp, candidates) export_rdev(rdev); mddev_put(mddev); } @@ -4936,7 +4936,7 @@ static void status_unused(struct seq_fil seq_printf(seq, unused devices: ); - ITERATE_RDEV_PENDING(rdev,tmp) { + rdev_for_each_list(rdev, tmp, pending_raid_disks) { char b[BDEVNAME_SIZE]; i++; seq_printf(seq, %s , diff .prev/include/linux/raid/md_k.h ./include/linux/raid/md_k.h --- .prev/include/linux/raid/md_k.h 2008-01-18 11:19:09.0 +1100 +++ ./include/linux/raid/md_k.h 2008-01-18 11:19:24.0 +1100 @@ -313,23 +313,17 @@ static inline char * mdname (mddev_t * m * iterates through some rdev ringlist. It's safe to remove the * current 'rdev'. Dont touch 'tmp' though. */ -#define ITERATE_RDEV_GENERIC(head,rdev,tmp)\ +#define rdev_for_each_list(rdev, tmp, list)\ \ - for ((tmp) = (head).next; \ + for ((tmp) = (list).next; \ (rdev) = (list_entry((tmp), mdk_rdev_t, same_set)), \ - (tmp) = (tmp)-next, (tmp)-prev != (head) \ + (tmp) = (tmp)-next, (tmp)-prev != (list) \ ; ) /* * iterates through the 'same array disks' ringlist */ #define rdev_for_each(rdev, tmp, mddev)\ - ITERATE_RDEV_GENERIC((mddev)-disks,rdev,tmp) - -/* - * Iterates through 'pending RAID disks' - */ -#define ITERATE_RDEV_PENDING(rdev,tmp) \ - ITERATE_RDEV_GENERIC(pending_raid_disks,rdev,tmp) + rdev_for_each_list(rdev, tmp, (mddev)-disks) typedef struct mdk_thread_s { void(*run) (mddev_t *mddev); - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 002 of 4] md: Allow devices to be shared between md arrays.
Currently, a given device is claimed by a particular array so that it cannot be used by other arrays. This is not ideal for DDF and other metadata schemes which have their own partitioning concept. So for externally managed metadata, just claim the device for md in general, require that offset and size are set properly for each device, and make sure that if a device is included in different arrays then the active sections do not overlap. This involves adding another flag to the rdev which makes it awkward to set -flags = 0 to clear certain flags. So now clear flags explicitly by name when we want to clear things. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c | 88 +++- ./include/linux/raid/md_k.h |2 + 2 files changed, 80 insertions(+), 10 deletions(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2008-01-18 11:03:15.0 +1100 +++ ./drivers/md/md.c 2008-01-18 11:18:04.0 +1100 @@ -774,7 +774,11 @@ static int super_90_validate(mddev_t *md __u64 ev1 = md_event(sb); rdev-raid_disk = -1; - rdev-flags = 0; + clear_bit(Faulty, rdev-flags); + clear_bit(In_sync, rdev-flags); + clear_bit(WriteMostly, rdev-flags); + clear_bit(BarriersNotsupp, rdev-flags); + if (mddev-raid_disks == 0) { mddev-major_version = 0; mddev-minor_version = sb-minor_version; @@ -1154,7 +1158,11 @@ static int super_1_validate(mddev_t *mdd __u64 ev1 = le64_to_cpu(sb-events); rdev-raid_disk = -1; - rdev-flags = 0; + clear_bit(Faulty, rdev-flags); + clear_bit(In_sync, rdev-flags); + clear_bit(WriteMostly, rdev-flags); + clear_bit(BarriersNotsupp, rdev-flags); + if (mddev-raid_disks == 0) { mddev-major_version = 1; mddev-patch_version = 0; @@ -1402,7 +1410,7 @@ static int bind_rdev_to_array(mdk_rdev_t goto fail; } list_add(rdev-same_set, mddev-disks); - bd_claim_by_disk(rdev-bdev, rdev, mddev-gendisk); + bd_claim_by_disk(rdev-bdev, rdev-bdev-bd_holder, mddev-gendisk); return 0; fail: @@ -1442,7 +1450,7 @@ static void unbind_rdev_from_array(mdk_r * otherwise reused by a RAID array (or any other kernel * subsystem), by bd_claiming the device. */ -static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) +static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared) { int err = 0; struct block_device *bdev; @@ -1454,13 +1462,15 @@ static int lock_rdev(mdk_rdev_t *rdev, d __bdevname(dev, b)); return PTR_ERR(bdev); } - err = bd_claim(bdev, rdev); + err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev); if (err) { printk(KERN_ERR md: could not bd_claim %s.\n, bdevname(bdev, b)); blkdev_put(bdev); return err; } + if (!shared) + set_bit(AllReserved, rdev-flags); rdev-bdev = bdev; return err; } @@ -1925,7 +1935,8 @@ slot_store(mdk_rdev_t *rdev, const char return -ENOSPC; rdev-raid_disk = slot; /* assume it is working */ - rdev-flags = 0; + clear_bit(Faulty, rdev-flags); + clear_bit(WriteMostly, rdev-flags); set_bit(In_sync, rdev-flags); } return len; @@ -1950,6 +1961,10 @@ offset_store(mdk_rdev_t *rdev, const cha return -EINVAL; if (rdev-mddev-pers) return -EBUSY; + if (rdev-size rdev-mddev-external) + /* Must set offset before size, so overlap checks +* can be sane */ + return -EBUSY; rdev-data_offset = offset; return len; } @@ -1963,16 +1978,69 @@ rdev_size_show(mdk_rdev_t *rdev, char *p return sprintf(page, %llu\n, (unsigned long long)rdev-size); } +static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) +{ + /* check if two start/length pairs overlap */ + if (s1+l1 = s2) + return 0; + if (s2+l2 = s1) + return 0; + return 1; +} + static ssize_t rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) { char *e; unsigned long long size = simple_strtoull(buf, e, 10); + unsigned long long oldsize = rdev-size; if (e==buf || (*e *e != '\n')) return -EINVAL; if (rdev-mddev-pers) return -EBUSY; rdev-size = size; + if (size oldsize rdev-mddev-external) { + /* need to check that all other rdevs with the same -bdev +* do not overlap. We need to unlock the mddev to avoid +* a deadlock. We have already changed rdev-size, and if +
[PATCH 000 of 4] md: assorted md patched - please read carefully.
Following are 4 patches for md. The first two replace md-allow-devices-to-be-shared-between-md-arrays.patch which was recently remove. They should go at the same place in the series, between md-allow-a-maximum-extent-to-be-set-for-resyncing.patch and md-lock-address-when-changing-attributes-of-component-devices.patch The third is a replacement for md-change-iterate_rdev_generic-to-rdev_for_each_list-and-remove-iterate_rdev_pending.patch which conflicts with the above change. The last is a fix for md-fix-an-occasional-deadlock-in-raid5.patch which makes me a lot happier about this patch. It introduced a performance regression and I now understand why. I'm now happy for that patch with this fix to go into 2.6.24 if that is convenient (If not, 2.6.24.1 will do). Thanks, NeilBrown [PATCH 001 of 4] md: Set and test the -persistent flag for md devices more consistently. [PATCH 002 of 4] md: Allow devices to be shared between md arrays. [PATCH 003 of 4] md: Change ITERATE_RDEV_GENERIC to rdev_for_each_list, and remove ITERATE_RDEV_PENDING. [PATCH 004 of 4] md: Fix an occasional deadlock in raid5 - FIX - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 001 of 4] md: Set and test the -persistent flag for md devices more consistently.
If you try to start an array for which the number of raid disks is listed as zero, md will currently try to read metadata off any devices that have been given. This was done because the value of raid_disks is used to signal whether array details have been provided by userspace (raid_disks 0) or must be read from the devices (raid_disks == 0). However for an array without persistent metadata (or with externally managed metadata) this is the wrong thing to do. So we add a test in do_md_run to give an error if raid_disks is zero for non-persistent arrays. This requires that mddev-persistent is set corrently at this point, which it currently isn't for in-kernel autodetected arrays. So set -persistent for autodetect arrays, and remove the settign in super_*_validate which is now redundant. Also clear -persistent when stopping an array so it is consistently zero when starting an array. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c |9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2008-01-18 10:46:49.0 +1100 +++ ./drivers/md/md.c 2008-01-18 11:03:15.0 +1100 @@ -779,7 +779,6 @@ static int super_90_validate(mddev_t *md mddev-major_version = 0; mddev-minor_version = sb-minor_version; mddev-patch_version = sb-patch_version; - mddev-persistent = 1; mddev-external = 0; mddev-chunk_size = sb-chunk_size; mddev-ctime = sb-ctime; @@ -1159,7 +1158,6 @@ static int super_1_validate(mddev_t *mdd if (mddev-raid_disks == 0) { mddev-major_version = 1; mddev-patch_version = 0; - mddev-persistent = 1; mddev-external = 0; mddev-chunk_size = le32_to_cpu(sb-chunksize) 9; mddev-ctime = le64_to_cpu(sb-ctime) ((1ULL 32)-1); @@ -3219,8 +3217,11 @@ static int do_md_run(mddev_t * mddev) /* * Analyze all RAID superblock(s) */ - if (!mddev-raid_disks) + if (!mddev-raid_disks) { + if (!mddev-persistent) + return -EINVAL; analyze_sbs(mddev); + } chunk_size = mddev-chunk_size; @@ -3627,6 +3628,7 @@ static int do_md_stop(mddev_t * mddev, i mddev-resync_max = MaxSector; mddev-reshape_position = MaxSector; mddev-external = 0; + mddev-persistent = 0; } else if (mddev-pers) printk(KERN_INFO md: %s switched to read-only mode.\n, @@ -3735,6 +3737,7 @@ static void autorun_devices(int part) mddev_unlock(mddev); } else { printk(KERN_INFO md: created %s\n, mdname(mddev)); + mddev-persistent = 1; ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { list_del_init(rdev-same_set); if (bind_rdev_to_array(rdev, mddev)) - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 000 of 6] md: various fixes for md
Following are 6 patches for md which are suitable for 2.6.25-rc1. The first fixes a bug which could make it a candidate for 24-final. However it is a deadlock that seems to occur very rarely, and has been in mainline since 2.6.22. So letting it into one more release shouldn't be a big problem. While the fix is fairly simple, it could have some unexpected consequences, so I'd rather go for the next cycle. The second patch fixes a bug which only affect -mm at the moment but will probably affect 2.6.25 unless fixed. The rest are cleanups with no functional change (I hope). Thanks, NeilBrown [PATCH 001 of 6] md: Fix an occasional deadlock in raid5 [PATCH 002 of 6] md: Fix use-after-free bug when dropping an rdev from an md array. [PATCH 003 of 6] md: Change a few 'int' to 'size_t' in md [PATCH 004 of 6] md: Change INTERATE_MDDEV to for_each_mddev [PATCH 005 of 6] md: Change ITERATE_RDEV to rdev_for_each [PATCH 006 of 6] md: Change ITERATE_RDEV_GENERIC to rdev_for_each_list, and remove ITERATE_RDEV_PENDING. - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 004 of 6] md: Change INTERATE_MDDEV to for_each_mddev
As this is more consistent with kernel style. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2008-01-14 12:24:54.0 +1100 +++ ./drivers/md/md.c 2008-01-14 12:26:04.0 +1100 @@ -195,7 +195,7 @@ static DEFINE_SPINLOCK(all_mddevs_lock); * Any code which breaks out of this loop while own * a reference to the current mddev and must mddev_put it. */ -#define ITERATE_MDDEV(mddev,tmp) \ +#define for_each_mddev(mddev,tmp) \ \ for (({ spin_lock(all_mddevs_lock);\ tmp = all_mddevs.next; \ @@ -1596,7 +1596,7 @@ static void md_print_devices(void) printk(md: **\n); printk(md: * COMPLETE RAID STATE PRINTOUT *\n); printk(md: **\n); - ITERATE_MDDEV(mddev,tmp) { + for_each_mddev(mddev, tmp) { if (mddev-bitmap) bitmap_print_sb(mddev-bitmap); @@ -2014,7 +2014,7 @@ rdev_size_store(mdk_rdev_t *rdev, const struct list_head *tmp, *tmp2; mddev_unlock(rdev-mddev); - ITERATE_MDDEV(mddev, tmp) { + for_each_mddev(mddev, tmp) { mdk_rdev_t *rdev2; mddev_lock(mddev); @@ -5464,7 +5464,7 @@ void md_do_sync(mddev_t *mddev) set_bit(MD_RECOVERY_INTR, mddev-recovery); goto skip; } - ITERATE_MDDEV(mddev2,tmp) { + for_each_mddev(mddev2, tmp) { if (mddev2 == mddev) continue; if (mddev2-curr_resync @@ -5912,7 +5912,7 @@ static int md_notify_reboot(struct notif printk(KERN_INFO md: stopping all md devices.\n); - ITERATE_MDDEV(mddev,tmp) + for_each_mddev(mddev, tmp) if (mddev_trylock(mddev)) { do_md_stop (mddev, 1); mddev_unlock(mddev); @@ -6046,7 +6046,7 @@ static __exit void md_exit(void) unregister_reboot_notifier(md_notifier); unregister_sysctl_table(raid_table_header); remove_proc_entry(mdstat, NULL); - ITERATE_MDDEV(mddev,tmp) { + for_each_mddev(mddev, tmp) { struct gendisk *disk = mddev-gendisk; if (!disk) continue; - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 005 of 6] md: Change ITERATE_RDEV to rdev_for_each
as this is morein line with common practice in the kernel. Also swap the args around to be more like list_for_each. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/bitmap.c |4 +- ./drivers/md/faulty.c |2 - ./drivers/md/linear.c |2 - ./drivers/md/md.c | 64 ++-- ./drivers/md/multipath.c|2 - ./drivers/md/raid0.c|8 ++--- ./drivers/md/raid1.c|2 - ./drivers/md/raid10.c |2 - ./drivers/md/raid5.c|6 ++-- ./include/linux/raid/md_k.h |2 - 10 files changed, 47 insertions(+), 47 deletions(-) diff .prev/drivers/md/bitmap.c ./drivers/md/bitmap.c --- .prev/drivers/md/bitmap.c 2008-01-11 15:01:13.0 +1100 +++ ./drivers/md/bitmap.c 2008-01-14 12:26:10.0 +1100 @@ -231,7 +231,7 @@ static struct page *read_sb_page(mddev_t if (!page) return ERR_PTR(-ENOMEM); - ITERATE_RDEV(mddev, rdev, tmp) { + rdev_for_each(rdev, tmp, mddev) { if (! test_bit(In_sync, rdev-flags) || test_bit(Faulty, rdev-flags)) continue; @@ -255,7 +255,7 @@ static int write_sb_page(struct bitmap * struct list_head *tmp; mddev_t *mddev = bitmap-mddev; - ITERATE_RDEV(mddev, rdev, tmp) + rdev_for_each(rdev, tmp, mddev) if (test_bit(In_sync, rdev-flags) !test_bit(Faulty, rdev-flags)) { int size = PAGE_SIZE; diff .prev/drivers/md/faulty.c ./drivers/md/faulty.c --- .prev/drivers/md/faulty.c 2008-01-11 15:01:13.0 +1100 +++ ./drivers/md/faulty.c 2008-01-14 12:26:10.0 +1100 @@ -294,7 +294,7 @@ static int run(mddev_t *mddev) } conf-nfaults = 0; - ITERATE_RDEV(mddev, rdev, tmp) + rdev_for_each(rdev, tmp, mddev) conf-rdev = rdev; mddev-array_size = mddev-size; diff .prev/drivers/md/linear.c ./drivers/md/linear.c --- .prev/drivers/md/linear.c 2008-01-11 15:01:13.0 +1100 +++ ./drivers/md/linear.c 2008-01-14 12:26:10.0 +1100 @@ -122,7 +122,7 @@ static linear_conf_t *linear_conf(mddev_ cnt = 0; conf-array_size = 0; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { int j = rdev-raid_disk; dev_info_t *disk = conf-disks + j; diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2008-01-14 12:26:04.0 +1100 +++ ./drivers/md/md.c 2008-01-14 12:26:10.0 +1100 @@ -311,7 +311,7 @@ static mdk_rdev_t * find_rdev_nr(mddev_t mdk_rdev_t * rdev; struct list_head *tmp; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { if (rdev-desc_nr == nr) return rdev; } @@ -323,7 +323,7 @@ static mdk_rdev_t * find_rdev(mddev_t * struct list_head *tmp; mdk_rdev_t *rdev; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { if (rdev-bdev-bd_dev == dev) return rdev; } @@ -944,7 +944,7 @@ static void super_90_sync(mddev_t *mddev sb-state |= (1MD_SB_BITMAP_PRESENT); sb-disks[0].state = (1MD_DISK_REMOVED); - ITERATE_RDEV(mddev,rdev2,tmp) { + rdev_for_each(rdev2, tmp, mddev) { mdp_disk_t *d; int desc_nr; if (rdev2-raid_disk = 0 test_bit(In_sync, rdev2-flags) @@ -1297,7 +1297,7 @@ static void super_1_sync(mddev_t *mddev, } max_dev = 0; - ITERATE_RDEV(mddev,rdev2,tmp) + rdev_for_each(rdev2, tmp, mddev) if (rdev2-desc_nr+1 max_dev) max_dev = rdev2-desc_nr+1; @@ -1306,7 +1306,7 @@ static void super_1_sync(mddev_t *mddev, for (i=0; imax_dev;i++) sb-dev_roles[i] = cpu_to_le16(0xfffe); - ITERATE_RDEV(mddev,rdev2,tmp) { + rdev_for_each(rdev2, tmp, mddev) { i = rdev2-desc_nr; if (test_bit(Faulty, rdev2-flags)) sb-dev_roles[i] = cpu_to_le16(0xfffe); @@ -1344,8 +1344,8 @@ static int match_mddev_units(mddev_t *md struct list_head *tmp, *tmp2; mdk_rdev_t *rdev, *rdev2; - ITERATE_RDEV(mddev1,rdev,tmp) - ITERATE_RDEV(mddev2, rdev2, tmp2) + rdev_for_each(rdev, tmp, mddev1) + rdev_for_each(rdev2, tmp2, mddev2) if (rdev-bdev-bd_contains == rdev2-bdev-bd_contains) return 1; @@ -1518,7 +1518,7 @@ static void export_array(mddev_t *mddev) struct list_head *tmp; mdk_rdev_t *rdev; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { if (!rdev-mddev) {
[PATCH 006 of 6] md: Change ITERATE_RDEV_GENERIC to rdev_for_each_list, and remove ITERATE_RDEV_PENDING.
Finish ITERATE_ to for_each conversion. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c |8 ./include/linux/raid/md_k.h | 14 -- 2 files changed, 8 insertions(+), 14 deletions(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2008-01-14 12:26:10.0 +1100 +++ ./drivers/md/md.c 2008-01-14 12:26:15.0 +1100 @@ -3767,7 +3767,7 @@ static void autorun_devices(int part) printk(KERN_INFO md: considering %s ...\n, bdevname(rdev0-bdev,b)); INIT_LIST_HEAD(candidates); - ITERATE_RDEV_PENDING(rdev,tmp) + rdev_for_each_list(rdev, tmp, pending_raid_disks) if (super_90_load(rdev, rdev0, 0) = 0) { printk(KERN_INFO md: adding %s ...\n, bdevname(rdev-bdev,b)); @@ -3810,7 +3810,7 @@ static void autorun_devices(int part) mddev_unlock(mddev); } else { printk(KERN_INFO md: created %s\n, mdname(mddev)); - ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { + rdev_for_each_list(rdev, tmp, candidates) { list_del_init(rdev-same_set); if (bind_rdev_to_array(rdev, mddev)) export_rdev(rdev); @@ -3821,7 +3821,7 @@ static void autorun_devices(int part) /* on success, candidates will be empty, on error * it won't... */ - ITERATE_RDEV_GENERIC(candidates,rdev,tmp) + rdev_for_each_list(rdev, tmp, candidates) export_rdev(rdev); mddev_put(mddev); } @@ -4936,7 +4936,7 @@ static void status_unused(struct seq_fil seq_printf(seq, unused devices: ); - ITERATE_RDEV_PENDING(rdev,tmp) { + rdev_for_each_list(rdev, tmp, pending_raid_disks) { char b[BDEVNAME_SIZE]; i++; seq_printf(seq, %s , diff .prev/include/linux/raid/md_k.h ./include/linux/raid/md_k.h --- .prev/include/linux/raid/md_k.h 2008-01-14 12:26:10.0 +1100 +++ ./include/linux/raid/md_k.h 2008-01-14 12:26:15.0 +1100 @@ -313,23 +313,17 @@ static inline char * mdname (mddev_t * m * iterates through some rdev ringlist. It's safe to remove the * current 'rdev'. Dont touch 'tmp' though. */ -#define ITERATE_RDEV_GENERIC(head,rdev,tmp)\ +#define rdev_for_each_list(rdev, tmp, list)\ \ - for ((tmp) = (head).next; \ + for ((tmp) = (list).next; \ (rdev) = (list_entry((tmp), mdk_rdev_t, same_set)), \ - (tmp) = (tmp)-next, (tmp)-prev != (head) \ + (tmp) = (tmp)-next, (tmp)-prev != (list) \ ; ) /* * iterates through the 'same array disks' ringlist */ #define rdev_for_each(rdev, tmp, mddev)\ - ITERATE_RDEV_GENERIC((mddev)-disks,rdev,tmp) - -/* - * Iterates through 'pending RAID disks' - */ -#define ITERATE_RDEV_PENDING(rdev,tmp) \ - ITERATE_RDEV_GENERIC(pending_raid_disks,rdev,tmp) + rdev_for_each_list(rdev, tmp, (mddev)-disks) typedef struct mdk_thread_s { void(*run) (mddev_t *mddev); - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] md: Fix data corruption when a degraded raid5 array is reshaped.
This patch fixes a fairly serious bug in md/raid5 in 2.6.23 and 24-rc. It would be great if it cold get into 23.13 and 24.final. Thanks. NeilBrown ### Comments for Changeset We currently do not wait for the block from the missing device to be computed from parity before copying data to the new stripe layout. The change in the raid6 code is not techincally needed as we don't delay data block recovery in the same way for raid6 yet. But making the change now is safer long-term. This bug exists in 2.6.23 and 2.6.24-rc Cc: [EMAIL PROTECTED] Cc: Dan Williams [EMAIL PROTECTED] Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid5.c |4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2008-01-04 09:42:05.0 +1100 +++ ./drivers/md/raid5.c2008-01-04 09:42:27.0 +1100 @@ -2865,7 +2865,7 @@ static void handle_stripe5(struct stripe md_done_sync(conf-mddev, STRIPE_SECTORS, 1); } - if (s.expanding s.locked == 0) + if (s.expanding s.locked == 0 s.req_compute == 0) handle_stripe_expansion(conf, sh, NULL); if (sh-ops.count) @@ -3067,7 +3067,7 @@ static void handle_stripe6(struct stripe md_done_sync(conf-mddev, STRIPE_SECTORS, 1); } - if (s.expanding s.locked == 0) + if (s.expanding s.locked == 0 s.req_compute == 0) handle_stripe_expansion(conf, sh, r6s); spin_unlock(sh-lock); - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 000 of 7] md: Introduction EXPLAIN PATCH SET HERE
Following are 7 md related patches are suitable for the next -mm and maybe for 2.6.25. They move towards giving user-space programs more fine control of an array so that we can add support for more complex metadata formats (e.g. DDF) without bothering the kernel with such things. The last patch isn't strictly md related. It adds an ioctl which allows mapping from an open file descriptor on a block device to a name in /sys. This makes finding name of things in /sys more practical. As I put this in block-layer code, I have Cc:ed Jens Axboe. [PATCH 001 of 7] md: Support 'external' metadata for md arrays. [PATCH 002 of 7] md: Give userspace control over removing failed devices when external metdata in use [PATCH 003 of 7] md: Allow a maximum extent to be set for resyncing. [PATCH 004 of 7] md: Allow devices to be shared between md arrays. [PATCH 005 of 7] md: Lock address when changing attributes of component devices. [PATCH 006 of 7] md: Allow an md array to appear with 0 drives if it has external metadata. [PATCH 007 of 7] md: Get name for block device in sysfs - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 001 of 7] md: Support 'external' metadata for md arrays.
- Add a state flag 'external' to indicate that the metadata is managed externally (by user-space) so important changes need to be left of user-space to handle. Alternates are non-persistant ('none') where there is no stable metadata - after the array is stopped there is no record of it's status - and internal which can be version 0.90 or version 1.x These are selected by writing to the 'metadata' attribute. - move the updating of superblocks (sync_sbs) to after we have checked if there are any superblocks or not. - New array state 'write_pending'. This means that the metadata records the array as 'clean', but a write has been requested, so the metadata has to be updated to record a 'dirty' array before the write can continue. This change is reported to md by writing 'active' to the array_state attribute. - tidy up marking of sb_dirty: - don't set sb_dirty when resync finishes as md_check_recovery calls md_update_sb when the sync thread finishes anyway. - Don't set sb_dirty in multipath_run as the array might not be dirty. - don't mark superblock dirty when switching to 'clean' if there is no internal superblock (if external, userspace can choose to update the superblock whenever it chooses to). Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c | 77 +--- ./include/linux/raid/md_k.h |3 + 2 files changed, 61 insertions(+), 19 deletions(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2007-12-14 16:07:51.0 +1100 +++ ./drivers/md/md.c 2007-12-14 16:08:28.0 +1100 @@ -778,7 +778,8 @@ static int super_90_validate(mddev_t *md mddev-major_version = 0; mddev-minor_version = sb-minor_version; mddev-patch_version = sb-patch_version; - mddev-persistent = ! sb-not_persistent; + mddev-persistent = 1; + mddev-external = 0; mddev-chunk_size = sb-chunk_size; mddev-ctime = sb-ctime; mddev-utime = sb-utime; @@ -904,7 +905,7 @@ static void super_90_sync(mddev_t *mddev sb-size = mddev-size; sb-raid_disks = mddev-raid_disks; sb-md_minor = mddev-md_minor; - sb-not_persistent = !mddev-persistent; + sb-not_persistent = 0; sb-utime = mddev-utime; sb-state = 0; sb-events_hi = (mddev-events32); @@ -1158,6 +1159,7 @@ static int super_1_validate(mddev_t *mdd mddev-major_version = 1; mddev-patch_version = 0; mddev-persistent = 1; + mddev-external = 0; mddev-chunk_size = le32_to_cpu(sb-chunksize) 9; mddev-ctime = le64_to_cpu(sb-ctime) ((1ULL 32)-1); mddev-utime = le64_to_cpu(sb-utime) ((1ULL 32)-1); @@ -1699,18 +1701,20 @@ repeat: MD_BUG(); mddev-events --; } - sync_sbs(mddev, nospares); /* * do not write anything to disk if using * nonpersistent superblocks */ if (!mddev-persistent) { - clear_bit(MD_CHANGE_PENDING, mddev-flags); + if (!mddev-external) + clear_bit(MD_CHANGE_PENDING, mddev-flags); + spin_unlock_irq(mddev-write_lock); wake_up(mddev-sb_wait); return; } + sync_sbs(mddev, nospares); spin_unlock_irq(mddev-write_lock); dprintk(KERN_INFO @@ -2430,6 +2434,8 @@ array_state_show(mddev_t *mddev, char *p case 0: if (mddev-in_sync) st = clean; + else if (test_bit(MD_CHANGE_CLEAN, mddev-flags)) + st = write_pending; else if (mddev-safemode) st = active_idle; else @@ -2460,11 +2466,9 @@ array_state_store(mddev_t *mddev, const break; case clear: /* stopping an active array */ - if (mddev-pers) { - if (atomic_read(mddev-active) 1) - return -EBUSY; - err = do_md_stop(mddev, 0); - } + if (atomic_read(mddev-active) 1) + return -EBUSY; + err = do_md_stop(mddev, 0); break; case inactive: /* stopping an active array */ @@ -2472,7 +2476,8 @@ array_state_store(mddev_t *mddev, const if (atomic_read(mddev-active) 1) return -EBUSY; err = do_md_stop(mddev, 2); - } + } else + err = 0; /* already inactive */ break; case suspended:
[PATCH 002 of 7] md: Give userspace control over removing failed devices when external metdata in use
When a device fails, we must not allow an further writes to the array until the device failure has been recorded in array metadata. When metadata is managed externally, this requires some synchronisation... Allow/require userspace to explicitly remove failed devices from active service in the array by writing 'none' to the 'slot' attribute. If this reduces the number of failed devices to 0, the write block will automatically be lowered. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c | 43 ++- 1 file changed, 34 insertions(+), 9 deletions(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2007-12-14 16:08:28.0 +1100 +++ ./drivers/md/md.c 2007-12-14 16:08:52.0 +1100 @@ -1894,20 +1894,44 @@ static ssize_t slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) { char *e; + int err; + char nm[20]; int slot = simple_strtoul(buf, e, 10); if (strncmp(buf, none, 4)==0) slot = -1; else if (e==buf || (*e *e!= '\n')) return -EINVAL; - if (rdev-mddev-pers) - /* Cannot set slot in active array (yet) */ - return -EBUSY; - if (slot = rdev-mddev-raid_disks) - return -ENOSPC; - rdev-raid_disk = slot; - /* assume it is working */ - rdev-flags = 0; - set_bit(In_sync, rdev-flags); + if (rdev-mddev-pers) { + /* Setting 'slot' on an active array requires also +* updating the 'rd%d' link, and communicating +* with the personality with -hot_*_disk. +* For now we only support removing +* failed/spare devices. This normally happens automatically, +* but not when the metadata is externally managed. +*/ + if (slot != -1) + return -EBUSY; + if (rdev-raid_disk == -1) + return -EEXIST; + /* personality does all needed checks */ + if (rdev-mddev-pers-hot_add_disk == NULL) + return -EINVAL; + err = rdev-mddev-pers- + hot_remove_disk(rdev-mddev, rdev-raid_disk); + if (err) + return err; + sprintf(nm, rd%d, rdev-raid_disk); + sysfs_remove_link(rdev-mddev-kobj, nm); + set_bit(MD_RECOVERY_NEEDED, rdev-mddev-recovery); + md_wakeup_thread(rdev-mddev-thread); + } else { + if (slot = rdev-mddev-raid_disks) + return -ENOSPC; + rdev-raid_disk = slot; + /* assume it is working */ + rdev-flags = 0; + set_bit(In_sync, rdev-flags); + } return len; } @@ -5551,6 +5575,7 @@ static int remove_and_add_spares(mddev_t ITERATE_RDEV(mddev,rdev,rtmp) if (rdev-raid_disk = 0 + !mddev-external (test_bit(Faulty, rdev-flags) || ! test_bit(In_sync, rdev-flags)) atomic_read(rdev-nr_pending)==0) { - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 003 of 7] md: Allow a maximum extent to be set for resyncing.
This allows userspace to control resync/reshape progress and synchronise it with other activities, such as shared access in a SAN, or backing up critical sections during a tricky reshape. Writing a number of sectors (which must be a multiple of the chunk size if such is meaningful) causes a resync to pause when it gets to that point. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./Documentation/md.txt | 10 + ./drivers/md/md.c | 75 ++-- ./drivers/md/raid1.c|2 + ./drivers/md/raid10.c |3 + ./drivers/md/raid5.c| 25 ++ ./include/linux/raid/md_k.h |2 + 6 files changed, 107 insertions(+), 10 deletions(-) diff .prev/Documentation/md.txt ./Documentation/md.txt --- .prev/Documentation/md.txt 2007-12-14 16:07:50.0 +1100 +++ ./Documentation/md.txt 2007-12-14 16:08:57.0 +1100 @@ -416,6 +416,16 @@ also have sectors in total that could need to be processed. The two numbers are separated by a '/' thus effectively showing one value, a fraction of the process that is complete. + A 'select' on this attribute will return when resync completes, + when it reaches the current sync_max (below) and possibly at + other times. + + sync_max + This is a number of sectors at which point a resync/recovery + process will pause. When a resync is active, the value can + only ever be increased, never decreased. The value of 'max' + effectively disables the limit. + sync_speed This shows the current actual speed, in K/sec, of the current diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2007-12-14 16:08:52.0 +1100 +++ ./drivers/md/md.c 2007-12-14 16:08:57.0 +1100 @@ -275,6 +275,7 @@ static mddev_t * mddev_find(dev_t unit) spin_lock_init(new-write_lock); init_waitqueue_head(new-sb_wait); new-reshape_position = MaxSector; + new-resync_max = MaxSector; new-queue = blk_alloc_queue(GFP_KERNEL); if (!new-queue) { @@ -2926,6 +2927,43 @@ sync_completed_show(mddev_t *mddev, char static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); static ssize_t +max_sync_show(mddev_t *mddev, char *page) +{ + if (mddev-resync_max == MaxSector) + return sprintf(page, max\n); + else + return sprintf(page, %llu\n, + (unsigned long long)mddev-resync_max); +} +static ssize_t +max_sync_store(mddev_t *mddev, const char *buf, size_t len) +{ + if (strncmp(buf, max, 3) == 0) + mddev-resync_max = MaxSector; + else { + char *ep; + unsigned long long max = simple_strtoull(buf, ep, 10); + if (ep == buf || (*ep != 0 *ep != '\n')) + return -EINVAL; + if (max mddev-resync_max + test_bit(MD_RECOVERY_RUNNING, mddev-recovery)) + return -EBUSY; + + /* Must be a multiple of chunk_size */ + if (mddev-chunk_size) { + if (max (sector_t)((mddev-chunk_size9)-1)) + return -EINVAL; + } + mddev-resync_max = max; + } + wake_up(mddev-recovery_wait); + return len; +} + +static struct md_sysfs_entry md_max_sync = +__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); + +static ssize_t suspend_lo_show(mddev_t *mddev, char *page) { return sprintf(page, %llu\n, (unsigned long long)mddev-suspend_lo); @@ -3035,6 +3073,7 @@ static struct attribute *md_redundancy_a md_sync_max.attr, md_sync_speed.attr, md_sync_completed.attr, + md_max_sync.attr, md_suspend_lo.attr, md_suspend_hi.attr, md_bitmap.attr, @@ -3582,6 +3621,7 @@ static int do_md_stop(mddev_t * mddev, i mddev-size = 0; mddev-raid_disks = 0; mddev-recovery_cp = 0; + mddev-resync_max = MaxSector; mddev-reshape_position = MaxSector; mddev-external = 0; @@ -5445,8 +5485,16 @@ void md_do_sync(mddev_t *mddev) sector_t sectors; skipped = 0; + if (j = mddev-resync_max) { + sysfs_notify(mddev-kobj, NULL, sync_completed); + wait_event(mddev-recovery_wait, + mddev-resync_max j + || kthread_should_stop()); + } + if (kthread_should_stop()) + goto interrupted; sectors = mddev-pers-sync_request(mddev, j, skipped, - currspeed speed_min(mddev)); + currspeed speed_min(mddev)); if (sectors ==
[PATCH 004 of 7] md: Allow devices to be shared between md arrays.
Currently, a given device is claimed by a particular array so that it cannot be used by other arrays. This is not ideal for DDF and other metadata schemes which have their own partitioning concept. So for externally managed metadata, just claim the device for md in general, require that offset and size are set properly for each device, and make sure that if a device is included in different arrays then the active sections do not overlap. This involves adding another flag to the rdev which makes it awkward to set -flags = 0 to clear certain flags. So now clear flags explicitly by name when we want to clear things. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c | 93 ++-- ./include/linux/raid/md_k.h |2 2 files changed, 84 insertions(+), 11 deletions(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2007-12-14 16:08:57.0 +1100 +++ ./drivers/md/md.c 2007-12-14 16:09:01.0 +1100 @@ -774,7 +774,11 @@ static int super_90_validate(mddev_t *md __u64 ev1 = md_event(sb); rdev-raid_disk = -1; - rdev-flags = 0; + clear_bit(Faulty, rdev-flags); + clear_bit(In_sync, rdev-flags); + clear_bit(WriteMostly, rdev-flags); + clear_bit(BarriersNotsupp, rdev-flags); + if (mddev-raid_disks == 0) { mddev-major_version = 0; mddev-minor_version = sb-minor_version; @@ -1155,7 +1159,11 @@ static int super_1_validate(mddev_t *mdd __u64 ev1 = le64_to_cpu(sb-events); rdev-raid_disk = -1; - rdev-flags = 0; + clear_bit(Faulty, rdev-flags); + clear_bit(In_sync, rdev-flags); + clear_bit(WriteMostly, rdev-flags); + clear_bit(BarriersNotsupp, rdev-flags); + if (mddev-raid_disks == 0) { mddev-major_version = 1; mddev-patch_version = 0; @@ -1407,7 +1415,7 @@ static int bind_rdev_to_array(mdk_rdev_t goto fail; } list_add(rdev-same_set, mddev-disks); - bd_claim_by_disk(rdev-bdev, rdev, mddev-gendisk); + bd_claim_by_disk(rdev-bdev, rdev-bdev-bd_holder, mddev-gendisk); return 0; fail: @@ -1447,7 +1455,7 @@ static void unbind_rdev_from_array(mdk_r * otherwise reused by a RAID array (or any other kernel * subsystem), by bd_claiming the device. */ -static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) +static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared) { int err = 0; struct block_device *bdev; @@ -1459,13 +1467,15 @@ static int lock_rdev(mdk_rdev_t *rdev, d __bdevname(dev, b)); return PTR_ERR(bdev); } - err = bd_claim(bdev, rdev); + err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev); if (err) { printk(KERN_ERR md: could not bd_claim %s.\n, bdevname(bdev, b)); blkdev_put(bdev); return err; } + if (!shared) + set_bit(AllReserved, rdev-flags); rdev-bdev = bdev; return err; } @@ -1930,7 +1940,8 @@ slot_store(mdk_rdev_t *rdev, const char return -ENOSPC; rdev-raid_disk = slot; /* assume it is working */ - rdev-flags = 0; + clear_bit(Faulty, rdev-flags); + clear_bit(WriteMostly, rdev-flags); set_bit(In_sync, rdev-flags); } return len; @@ -1955,6 +1966,10 @@ offset_store(mdk_rdev_t *rdev, const cha return -EINVAL; if (rdev-mddev-pers) return -EBUSY; + if (rdev-size rdev-mddev-external) + /* Must set offset before size, so overlap checks +* can be sane */ + return -EBUSY; rdev-data_offset = offset; return len; } @@ -1968,16 +1983,69 @@ rdev_size_show(mdk_rdev_t *rdev, char *p return sprintf(page, %llu\n, (unsigned long long)rdev-size); } +static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) +{ + /* check if two start/length pairs overlap */ + if (s1+l1 = s2) + return 0; + if (s2+l2 = s1) + return 0; + return 1; +} + static ssize_t rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) { char *e; unsigned long long size = simple_strtoull(buf, e, 10); + unsigned long long oldsize = rdev-size; if (e==buf || (*e *e != '\n')) return -EINVAL; if (rdev-mddev-pers) return -EBUSY; rdev-size = size; + if (size oldsize rdev-mddev-external) { + /* need to check that all other rdevs with the same -bdev +* do not overlap. We need to unlock the mddev to avoid +* a deadlock. We have already changed rdev-size, and if +
[PATCH 005 of 7] md: Lock address when changing attributes of component devices.
Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c |8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2007-12-14 16:09:01.0 +1100 +++ ./drivers/md/md.c 2007-12-14 16:09:03.0 +1100 @@ -2080,12 +2080,18 @@ rdev_attr_store(struct kobject *kobj, st { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); + int rv; if (!entry-store) return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; - return entry-store(rdev, page, length); + rv = mddev_lock(rdev-mddev); + if (!rv) { + rv = entry-store(rdev, page, length); + mddev_unlock(rdev-mddev); + } + return rv; } static void rdev_free(struct kobject *ko) - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 006 of 7] md: Allow an md array to appear with 0 drives if it has external metadata.
Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c |7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2007-12-14 16:09:03.0 +1100 +++ ./drivers/md/md.c 2007-12-14 16:09:09.0 +1100 @@ -4650,9 +4650,10 @@ static int md_ioctl(struct inode *inode, */ /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ - if (!mddev-raid_disks cmd != ADD_NEW_DISK cmd != STOP_ARRAY -cmd != RUN_ARRAY cmd != SET_BITMAP_FILE -cmd != GET_BITMAP_FILE) { + if ((!mddev-raid_disks !mddev-external) +cmd != ADD_NEW_DISK cmd != STOP_ARRAY +cmd != RUN_ARRAY cmd != SET_BITMAP_FILE +cmd != GET_BITMAP_FILE) { err = -ENODEV; goto abort_unlock; } - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 007 of 7] md: Get name for block device in sysfs
Given an fd on a block device, returns a string like /block/sda/sda1 which can be used to find related information in /sys. Ideally we should have an ioctl that works on char devices as well, but that seems far from trivial, so it seems reasonable to have this until the later can be implemented. Cc: Jens Axboe [EMAIL PROTECTED] Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./block/ioctl.c | 13 + ./include/linux/fs.h |2 ++ 2 files changed, 15 insertions(+) diff .prev/block/ioctl.c ./block/ioctl.c --- .prev/block/ioctl.c 2007-12-14 17:18:50.0 +1100 +++ ./block/ioctl.c 2007-12-14 16:15:41.0 +1100 @@ -227,8 +227,21 @@ int blkdev_ioctl(struct inode *inode, st struct block_device *bdev = inode-i_bdev; struct gendisk *disk = bdev-bd_disk; int ret, n; + char b[BDEVNAME_SIZE*2 + 10]; switch(cmd) { + case BLKGETNAME: + strcpy(b, /block/); + bdevname(bdev-bd_contains, b+7); + if (bdev-bd_contains != bdev) { + char *e = b + strlen(b); + *e++ = '/'; + bdevname(bdev, e); + } + if (copy_to_user((char __user *)arg, b, strlen(b)+1)) + return -EFAULT; + return 0; + case BLKFLSBUF: if (!capable(CAP_SYS_ADMIN)) return -EACCES; diff .prev/include/linux/fs.h ./include/linux/fs.h --- .prev/include/linux/fs.h2007-12-14 17:18:50.0 +1100 +++ ./include/linux/fs.h2007-12-14 16:13:03.0 +1100 @@ -218,6 +218,8 @@ extern int dir_notify_enable; #define BLKTRACESTOP _IO(0x12,117) #define BLKTRACETEARDOWN _IO(0x12,118) +#define BLKGETNAME _IOR(0x12, 119, char [1024]) + #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ #define FIBMAP_IO(0x00,1) /* bmap access */ #define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */ - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 001 of 3] md: raid6: Fix mktable.c
From: H. Peter Anvin [EMAIL PROTECTED] Make both mktables.c and its output CodingStyle compliant. Update the copyright notice. Signed-off-by: H. Peter Anvin [EMAIL PROTECTED] Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/mktables.c | 43 +-- 1 file changed, 17 insertions(+), 26 deletions(-) diff .prev/drivers/md/mktables.c ./drivers/md/mktables.c --- .prev/drivers/md/mktables.c 2007-12-03 14:47:09.0 +1100 +++ ./drivers/md/mktables.c 2007-12-03 14:56:06.0 +1100 @@ -1,13 +1,10 @@ -#ident $Id: mktables.c,v 1.2 2002/12/12 22:41:27 hpa Exp $ -/* --- * +/* -*- linux-c -*- --- * * - * Copyright 2002 H. Peter Anvin - All Rights Reserved + * Copyright 2002-2007 H. Peter Anvin - All Rights Reserved * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2 or (at your + * option) any later version; incorporated herein by reference. * * --- */ @@ -73,8 +70,8 @@ int main(int argc, char *argv[]) for (j = 0; j 256; j += 8) { printf(\t\t); for (k = 0; k 8; k++) - printf(0x%02x, , gfmul(i, j+k)); - printf(\n); + printf(0x%02x,%c, gfmul(i, j + k), + (k == 7) ? '\n' : ' '); } printf(\t},\n); } @@ -83,47 +80,41 @@ int main(int argc, char *argv[]) /* Compute power-of-2 table (exponent) */ v = 1; printf(\nconst u8 __attribute__((aligned(256)))\n - raid6_gfexp[256] =\n - {\n); + raid6_gfexp[256] =\n {\n); for (i = 0; i 256; i += 8) { printf(\t); for (j = 0; j 8; j++) { - exptbl[i+j] = v; - printf(0x%02x, , v); + exptbl[i + j] = v; + printf(0x%02x,%c, v, (j == 7) ? '\n' : ' '); v = gfmul(v, 2); if (v == 1) v = 0; /* For entry 255, not a real entry */ } - printf(\n); } printf(};\n); /* Compute inverse table x^-1 == x^254 */ printf(\nconst u8 __attribute__((aligned(256)))\n - raid6_gfinv[256] =\n - {\n); + raid6_gfinv[256] =\n {\n); for (i = 0; i 256; i += 8) { printf(\t); for (j = 0; j 8; j++) { - v = gfpow(i+j, 254); - invtbl[i+j] = v; - printf(0x%02x, , v); + invtbl[i + j] = v = gfpow(i + j, 254); + printf(0x%02x,%c, v, (j == 7) ? '\n' : ' '); } - printf(\n); } printf(};\n); /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ printf(\nconst u8 __attribute__((aligned(256)))\n - raid6_gfexi[256] =\n - {\n); + raid6_gfexi[256] =\n {\n); for (i = 0; i 256; i += 8) { printf(\t); for (j = 0; j 8; j++) - printf(0x%02x, , invtbl[exptbl[i+j]^1]); - printf(\n); + printf(0x%02x,%c, invtbl[exptbl[i + j] ^ 1], + (j == 7) ? '\n' : ' '); } - printf(};\n\n); + printf(};\n); return 0; } - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 000 of 3] md: a few little patches
Following 3 patches for md provide some code tidyup and a small functionality improvement. They do not need to go into 2.6.24 but are definitely appropriate 25-rc1. (Patches made against 2.6.24-rc3-mm2) Thanks, NeilBrown [PATCH 001 of 3] md: raid6: Fix mktable.c [PATCH 002 of 3] md: raid6: clean up the style of raid6test/test.c [PATCH 003 of 3] md: Update md bitmap during resync. - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 002 of 3] md: raid6: clean up the style of raid6test/test.c
From: H. Peter Anvin [EMAIL PROTECTED] Date: Fri, 26 Oct 2007 11:22:42 -0700 Clean up the coding style in raid6test/test.c. Break it apart into subfunctions to make the code more readable. Signed-off-by: H. Peter Anvin [EMAIL PROTECTED] Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid6test/test.c | 115 -- 1 file changed, 68 insertions(+), 47 deletions(-) diff .prev/drivers/md/raid6test/test.c ./drivers/md/raid6test/test.c --- .prev/drivers/md/raid6test/test.c 2007-12-03 14:57:55.0 +1100 +++ ./drivers/md/raid6test/test.c 2007-12-03 14:57:55.0 +1100 @@ -1,12 +1,10 @@ /* -*- linux-c -*- --- * * - * Copyright 2002 H. Peter Anvin - All Rights Reserved + * Copyright 2002-2007 H. Peter Anvin - All Rights Reserved * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2 or (at your + * option) any later version; incorporated herein by reference. * * --- */ @@ -30,67 +28,87 @@ char *dataptrs[NDISKS]; char data[NDISKS][PAGE_SIZE]; char recovi[PAGE_SIZE], recovj[PAGE_SIZE]; -void makedata(void) +static void makedata(void) { int i, j; - for ( i = 0 ; i NDISKS ; i++ ) { - for ( j = 0 ; j PAGE_SIZE ; j++ ) { + for (i = 0; i NDISKS; i++) { + for (j = 0; j PAGE_SIZE; j++) data[i][j] = rand(); - } + dataptrs[i] = data[i]; } } +static char disk_type(int d) +{ + switch (d) { + case NDISKS-2: + return 'P'; + case NDISKS-1: + return 'Q'; + default: + return 'D'; + } +} + +static int test_disks(int i, int j) +{ + int erra, errb; + + memset(recovi, 0xf0, PAGE_SIZE); + memset(recovj, 0xba, PAGE_SIZE); + + dataptrs[i] = recovi; + dataptrs[j] = recovj; + + raid6_dual_recov(NDISKS, PAGE_SIZE, i, j, (void **)dataptrs); + + erra = memcmp(data[i], recovi, PAGE_SIZE); + errb = memcmp(data[j], recovj, PAGE_SIZE); + + if (i NDISKS-2 j == NDISKS-1) { + /* We don't implement the DQ failure scenario, since it's + equivalent to a RAID-5 failure (XOR, then recompute Q) */ + erra = errb = 0; + } else { + printf(algo=%-8s faila=%3d(%c) failb=%3d(%c) %s\n, + raid6_call.name, + i, disk_type(i), + j, disk_type(j), + (!erra !errb) ? OK : + !erra ? ERRB : + !errb ? ERRA : ERRAB); + } + + dataptrs[i] = data[i]; + dataptrs[j] = data[j]; + + return erra || errb; +} + int main(int argc, char *argv[]) { - const struct raid6_calls * const * algo; + const struct raid6_calls *const *algo; int i, j; - int erra, errb; + int err = 0; makedata(); - for ( algo = raid6_algos ; *algo ; algo++ ) { - if ( !(*algo)-valid || (*algo)-valid() ) { + for (algo = raid6_algos; *algo; algo++) { + if (!(*algo)-valid || (*algo)-valid()) { raid6_call = **algo; /* Nuke syndromes */ memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); /* Generate assumed good syndrome */ - raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, (void **)dataptrs); + raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, + (void **)dataptrs); - for ( i = 0 ; i NDISKS-1 ; i++ ) { - for ( j = i+1 ; j NDISKS ; j++ ) { - memset(recovi, 0xf0, PAGE_SIZE); - memset(recovj, 0xba, PAGE_SIZE); - - dataptrs[i] = recovi; - dataptrs[j] = recovj; - - raid6_dual_recov(NDISKS, PAGE_SIZE, i, j, (void **)dataptrs); - - erra = memcmp(data[i], recovi, PAGE_SIZE); - errb = memcmp(data[j], recovj, PAGE_SIZE); - - if ( i NDISKS-2 j == NDISKS-1 ) { -
[PATCH 003 of 3] md: Update md bitmap during resync.
Currently and md array with a write-intent bitmap does not updated that bitmap to reflect successful partial resync. Rather the entire bitmap is updated when the resync completes. This is because there is no guarentee that resync requests will complete in order, and tracking each request individually is unnecessarily burdensome. However there is value in regularly updating the bitmap, so add code to periodically pause while all pending sync requests complete, then update the bitmap. Doing this only every few seconds (the same as the bitmap update time) does not notciably affect resync performance. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/bitmap.c | 34 +- ./drivers/md/raid1.c |1 + ./drivers/md/raid10.c |2 ++ ./drivers/md/raid5.c |3 +++ ./include/linux/raid/bitmap.h |3 +++ 5 files changed, 38 insertions(+), 5 deletions(-) diff .prev/drivers/md/bitmap.c ./drivers/md/bitmap.c --- .prev/drivers/md/bitmap.c 2007-12-03 14:58:48.0 +1100 +++ ./drivers/md/bitmap.c 2007-12-03 14:59:00.0 +1100 @@ -1342,14 +1342,38 @@ void bitmap_close_sync(struct bitmap *bi */ sector_t sector = 0; int blocks; - if (!bitmap) return; + if (!bitmap) + return; while (sector bitmap-mddev-resync_max_sectors) { bitmap_end_sync(bitmap, sector, blocks, 0); -/* - if (sector 500) printk(bitmap_close_sync: sec %llu blks %d\n, -(unsigned long long)sector, blocks); -*/ sector += blocks; + sector += blocks; + } +} + +void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) +{ + sector_t s = 0; + int blocks; + + if (!bitmap) + return; + if (sector == 0) { + bitmap-last_end_sync = jiffies; + return; + } + if (time_before(jiffies, (bitmap-last_end_sync + + bitmap-daemon_sleep * HZ))) + return; + wait_event(bitmap-mddev-recovery_wait, + atomic_read(bitmap-mddev-recovery_active) == 0); + + sector = ~((1ULL CHUNK_BLOCK_SHIFT(bitmap)) - 1); + s = 0; + while (s sector s bitmap-mddev-resync_max_sectors) { + bitmap_end_sync(bitmap, s, blocks, 0); + s += blocks; } + bitmap-last_end_sync = jiffies; } static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) diff .prev/drivers/md/raid10.c ./drivers/md/raid10.c --- .prev/drivers/md/raid10.c 2007-12-03 14:58:48.0 +1100 +++ ./drivers/md/raid10.c 2007-12-03 14:58:10.0 +1100 @@ -1670,6 +1670,8 @@ static sector_t sync_request(mddev_t *md if (!go_faster conf-nr_waiting) msleep_interruptible(1000); + bitmap_cond_end_sync(mddev-bitmap, sector_nr); + /* Again, very different code for resync and recovery. * Both must result in an r10bio with a list of bios that * have bi_end_io, bi_sector, bi_bdev set, diff .prev/drivers/md/raid1.c ./drivers/md/raid1.c --- .prev/drivers/md/raid1.c2007-12-03 14:58:48.0 +1100 +++ ./drivers/md/raid1.c2007-12-03 14:58:10.0 +1100 @@ -1684,6 +1684,7 @@ static sector_t sync_request(mddev_t *md if (!go_faster conf-nr_waiting) msleep_interruptible(1000); + bitmap_cond_end_sync(mddev-bitmap, sector_nr); raise_barrier(conf); conf-next_resync = sector_nr; diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2007-12-03 14:58:48.0 +1100 +++ ./drivers/md/raid5.c2007-12-03 14:58:10.0 +1100 @@ -4333,6 +4333,9 @@ static inline sector_t sync_request(mdde return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ } + + bitmap_cond_end_sync(mddev-bitmap, sector_nr); + pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks); sh = wait_for_inactive_cache(conf, sector_nr, raid_disks, pd_idx); diff .prev/include/linux/raid/bitmap.h ./include/linux/raid/bitmap.h --- .prev/include/linux/raid/bitmap.h 2007-12-03 14:58:48.0 +1100 +++ ./include/linux/raid/bitmap.h 2007-12-03 14:58:10.0 +1100 @@ -244,6 +244,8 @@ struct bitmap { */ unsigned long daemon_lastrun; /* jiffies of last run */ unsigned long daemon_sleep; /* how many seconds between updates? */ + unsigned long last_end_sync; /* when we lasted called end_sync to + * update bitmap with resync progress */ atomic_t pending_writes; /* pending writes to the bitmap file */ wait_queue_head_t write_wait; @@ -275,6 +277,7 @@ void bitmap_endwrite(struct bitmap *bitm int bitmap_start_sync(struct bitmap *bitmap,
[PATCH] md: Fix misapplied patch in raid5.c
commit 4ae3f847e49e3787eca91bced31f8fd328d50496 did not get applied correctly, presumably due to substantial similarities between handle_stripe5 and handle_stripe6. This patch (with lots of context) moves the chunk of new code from handle_stripe6 (where it isn't needed (yet)) to handle_stripe5. Signed-off-by: Neil Brown [EMAIL PROTECTED] cc: Dan Williams [EMAIL PROTECTED] --- The patch is correctly applied in -mm. The same patch was sent to stable@ but doesn't seem to have made it yet. When it does get applied, we should make sure it gets applied properly... ### Diffstat output ./drivers/md/raid5.c | 14 +++--- 1 file changed, 7 insertions(+), 7 deletions(-) diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2007-11-02 12:10:49.0 +1100 +++ ./drivers/md/raid5.c2007-11-02 12:25:31.0 +1100 @@ -2607,40 +2607,47 @@ static void handle_stripe5(struct stripe struct bio *return_bi = NULL; struct stripe_head_state s; struct r5dev *dev; unsigned long pending = 0; memset(s, 0, sizeof(s)); pr_debug(handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d ops=%lx:%lx:%lx\n, (unsigned long long)sh-sector, sh-state, atomic_read(sh-count), sh-pd_idx, sh-ops.pending, sh-ops.ack, sh-ops.complete); spin_lock(sh-lock); clear_bit(STRIPE_HANDLE, sh-state); clear_bit(STRIPE_DELAYED, sh-state); s.syncing = test_bit(STRIPE_SYNCING, sh-state); s.expanding = test_bit(STRIPE_EXPAND_SOURCE, sh-state); s.expanded = test_bit(STRIPE_EXPAND_READY, sh-state); /* Now to look around and see what can be done */ + /* clean-up completed biofill operations */ + if (test_bit(STRIPE_OP_BIOFILL, sh-ops.complete)) { + clear_bit(STRIPE_OP_BIOFILL, sh-ops.pending); + clear_bit(STRIPE_OP_BIOFILL, sh-ops.ack); + clear_bit(STRIPE_OP_BIOFILL, sh-ops.complete); + } + rcu_read_lock(); for (i=disks; i--; ) { mdk_rdev_t *rdev; struct r5dev *dev = sh-dev[i]; clear_bit(R5_Insync, dev-flags); pr_debug(check %d: state 0x%lx toread %p read %p write %p written %p\n, i, dev-flags, dev-toread, dev-read, dev-towrite, dev-written); /* maybe we can request a biofill operation * * new wantfill requests are only permitted while * STRIPE_OP_BIOFILL is clear */ if (test_bit(R5_UPTODATE, dev-flags) dev-toread !test_bit(STRIPE_OP_BIOFILL, sh-ops.pending)) set_bit(R5_Wantfill, dev-flags); /* now count some things */ @@ -2880,47 +2887,40 @@ static void handle_stripe6(struct stripe struct stripe_head_state s; struct r6_state r6s; struct r5dev *dev, *pdev, *qdev; r6s.qd_idx = raid6_next_disk(pd_idx, disks); pr_debug(handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n, (unsigned long long)sh-sector, sh-state, atomic_read(sh-count), pd_idx, r6s.qd_idx); memset(s, 0, sizeof(s)); spin_lock(sh-lock); clear_bit(STRIPE_HANDLE, sh-state); clear_bit(STRIPE_DELAYED, sh-state); s.syncing = test_bit(STRIPE_SYNCING, sh-state); s.expanding = test_bit(STRIPE_EXPAND_SOURCE, sh-state); s.expanded = test_bit(STRIPE_EXPAND_READY, sh-state); /* Now to look around and see what can be done */ - /* clean-up completed biofill operations */ - if (test_bit(STRIPE_OP_BIOFILL, sh-ops.complete)) { - clear_bit(STRIPE_OP_BIOFILL, sh-ops.pending); - clear_bit(STRIPE_OP_BIOFILL, sh-ops.ack); - clear_bit(STRIPE_OP_BIOFILL, sh-ops.complete); - } - rcu_read_lock(); for (i=disks; i--; ) { mdk_rdev_t *rdev; dev = sh-dev[i]; clear_bit(R5_Insync, dev-flags); pr_debug(check %d: state 0x%lx read %p write %p written %p\n, i, dev-flags, dev-toread, dev-towrite, dev-written); /* maybe we can reply to a read */ if (test_bit(R5_UPTODATE, dev-flags) dev-toread) { struct bio *rbi, *rbi2; pr_debug(Return read for disc %d\n, i); spin_lock_irq(conf-device_lock); rbi = dev-toread; dev-toread = NULL; if (test_and_clear_bit(R5_Overlap, dev-flags)) wake_up(conf-wait_for_overlap); spin_unlock_irq(conf-device_lock); while (rbi rbi-bi_sector dev-sector + STRIPE_SECTORS) {
[PATCH 000 of 2] md: Fixes for md in 2.6.23
It appears that a couple of bugs slipped in to md for 2.6.23. These two patches fix them and are appropriate for 2.6.23.y as well as 2.6.24-rcX Thanks, NeilBrown [PATCH 001 of 2] md: Fix an unsigned compare to allow creation of bitmaps with v1.0 metadata. [PATCH 002 of 2] md: raid5: fix clearing of biofill operations - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 001 of 2] md: Fix an unsigned compare to allow creation of bitmaps with v1.0 metadata.
As page-index is unsigned, this all becomes an unsigned comparison, which almost always returns an error. Signed-off-by: Neil Brown [EMAIL PROTECTED] Cc: Stable [EMAIL PROTECTED] ### Diffstat output ./drivers/md/bitmap.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff .prev/drivers/md/bitmap.c ./drivers/md/bitmap.c --- .prev/drivers/md/bitmap.c 2007-10-22 16:55:48.0 +1000 +++ ./drivers/md/bitmap.c 2007-10-22 16:55:52.0 +1000 @@ -274,7 +274,7 @@ static int write_sb_page(struct bitmap * if (bitmap-offset 0) { /* DATA BITMAP METADATA */ if (bitmap-offset - + page-index * (PAGE_SIZE/512) + + (long)(page-index * (PAGE_SIZE/512)) + size/512 0) /* bitmap runs in to metadata */ return -EINVAL; - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 002 of 2] md: raid5: fix clearing of biofill operations
From: Dan Williams [EMAIL PROTECTED] ops_complete_biofill() runs outside of spin_lock(sh-lock) and clears the 'pending' and 'ack' bits. Since the test_and_ack_op() macro only checks against 'complete' it can get an inconsistent snapshot of pending work. Move the clearing of these bits to handle_stripe5(), under the lock. Signed-off-by: Dan Williams [EMAIL PROTECTED] Tested-by: Joël Bertrand [EMAIL PROTECTED] Signed-off-by: Neil Brown [EMAIL PROTECTED] Cc: Stable [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid5.c | 17 ++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2007-10-22 16:55:49.0 +1000 +++ ./drivers/md/raid5.c2007-10-22 16:57:41.0 +1000 @@ -665,7 +665,12 @@ static unsigned long get_stripe_work(str ack++; sh-ops.count -= ack; - BUG_ON(sh-ops.count 0); + if (unlikely(sh-ops.count 0)) { + printk(KERN_ERR pending: %#lx ops.pending: %#lx ops.ack: %#lx + ops.complete: %#lx\n, pending, sh-ops.pending, + sh-ops.ack, sh-ops.complete); + BUG(); + } return pending; } @@ -842,8 +847,7 @@ static void ops_complete_biofill(void *s } } } - clear_bit(STRIPE_OP_BIOFILL, sh-ops.ack); - clear_bit(STRIPE_OP_BIOFILL, sh-ops.pending); + set_bit(STRIPE_OP_BIOFILL, sh-ops.complete); return_io(return_bi); @@ -3130,6 +3134,13 @@ static void handle_stripe5(struct stripe s.expanded = test_bit(STRIPE_EXPAND_READY, sh-state); /* Now to look around and see what can be done */ + /* clean-up completed biofill operations */ + if (test_bit(STRIPE_OP_BIOFILL, sh-ops.complete)) { + clear_bit(STRIPE_OP_BIOFILL, sh-ops.pending); + clear_bit(STRIPE_OP_BIOFILL, sh-ops.ack); + clear_bit(STRIPE_OP_BIOFILL, sh-ops.complete); + } + rcu_read_lock(); for (i=disks; i--; ) { mdk_rdev_t *rdev; - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 001 of 5] md: Fix a bug in some never-used code.
http://bugzilla.kernel.org/show_bug.cgi?id=3277 There is a seq_printf here that isn't being passed a 'seq'. Howeve as the code is inside #ifdef MD_DEBUG, nobody noticed. Also remove some extra spaces. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid0.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff .prev/drivers/md/raid0.c ./drivers/md/raid0.c --- .prev/drivers/md/raid0.c2007-10-15 14:05:58.0 +1000 +++ ./drivers/md/raid0.c2007-10-15 14:06:05.0 +1000 @@ -472,7 +472,7 @@ bad_map: bio_io_error(bio); return 0; } - + static void raid0_status (struct seq_file *seq, mddev_t *mddev) { #undef MD_DEBUG @@ -480,18 +480,18 @@ static void raid0_status (struct seq_fil int j, k, h; char b[BDEVNAME_SIZE]; raid0_conf_t *conf = mddev_to_conf(mddev); - + h = 0; for (j = 0; j conf-nr_strip_zones; j++) { seq_printf(seq, z%d, j); if (conf-hash_table[h] == conf-strip_zone+j) - seq_printf((h%d), h++); + seq_printf(seq, (h%d), h++); seq_printf(seq, =[); for (k = 0; k conf-strip_zone[j].nb_dev; k++) - seq_printf (seq, %s/, bdevname( + seq_printf(seq, %s/, bdevname( conf-strip_zone[j].dev[k]-bdev,b)); - seq_printf (seq, ] zo=%d do=%d s=%d\n, + seq_printf(seq, ] zo=%d do=%d s=%d\n, conf-strip_zone[j].zone_offset, conf-strip_zone[j].dev_offset, conf-strip_zone[j].size); - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 002 of 5] md: 'sync_action' in sysfs returns wrong value for readonly arrays
When an array is started read-only, MD_RECOVERY_NEEDED can be set but no recovery will be running. This causes 'sync_action' to report the wrong value. We could remove the test for MD_RECOVERY_NEEDED, but doing so would leave a small gap after requesting a sync action, where 'sync_action' would still report the old value. So make sure that for a read-only array, 'sync_action' always returns 'idle'. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2007-10-15 14:06:32.0 +1000 +++ ./drivers/md/md.c 2007-10-15 14:06:32.0 +1000 @@ -2714,7 +2714,7 @@ action_show(mddev_t *mddev, char *page) { char *type = idle; if (test_bit(MD_RECOVERY_RUNNING, mddev-recovery) || - test_bit(MD_RECOVERY_NEEDED, mddev-recovery)) { + (!mddev-ro test_bit(MD_RECOVERY_NEEDED, mddev-recovery))) { if (test_bit(MD_RECOVERY_RESHAPE, mddev-recovery)) type = reshape; else if (test_bit(MD_RECOVERY_SYNC, mddev-recovery)) { - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 003 of 5] md: Expose the degraded status of an assembled array through sysfs
From: Iustin Pop [EMAIL PROTECTED] The 'degraded' attribute is useful to quickly determine if the array is degraded, instead of parsing 'mdadm -D' output or relying on the other techniques (number of working devices against number of defined devices, etc.). The md code already keeps track of this attribute, so it's useful to export it. Signed-off-by: Iustin Pop [EMAIL PROTECTED] Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c |7 +++ 1 file changed, 7 insertions(+) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2007-10-15 14:06:32.0 +1000 +++ ./drivers/md/md.c 2007-10-15 14:06:52.0 +1000 @@ -2833,6 +2833,12 @@ sync_max_store(mddev_t *mddev, const cha static struct md_sysfs_entry md_sync_max = __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); +static ssize_t +degraded_show(mddev_t *mddev, char *page) +{ + return sprintf(page, %d\n, mddev-degraded); +} +static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); static ssize_t sync_speed_show(mddev_t *mddev, char *page) @@ -2976,6 +2982,7 @@ static struct attribute *md_redundancy_a md_suspend_lo.attr, md_suspend_hi.attr, md_bitmap.attr, + md_degraded.attr, NULL, }; static struct attribute_group md_redundancy_group = { - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 004 of 5] md: Make sure read errors are auto-corrected during a 'check' resync in raid1
Whenever a read error is found, we should attempt to overwrite with correct data to 'fix' it. However when do a 'check' pass (which compares data blocks that are successfully read, but doesn't normally overwrite) we don't do that. We should. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid1.c |3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff .prev/drivers/md/raid1.c ./drivers/md/raid1.c --- .prev/drivers/md/raid1.c2007-10-15 14:07:17.0 +1000 +++ ./drivers/md/raid1.c2007-10-15 14:08:55.0 +1000 @@ -1214,7 +1214,8 @@ static void sync_request_write(mddev_t * j = 0; if (j = 0) mddev-resync_mismatches += r1_bio-sectors; - if (j 0 || test_bit(MD_RECOVERY_CHECK, mddev-recovery)) { + if (j 0 || (test_bit(MD_RECOVERY_CHECK, mddev-recovery) + test_bit(BIO_UPTODATE, sbio-bi_flags))) { sbio-bi_end_io = NULL; rdev_dec_pending(conf-mirrors[i].rdev, mddev); } else { - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 005 of 5] md: Fix type that is stopping raid5 grow from working.
This kmem_cache_create is creating a cache that already exists. We could us the alternate name, just like we do a few lines up. Signed-off-by: Neil Brown [EMAIL PROTECTED] Cc: Dan Williams [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid5.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2007-10-15 14:12:03.0 +1000 +++ ./drivers/md/raid5.c2007-10-15 14:12:06.0 +1000 @@ -1380,7 +1380,7 @@ static int resize_stripes(raid5_conf_t * if (!sc) return -ENOMEM; - sc_q = kmem_cache_create(conf-sq_cache_name[conf-active_name], + sc_q = kmem_cache_create(conf-sq_cache_name[1-conf-active_name], (sizeof(struct stripe_queue)+(newsize-1) * sizeof(struct r5_queue_dev)) + r5_io_weight_size(newsize) + - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] md: Fix some bugs with growing raid5/raid6 arrays.
The recent changed to raid5 to allow offload of parity calculation etc introduced some bugs in the code for growing (i.e. adding a disk to) raid5 and raid6. This fixes them Acked-by: Dan Williams [EMAIL PROTECTED] Signed-off-by: Neil Brown [EMAIL PROTECTED] --- This is against 2.6.23-rc4. It applies to current -mm with quite a bit of fuzz... Thanks, NeilBrown ### Diffstat output ./drivers/md/raid5.c | 17 + 1 file changed, 9 insertions(+), 8 deletions(-) diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2007-08-30 15:58:55.0 +1000 +++ ./drivers/md/raid5.c2007-08-30 15:58:55.0 +1000 @@ -2541,7 +2541,7 @@ static void handle_stripe_expansion(raid struct dma_async_tx_descriptor *tx = NULL; clear_bit(STRIPE_EXPAND_SOURCE, sh-state); for (i = 0; i sh-disks; i++) - if (i != sh-pd_idx (r6s i != r6s-qd_idx)) { + if (i != sh-pd_idx (!r6s || i != r6s-qd_idx)) { int dd_idx, pd_idx, j; struct stripe_head *sh2; @@ -2574,7 +2574,8 @@ static void handle_stripe_expansion(raid set_bit(R5_UPTODATE, sh2-dev[dd_idx].flags); for (j = 0; j conf-raid_disks; j++) if (j != sh2-pd_idx - (r6s j != r6s-qd_idx) + (!r6s || j != raid6_next_disk(sh2-pd_idx, +sh2-disks)) !test_bit(R5_Expanded, sh2-dev[j].flags)) break; if (j == conf-raid_disks) { @@ -2583,12 +2584,12 @@ static void handle_stripe_expansion(raid } release_stripe(sh2); - /* done submitting copies, wait for them to complete */ - if (i + 1 = sh-disks) { - async_tx_ack(tx); - dma_wait_for_async_tx(tx); - } } + /* done submitting copies, wait for them to complete */ + if (tx) { + async_tx_ack(tx); + dma_wait_for_async_tx(tx); + } } /* @@ -2855,7 +2856,7 @@ static void handle_stripe5(struct stripe sh-disks = conf-raid_disks; sh-pd_idx = stripe_to_pdidx(sh-sector, conf, conf-raid_disks); - s.locked += handle_write_operations5(sh, 0, 1); + s.locked += handle_write_operations5(sh, 1, 1); } else if (s.expanded !test_bit(STRIPE_OP_POSTXOR, sh-ops.pending)) { clear_bit(STRIPE_EXPAND_READY, sh-state); - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 002 of 2] md: Correctly update sysfs when a raid1 is reshaped.
When a raid1 array is reshaped (number of drives changed), the list of devices is compacted, so that slots for missing devices are filled with working devices from later slots. This requires the rd%d symlinks in sysfs to be updated. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid1.c | 22 ++ 1 file changed, 18 insertions(+), 4 deletions(-) diff .prev/drivers/md/raid1.c ./drivers/md/raid1.c --- .prev/drivers/md/raid1.c2007-08-16 10:27:57.0 +1000 +++ ./drivers/md/raid1.c2007-08-16 10:29:58.0 +1000 @@ -2154,11 +2154,25 @@ static int raid1_reshape(mddev_t *mddev) oldpool = conf-r1bio_pool; conf-r1bio_pool = newpool; - for (d=d2=0; d conf-raid_disks; d++) - if (conf-mirrors[d].rdev) { - conf-mirrors[d].rdev-raid_disk = d2; - newmirrors[d2++].rdev = conf-mirrors[d].rdev; + for (d = d2 = 0; d conf-raid_disks; d++) { + mdk_rdev_t *rdev = conf-mirrors[d].rdev; + if (rdev rdev-raid_disk != d2) { + char nm[20]; + sprintf(nm, rd%d, rdev-raid_disk); + sysfs_remove_link(mddev-kobj, nm); + rdev-raid_disk = d2; + sprintf(nm, rd%d, rdev-raid_disk); + sysfs_remove_link(mddev-kobj, nm); + if (sysfs_create_link(mddev-kobj, + rdev-kobj, nm)) + printk(KERN_WARNING + md/raid1: cannot register + %s for %s\n, + nm, mdname(mddev)); } + if (rdev) + newmirrors[d2++].rdev = rdev; + } kfree(conf-mirrors); conf-mirrors = newmirrors; kfree(conf-poolinfo); - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 000 of 2] md: bug fixes for 2.6.23-rc
Following 2 patches contain bugfixes for md. Both apply to earlier kernels, but probably aren't significant enough for -stable (no oops, no data corruption, no security hole). They should go in 2.6.23 though. Thanks, NeilBrown [PATCH 001 of 2] md: Make sure a re-add after a restart honours bitmap when resyncing. [PATCH 002 of 2] md: Correctly update sysfs when a raid1 is reshaped. - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 001 of 2] md: Make sure a re-add after a restart honours bitmap when resyncing.
Commit 1757128438d41670ded8bc3bc735325cc07dc8f9 was slightly bad. If an array has a write-intent bitmap, and you remove a drive, then readd it, only the changed parts should be resynced. However after the above commit, this only works if the array has not been shut down and restarted. This is because it sets 'fullsync' at little more often than it should. This patch is more careful. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid1.c |3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff .prev/drivers/md/raid1.c ./drivers/md/raid1.c --- .prev/drivers/md/raid1.c2007-08-16 10:27:57.0 +1000 +++ ./drivers/md/raid1.c2007-08-16 10:27:57.0 +1000 @@ -1972,7 +1972,8 @@ static int run(mddev_t *mddev) !test_bit(In_sync, disk-rdev-flags)) { disk-head_position = 0; mddev-degraded++; - conf-fullsync = 1; + if (disk-rdev) + conf-fullsync = 1; } } if (mddev-degraded == conf-raid_disks) { - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 000 of 7] md: Introduction EXPLAIN PATCH SET HERE
Following are 7 patches for md in current main-line. The first two fix bugs that can cause data corruption, and so are suitable for -stable. The next fixes some problems with hot-adding a device to a linear array. As has not been tested by my test-suite until now, it hasn't worked properly until now :-( The remainder are mainly cleaning up code and comments. They could wait for 2.6.23, but are probably safe to go into 2.6.22 (maybe sit a week in -mm??) Thanks, NeilBrown [PATCH 001 of 7] md: Avoid overflow in raid0 calculation with large components. [PATCH 002 of 7] md: Don't write more than is required of the last page of a bitmap [PATCH 003 of 7] md: Fix bug with linear hot-add and elsewhere. [PATCH 004 of 7] md: Improve message about invalid superblock during autodetect. [PATCH 005 of 7] md: Improve the is_mddev_idle test fix [PATCH 006 of 7] md: Check that internal bitmap does not overlap other data. [PATCH 007 of 7] md: Change bitmap_unplug and others to void functions. - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 001 of 7] md: Avoid overflow in raid0 calculation with large components.
If a raid0 has a component device larger than 4TB, and is accessed on a 32bit machines, then as 'chunk' is unsigned lock, chunk chunksize_bits can overflow (this can be as high as the size of the device in KB). chunk itself will not overflow (without triggering a BUG). So change 'chunk' to be 'sector_t, and get rid of the 'BUG' as it becomes impossible to hit. Cc: Jeff Zheng [EMAIL PROTECTED] Signed-off-by: Neil Brown [EMAIL PROTECTED] Cc: [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid0.c |3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff .prev/drivers/md/raid0.c ./drivers/md/raid0.c --- .prev/drivers/md/raid0.c2007-05-18 11:48:57.0 +1000 +++ ./drivers/md/raid0.c2007-05-18 11:48:57.0 +1000 @@ -415,7 +415,7 @@ static int raid0_make_request (request_q raid0_conf_t *conf = mddev_to_conf(mddev); struct strip_zone *zone; mdk_rdev_t *tmp_dev; - unsigned long chunk; + sector_t chunk; sector_t block, rsect; const int rw = bio_data_dir(bio); @@ -470,7 +470,6 @@ static int raid0_make_request (request_q sector_div(x, zone-nb_dev); chunk = x; - BUG_ON(x != (sector_t)chunk); x = block chunksize_bits; tmp_dev = zone-dev[sector_div(x, zone-nb_dev)]; - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 002 of 7] md: Don't write more than is required of the last page of a bitmap
It is possible that real data or metadata follows the bitmap without full page alignment. So limit the last write to be only the required number of bytes, rounded up to the hard sector size of the device. Signed-off-by: Neil Brown [EMAIL PROTECTED] Cc: [EMAIL PROTECTED] ### Diffstat output ./drivers/md/bitmap.c | 17 - ./include/linux/raid/bitmap.h |1 + 2 files changed, 13 insertions(+), 5 deletions(-) diff .prev/drivers/md/bitmap.c ./drivers/md/bitmap.c --- .prev/drivers/md/bitmap.c 2007-05-18 11:49:18.0 +1000 +++ ./drivers/md/bitmap.c 2007-05-18 11:49:18.0 +1000 @@ -255,19 +255,25 @@ static struct page *read_sb_page(mddev_t } -static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wait) +static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) { mdk_rdev_t *rdev; struct list_head *tmp; + mddev_t *mddev = bitmap-mddev; ITERATE_RDEV(mddev, rdev, tmp) if (test_bit(In_sync, rdev-flags) -!test_bit(Faulty, rdev-flags)) +!test_bit(Faulty, rdev-flags)) { + int size = PAGE_SIZE; + if (page-index == bitmap-file_pages-1) + size = roundup(bitmap-last_page_size, + bdev_hardsect_size(rdev-bdev)); md_super_write(mddev, rdev, - (rdev-sb_offset1) + offset + (rdev-sb_offset1) + bitmap-offset + page-index * (PAGE_SIZE/512), - PAGE_SIZE, + size, page); + } if (wait) md_super_wait(mddev); @@ -282,7 +288,7 @@ static int write_page(struct bitmap *bit struct buffer_head *bh; if (bitmap-file == NULL) - return write_sb_page(bitmap-mddev, bitmap-offset, page, wait); + return write_sb_page(bitmap, page, wait); bh = page_buffers(page); @@ -923,6 +929,7 @@ static int bitmap_init_from_disk(struct } bitmap-filemap[bitmap-file_pages++] = page; + bitmap-last_page_size = count; } paddr = kmap_atomic(page, KM_USER0); if (bitmap-flags BITMAP_HOSTENDIAN) diff .prev/include/linux/raid/bitmap.h ./include/linux/raid/bitmap.h --- .prev/include/linux/raid/bitmap.h 2007-05-18 11:49:18.0 +1000 +++ ./include/linux/raid/bitmap.h 2007-05-18 11:49:18.0 +1000 @@ -232,6 +232,7 @@ struct bitmap { struct page **filemap; /* list of cache pages for the file */ unsigned long *filemap_attr; /* attributes associated w/ filemap pages */ unsigned long file_pages; /* number of pages in the file */ + int last_page_size; /* bytes in the last page */ unsigned long flags; - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 003 of 7] md: Fix bug with linear hot-add and elsewhere.
Adding a drive to a linear array seems to have stopped working, due to changes elsewhere in md, and insufficient ongoing testing... So the patch to make linear hot-add work in the first place introduced a subtle bug elsewhere that interracts poorly with older version of mdadm. This fixes it all up. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/linear.c | 10 +- ./drivers/md/md.c | 20 ++-- 2 files changed, 19 insertions(+), 11 deletions(-) diff .prev/drivers/md/linear.c ./drivers/md/linear.c --- .prev/drivers/md/linear.c 2007-05-21 11:13:25.0 +1000 +++ ./drivers/md/linear.c 2007-05-21 11:13:39.0 +1000 @@ -139,8 +139,6 @@ static linear_conf_t *linear_conf(mddev_ if (!conf) return NULL; - mddev-private = conf; - cnt = 0; conf-array_size = 0; @@ -232,7 +230,7 @@ static linear_conf_t *linear_conf(mddev_ * First calculate the device offsets. */ conf-disks[0].offset = 0; - for (i=1; imddev-raid_disks; i++) + for (i=1; i raid_disks; i++) conf-disks[i].offset = conf-disks[i-1].offset + conf-disks[i-1].size; @@ -244,7 +242,7 @@ static linear_conf_t *linear_conf(mddev_ curr_offset conf-array_size; curr_offset += conf-hash_spacing) { - while (i mddev-raid_disks-1 + while (i raid_disks-1 curr_offset = conf-disks[i+1].offset) i++; @@ -299,9 +297,11 @@ static int linear_add(mddev_t *mddev, md */ linear_conf_t *newconf; - if (rdev-raid_disk != mddev-raid_disks) + if (rdev-saved_raid_disk != mddev-raid_disks) return -EINVAL; + rdev-raid_disk = rdev-saved_raid_disk; + newconf = linear_conf(mddev,mddev-raid_disks+1); if (!newconf) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2007-05-21 11:13:25.0 +1000 +++ ./drivers/md/md.c 2007-05-21 11:14:54.0 +1000 @@ -1298,8 +1298,9 @@ static void super_1_sync(mddev_t *mddev, ITERATE_RDEV(mddev,rdev2,tmp) if (rdev2-desc_nr+1 max_dev) max_dev = rdev2-desc_nr+1; - - sb-max_dev = cpu_to_le32(max_dev); + + if (max_dev le32_to_cpu(sb-max_dev)) + sb-max_dev = cpu_to_le32(max_dev); for (i=0; imax_dev;i++) sb-dev_roles[i] = cpu_to_le16(0xfffe); @@ -1365,10 +1366,14 @@ static int bind_rdev_to_array(mdk_rdev_t } /* make sure rdev-size exceeds mddev-size */ if (rdev-size (mddev-size == 0 || rdev-size mddev-size)) { - if (mddev-pers) - /* Cannot change size, so fail */ - return -ENOSPC; - else + if (mddev-pers) { + /* Cannot change size, so fail +* If mddev-level = 0, then we don't care +* about aligning sizes (e.g. linear) +*/ + if (mddev-level 0) + return -ENOSPC; + } else mddev-size = rdev-size; } @@ -2142,6 +2147,9 @@ static void analyze_sbs(mddev_t * mddev) rdev-desc_nr = i++; rdev-raid_disk = rdev-desc_nr; set_bit(In_sync, rdev-flags); + } else if (rdev-raid_disk = mddev-raid_disks) { + rdev-raid_disk = -1; + clear_bit(In_sync, rdev-flags); } } - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 004 of 7] md: Improve message about invalid superblock during autodetect.
People try to use raid auto-detect with version-1 superblocks (which is not supported) and get confused when they are told they have an invalid superblock. So be more explicit, and say it it is not a valid v0.90 superblock. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2007-05-21 11:14:54.0 +1000 +++ ./drivers/md/md.c 2007-05-21 11:16:16.0 +1000 @@ -2073,9 +2073,11 @@ static mdk_rdev_t *md_import_device(dev_ err = super_types[super_format]. load_super(rdev, NULL, super_minor); if (err == -EINVAL) { - printk(KERN_WARNING - md: %s has invalid sb, not importing!\n, - bdevname(rdev-bdev,b)); + printk(KERN_WARNING + md: %s does not have a valid v%d.%d + superblock, not importing!\n, + bdevname(rdev-bdev,b), + super_format, super_minor); goto abort_free; } if (err 0) { @@ -5772,7 +5774,7 @@ static void autostart_arrays(int part) for (i = 0; i dev_cnt; i++) { dev_t dev = detected_devices[i]; - rdev = md_import_device(dev,0, 0); + rdev = md_import_device(dev,0, 90); if (IS_ERR(rdev)) continue; - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 005 of 7] md: Improve the is_mddev_idle test fix
Don't use 'unsigned' variable to track sync vs non-sync IO, as the only thing we want to do with them is a signed comparison, and fix up the comment which had become quite wrong. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c | 35 ++- ./include/linux/raid/md_k.h |2 +- 2 files changed, 23 insertions(+), 14 deletions(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2007-05-21 11:17:57.0 +1000 +++ ./drivers/md/md.c 2007-05-21 11:18:00.0 +1000 @@ -5092,7 +5092,7 @@ static int is_mddev_idle(mddev_t *mddev) mdk_rdev_t * rdev; struct list_head *tmp; int idle; - unsigned long curr_events; + long curr_events; idle = 1; ITERATE_RDEV(mddev,rdev,tmp) { @@ -5100,20 +5100,29 @@ static int is_mddev_idle(mddev_t *mddev) curr_events = disk_stat_read(disk, sectors[0]) + disk_stat_read(disk, sectors[1]) - atomic_read(disk-sync_io); - /* The difference between curr_events and last_events -* will be affected by any new non-sync IO (making -* curr_events bigger) and any difference in the amount of -* in-flight syncio (making current_events bigger or smaller) -* The amount in-flight is currently limited to -* 32*64K in raid1/10 and 256*PAGE_SIZE in raid5/6 -* which is at most 4096 sectors. -* These numbers are fairly fragile and should be made -* more robust, probably by enforcing the -* 'window size' that md_do_sync sort-of uses. + /* sync IO will cause sync_io to increase before the disk_stats +* as sync_io is counted when a request starts, and +* disk_stats is counted when it completes. +* So resync activity will cause curr_events to be smaller than +* when there was no such activity. +* non-sync IO will cause disk_stat to increase without +* increasing sync_io so curr_events will (eventually) +* be larger than it was before. Once it becomes +* substantially larger, the test below will cause +* the array to appear non-idle, and resync will slow +* down. +* If there is a lot of outstanding resync activity when +* we set last_event to curr_events, then all that activity +* completing might cause the array to appear non-idle +* and resync will be slowed down even though there might +* not have been non-resync activity. This will only +* happen once though. 'last_events' will soon reflect +* the state where there is little or no outstanding +* resync requests, and further resync activity will +* always make curr_events less than last_events. * -* Note: the following is an unsigned comparison. */ - if ((long)curr_events - (long)rdev-last_events 4096) { + if (curr_events - rdev-last_events 4096) { rdev-last_events = curr_events; idle = 0; } diff .prev/include/linux/raid/md_k.h ./include/linux/raid/md_k.h --- .prev/include/linux/raid/md_k.h 2007-05-21 11:17:57.0 +1000 +++ ./include/linux/raid/md_k.h 2007-05-21 11:18:00.0 +1000 @@ -51,7 +51,7 @@ struct mdk_rdev_s sector_t size; /* Device size (in blocks) */ mddev_t *mddev; /* RAID array if running */ - unsigned long last_events; /* IO event timestamp */ + long last_events; /* IO event timestamp */ struct block_device *bdev; /* block device handle */ - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 006 of 7] md: Check that internal bitmap does not overlap other data.
We current completely trust user-space to set up metadata describing an consistant array. In particlar, that the metadata, data, and bitmap do not overlap. But userspace can be buggy, and it is better to report an error than corrupt data. So put in some appropriate checks. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/bitmap.c | 35 +-- ./drivers/md/md.c | 22 +- 2 files changed, 54 insertions(+), 3 deletions(-) diff .prev/drivers/md/bitmap.c ./drivers/md/bitmap.c --- .prev/drivers/md/bitmap.c 2007-05-21 11:17:57.0 +1000 +++ ./drivers/md/bitmap.c 2007-05-21 11:18:22.0 +1000 @@ -268,6 +268,31 @@ static int write_sb_page(struct bitmap * if (page-index == bitmap-file_pages-1) size = roundup(bitmap-last_page_size, bdev_hardsect_size(rdev-bdev)); + /* Just make sure we aren't corrupting data or +* metadata +*/ + if (bitmap-offset 0) { + /* DATA BITMAP METADATA */ + if (bitmap-offset + + page-index * (PAGE_SIZE/512) + + size/512 0) + /* bitmap runs in to metadata */ + return -EINVAL; + if (rdev-data_offset + mddev-size*2 +rdev-sb_offset*2 + bitmap-offset) + /* data runs in to bitmap */ + return -EINVAL; + } else if (rdev-sb_offset*2 rdev-data_offset) { + /* METADATA BITMAP DATA */ + if (rdev-sb_offset*2 + + bitmap-offset + + page-index*(PAGE_SIZE/512) + size/512 +rdev-data_offset) + /* bitmap runs in to data */ + return -EINVAL; + } else { + /* DATA METADATA BITMAP - no problems */ + } md_super_write(mddev, rdev, (rdev-sb_offset1) + bitmap-offset + page-index * (PAGE_SIZE/512), @@ -287,8 +312,14 @@ static int write_page(struct bitmap *bit { struct buffer_head *bh; - if (bitmap-file == NULL) - return write_sb_page(bitmap, page, wait); + if (bitmap-file == NULL) { + switch (write_sb_page(bitmap, page, wait)) { + case -EINVAL: + bitmap-flags |= BITMAP_WRITE_ERROR; + return -EIO; + } + return 0; + } bh = page_buffers(page); diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2007-05-21 11:18:00.0 +1000 +++ ./drivers/md/md.c 2007-05-21 11:18:22.0 +1000 @@ -3176,13 +3176,33 @@ static int do_md_run(mddev_t * mddev) * Drop all container device buffers, from now on * the only valid external interface is through the md * device. -* Also find largest hardsector size */ ITERATE_RDEV(mddev,rdev,tmp) { if (test_bit(Faulty, rdev-flags)) continue; sync_blockdev(rdev-bdev); invalidate_bdev(rdev-bdev); + + /* perform some consistency tests on the device. +* We don't want the data to overlap the metadata, +* Internal Bitmap issues has handled elsewhere. +*/ + if (rdev-data_offset rdev-sb_offset) { + if (mddev-size + rdev-data_offset + mddev-size*2 +rdev-sb_offset*2) { + printk(md: %s: data overlaps metadata\n, + mdname(mddev)); + return -EINVAL; + } + } else { + if (rdev-sb_offset*2 + rdev-sb_size/512 +rdev-data_offset) { + printk(md: %s: metadata overlaps data\n, + mdname(mddev)); + return -EINVAL; + } + } } md_probe(mddev-unit, NULL, NULL); - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 007 of 7] md: Change bitmap_unplug and others to void functions.
bitmap_unplug only ever returns 0, so it may as well be void. Two callers try to print a message if it returns non-zero, but that message is already printed by bitmap_file_kick. write_page returns an error which is not consistently checked. It always causes BITMAP_WRITE_ERROR to be set on an error, and that can more conveniently be checked. When the return of write_page is checked, an error causes bitmap_file_kick to be called - so move that call into write_page - and protect against recursive calls into bitmap_file_kick. bitmap_update_sb returns an error that is never checked. So make these 'void' and be consistent about checking the bit. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/bitmap.c | 144 +- ./drivers/md/md.c |3 ./drivers/md/raid1.c |3 ./drivers/md/raid10.c |3 ./include/linux/raid/bitmap.h |6 - 5 files changed, 80 insertions(+), 79 deletions(-) diff .prev/drivers/md/bitmap.c ./drivers/md/bitmap.c --- .prev/drivers/md/bitmap.c 2007-05-21 11:18:22.0 +1000 +++ ./drivers/md/bitmap.c 2007-05-21 11:18:23.0 +1000 @@ -305,10 +305,11 @@ static int write_sb_page(struct bitmap * return 0; } +static void bitmap_file_kick(struct bitmap *bitmap); /* * write out a page to a file */ -static int write_page(struct bitmap *bitmap, struct page *page, int wait) +static void write_page(struct bitmap *bitmap, struct page *page, int wait) { struct buffer_head *bh; @@ -316,27 +317,26 @@ static int write_page(struct bitmap *bit switch (write_sb_page(bitmap, page, wait)) { case -EINVAL: bitmap-flags |= BITMAP_WRITE_ERROR; - return -EIO; } - return 0; - } + } else { - bh = page_buffers(page); + bh = page_buffers(page); - while (bh bh-b_blocknr) { - atomic_inc(bitmap-pending_writes); - set_buffer_locked(bh); - set_buffer_mapped(bh); - submit_bh(WRITE, bh); - bh = bh-b_this_page; - } + while (bh bh-b_blocknr) { + atomic_inc(bitmap-pending_writes); + set_buffer_locked(bh); + set_buffer_mapped(bh); + submit_bh(WRITE, bh); + bh = bh-b_this_page; + } - if (wait) { - wait_event(bitmap-write_wait, - atomic_read(bitmap-pending_writes)==0); - return (bitmap-flags BITMAP_WRITE_ERROR) ? -EIO : 0; + if (wait) { + wait_event(bitmap-write_wait, + atomic_read(bitmap-pending_writes)==0); + } } - return 0; + if (bitmap-flags BITMAP_WRITE_ERROR) + bitmap_file_kick(bitmap); } static void end_bitmap_write(struct buffer_head *bh, int uptodate) @@ -456,17 +456,17 @@ out: */ /* update the event counter and sync the superblock to disk */ -int bitmap_update_sb(struct bitmap *bitmap) +void bitmap_update_sb(struct bitmap *bitmap) { bitmap_super_t *sb; unsigned long flags; if (!bitmap || !bitmap-mddev) /* no bitmap for this array */ - return 0; + return; spin_lock_irqsave(bitmap-lock, flags); if (!bitmap-sb_page) { /* no superblock */ spin_unlock_irqrestore(bitmap-lock, flags); - return 0; + return; } spin_unlock_irqrestore(bitmap-lock, flags); sb = (bitmap_super_t *)kmap_atomic(bitmap-sb_page, KM_USER0); @@ -474,7 +474,7 @@ int bitmap_update_sb(struct bitmap *bitm if (!bitmap-mddev-degraded) sb-events_cleared = cpu_to_le64(bitmap-mddev-events); kunmap_atomic(sb, KM_USER0); - return write_page(bitmap, bitmap-sb_page, 1); + write_page(bitmap, bitmap-sb_page, 1); } /* print out the bitmap file superblock */ @@ -603,20 +603,22 @@ enum bitmap_mask_op { MASK_UNSET }; -/* record the state of the bitmap in the superblock */ -static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, - enum bitmap_mask_op op) +/* record the state of the bitmap in the superblock. Return the old value */ +static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, +enum bitmap_mask_op op) { bitmap_super_t *sb; unsigned long flags; + int old; spin_lock_irqsave(bitmap-lock, flags); if (!bitmap-sb_page) { /* can't set the state */ spin_unlock_irqrestore(bitmap-lock, flags); - return; + return 0; } spin_unlock_irqrestore(bitmap-lock, flags); sb =
[PATCH 000 of 2] md: Two more bugfixes.
Following are two bugfixes for md in current kernels. The first is suitable for -stable is it can allow drive errors through to the filesystem wrongly. Both are suitable for 2.6.22. Thanks, NeilBrown [PATCH 001 of 2] md: Avoid a possibility that a read error can wrongly propagate through md/raid1 to a filesystem. [PATCH 002 of 2] md: Improve the is_mddev_idle test - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 001 of 2] md: Avoid a possibility that a read error can wrongly propagate through md/raid1 to a filesystem.
When a raid1 has only one working drive, we want read error to propagate up to the filesystem as there is no point failing the last drive in an array. Currently the code perform this check is racy. If a write and a read a both submitted to a device on a 2-drive raid1, and the write fails followed by the read failing, the read will see that there is only one working drive and will pass the failure up, even though the one working drive is actually the *other* one. So, tighten up the locking. Cc: [EMAIL PROTECTED] Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid1.c | 33 +++-- 1 file changed, 19 insertions(+), 14 deletions(-) diff .prev/drivers/md/raid1.c ./drivers/md/raid1.c --- .prev/drivers/md/raid1.c2007-05-10 15:51:54.0 +1000 +++ ./drivers/md/raid1.c2007-05-10 15:51:58.0 +1000 @@ -271,21 +271,25 @@ static int raid1_end_read_request(struct */ update_head_pos(mirror, r1_bio); - if (uptodate || (conf-raid_disks - conf-mddev-degraded) = 1) { - /* -* Set R1BIO_Uptodate in our master bio, so that -* we will return a good error code for to the higher -* levels even if IO on some other mirrored buffer fails. -* -* The 'master' represents the composite IO operation to -* user-side. So if something waits for IO, then it will -* wait for the 'master' bio. + if (uptodate) + set_bit(R1BIO_Uptodate, r1_bio-state); + else { + /* If all other devices have failed, we want to return +* the error upwards rather than fail the last device. +* Here we redefine uptodate to mean Don't want to retry */ - if (uptodate) - set_bit(R1BIO_Uptodate, r1_bio-state); + unsigned long flags; + spin_lock_irqsave(conf-device_lock, flags); + if (r1_bio-mddev-degraded == conf-raid_disks || + (r1_bio-mddev-degraded == conf-raid_disks-1 +!test_bit(Faulty, conf-mirrors[mirror].rdev-flags))) + uptodate = 1; + spin_unlock_irqrestore(conf-device_lock, flags); + } + if (uptodate) raid_end_bio_io(r1_bio); - } else { + else { /* * oops, read error: */ @@ -992,13 +996,14 @@ static void error(mddev_t *mddev, mdk_rd unsigned long flags; spin_lock_irqsave(conf-device_lock, flags); mddev-degraded++; + set_bit(Faulty, rdev-flags); spin_unlock_irqrestore(conf-device_lock, flags); /* * if recovery is running, make sure it aborts. */ set_bit(MD_RECOVERY_ERR, mddev-recovery); - } - set_bit(Faulty, rdev-flags); + } else + set_bit(Faulty, rdev-flags); set_bit(MD_CHANGE_DEVS, mddev-flags); printk(KERN_ALERT raid1: Disk failure on %s, disabling device. \n Operation continuing on %d devices\n, - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 002 of 2] md: Improve the is_mddev_idle test
During a 'resync' or similar activity, md checks if the devices in the array are otherwise active and winds back resync activity when they are. This test in done in is_mddev_idle, and it is somewhat fragile - it sometimes thinks there is non-sync io when there isn't. The test compares the total sectors of io (disk_stat_read) with the sectors of resync io (disk-sync_io). This has problems because total sectors gets updated when a request completes, while resync io gets updated when the request is submitted. The time difference can cause large differenced between the two which do not actually imply non-resync activity. The test currently allows for some fuzz (+/- 4096) but there are some cases when it is not enough. The test currently looks for any (non-fuzz) difference, either positive or negative. This clearly is not needed. Any non-sync activity will cause the total sectors to grow faster than the sync_io count (never slower) so we only need to look for a positive differences. If we do this then the amount of in-flight sync io will never cause the appearance of non-sync IO. Once enough non-sync IO to worry about starts happening, resync will be slowed down and the measurements will thus be more precise (as there is less in-flight) and control of resync will still be suitably responsive. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2007-05-10 15:51:54.0 +1000 +++ ./drivers/md/md.c 2007-05-10 16:05:10.0 +1000 @@ -5095,7 +5095,7 @@ static int is_mddev_idle(mddev_t *mddev) * * Note: the following is an unsigned comparison. */ - if ((curr_events - rdev-last_events + 4096) 8192) { + if ((long)curr_events - (long)rdev-last_events 4096) { rdev-last_events = curr_events; idle = 0; } - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 000 of 5] md: assorted bug fixes and minor features
Following are 5 patches for md suitable for 2.6.22. None are needed for -stable. Thanks NeilBrown [PATCH 001 of 5] md: Move test for whether level supports bitmap to correct place. [PATCH 002 of 5] md: Stop using csum_partial for checksum calculation in md. [PATCH 003 of 5] md: Remove the slash from the name of a kmem_cache used by raid5. [PATCH 004 of 5] md: Allow reshape_position for md arrays to be set via sysfs. [PATCH 005 of 5] md: Improve partition detection in md array. - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 001 of 5] md: Move test for whether level supports bitmap to correct place.
We need to check for internal-consistency of superblock in load_super. validate_super is for inter-device consistency. With the test in the wrong place, a badly created array will confuse md rather an produce sensible errors. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c | 42 ++ 1 file changed, 26 insertions(+), 16 deletions(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2007-05-07 14:33:31.0 +1000 +++ ./drivers/md/md.c 2007-05-07 14:33:31.0 +1000 @@ -695,6 +695,17 @@ static int super_90_load(mdk_rdev_t *rde rdev-data_offset = 0; rdev-sb_size = MD_SB_BYTES; + if (sb-state (1MD_SB_BITMAP_PRESENT)) { + if (sb-level != 1 sb-level != 4 +sb-level != 5 sb-level != 6 +sb-level != 10) { + /* FIXME use a better test */ + printk(KERN_WARNING + md: bitmaps not supported for this level.\n); + goto abort; + } + } + if (sb-level == LEVEL_MULTIPATH) rdev-desc_nr = -1; else @@ -793,16 +804,8 @@ static int super_90_validate(mddev_t *md mddev-max_disks = MD_SB_DISKS; if (sb-state (1MD_SB_BITMAP_PRESENT) - mddev-bitmap_file == NULL) { - if (mddev-level != 1 mddev-level != 4 -mddev-level != 5 mddev-level != 6 -mddev-level != 10) { - /* FIXME use a better test */ - printk(KERN_WARNING md: bitmaps not supported for this level.\n); - return -EINVAL; - } + mddev-bitmap_file == NULL) mddev-bitmap_offset = mddev-default_bitmap_offset; - } } else if (mddev-pers == NULL) { /* Insist on good event counter while assembling */ @@ -1059,6 +1062,18 @@ static int super_1_load(mdk_rdev_t *rdev bdevname(rdev-bdev,b)); return -EINVAL; } + if ((le32_to_cpu(sb-feature_map) MD_FEATURE_BITMAP_OFFSET)) { + if (sb-level != cpu_to_le32(1) + sb-level != cpu_to_le32(4) + sb-level != cpu_to_le32(5) + sb-level != cpu_to_le32(6) + sb-level != cpu_to_le32(10)) { + printk(KERN_WARNING + md: bitmaps not supported for this level.\n); + return -EINVAL; + } + } + rdev-preferred_minor = 0x; rdev-data_offset = le64_to_cpu(sb-data_offset); atomic_set(rdev-corrected_errors, le32_to_cpu(sb-cnt_corrected_read)); @@ -1142,14 +1157,9 @@ static int super_1_validate(mddev_t *mdd mddev-max_disks = (4096-256)/2; if ((le32_to_cpu(sb-feature_map) MD_FEATURE_BITMAP_OFFSET) - mddev-bitmap_file == NULL ) { - if (mddev-level != 1 mddev-level != 5 mddev-level != 6 -mddev-level != 10) { - printk(KERN_WARNING md: bitmaps not supported for this level.\n); - return -EINVAL; - } + mddev-bitmap_file == NULL ) mddev-bitmap_offset = (__s32)le32_to_cpu(sb-bitmap_offset); - } + if ((le32_to_cpu(sb-feature_map) MD_FEATURE_RESHAPE_ACTIVE)) { mddev-reshape_position = le64_to_cpu(sb-reshape_position); mddev-delta_disks = le32_to_cpu(sb-delta_disks); - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 003 of 5] md: Remove the slash from the name of a kmem_cache used by raid5.
SLUB doesn't like slashes as it wants to use the cache name as the name of a directory (or symlink) in sysfs. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid5.c |4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2007-05-07 14:36:01.0 +1000 +++ ./drivers/md/raid5.c2007-05-07 15:08:45.0 +1000 @@ -931,8 +931,8 @@ static int grow_stripes(raid5_conf_t *co struct kmem_cache *sc; int devs = conf-raid_disks; - sprintf(conf-cache_name[0], raid5/%s, mdname(conf-mddev)); - sprintf(conf-cache_name[1], raid5/%s-alt, mdname(conf-mddev)); + sprintf(conf-cache_name[0], raid5-%s, mdname(conf-mddev)); + sprintf(conf-cache_name[1], raid5-%s-alt, mdname(conf-mddev)); conf-active_name = 0; sc = kmem_cache_create(conf-cache_name[conf-active_name], sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 004 of 5] md: Allow reshape_position for md arrays to be set via sysfs.
reshape_position records how much progress has been made on a reshape (adding drives, changing layout or chunksize). When it is set, the number of drives, layout and chunksize can have two possible values, an old an a new. So allow these different values to be visible, and allow both old and new to be set: Set the old ones first, then the reshape_position, then the new values. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./Documentation/md.txt | 72 + ./drivers/md/md.c | 70 ++- 2 files changed, 107 insertions(+), 35 deletions(-) diff .prev/Documentation/md.txt ./Documentation/md.txt --- .prev/Documentation/md.txt 2007-05-07 15:38:13.0 +1000 +++ ./Documentation/md.txt 2007-05-07 15:49:01.0 +1000 @@ -178,6 +178,21 @@ All md devices contain: The size should be at least PAGE_SIZE (4k) and should be a power of 2. This can only be set while assembling an array + layout + The layout for the array for the particular level. This is + simply a number that is interpretted differently by different + levels. It can be written while assembling an array. + + reshape_position + This is either none or a sector number within the devices of + the array where reshape is up to. If this is set, the three + attributes mentioned above (raid_disks, chunk_size, layout) can + potentially have 2 values, an old and a new value. If these + values differ, reading the attribute returns +new (old) + and writing will effect the 'new' value, leaving the 'old' + unchanged. + component_size For arrays with data redundancy (i.e. not raid0, linear, faulty, multipath), all components must be the same size - or at least @@ -193,11 +208,6 @@ All md devices contain: 1.2 (newer format in varying locations) or none indicating that the kernel isn't managing metadata at all. - layout - The layout for the array for the particular level. This is - simply a number that is interpretted differently by different - levels. It can be written while assembling an array. - resync_start The point at which resync should start. If no resync is needed, this will be a very large number. At array creation it will @@ -259,29 +269,6 @@ All md devices contain: like active, but no writes have been seen for a while (safe_mode_delay). - sync_speed_min - sync_speed_max - This are similar to /proc/sys/dev/raid/speed_limit_{min,max} - however they only apply to the particular array. - If no value has been written to these, of if the word 'system' - is written, then the system-wide value is used. If a value, - in kibibytes-per-second is written, then it is used. - When the files are read, they show the currently active value - followed by (local) or (system) depending on whether it is - a locally set or system-wide value. - - sync_completed - This shows the number of sectors that have been completed of - whatever the current sync_action is, followed by the number of - sectors in total that could need to be processed. The two - numbers are separated by a '/' thus effectively showing one - value, a fraction of the process that is complete. - - sync_speed - This shows the current actual speed, in K/sec, of the current - sync_action. It is averaged over the last 30 seconds. - - As component devices are added to an md array, they appear in the 'md' directory as new directories named dev-XXX @@ -412,6 +399,35 @@ also have Note that the numbers are 'bit' numbers, not 'block' numbers. They should be scaled by the bitmap_chunksize. + sync_speed_min + sync_speed_max + This are similar to /proc/sys/dev/raid/speed_limit_{min,max} + however they only apply to the particular array. + If no value has been written to these, of if the word 'system' + is written, then the system-wide value is used. If a value, + in kibibytes-per-second is written, then it is used. + When the files are read, they show the currently active value + followed by (local) or (system) depending on whether it is + a locally set or system-wide value. + + sync_completed + This shows the number of sectors that have been completed of + whatever the current sync_action is, followed by the number of + sectors in total that could need to be processed. The two + numbers are separated by a '/' thus effectively showing one + value, a fraction of the process that is complete. + + sync_speed + This shows the current actual speed, in K/sec, of the current + sync_action. It is averaged over the last 30 seconds. + + suspend_lo + suspend_hi + The two values, given as numbers of sectors, indicate a range + within the array where IO will be blocked. This is currently +
[PATCH 005 of 5] md: Improve partition detection in md array.
md currently uses -media_changed to make sure rescan_partitions is call on md array after they are assembled. However that doesn't happen until the array is opened, which is later than some people would like. So use blkdev_ioctl to do the rescan immediately that the array has been assembled. This means we can remove all the -change infrastructure as it was only used to trigger a partition rescan. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c | 26 -- ./drivers/md/raid1.c|1 - ./drivers/md/raid5.c|2 -- ./include/linux/raid/md_k.h |1 - 4 files changed, 8 insertions(+), 22 deletions(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2007-05-08 14:24:00.0 +1000 +++ ./drivers/md/md.c 2007-05-07 17:47:15.0 +1000 @@ -3104,6 +3104,7 @@ static int do_md_run(mddev_t * mddev) struct gendisk *disk; struct mdk_personality *pers; char b[BDEVNAME_SIZE]; + struct block_device *bdev; if (list_empty(mddev-disks)) /* cannot run an array with no devices.. */ @@ -3331,7 +3332,13 @@ static int do_md_run(mddev_t * mddev) md_wakeup_thread(mddev-thread); md_wakeup_thread(mddev-sync_thread); /* possibly kick off a reshape */ - mddev-changed = 1; + bdev = bdget_disk(mddev-gendisk, 0); + if (bdev) { + bd_set_size(bdev, mddev-array_size 1); + blkdev_ioctl(bdev-bd_inode, NULL, BLKRRPART, 0); + bdput(bdev); + } + md_new_event(mddev); kobject_uevent(mddev-gendisk-kobj, KOBJ_CHANGE); return 0; @@ -3453,7 +3460,6 @@ static int do_md_stop(mddev_t * mddev, i mddev-pers = NULL; set_capacity(disk, 0); - mddev-changed = 1; if (mddev-ro) mddev-ro = 0; @@ -4593,20 +4599,6 @@ static int md_release(struct inode *inod return 0; } -static int md_media_changed(struct gendisk *disk) -{ - mddev_t *mddev = disk-private_data; - - return mddev-changed; -} - -static int md_revalidate(struct gendisk *disk) -{ - mddev_t *mddev = disk-private_data; - - mddev-changed = 0; - return 0; -} static struct block_device_operations md_fops = { .owner = THIS_MODULE, @@ -4614,8 +4606,6 @@ static struct block_device_operations md .release= md_release, .ioctl = md_ioctl, .getgeo = md_getgeo, - .media_changed = md_media_changed, - .revalidate_disk= md_revalidate, }; static int md_thread(void * arg) diff .prev/drivers/md/raid1.c ./drivers/md/raid1.c --- .prev/drivers/md/raid1.c2007-05-08 14:24:00.0 +1000 +++ ./drivers/md/raid1.c2007-05-07 17:02:27.0 +1000 @@ -2063,7 +2063,6 @@ static int raid1_resize(mddev_t *mddev, */ mddev-array_size = sectors1; set_capacity(mddev-gendisk, mddev-array_size 1); - mddev-changed = 1; if (mddev-array_size mddev-size mddev-recovery_cp == MaxSector) { mddev-recovery_cp = mddev-size 1; set_bit(MD_RECOVERY_NEEDED, mddev-recovery); diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2007-05-08 14:24:00.0 +1000 +++ ./drivers/md/raid5.c2007-05-07 17:03:05.0 +1000 @@ -4514,7 +4514,6 @@ static int raid5_resize(mddev_t *mddev, sectors = ~((sector_t)mddev-chunk_size/512 - 1); mddev-array_size = (sectors * (mddev-raid_disks-conf-max_degraded))1; set_capacity(mddev-gendisk, mddev-array_size 1); - mddev-changed = 1; if (sectors/2 mddev-size mddev-recovery_cp == MaxSector) { mddev-recovery_cp = mddev-size 1; set_bit(MD_RECOVERY_NEEDED, mddev-recovery); @@ -4649,7 +4648,6 @@ static void end_reshape(raid5_conf_t *co conf-mddev-array_size = conf-mddev-size * (conf-raid_disks - conf-max_degraded); set_capacity(conf-mddev-gendisk, conf-mddev-array_size 1); - conf-mddev-changed = 1; bdev = bdget_disk(conf-mddev-gendisk, 0); if (bdev) { diff .prev/include/linux/raid/md_k.h ./include/linux/raid/md_k.h --- .prev/include/linux/raid/md_k.h 2007-05-08 14:24:00.0 +1000 +++ ./include/linux/raid/md_k.h 2007-05-08 14:24:36.0 +1000 @@ -201,7 +201,6 @@ struct mddev_s struct mutexreconfig_mutex; atomic_tactive; - int changed;/* true if we might need to reread partition info */ int degraded; /* whether md should consider * adding a spare
[PATCH 002 of 5] md: Stop using csum_partial for checksum calculation in md.
If CONFIG_NET is not selected, csum_partial is not exported, so md.ko cannot use it. We shouldn't really be using csum_partial anyway as it is an internal-to-networking interface. So replace it with C code to do the same thing. Speed is not crucial here, so something simple and correct is best. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c | 31 +-- 1 file changed, 29 insertions(+), 2 deletions(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2007-05-07 14:33:31.0 +1000 +++ ./drivers/md/md.c 2007-05-07 14:57:41.0 +1000 @@ -590,14 +590,41 @@ abort: return ret; } + +static u32 md_csum_fold(u32 csum) +{ + csum = (csum 0x) + (csum 16); + return (csum 0x) + (csum 16); +} + static unsigned int calc_sb_csum(mdp_super_t * sb) { + u64 newcsum = 0; + u32 *sb32 = (u32*)sb; + int i; unsigned int disk_csum, csum; disk_csum = sb-sb_csum; sb-sb_csum = 0; - csum = csum_partial((void *)sb, MD_SB_BYTES, 0); + + for (i = 0; i MD_SB_BYTES/4 ; i++) + newcsum += sb32[i]; + csum = (newcsum 0x) + (newcsum32); + + +#ifdef CONFIG_ALPHA + /* This used to use csum_partial, which was wrong for several +* reasons including that different results are returned on +* different architectures. It isn't critical that we get exactly +* the same return value as before (we always csum_fold before +* testing, and that removes any differences). However as we +* know that csum_partial always returned a 16bit value on +* alphas, do a fold to maximise conformity to previous behaviour. +*/ + sb-sb_csum = md_csum_fold(disk_csum); +#else sb-sb_csum = disk_csum; +#endif return csum; } @@ -685,7 +712,7 @@ static int super_90_load(mdk_rdev_t *rde if (sb-raid_disks = 0) goto abort; - if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb-sb_csum)) { + if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb-sb_csum)) { printk(KERN_WARNING md: invalid superblock checksum on %s\n, b); goto abort; - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] md: Avoid a deadlock when removing a device from an md array via sysfs.
(This patch should go in 2.6.21 as it fixes a recent regression - NB) A device can be removed from an md array via e.g. echo remove /sys/block/md3/md/dev-sde/state This will try to remove the 'dev-sde' subtree which will deadlock since commit e7b0d26a86943370c04d6833c6edba2a72a6e240 With this patch we run the kobject_del via schedule_work so as to avoid the deadlock. Cc: Alan Stern [EMAIL PROTECTED] Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c | 13 - ./include/linux/raid/md_k.h |1 + 2 files changed, 13 insertions(+), 1 deletion(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2007-04-02 17:43:03.0 +1000 +++ ./drivers/md/md.c 2007-04-02 17:38:46.0 +1000 @@ -1389,6 +1389,12 @@ static int bind_rdev_to_array(mdk_rdev_t return err; } +static void delayed_delete(struct work_struct *ws) +{ + mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work); + kobject_del(rdev-kobj); +} + static void unbind_rdev_from_array(mdk_rdev_t * rdev) { char b[BDEVNAME_SIZE]; @@ -1401,7 +1407,12 @@ static void unbind_rdev_from_array(mdk_r printk(KERN_INFO md: unbind%s\n, bdevname(rdev-bdev,b)); rdev-mddev = NULL; sysfs_remove_link(rdev-kobj, block); - kobject_del(rdev-kobj); + + /* We need to delay this, otherwise we can deadlock when +* writing to 'remove' to dev/state +*/ + INIT_WORK(rdev-del_work, delayed_delete); + schedule_work(rdev-del_work); } /* diff .prev/include/linux/raid/md_k.h ./include/linux/raid/md_k.h --- .prev/include/linux/raid/md_k.h 2007-04-02 17:43:03.0 +1000 +++ ./include/linux/raid/md_k.h 2007-04-02 17:36:32.0 +1000 @@ -104,6 +104,7 @@ struct mdk_rdev_s * for reporting to userspace and storing * in superblock. */ + struct work_struct del_work;/* used for delayed sysfs removal */ }; struct mddev_s - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 000 of 3] md: bug fixes for md for 2.6.21
A minor new feature and 2 bug fixes for md suitable for 2.6.21 The minor feature is to make reshape (adding a drive to an array and restriping it) work for raid4. The code is all ready, it just wasn't used. Thanks, NeilBrown [PATCH 001 of 3] md: Allow raid4 arrays to be reshaped. [PATCH 002 of 3] md: Clear the congested_fn when stopping a raid5 [PATCH 003 of 3] md: Convert compile time warnings into runtime warnings. - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 001 of 3] md: Allow raid4 arrays to be reshaped.
All that is missing the the function pointers in raid4_pers. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid5.c |4 1 file changed, 4 insertions(+) diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2007-03-23 11:13:29.0 +1100 +++ ./drivers/md/raid5.c2007-03-23 11:13:29.0 +1100 @@ -4727,6 +4727,10 @@ static struct mdk_personality raid4_pers .spare_active = raid5_spare_active, .sync_request = sync_request, .resize = raid5_resize, +#ifdef CONFIG_MD_RAID5_RESHAPE + .check_reshape = raid5_check_reshape, + .start_reshape = raid5_start_reshape, +#endif .quiesce= raid5_quiesce, }; - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 002 of 3] md: Clear the congested_fn when stopping a raid5
If this mddev and queue got reused for another array that doesn't register a congested_fn, this function would get called incorretly. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c|1 + ./drivers/md/raid5.c |3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2007-03-23 11:13:41.0 +1100 +++ ./drivers/md/md.c 2007-03-23 11:13:41.0 +1100 @@ -3325,6 +3325,7 @@ static int do_md_stop(mddev_t * mddev, i mddev-queue-merge_bvec_fn = NULL; mddev-queue-unplug_fn = NULL; mddev-queue-issue_flush_fn = NULL; + mddev-queue-backing_dev_info.congested_fn = NULL; if (mddev-pers-sync_request) sysfs_remove_group(mddev-kobj, md_redundancy_group); diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2007-03-23 11:13:29.0 +1100 +++ ./drivers/md/raid5.c2007-03-23 11:13:41.0 +1100 @@ -4269,8 +4269,8 @@ static int run(mddev_t *mddev) mddev-queue-unplug_fn = raid5_unplug_device; mddev-queue-issue_flush_fn = raid5_issue_flush; - mddev-queue-backing_dev_info.congested_fn = raid5_congested; mddev-queue-backing_dev_info.congested_data = mddev; + mddev-queue-backing_dev_info.congested_fn = raid5_congested; mddev-array_size = mddev-size * (conf-previous_raid_disks - conf-max_degraded); @@ -4301,6 +4301,7 @@ static int stop(mddev_t *mddev) mddev-thread = NULL; shrink_stripes(conf); kfree(conf-stripe_hashtbl); + mddev-queue-backing_dev_info.congested_fn = NULL; blk_sync_queue(mddev-queue); /* the unplug fn references 'conf'*/ sysfs_remove_group(mddev-kobj, raid5_attrs_group); kfree(conf-disks); - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 003 of 3] md: Convert compile time warnings into runtime warnings.
... still not sure why we need this Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c| 41 +++-- ./drivers/md/raid5.c | 12 ++-- 2 files changed, 41 insertions(+), 12 deletions(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2007-03-23 11:13:41.0 +1100 +++ ./drivers/md/md.c 2007-03-23 12:06:34.0 +1100 @@ -1319,6 +1319,7 @@ static int bind_rdev_to_array(mdk_rdev_t char b[BDEVNAME_SIZE]; struct kobject *ko; char *s; + int err; if (rdev-mddev) { MD_BUG(); @@ -1353,20 +1354,29 @@ static int bind_rdev_to_array(mdk_rdev_t while ( (s=strchr(rdev-kobj.k_name, '/')) != NULL) *s = '!'; - list_add(rdev-same_set, mddev-disks); rdev-mddev = mddev; printk(KERN_INFO md: bind%s\n, b); rdev-kobj.parent = mddev-kobj; - kobject_add(rdev-kobj); + if ((err = kobject_add(rdev-kobj))) + goto fail; if (rdev-bdev-bd_part) ko = rdev-bdev-bd_part-kobj; else ko = rdev-bdev-bd_disk-kobj; - sysfs_create_link(rdev-kobj, ko, block); + if ((err = sysfs_create_link(rdev-kobj, ko, block))) { + kobject_del(rdev-kobj); + goto fail; + } + list_add(rdev-same_set, mddev-disks); bd_claim_by_disk(rdev-bdev, rdev, mddev-gendisk); return 0; + + fail: + printk(KERN_WARNING md: failed to register dev-%s for %s\n, + b, mdname(mddev)); + return err; } static void unbind_rdev_from_array(mdk_rdev_t * rdev) @@ -2966,7 +2976,9 @@ static struct kobject *md_probe(dev_t de mddev-kobj.k_name = NULL; snprintf(mddev-kobj.name, KOBJ_NAME_LEN, %s, md); mddev-kobj.ktype = md_ktype; - kobject_register(mddev-kobj); + if (kobject_register(mddev-kobj)) + printk(KERN_WARNING md: cannot register %s/md - name in use\n, + disk-disk_name); return NULL; } @@ -3144,9 +3156,12 @@ static int do_md_run(mddev_t * mddev) bitmap_destroy(mddev); return err; } - if (mddev-pers-sync_request) - sysfs_create_group(mddev-kobj, md_redundancy_group); - else if (mddev-ro == 2) /* auto-readonly not meaningful */ + if (mddev-pers-sync_request) { + if (sysfs_create_group(mddev-kobj, md_redundancy_group)) + printk(KERN_WARNING + md: cannot register extra attributes for %s\n, + mdname(mddev)); + } else if (mddev-ro == 2) /* auto-readonly not meaningful */ mddev-ro = 0; atomic_set(mddev-writes_pending,0); @@ -3160,7 +3175,9 @@ static int do_md_run(mddev_t * mddev) if (rdev-raid_disk = 0) { char nm[20]; sprintf(nm, rd%d, rdev-raid_disk); - sysfs_create_link(mddev-kobj, rdev-kobj, nm); + if (sysfs_create_link(mddev-kobj, rdev-kobj, nm)) + printk(md: cannot register %s for %s\n, + nm, mdname(mddev)); } set_bit(MD_RECOVERY_NEEDED, mddev-recovery); @@ -5388,8 +5405,12 @@ static int remove_and_add_spares(mddev_t if (mddev-pers-hot_add_disk(mddev,rdev)) { char nm[20]; sprintf(nm, rd%d, rdev-raid_disk); - sysfs_create_link(mddev-kobj, - rdev-kobj, nm); + if (sysfs_create_link(mddev-kobj, + rdev-kobj, nm)) + printk(KERN_WARNING + md: cannot register + %s for %s\n, + nm, mdname(mddev)); spares++; md_new_event(mddev); } else diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2007-03-23 11:13:41.0 +1100 +++ ./drivers/md/raid5.c2007-03-23 12:06:00.0 +1100 @@ -4265,7 +4265,10 @@ static int run(mddev_t *mddev) } /* Ok, everything is just fine now */ - sysfs_create_group(mddev-kobj, raid5_attrs_group); + if (sysfs_create_group(mddev-kobj, raid5_attrs_group)) + printk(KERN_WARNING + raid5: failed to create sysfs attributes for %s\n, +
[PATCH] md: Fix for raid6 reshape.
### Comments for Changeset Recent patch for raid6 reshape had a change missing that showed up in subsequent review. Many places in the raid5 code used conf-raid_disks-1 to mean number of data disks. With raid6 that had to be changed to conf-raid_disk - conf-max_degraded or similar. One place was missed. This bug means that if a raid6 reshape were aborted in the middle the recorded position would be wrong. On restart it would either fail (as the position wasn't on an appropriate boundary) or would leave a section of the array unreshaped, causing data corruption. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid5.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2007-03-02 15:47:51.0 +1100 +++ ./drivers/md/raid5.c2007-03-02 15:48:35.0 +1100 @@ -3071,7 +3071,7 @@ static sector_t reshape_request(mddev_t release_stripe(sh); } spin_lock_irq(conf-device_lock); - conf-expand_progress = (sector_nr + i)*(conf-raid_disks-1); + conf-expand_progress = (sector_nr + i) * new_data_disks); spin_unlock_irq(conf-device_lock); /* Ok, those stripe are ready. We can start scheduling * reads on the source stripes. - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 000 of 6] md: Assorted fixes and features for md for 2.6.21
Following 6 patches are against 2.6.20 and are suitable for 2.6.21. They are not against -mm because the new plugging makes raid5 not work and so not testable, and there are a few fairly minor intersections between these patches and those patches. There is also a very minor conflict with the hardware-xor patches - one line of context is different. Patch 1 should probably go in -stable - the bug could cause data corruption in a fairly uncommon raid10 configuration, so that one and this intro are Cc:ed to [EMAIL PROTECTED] Thanks, NeilBrown [PATCH 001 of 6] md: Fix raid10 recovery problem. [PATCH 002 of 6] md: RAID6: clean up CPUID and FPU enter/exit code [PATCH 003 of 6] md: Move warning about creating a raid array on partitions of the one device. [PATCH 004 of 6] md: Clean out unplug and other queue function on md shutdown [PATCH 005 of 6] md: Restart a (raid5) reshape that has been aborted due to a read/write error. [PATCH 006 of 6] md: Add support for reshape of a raid6 - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 001 of 6] md: Fix raid10 recovery problem.
There are two errors that can lead to recovery problems with raid10 when used in 'far' more (not the default). Due to a '' instead of '=' the wrong block is located which would result in garbage being written to some random location, quite possible outside the range of the device, causing the newly reconstructed device to fail. The device size calculation had some rounding errors (it didn't round when it should) and so recovery would go a few blocks too far which would again cause a write to a random block address and probably a device error. The code for working with device sizes was fairly confused and spread out, so this has been tided up a bit. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid10.c | 38 -- 1 file changed, 20 insertions(+), 18 deletions(-) diff .prev/drivers/md/raid10.c ./drivers/md/raid10.c --- .prev/drivers/md/raid10.c 2007-02-20 17:10:15.0 +1100 +++ ./drivers/md/raid10.c 2007-02-20 17:11:41.0 +1100 @@ -429,7 +429,7 @@ static sector_t raid10_find_virt(conf_t if (dev 0) dev += conf-raid_disks; } else { - while (sector conf-stride) { + while (sector = conf-stride) { sector -= conf-stride; if (dev conf-near_copies) dev += conf-raid_disks - conf-near_copies; @@ -1801,6 +1801,7 @@ static sector_t sync_request(mddev_t *md for (k=0; kconf-copies; k++) if (r10_bio-devs[k].devnum == i) break; + BUG_ON(k == conf-copies); bio = r10_bio-devs[1].bio; bio-bi_next = biolist; biolist = bio; @@ -2021,19 +2022,30 @@ static int run(mddev_t *mddev) if (!conf-tmppage) goto out_free_conf; + conf-mddev = mddev; + conf-raid_disks = mddev-raid_disks; conf-near_copies = nc; conf-far_copies = fc; conf-copies = nc*fc; conf-far_offset = fo; conf-chunk_mask = (sector_t)(mddev-chunk_size9)-1; conf-chunk_shift = ffz(~mddev-chunk_size) - 9; + size = mddev-size (conf-chunk_shift-1); + sector_div(size, fc); + size = size * conf-raid_disks; + sector_div(size, nc); + /* 'size' is now the number of chunks in the array */ + /* calculate used chunks per device in 'stride' */ + stride = size * conf-copies; + sector_div(stride, conf-raid_disks); + mddev-size = stride (conf-chunk_shift-1); + if (fo) - conf-stride = 1 conf-chunk_shift; - else { - stride = mddev-size (conf-chunk_shift-1); + stride = 1; + else sector_div(stride, fc); - conf-stride = stride conf-chunk_shift; - } + conf-stride = stride conf-chunk_shift; + conf-r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, r10bio_pool_free, conf); if (!conf-r10bio_pool) { @@ -2063,8 +2075,6 @@ static int run(mddev_t *mddev) disk-head_position = 0; } - conf-raid_disks = mddev-raid_disks; - conf-mddev = mddev; spin_lock_init(conf-device_lock); INIT_LIST_HEAD(conf-retry_list); @@ -2106,16 +2116,8 @@ static int run(mddev_t *mddev) /* * Ok, everything is just fine now */ - if (conf-far_offset) { - size = mddev-size (conf-chunk_shift-1); - size *= conf-raid_disks; - size = conf-chunk_shift; - sector_div(size, conf-far_copies); - } else - size = conf-stride * conf-raid_disks; - sector_div(size, conf-near_copies); - mddev-array_size = size/2; - mddev-resync_max_sectors = size; + mddev-array_size = size (conf-chunk_shift-1); + mddev-resync_max_sectors = size conf-chunk_shift; mddev-queue-unplug_fn = raid10_unplug; mddev-queue-issue_flush_fn = raid10_issue_flush; - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 003 of 6] md: Move warning about creating a raid array on partitions of the one device.
md tries to warn the user if they e.g. create a raid1 using two partitions of the same device, as this does not provide true redundancy. However it also warns if a raid0 is created like this, and there is nothing wrong with that. At the place where the warning is currently printer, we don't necessarily know what level the array will be, so move the warning from the point where the device is added to the point where the array is started. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c | 63 +++--- 1 file changed, 37 insertions(+), 26 deletions(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2007-02-20 17:10:06.0 +1100 +++ ./drivers/md/md.c 2007-02-20 17:12:16.0 +1100 @@ -1296,27 +1296,17 @@ static struct super_type super_types[] = .sync_super = super_1_sync, }, }; - -static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) -{ - struct list_head *tmp; - mdk_rdev_t *rdev; - - ITERATE_RDEV(mddev,rdev,tmp) - if (rdev-bdev-bd_contains == dev-bdev-bd_contains) - return rdev; - - return NULL; -} static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) { - struct list_head *tmp; - mdk_rdev_t *rdev; + struct list_head *tmp, *tmp2; + mdk_rdev_t *rdev, *rdev2; ITERATE_RDEV(mddev1,rdev,tmp) - if (match_dev_unit(mddev2, rdev)) - return 1; + ITERATE_RDEV(mddev2, rdev2, tmp2) + if (rdev-bdev-bd_contains == + rdev2-bdev-bd_contains) + return 1; return 0; } @@ -1325,8 +1315,7 @@ static LIST_HEAD(pending_raid_disks); static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) { - mdk_rdev_t *same_pdev; - char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; + char b[BDEVNAME_SIZE]; struct kobject *ko; char *s; @@ -1342,14 +1331,6 @@ static int bind_rdev_to_array(mdk_rdev_t else mddev-size = rdev-size; } - same_pdev = match_dev_unit(mddev, rdev); - if (same_pdev) - printk(KERN_WARNING - %s: WARNING: %s appears to be on the same physical -disk as %s. True\n protection against single-disk -failure might be compromised.\n, - mdname(mddev), bdevname(rdev-bdev,b), - bdevname(same_pdev-bdev,b2)); /* Verify rdev-desc_nr is unique. * If it is -1, assign a free number, else @@ -3109,6 +3090,36 @@ static int do_md_run(mddev_t * mddev) return -EINVAL; } + if (pers-sync_request) { + /* Warn if this is a potentially silly +* configuration. +*/ + char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; + mdk_rdev_t *rdev2; + struct list_head *tmp2; + int warned = 0; + ITERATE_RDEV(mddev, rdev, tmp) { + ITERATE_RDEV(mddev, rdev2, tmp2) { + if (rdev rdev2 + rdev-bdev-bd_contains == + rdev2-bdev-bd_contains) { + printk(KERN_WARNING + %s: WARNING: %s appears to be + on the same physical disk as + %s.\n, + mdname(mddev), + bdevname(rdev-bdev,b), + bdevname(rdev2-bdev,b2)); + warned = 1; + } + } + } + if (warned) + printk(KERN_WARNING + True protection against single-disk + failure might be compromised.\n); + } + mddev-recovery = 0; mddev-resync_max_sectors = mddev-size 1; /* may be over-ridden by personality */ mddev-barriers_work = 1; - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 004 of 6] md: Clean out unplug and other queue function on md shutdown
The mddev and queue might be used for another array which does not set these, so they need to be cleared. Signed-off-by: NeilBrown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c |3 +++ 1 file changed, 3 insertions(+) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2007-02-20 17:13:54.0 +1100 +++ ./drivers/md/md.c 2007-02-20 17:13:08.0 +1100 @@ -3322,6 +3322,9 @@ static int do_md_stop(mddev_t * mddev, i set_disk_ro(disk, 0); blk_queue_make_request(mddev-queue, md_fail_request); mddev-pers-stop(mddev); + mddev-queue-merge_bvec_fn = NULL; + mddev-queue-unplug_fn = NULL; + mddev-queue-issue_flush_fn = NULL; if (mddev-pers-sync_request) sysfs_remove_group(mddev-kobj, md_redundancy_group); - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 002 of 6] md: RAID6: clean up CPUID and FPU enter/exit code
From: H. Peter Anvin [EMAIL PROTECTED] - Use kernel_fpu_begin() and kernel_fpu_end() - Use boot_cpu_has() for feature testing even in userspace Signed-off-by: H. Peter Anvin [EMAIL PROTECTED] Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid6mmx.c | 16 --- ./drivers/md/raid6sse1.c | 17 --- ./drivers/md/raid6sse2.c | 22 +--- ./drivers/md/raid6x86.h | 218 +++ 4 files changed, 32 insertions(+), 241 deletions(-) diff .prev/drivers/md/raid6mmx.c ./drivers/md/raid6mmx.c --- .prev/drivers/md/raid6mmx.c 2007-02-20 17:11:51.0 +1100 +++ ./drivers/md/raid6mmx.c 2007-02-20 17:11:51.0 +1100 @@ -30,14 +30,8 @@ const struct raid6_mmx_constants { static int raid6_have_mmx(void) { -#ifdef __KERNEL__ /* Not really boot_cpu but all_cpus */ return boot_cpu_has(X86_FEATURE_MMX); -#else - /* User space test code */ - u32 features = cpuid_features(); - return ( (features (123)) == (123) ); -#endif } /* @@ -48,13 +42,12 @@ static void raid6_mmx1_gen_syndrome(int u8 **dptr = (u8 **)ptrs; u8 *p, *q; int d, z, z0; - raid6_mmx_save_t sa; z0 = disks - 3; /* Highest data disk */ p = dptr[z0+1]; /* XOR parity */ q = dptr[z0+2]; /* RS syndrome */ - raid6_before_mmx(sa); + kernel_fpu_begin(); asm volatile(movq %0,%%mm0 : : m (raid6_mmx_constants.x1d)); asm volatile(pxor %mm5,%mm5); /* Zero temp */ @@ -78,7 +71,7 @@ static void raid6_mmx1_gen_syndrome(int asm volatile(pxor %mm4,%mm4); } - raid6_after_mmx(sa); + kernel_fpu_end(); } const struct raid6_calls raid6_mmxx1 = { @@ -96,13 +89,12 @@ static void raid6_mmx2_gen_syndrome(int u8 **dptr = (u8 **)ptrs; u8 *p, *q; int d, z, z0; - raid6_mmx_save_t sa; z0 = disks - 3; /* Highest data disk */ p = dptr[z0+1]; /* XOR parity */ q = dptr[z0+2]; /* RS syndrome */ - raid6_before_mmx(sa); + kernel_fpu_begin(); asm volatile(movq %0,%%mm0 : : m (raid6_mmx_constants.x1d)); asm volatile(pxor %mm5,%mm5); /* Zero temp */ @@ -137,7 +129,7 @@ static void raid6_mmx2_gen_syndrome(int asm volatile(movq %%mm6,%0 : =m (q[d+8])); } - raid6_after_mmx(sa); + kernel_fpu_end(); } const struct raid6_calls raid6_mmxx2 = { diff .prev/drivers/md/raid6sse1.c ./drivers/md/raid6sse1.c --- .prev/drivers/md/raid6sse1.c2007-02-20 17:11:51.0 +1100 +++ ./drivers/md/raid6sse1.c2007-02-20 17:11:51.0 +1100 @@ -33,16 +33,10 @@ extern const struct raid6_mmx_constants static int raid6_have_sse1_or_mmxext(void) { -#ifdef __KERNEL__ /* Not really boot_cpu but all_cpus */ return boot_cpu_has(X86_FEATURE_MMX) (boot_cpu_has(X86_FEATURE_XMM) || boot_cpu_has(X86_FEATURE_MMXEXT)); -#else - /* User space test code - this incorrectly breaks on some Athlons */ - u32 features = cpuid_features(); - return ( (features (523)) == (523) ); -#endif } /* @@ -53,14 +47,12 @@ static void raid6_sse11_gen_syndrome(int u8 **dptr = (u8 **)ptrs; u8 *p, *q; int d, z, z0; - raid6_mmx_save_t sa; z0 = disks - 3; /* Highest data disk */ p = dptr[z0+1]; /* XOR parity */ q = dptr[z0+2]; /* RS syndrome */ - /* This is really MMX code, not SSE */ - raid6_before_mmx(sa); + kernel_fpu_begin(); asm volatile(movq %0,%%mm0 : : m (raid6_mmx_constants.x1d)); asm volatile(pxor %mm5,%mm5); /* Zero temp */ @@ -94,8 +86,8 @@ static void raid6_sse11_gen_syndrome(int asm volatile(movntq %%mm4,%0 : =m (q[d])); } - raid6_after_mmx(sa); asm volatile(sfence : : : memory); + kernel_fpu_end(); } const struct raid6_calls raid6_sse1x1 = { @@ -113,13 +105,12 @@ static void raid6_sse12_gen_syndrome(int u8 **dptr = (u8 **)ptrs; u8 *p, *q; int d, z, z0; - raid6_mmx_save_t sa; z0 = disks - 3; /* Highest data disk */ p = dptr[z0+1]; /* XOR parity */ q = dptr[z0+2]; /* RS syndrome */ - raid6_before_mmx(sa); + kernel_fpu_begin(); asm volatile(movq %0,%%mm0 : : m (raid6_mmx_constants.x1d)); asm volatile(pxor %mm5,%mm5); /* Zero temp */ @@ -157,8 +148,8 @@ static void raid6_sse12_gen_syndrome(int asm volatile(movntq %%mm6,%0 : =m (q[d+8])); } - raid6_after_mmx(sa); asm volatile(sfence : :: memory); + kernel_fpu_end(); } const struct raid6_calls raid6_sse1x2 = { diff .prev/drivers/md/raid6sse2.c ./drivers/md/raid6sse2.c --- .prev/drivers/md/raid6sse2.c2007-02-20 17:11:51.0 +1100 +++
[PATCH 005 of 6] md: Restart a (raid5) reshape that has been aborted due to a read/write error.
An error always aborts any resync/recovery/reshape on the understanding that it will immediately be restarted if that still makes sense. However a reshape currently doesn't get restarted. With this patch it does. To avoid restarting when it is not possible to do work, we call into the personality to check that a reshape is ok, and strengthen raid5_check_reshape to fail if there are too many failed devices. We also break some code out into a separate function: remove_and_add_spares as the indent level for that code was getting crazy. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c| 74 +++ ./drivers/md/raid5.c |2 + 2 files changed, 47 insertions(+), 29 deletions(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2007-02-20 17:13:08.0 +1100 +++ ./drivers/md/md.c 2007-02-20 17:14:35.0 +1100 @@ -5357,6 +5357,44 @@ void md_do_sync(mddev_t *mddev) EXPORT_SYMBOL_GPL(md_do_sync); +static int remove_and_add_spares(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + struct list_head *rtmp; + int spares = 0; + + ITERATE_RDEV(mddev,rdev,rtmp) + if (rdev-raid_disk = 0 + (test_bit(Faulty, rdev-flags) || +! test_bit(In_sync, rdev-flags)) + atomic_read(rdev-nr_pending)==0) { + if (mddev-pers-hot_remove_disk( + mddev, rdev-raid_disk)==0) { + char nm[20]; + sprintf(nm,rd%d, rdev-raid_disk); + sysfs_remove_link(mddev-kobj, nm); + rdev-raid_disk = -1; + } + } + + if (mddev-degraded) { + ITERATE_RDEV(mddev,rdev,rtmp) + if (rdev-raid_disk 0 +!test_bit(Faulty, rdev-flags)) { + rdev-recovery_offset = 0; + if (mddev-pers-hot_add_disk(mddev,rdev)) { + char nm[20]; + sprintf(nm, rd%d, rdev-raid_disk); + sysfs_create_link(mddev-kobj, + rdev-kobj, nm); + spares++; + md_new_event(mddev); + } else + break; + } + } + return spares; +} /* * This routine is regularly called by all per-raid-array threads to * deal with generic issues like resync and super-block update. @@ -5411,7 +5449,7 @@ void md_check_recovery(mddev_t *mddev) return; if (mddev_trylock(mddev)) { - int spares =0; + int spares = 0; spin_lock_irq(mddev-write_lock); if (mddev-safemode !atomic_read(mddev-writes_pending) @@ -5474,35 +5512,13 @@ void md_check_recovery(mddev_t *mddev) * Spare are also removed and re-added, to allow * the personality to fail the re-add. */ - ITERATE_RDEV(mddev,rdev,rtmp) - if (rdev-raid_disk = 0 - (test_bit(Faulty, rdev-flags) || ! test_bit(In_sync, rdev-flags)) - atomic_read(rdev-nr_pending)==0) { - if (mddev-pers-hot_remove_disk(mddev, rdev-raid_disk)==0) { - char nm[20]; - sprintf(nm,rd%d, rdev-raid_disk); - sysfs_remove_link(mddev-kobj, nm); - rdev-raid_disk = -1; - } - } - - if (mddev-degraded) { - ITERATE_RDEV(mddev,rdev,rtmp) - if (rdev-raid_disk 0 -!test_bit(Faulty, rdev-flags)) { - rdev-recovery_offset = 0; - if (mddev-pers-hot_add_disk(mddev,rdev)) { - char nm[20]; - sprintf(nm, rd%d, rdev-raid_disk); - sysfs_create_link(mddev-kobj, rdev-kobj, nm); - spares++; - md_new_event(mddev); - } else - break; - } - } - if (spares) { + if (mddev-reshape_position != MaxSector) { + if
[PATCH 006 of 6] md: Add support for reshape of a raid6
i.e. one or more drives can be added and the array will re-stripe while on-line. Most of the interesting work was already done for raid5. This just extends it to raid6. mdadm newer than 2.6 is needed for complete safety, however any version of mdadm which support raid5 reshape will do a good enough job in almost all cases (an 'echo repair /sys/block/mdX/md/sync_action' is recommended after a reshape that was aborted and had to be restarted with an such a version of mdadm). Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid5.c | 157 --- 1 file changed, 124 insertions(+), 33 deletions(-) diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2007-02-20 17:14:35.0 +1100 +++ ./drivers/md/raid5.c2007-02-20 17:14:48.0 +1100 @@ -1050,7 +1050,7 @@ static void compute_parity5(struct strip static void compute_parity6(struct stripe_head *sh, int method) { raid6_conf_t *conf = sh-raid_conf; - int i, pd_idx = sh-pd_idx, qd_idx, d0_idx, disks = conf-raid_disks, count; + int i, pd_idx = sh-pd_idx, qd_idx, d0_idx, disks = sh-disks, count; struct bio *chosen; / FIX THIS: This could be very bad if disks is close to 256 / void *ptrs[disks]; @@ -1131,8 +1131,7 @@ static void compute_parity6(struct strip /* Compute one missing block */ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) { - raid6_conf_t *conf = sh-raid_conf; - int i, count, disks = conf-raid_disks; + int i, count, disks = sh-disks; void *ptr[MAX_XOR_BLOCKS], *p; int pd_idx = sh-pd_idx; int qd_idx = raid6_next_disk(pd_idx, disks); @@ -1170,8 +1169,7 @@ static void compute_block_1(struct strip /* Compute two missing blocks */ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) { - raid6_conf_t *conf = sh-raid_conf; - int i, count, disks = conf-raid_disks; + int i, count, disks = sh-disks; int pd_idx = sh-pd_idx; int qd_idx = raid6_next_disk(pd_idx, disks); int d0_idx = raid6_next_disk(qd_idx, disks); @@ -1887,11 +1885,11 @@ static void handle_stripe5(struct stripe static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) { raid6_conf_t *conf = sh-raid_conf; - int disks = conf-raid_disks; + int disks = sh-disks; struct bio *return_bi= NULL; struct bio *bi; int i; - int syncing; + int syncing, expanding, expanded; int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; int non_overwrite = 0; int failed_num[2] = {0, 0}; @@ -1909,6 +1907,8 @@ static void handle_stripe6(struct stripe clear_bit(STRIPE_DELAYED, sh-state); syncing = test_bit(STRIPE_SYNCING, sh-state); + expanding = test_bit(STRIPE_EXPAND_SOURCE, sh-state); + expanded = test_bit(STRIPE_EXPAND_READY, sh-state); /* Now to look around and see what can be done */ rcu_read_lock(); @@ -2114,13 +2114,15 @@ static void handle_stripe6(struct stripe * parity, or to satisfy requests * or to load a block that is being partially written. */ - if (to_read || non_overwrite || (to_write failed) || (syncing (uptodate disks))) { + if (to_read || non_overwrite || (to_write failed) || + (syncing (uptodate disks)) || expanding) { for (i=disks; i--;) { dev = sh-dev[i]; if (!test_bit(R5_LOCKED, dev-flags) !test_bit(R5_UPTODATE, dev-flags) (dev-toread || (dev-towrite !test_bit(R5_OVERWRITE, dev-flags)) || syncing || +expanding || (failed = 1 (sh-dev[failed_num[0]].toread || to_write)) || (failed = 2 (sh-dev[failed_num[1]].toread || to_write)) ) @@ -2355,6 +2357,79 @@ static void handle_stripe6(struct stripe } } } + + if (expanded test_bit(STRIPE_EXPANDING, sh-state)) { + /* Need to write out all blocks after computing PQ */ + sh-disks = conf-raid_disks; + sh-pd_idx = stripe_to_pdidx(sh-sector, conf, +conf-raid_disks); + compute_parity6(sh, RECONSTRUCT_WRITE); + for (i = conf-raid_disks ; i-- ; ) { + set_bit(R5_LOCKED, sh-dev[i].flags); + locked++; + set_bit(R5_Wantwrite, sh-dev[i].flags); + } + clear_bit(STRIPE_EXPANDING, sh-state); + } else if (expanded) { + clear_bit(STRIPE_EXPAND_READY, sh-state); +
[PATCH] md: Fix potential memalloc deadlock in md
Another md patch suitable for 2.6.20. Thanks, NeilBrown ### Comments for Changeset If a GFP_KERNEL allocation is attempted in md while the mddev_lock is held, it is possible for a deadlock to eventuate. This happens if the array was marked 'clean', and the memalloc triggers a write-out to the md device. For the writeout to succeed, the array must be marked 'dirty', and that requires getting the mddev_lock. So, before attempting a GFP_KERNEL alloction while holding the lock, make sure the array is marked 'dirty' (unless it is currently read-only). Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c | 29 + ./drivers/md/raid1.c |2 ++ ./drivers/md/raid5.c |3 +++ ./include/linux/raid/md.h |2 +- 4 files changed, 35 insertions(+), 1 deletion(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2007-01-23 11:23:58.0 +1100 +++ ./drivers/md/md.c 2007-01-25 12:47:58.0 +1100 @@ -3564,6 +3564,8 @@ static int get_bitmap_file(mddev_t * mdd char *ptr, *buf = NULL; int err = -ENOMEM; + md_allow_write(mddev); + file = kmalloc(sizeof(*file), GFP_KERNEL); if (!file) goto out; @@ -5032,6 +5034,33 @@ void md_write_end(mddev_t *mddev) } } +/* md_allow_write(mddev) + * Calling this ensures that the array is marked 'active' so that writes + * may proceed without blocking. It is important to call this before + * attempting a GFP_KERNEL allocation while holding the mddev lock. + * Must be called with mddev_lock held. + */ +void md_allow_write(mddev_t *mddev) +{ + if (!mddev-pers) + return; + if (mddev-ro) + return; + + spin_lock_irq(mddev-write_lock); + if (mddev-in_sync) { + mddev-in_sync = 0; + set_bit(MD_CHANGE_CLEAN, mddev-flags); + if (mddev-safemode_delay + mddev-safemode == 0) + mddev-safemode = 1; + spin_unlock_irq(mddev-write_lock); + md_update_sb(mddev, 0); + } else + spin_unlock_irq(mddev-write_lock); +} +EXPORT_SYMBOL_GPL(md_allow_write); + static DECLARE_WAIT_QUEUE_HEAD(resync_wait); #define SYNC_MARKS 10 diff .prev/drivers/md/raid1.c ./drivers/md/raid1.c --- .prev/drivers/md/raid1.c2007-01-23 11:23:43.0 +1100 +++ ./drivers/md/raid1.c2007-01-25 12:09:43.0 +1100 @@ -2050,6 +2050,8 @@ static int raid1_reshape(mddev_t *mddev) return -EINVAL; } + md_allow_write(mddev); + raid_disks = mddev-raid_disks + mddev-delta_disks; if (raid_disks conf-raid_disks) { diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2007-01-23 11:13:44.0 +1100 +++ ./drivers/md/raid5.c2007-01-25 12:18:04.0 +1100 @@ -399,6 +399,8 @@ static int resize_stripes(raid5_conf_t * if (newsize = conf-pool_size) return 0; /* never bother to shrink */ + md_allow_write(conf-mddev); + /* Step 1 */ sc = kmem_cache_create(conf-cache_name[1-conf-active_name], sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), @@ -3195,6 +3197,7 @@ raid5_store_stripe_cache_size(mddev_t *m else break; } + md_allow_write(mddev); while (new conf-max_nr_stripes) { if (grow_one_stripe(conf)) conf-max_nr_stripes++; diff .prev/include/linux/raid/md.h ./include/linux/raid/md.h --- .prev/include/linux/raid/md.h 2007-01-25 12:16:57.0 +1100 +++ ./include/linux/raid/md.h 2007-01-25 12:17:18.0 +1100 @@ -93,7 +93,7 @@ extern int sync_page_io(struct block_dev struct page *page, int rw); extern void md_do_sync(mddev_t *mddev); extern void md_new_event(mddev_t *mddev); - +extern void md_allow_write(mddev_t *mddev); #endif /* CONFIG_MD */ #endif - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] md: Remove unnecessary printk when raid5 gets an unaligned read.
One more... (sorry about the dribs-and-drabs approach) NeilBrown ### Comments for Changeset raid5_mergeable_bvec tries to ensure that raid5 never sees a read request that does not fit within just one chunk. However as we must always accept a single-page read, that is not always possible. So when in_chunk_boundary fails, it might be unusual, but it is not a problem and printing a message every time is a bad idea. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid5.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2007-01-25 15:57:56.0 +1100 +++ ./drivers/md/raid5.c2007-01-25 15:55:43.0 +1100 @@ -2630,7 +2630,7 @@ static int chunk_aligned_read(request_qu mdk_rdev_t *rdev; if (!in_chunk_boundary(mddev, raid_bio)) { - printk(chunk_aligned_read : non aligned\n); + PRINTK(chunk_aligned_read : non aligned\n); return 0; } /* - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 000 of 4] md: Introduction - Assorted bugfixes
Following are 4 patches suitable for inclusion in 2.6.20. Thanks, NeilBrown [PATCH 001 of 4] md: Update email address and status for MD in MAINTAINERS. [PATCH 002 of 4] md: Make 'repair' actually work for raid1. [PATCH 003 of 4] md: Make sure the events count in an md array never returns to zero. [PATCH 004 of 4] md: Avoid reading past the end of a bitmap file. - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 001 of 4] md: Update email address and status for MD in MAINTAINERS.
Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./MAINTAINERS |4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff .prev/MAINTAINERS ./MAINTAINERS --- .prev/MAINTAINERS 2007-01-23 11:14:14.0 +1100 +++ ./MAINTAINERS 2007-01-23 11:23:03.0 +1100 @@ -3011,9 +3011,9 @@ SOFTWARE RAID (Multiple Disks) SUPPORT P: Ingo Molnar M: [EMAIL PROTECTED] P: Neil Brown -M: [EMAIL PROTECTED] +M: [EMAIL PROTECTED] L: linux-raid@vger.kernel.org -S: Maintained +S: Supported SOFTWARE SUSPEND: P: Pavel Machek - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 004 of 4] md: Avoid reading past the end of a bitmap file.
In most cases we check the size of the bitmap file before reading data from it. However when reading the superblock, we always read the first PAGE_SIZE bytes, which might not always be appropriate. So limit that read to the size of the file if appropriate. Also, we get the count of available bytes wrong in one place, so that too can read past the end of the file. Cc: yang yin [EMAIL PROTECTED] Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/bitmap.c | 12 1 file changed, 8 insertions(+), 4 deletions(-) diff .prev/drivers/md/bitmap.c ./drivers/md/bitmap.c --- .prev/drivers/md/bitmap.c 2007-01-23 11:13:43.0 +1100 +++ ./drivers/md/bitmap.c 2007-01-23 11:24:09.0 +1100 @@ -479,9 +479,12 @@ static int bitmap_read_sb(struct bitmap int err = -EINVAL; /* page 0 is the superblock, read it... */ - if (bitmap-file) - bitmap-sb_page = read_page(bitmap-file, 0, bitmap, PAGE_SIZE); - else { + if (bitmap-file) { + loff_t isize = i_size_read(bitmap-file-f_mapping-host); + int bytes = isize PAGE_SIZE ? PAGE_SIZE : isize; + + bitmap-sb_page = read_page(bitmap-file, 0, bitmap, bytes); + } else { bitmap-sb_page = read_sb_page(bitmap-mddev, bitmap-offset, 0); } if (IS_ERR(bitmap-sb_page)) { @@ -877,7 +880,8 @@ static int bitmap_init_from_disk(struct int count; /* unmap the old page, we're done with it */ if (index == num_pages-1) - count = bytes - index * PAGE_SIZE; + count = bytes + sizeof(bitmap_super_t) + - index * PAGE_SIZE; else count = PAGE_SIZE; if (index == 0) { - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 002 of 4] md: Make 'repair' actually work for raid1.
When 'repair' finds a block that is different one the various parts of the mirror. it is meant to write a chosen good version to the others. However it currently writes out the original data to each. The memcpy to make all the data the same is missing. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid1.c |5 + 1 file changed, 5 insertions(+) diff .prev/drivers/md/raid1.c ./drivers/md/raid1.c --- .prev/drivers/md/raid1.c2007-01-23 11:13:45.0 +1100 +++ ./drivers/md/raid1.c2007-01-23 11:23:43.0 +1100 @@ -1221,6 +1221,11 @@ static void sync_request_write(mddev_t * sbio-bi_sector = r1_bio-sector + conf-mirrors[i].rdev-data_offset; sbio-bi_bdev = conf-mirrors[i].rdev-bdev; + for (j = 0; j vcnt ; j++) + memcpy(page_address(sbio-bi_io_vec[j].bv_page), + page_address(pbio-bi_io_vec[j].bv_page), + PAGE_SIZE); + } } } - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 003 of 4] md: Make sure the events count in an md array never returns to zero.
Now that we sometimes step the array events count backwards (when transitioning dirty-clean where nothing else interesting has happened - so that we don't need to write to spares all the time), it is possible for the event count to return to zero, which is potentially confusing and triggers and MD_BUG. We could possibly remove the MD_BUG, but is just as easy, and probably safer, to make sure we never return to zero. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c |3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2007-01-23 11:13:44.0 +1100 +++ ./drivers/md/md.c 2007-01-23 11:23:58.0 +1100 @@ -1633,7 +1633,8 @@ repeat: * and 'events' is odd, we can roll back to the previous clean state */ if (nospares (mddev-in_sync mddev-recovery_cp == MaxSector) -(mddev-events 1)) +(mddev-events 1) +mddev-events != 1) mddev-events--; else { /* otherwise we have to go forward and ... */ - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] md: Fix a few problems with the interface (sysfs and ioctl) to md.
Following patch is suitable for 2.6.20. It fixes some minor bugs that need to be fix in order to use new functionality in mdadm-2.6. Thanks, NeilBrown ### Comments for Changeset While developing more functionality in mdadm I found some bugs in md... - When we remove a device from an inactive array (write 'remove' to the 'state' sysfs file - see 'state_store') would should not update the superblock information - as we may not have read and processed it all properly yet. - initialise all raid_disk entries to '-1' else the 'slot sysfs file will claim '0' for all devices in an array before the array is started. - all '\n' not to be present at the end of words written to sysfs files - when we use SET_ARRAY_INFO to set the md metadata version, set the flag to say that there is persistant metadata. - allow GET_BITMAP_FILE to be called on an array that hasn't been started yet. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c | 13 - 1 file changed, 8 insertions(+), 5 deletions(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2006-12-21 16:29:31.0 +1100 +++ ./drivers/md/md.c 2006-12-21 16:29:40.0 +1100 @@ -1795,7 +1795,8 @@ state_store(mdk_rdev_t *rdev, const char else { mddev_t *mddev = rdev-mddev; kick_rdev_from_array(rdev); - md_update_sb(mddev, 1); + if (mddev-pers) + md_update_sb(mddev, 1); md_new_event(mddev); err = 0; } @@ -2007,6 +2008,7 @@ static mdk_rdev_t *md_import_device(dev_ rdev-desc_nr = -1; rdev-saved_raid_disk = -1; + rdev-raid_disk = -1; rdev-flags = 0; rdev-data_offset = 0; rdev-sb_events = 0; @@ -2236,7 +2238,6 @@ static int update_raid_disks(mddev_t *md static ssize_t raid_disks_store(mddev_t *mddev, const char *buf, size_t len) { - /* can only set raid_disks if array is not yet active */ char *e; int rv = 0; unsigned long n = simple_strtoul(buf, e, 10); @@ -2634,7 +2635,7 @@ metadata_store(mddev_t *mddev, const cha return -EINVAL; buf = e+1; minor = simple_strtoul(buf, e, 10); - if (e==buf || *e != '\n') + if (e==buf || (*e *e != '\n') ) return -EINVAL; if (major = sizeof(super_types)/sizeof(super_types[0]) || super_types[major].name == NULL) @@ -3984,6 +3985,7 @@ static int set_array_info(mddev_t * mdde mddev-major_version = info-major_version; mddev-minor_version = info-minor_version; mddev-patch_version = info-patch_version; + mddev-persistent = ! info-not_persistent; return 0; } mddev-major_version = MD_MAJOR_VERSION; @@ -4309,9 +4311,10 @@ static int md_ioctl(struct inode *inode, * Commands querying/configuring an existing array: */ /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, -* RUN_ARRAY, and SET_BITMAP_FILE are allowed */ +* RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ if (!mddev-raid_disks cmd != ADD_NEW_DISK cmd != STOP_ARRAY -cmd != RUN_ARRAY cmd != SET_BITMAP_FILE) { +cmd != RUN_ARRAY cmd != SET_BITMAP_FILE +cmd != GET_BITMAP_FILE) { err = -ENODEV; goto abort_unlock; } - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] md: Don't assume that READ==0 and WRITE==1 - use the names explicitly.
### Comments for Changeset Thanks Jens for alerting me to this. Cc: Jens Axboe [EMAIL PROTECTED] Cc: [EMAIL PROTECTED] Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/faulty.c |2 +- ./drivers/md/raid1.c |2 +- ./drivers/md/raid10.c |6 +++--- ./drivers/md/raid5.c | 20 ++-- 4 files changed, 15 insertions(+), 15 deletions(-) diff .prev/drivers/md/faulty.c ./drivers/md/faulty.c --- .prev/drivers/md/faulty.c 2006-12-12 09:47:58.0 +1100 +++ ./drivers/md/faulty.c 2006-12-12 09:48:10.0 +1100 @@ -173,7 +173,7 @@ static int make_request(request_queue_t conf_t *conf = (conf_t*)mddev-private; int failit = 0; - if (bio-bi_rw 1) { + if (bio_data_dir(bio) == WRITE) { /* write request */ if (atomic_read(conf-counters[WriteAll])) { /* special case - don't decrement, don't generic_make_request, diff .prev/drivers/md/raid10.c ./drivers/md/raid10.c --- .prev/drivers/md/raid10.c 2006-12-12 09:42:11.0 +1100 +++ ./drivers/md/raid10.c 2006-12-12 09:45:02.0 +1100 @@ -1785,7 +1785,7 @@ static sector_t sync_request(mddev_t *md biolist = bio; bio-bi_private = r10_bio; bio-bi_end_io = end_sync_read; - bio-bi_rw = 0; + bio-bi_rw = READ; bio-bi_sector = r10_bio-devs[j].addr + conf-mirrors[d].rdev-data_offset; bio-bi_bdev = conf-mirrors[d].rdev-bdev; @@ -1801,7 +1801,7 @@ static sector_t sync_request(mddev_t *md biolist = bio; bio-bi_private = r10_bio; bio-bi_end_io = end_sync_write; - bio-bi_rw = 1; + bio-bi_rw = WRITE; bio-bi_sector = r10_bio-devs[k].addr + conf-mirrors[i].rdev-data_offset; bio-bi_bdev = conf-mirrors[i].rdev-bdev; @@ -1870,7 +1870,7 @@ static sector_t sync_request(mddev_t *md biolist = bio; bio-bi_private = r10_bio; bio-bi_end_io = end_sync_read; - bio-bi_rw = 0; + bio-bi_rw = READ; bio-bi_sector = r10_bio-devs[i].addr + conf-mirrors[d].rdev-data_offset; bio-bi_bdev = conf-mirrors[d].rdev-bdev; diff .prev/drivers/md/raid1.c ./drivers/md/raid1.c --- .prev/drivers/md/raid1.c2006-12-08 12:07:39.0 +1100 +++ ./drivers/md/raid1.c2006-12-12 09:45:10.0 +1100 @@ -1736,7 +1736,7 @@ static sector_t sync_request(mddev_t *md /* take from bio_init */ bio-bi_next = NULL; bio-bi_flags |= 1 BIO_UPTODATE; - bio-bi_rw = 0; + bio-bi_rw = READ; bio-bi_vcnt = 0; bio-bi_idx = 0; bio-bi_phys_segments = 0; diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2006-12-11 09:54:43.0 +1100 +++ ./drivers/md/raid5.c2006-12-12 09:49:53.0 +1100 @@ -1827,16 +1827,16 @@ static void handle_stripe5(struct stripe struct bio *bi; mdk_rdev_t *rdev; if (test_and_clear_bit(R5_Wantwrite, sh-dev[i].flags)) - rw = 1; + rw = WRITE; else if (test_and_clear_bit(R5_Wantread, sh-dev[i].flags)) - rw = 0; + rw = READ; else continue; bi = sh-dev[i].req; bi-bi_rw = rw; - if (rw) + if (rw == WRITE) bi-bi_end_io = raid5_end_write_request; else bi-bi_end_io = raid5_end_read_request; @@ -1872,7 +1872,7 @@ static void handle_stripe5(struct stripe atomic_add(STRIPE_SECTORS, rdev-corrected_errors); generic_make_request(bi); } else { - if (rw == 1) + if (rw == WRITE) set_bit(STRIPE_DEGRADED, sh-state); PRINTK(skip op %ld on disc %d for sector %llu\n, bi-bi_rw, i, (unsigned long long)sh-sector);
[PATCH 000 of 5] md: Assorted minor fixes for mainline
Following are 5 patches for md in 2.6.19-rc6-mm2 that are suitable for 2.6.20. Patch 4 might fix an outstanding bug against md which manifests as an oops early in boot, but I don't have test results yet. NeilBrown [PATCH 001 of 5] md: Remove some old ifdefed-out code from raid5.c [PATCH 002 of 5] md: Return a non-zero error to bi_end_io as appropriate in raid5. [PATCH 003 of 5] md: Assorted md and raid1 one-liners [PATCH 004 of 5] md: Close a race between destroying and recreating an md device. [PATCH 005 of 5] md: Allow mddevs to live a bit longer to avoid a loop with udev. - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 004 of 5] md: Close a race between destroying and recreating an md device.
For each md device, we need a gendisk. As that gendisk has a name that gets registered in sysfs, we need to make sure that when an md device is shut down, we don't create it again until the shutdown is complete and the gendisk has been deleted. This patches utilises the disks_mutex to ensure the proper exclusion. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c | 25 + 1 file changed, 21 insertions(+), 4 deletions(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2006-12-07 15:45:31.0 +1100 +++ ./drivers/md/md.c 2006-12-07 21:01:11.0 +1100 @@ -222,18 +222,36 @@ static inline mddev_t *mddev_get(mddev_t return mddev; } +static DEFINE_MUTEX(disks_mutex); static void mddev_put(mddev_t *mddev) { + /* We need to hold disks_mutex to safely destroy the gendisk +* info before someone else creates a new gendisk with the same +* name, but we don't want to take that mutex just to decrement +* the -active counter. So we first test if this is the last +* reference. If it is, we put things back as they were found +* and take disks_mutex before trying again. +*/ if (!atomic_dec_and_lock(mddev-active, all_mddevs_lock)) return; + atomic_inc(mddev-active); + spin_unlock(all_mddevs_lock); + + mutex_lock(disks_mutex); + + if (!atomic_dec_and_lock(mddev-active, all_mddevs_lock)) { + mutex_unlock(disks_mutex); + return; + } list_del(mddev-all_mddevs); spin_unlock(all_mddevs_lock); - del_gendisk(mddev-gendisk); - mddev-gendisk = NULL; + if (mddev-gendisk) + del_gendisk(mddev-gendisk); blk_cleanup_queue(mddev-queue); - mddev-queue = NULL; kobject_unregister(mddev-kobj); + + mutex_unlock(disks_mutex); } static mddev_t * mddev_find(dev_t unit) @@ -2948,7 +2966,6 @@ int mdp_major = 0; static struct kobject *md_probe(dev_t dev, int *part, void *data) { - static DEFINE_MUTEX(disks_mutex); mddev_t *mddev = mddev_find(dev); struct gendisk *disk; int partitioned = (MAJOR(dev) != MD_MAJOR); - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 003 of 5] md: Assorted md and raid1 one-liners
Fix few bugs that meant that: - superblocks weren't alway written at exactly the right time (this could show up if the array was not written to - writting to the array causes lots of superblock updates and so hides these errors). - restarting device recovery after a clean shutdown (version-1 metadata only) didn't work as intended (or at all). 1/ Ensure superblock is updated when a new device is added. 2/ Remove an inappropriate test on MD_RECOVERY_SYNC in md_do_sync. The body of this if takes one of two branches depending on whether MD_RECOVERY_SYNC is set, so testing it in the clause of the if is wrong. 3/ Flag superblock for updating after a resync/recovery finishes. 4/ If we find the neeed to restart a recovery in the middle (version-1 metadata only) make sure a full recovery (not just as guided by bitmaps) does get done. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c|3 ++- ./drivers/md/raid1.c |1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2006-12-07 15:33:40.0 +1100 +++ ./drivers/md/md.c 2006-12-07 15:44:53.0 +1100 @@ -3729,6 +3729,7 @@ static int add_new_disk(mddev_t * mddev, if (err) export_rdev(rdev); + md_update_sb(mddev, 1); set_bit(MD_RECOVERY_NEEDED, mddev-recovery); md_wakeup_thread(mddev-thread); return err; @@ -5289,7 +5290,6 @@ void md_do_sync(mddev_t *mddev) mddev-pers-sync_request(mddev, max_sectors, skipped, 1); if (!test_bit(MD_RECOVERY_ERR, mddev-recovery) - test_bit(MD_RECOVERY_SYNC, mddev-recovery) !test_bit(MD_RECOVERY_CHECK, mddev-recovery) mddev-curr_resync 2) { if (test_bit(MD_RECOVERY_SYNC, mddev-recovery)) { @@ -5313,6 +5313,7 @@ void md_do_sync(mddev_t *mddev) rdev-recovery_offset = mddev-curr_resync; } } + set_bit(MD_CHANGE_DEVS, mddev-flags); skip: mddev-curr_resync = 0; diff .prev/drivers/md/raid1.c ./drivers/md/raid1.c --- .prev/drivers/md/raid1.c2006-12-07 15:33:40.0 +1100 +++ ./drivers/md/raid1.c2006-12-07 15:44:53.0 +1100 @@ -1951,6 +1951,7 @@ static int run(mddev_t *mddev) !test_bit(In_sync, disk-rdev-flags)) { disk-head_position = 0; mddev-degraded++; + conf-fullsync = 1; } } if (mddev-degraded == conf-raid_disks) { - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 002 of 5] md: Return a non-zero error to bi_end_io as appropriate in raid5.
Currently raid5 depends on clearing the BIO_UPTODATE flag to signal an error to higher levels. While this should be sufficient, it is safer to explicitly set the error code as well - less room for confusion. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid5.c | 16 1 file changed, 12 insertions(+), 4 deletions(-) diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2006-12-07 15:33:40.0 +1100 +++ ./drivers/md/raid5.c2006-12-07 15:44:41.0 +1100 @@ -1818,7 +1818,9 @@ static void handle_stripe5(struct stripe return_bi = bi-bi_next; bi-bi_next = NULL; bi-bi_size = 0; - bi-bi_end_io(bi, bytes, 0); + bi-bi_end_io(bi, bytes, + test_bit(BIO_UPTODATE, bi-bi_flags) + ? 0 : -EIO); } for (i=disks; i-- ;) { int rw; @@ -2359,7 +2361,9 @@ static void handle_stripe6(struct stripe return_bi = bi-bi_next; bi-bi_next = NULL; bi-bi_size = 0; - bi-bi_end_io(bi, bytes, 0); + bi-bi_end_io(bi, bytes, + test_bit(BIO_UPTODATE, bi-bi_flags) + ? 0 : -EIO); } for (i=disks; i-- ;) { int rw; @@ -2859,7 +2863,9 @@ static int make_request(request_queue_t if ( rw == WRITE ) md_write_end(mddev); bi-bi_size = 0; - bi-bi_end_io(bi, bytes, 0); + bi-bi_end_io(bi, bytes, + test_bit(BIO_UPTODATE, bi-bi_flags) + ? 0 : -EIO); } return 0; } @@ -3127,7 +3133,9 @@ static int retry_aligned_read(raid5_con int bytes = raid_bio-bi_size; raid_bio-bi_size = 0; - raid_bio-bi_end_io(raid_bio, bytes, 0); + raid_bio-bi_end_io(raid_bio, bytes, + test_bit(BIO_UPTODATE, raid_bio-bi_flags) + ? 0 : -EIO); } if (atomic_dec_and_test(conf-active_aligned_reads)) wake_up(conf-wait_for_stripe); - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 001 of 4] md: Fix innocuous bug in raid6 stripe_to_pdidx
stripe_to_pdidx finds the index of the parity disk for a given stripe. It assumes raid5 in that it uses disks-1 to determine the number of data disks. This is incorrect for raid6 but fortunately the two usages cancel each other out. The only way that 'data_disks' affects the calculation of pd_idx in raid5_compute_sector is when it is divided into the sector number. But as that sector number is calculated by multiplying in the wrong value of 'data_disks' the division produces the right value. So it is innocuous but needs to be fixed. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid5.c |6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2006-11-14 10:05:00.0 +1100 +++ ./drivers/md/raid5.c2006-11-14 10:33:41.0 +1100 @@ -1355,8 +1355,10 @@ static int stripe_to_pdidx(sector_t stri int pd_idx, dd_idx; int chunk_offset = sector_div(stripe, sectors_per_chunk); - raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk -+ chunk_offset, disks, disks-1, dd_idx, pd_idx, conf); + raid5_compute_sector(stripe * (disks - conf-max_degraded) +*sectors_per_chunk + chunk_offset, +disks, disks - conf-max_degraded, +dd_idx, pd_idx, conf); return pd_idx; } - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 003 of 4] md: Misc fixes for aligned-read handling.
1/ When aligned requests fail (read error) they need to be retried via the normal method (stripe cache). As we cannot be sure that we can process a single read in one go (we may not be able to allocate all the stripes needed) we store a bio-being-retried and a list of bioes-that-still-need-to-be-retried. When find a bio that needs to be retried, we should add it to the list, not to single-bio... 2/ The cloned bio is being used-after-free (to test BIO_UPTODATE). 3/ We forgot to add rdev-data_offset when submitting a bio for aligned-read 4/ clone_bio calls blk_recount_segments and then we change bi_bdev, so we need to invalidate the segment counts. 5/ We were never incrementing 'scnt' when resubmitting failed aligned requests. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid5.c | 14 +- 1 file changed, 9 insertions(+), 5 deletions(-) diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2006-11-14 10:34:17.0 +1100 +++ ./drivers/md/raid5.c2006-11-14 10:34:33.0 +1100 @@ -2658,8 +2658,8 @@ static void add_bio_to_retry(struct bio spin_lock_irqsave(conf-device_lock, flags); - bi-bi_next = conf-retry_read_aligned; - conf-retry_read_aligned = bi; + bi-bi_next = conf-retry_read_aligned_list; + conf-retry_read_aligned_list = bi; spin_unlock_irqrestore(conf-device_lock, flags); md_wakeup_thread(conf-mddev-thread); @@ -2698,6 +2698,7 @@ static int raid5_align_endio(struct bio struct bio* raid_bi = bi-bi_private; mddev_t *mddev; raid5_conf_t *conf; + int uptodate = test_bit(BIO_UPTODATE, bi-bi_flags); if (bi-bi_size) return 1; @@ -2706,7 +2707,7 @@ static int raid5_align_endio(struct bio mddev = raid_bi-bi_bdev-bd_disk-queue-queuedata; conf = mddev_to_conf(mddev); - if (!error test_bit(BIO_UPTODATE, bi-bi_flags)) { + if (!error uptodate) { bio_endio(raid_bi, bytes, 0); if (atomic_dec_and_test(conf-active_aligned_reads)) wake_up(conf-wait_for_stripe); @@ -2759,9 +2760,11 @@ static int chunk_aligned_read(request_qu rcu_read_lock(); rdev = rcu_dereference(conf-disks[dd_idx].rdev); if (rdev test_bit(In_sync, rdev-flags)) { - align_bi-bi_bdev = rdev-bdev; atomic_inc(rdev-nr_pending); rcu_read_unlock(); + align_bi-bi_bdev = rdev-bdev; + align_bi-bi_flags = ~(1 BIO_SEG_VALID); + align_bi-bi_sector += rdev-data_offset; spin_lock_irq(conf-device_lock); wait_event_lock_irq(conf-wait_for_stripe, @@ -3151,7 +3154,8 @@ static int retry_aligned_read(raid5_con conf); last_sector = raid_bio-bi_sector + (raid_bio-bi_size9); - for (; logical_sector last_sector; logical_sector += STRIPE_SECTORS) { + for (; logical_sector last_sector; +logical_sector += STRIPE_SECTORS, scnt++) { if (scnt raid_bio-bi_hw_segments) /* already done this stripe */ - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 004 of 4] md: Fix a couple more bugs in raid5/6 aligned reads
1/ We don't de-reference the rdev when the read completes. This means we need to record the rdev to so it is still available in the end_io routine. Fortunately bi_next in the original bio is unused at this point so we can stuff it in there. 2/ We leak a cloned by if the target rdev is not usasble. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid5.c |7 +++ 1 file changed, 7 insertions(+) diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2006-11-14 11:00:51.0 +1100 +++ ./drivers/md/raid5.c2006-11-14 11:06:44.0 +1100 @@ -2699,6 +2699,7 @@ static int raid5_align_endio(struct bio mddev_t *mddev; raid5_conf_t *conf; int uptodate = test_bit(BIO_UPTODATE, bi-bi_flags); + mdk_rdev_t *rdev; if (bi-bi_size) return 1; @@ -2706,6 +2707,10 @@ static int raid5_align_endio(struct bio mddev = raid_bi-bi_bdev-bd_disk-queue-queuedata; conf = mddev_to_conf(mddev); + rdev = (void*)raid_bi-bi_next; + raid_bi-bi_next = NULL; + + rdev_dec_pending(rdev, conf-mddev); if (!error uptodate) { bio_endio(raid_bi, bytes, 0); @@ -2762,6 +2767,7 @@ static int chunk_aligned_read(request_qu if (rdev test_bit(In_sync, rdev-flags)) { atomic_inc(rdev-nr_pending); rcu_read_unlock(); + raid_bio-bi_next = (void*)rdev; align_bi-bi_bdev = rdev-bdev; align_bi-bi_flags = ~(1 BIO_SEG_VALID); align_bi-bi_sector += rdev-data_offset; @@ -2777,6 +2783,7 @@ static int chunk_aligned_read(request_qu return 1; } else { rcu_read_unlock(); + bio_put(align_bi); return 0; } } - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 002 of 9] md: Fix sizing problem with raid5-reshape and CONFIG_LBD=n
I forgot to has the size-in-blocks to (loff_t) before shifting up to a size-in-bytes. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid5.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2006-11-06 11:21:24.0 +1100 +++ ./drivers/md/raid5.c2006-11-06 11:28:51.0 +1100 @@ -3659,7 +3659,7 @@ static void end_reshape(raid5_conf_t *co bdev = bdget_disk(conf-mddev-gendisk, 0); if (bdev) { mutex_lock(bdev-bd_inode-i_mutex); - i_size_write(bdev-bd_inode, conf-mddev-array_size 10); + i_size_write(bdev-bd_inode, (loff_t)conf-mddev-array_size 10); mutex_unlock(bdev-bd_inode-i_mutex); bdput(bdev); } - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 008 of 9] md: Allow reads that have bypassed the cache to be retried on failure.
From: Raz Ben-Jehuda(caro) [EMAIL PROTECTED] If a bypass-the-cache read fails, we simply try again through the cache. If it fails again it will trigger normal recovery precedures. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid5.c | 150 ++- ./include/linux/raid/raid5.h |3 2 files changed, 150 insertions(+), 3 deletions(-) diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2006-11-06 11:29:13.0 +1100 +++ ./drivers/md/raid5.c2006-11-06 11:29:14.0 +1100 @@ -134,6 +134,8 @@ static void __release_stripe(raid5_conf_ if (!test_bit(STRIPE_EXPANDING, sh-state)) { list_add_tail(sh-lru, conf-inactive_list); wake_up(conf-wait_for_stripe); + if (conf-retry_read_aligned) + md_wakeup_thread(conf-mddev-thread); } } } @@ -2645,18 +2647,74 @@ static int in_chunk_boundary(mddev_t *md } /* + * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) + * later sampled by raid5d. + */ +static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf) +{ + unsigned long flags; + + spin_lock_irqsave(conf-device_lock, flags); + + bi-bi_next = conf-retry_read_aligned; + conf-retry_read_aligned = bi; + + spin_unlock_irqrestore(conf-device_lock, flags); + md_wakeup_thread(conf-mddev-thread); +} + + +static struct bio *remove_bio_from_retry(raid5_conf_t *conf) +{ + struct bio *bi; + + bi = conf-retry_read_aligned; + if (bi) { + conf-retry_read_aligned = NULL; + return bi; + } + bi = conf-retry_read_aligned_list; + if(bi) { + conf-retry_read_aligned = bi-bi_next; + bi-bi_next = NULL; + bi-bi_phys_segments = 1; /* biased count of active stripes */ + bi-bi_hw_segments = 0; /* count of processed stripes */ + } + + return bi; +} + + +/* * The raid5_align_endio should check if the read succeeded and if it * did, call bio_endio on the original bio (having bio_put the new bio * first). * If the read failed.. */ -int raid5_align_endio(struct bio *bi, unsigned int bytes , int error) +int raid5_align_endio(struct bio *bi, unsigned int bytes, int error) { struct bio* raid_bi = bi-bi_private; + mddev_t *mddev; + raid5_conf_t *conf; + if (bi-bi_size) return 1; bio_put(bi); - bio_endio(raid_bi, bytes, error); + + mddev = raid_bi-bi_bdev-bd_disk-queue-queuedata; + conf = mddev_to_conf(mddev); + + if (!error test_bit(BIO_UPTODATE, bi-bi_flags)) { + bio_endio(raid_bi, bytes, 0); + if (atomic_dec_and_test(conf-active_aligned_reads)) + wake_up(conf-wait_for_stripe); + return 0; + } + + + PRINTK(raid5_align_endio : io error...handing IO for a retry\n); + + add_bio_to_retry(raid_bi, conf); return 0; } @@ -2702,6 +2760,14 @@ static int chunk_aligned_read(request_qu align_bi-bi_bdev = rdev-bdev; atomic_inc(rdev-nr_pending); rcu_read_unlock(); + + spin_lock_irq(conf-device_lock); + wait_event_lock_irq(conf-wait_for_stripe, + conf-quiesce == 0, + conf-device_lock, /* nothing */); + atomic_inc(conf-active_aligned_reads); + spin_unlock_irq(conf-device_lock); + generic_make_request(align_bi); return 1; } else { @@ -3050,6 +3116,71 @@ static inline sector_t sync_request(mdde return STRIPE_SECTORS; } +static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) +{ + /* We may not be able to submit a whole bio at once as there +* may not be enough stripe_heads available. +* We cannot pre-allocate enough stripe_heads as we may need +* more than exist in the cache (if we allow ever large chunks). +* So we do one stripe head at a time and record in +* -bi_hw_segments how many have been done. +* +* We *know* that this entire raid_bio is in one chunk, so +* it will be only one 'dd_idx' and only need one call to raid5_compute_sector. +*/ + struct stripe_head *sh; + int dd_idx, pd_idx; + sector_t sector, logical_sector, last_sector; + int scnt = 0; + int remaining; + int handled = 0; + + logical_sector = raid_bio-bi_sector ~((sector_t)STRIPE_SECTORS-1); + sector = raid5_compute_sector( logical_sector, + conf-raid_disks, +
[PATCH 005 of 9] md: Change lifetime rules for 'md' devices.
Currently md devices are created when first opened and remain in existence until the module is unloaded. This isn't a major problem, but it somewhat ugly. This patch changes the lifetime rules so that an md device will disappear on the last close if it has no state. Locking rules depend on bd_mutex being held in do_open and __blkdev_put, and on setting bd_disk-private_data to 'mddev'. There is room for a race because md_probe is called early in do_open (get_gendisk) to create the mddev. As this isn't protected by bd_mutex, a concurrent call to md_close can destroy that mddev before do_open calls md_open to get a reference on it. md_open and md_close are serialised by md_mutex so the worst that can happen is that md_open finds that the mddev structure doesn't exist after all. In this case bd_disk-private_data will be NULL, and md_open chooses to exit with -EBUSY in this case, which is arguable and appropriate result. The new 'dead' field in mddev is used to track whether it is time to destroy the mddev (if a last-close happens). It is cleared when any state is create (set_array_info) and set when the array is stopped (do_md_stop). mddev_put becomes simpler. It just destroys the mddev when the refcount hits zero. This will normally be the reference held in bd_disk-private_data. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c | 35 +-- ./include/linux/raid/md_k.h |3 +++ 2 files changed, 28 insertions(+), 10 deletions(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2006-11-06 11:29:12.0 +1100 +++ ./drivers/md/md.c 2006-11-06 11:29:13.0 +1100 @@ -226,13 +226,14 @@ static void mddev_put(mddev_t *mddev) { if (!atomic_dec_and_lock(mddev-active, all_mddevs_lock)) return; - if (!mddev-raid_disks list_empty(mddev-disks)) { - list_del(mddev-all_mddevs); - spin_unlock(all_mddevs_lock); - blk_cleanup_queue(mddev-queue); - kobject_unregister(mddev-kobj); - } else - spin_unlock(all_mddevs_lock); + list_del(mddev-all_mddevs); + spin_unlock(all_mddevs_lock); + + del_gendisk(mddev-gendisk); + mddev-gendisk = NULL; + blk_cleanup_queue(mddev-queue); + mddev-queue = NULL; + kobject_unregister(mddev-kobj); } static mddev_t * mddev_find(dev_t unit) @@ -273,6 +274,7 @@ static mddev_t * mddev_find(dev_t unit) atomic_set(new-active, 1); spin_lock_init(new-write_lock); init_waitqueue_head(new-sb_wait); + new-dead = 1; new-queue = blk_alloc_queue(GFP_KERNEL); if (!new-queue) { @@ -3360,6 +3362,8 @@ static int do_md_stop(mddev_t * mddev, i mddev-array_size = 0; mddev-size = 0; mddev-raid_disks = 0; + mddev-dead = 1; + mddev-recovery_cp = 0; } else if (mddev-pers) @@ -4292,7 +4296,8 @@ static int md_ioctl(struct inode *inode, printk(KERN_WARNING md: couldn't set array info. %d\n, err); goto abort_unlock; - } + } else + mddev-dead = 0; } goto done_unlock; @@ -4376,6 +4381,8 @@ static int md_ioctl(struct inode *inode, err = -EFAULT; else err = add_new_disk(mddev, info); + if (!err) + mddev-dead = 0; goto done_unlock; } @@ -4422,8 +4429,12 @@ static int md_open(struct inode *inode, * Succeed if we can lock the mddev, which confirms that * it isn't being stopped right now. */ - mddev_t *mddev = inode-i_bdev-bd_disk-private_data; - int err; + mddev_t *mddev; + int err = -EBUSY; + + mddev = inode-i_bdev-bd_disk-private_data; + if (!mddev) + goto out; if ((err = mutex_lock_interruptible_nested(mddev-reconfig_mutex, 1))) goto out; @@ -4442,6 +4453,10 @@ static int md_release(struct inode *inod mddev_t *mddev = inode-i_bdev-bd_disk-private_data; BUG_ON(!mddev); + if (inode-i_bdev-bd_openers == 0 mddev-dead) { + inode-i_bdev-bd_disk-private_data = NULL; + mddev_put(mddev); + } mddev_put(mddev); return 0; diff .prev/include/linux/raid/md_k.h ./include/linux/raid/md_k.h --- .prev/include/linux/raid/md_k.h 2006-11-06 11:21:24.0 +1100 +++ ./include/linux/raid/md_k.h 2006-11-06 11:29:13.0 +1100 @@ -119,6 +119,9 @@ struct mddev_s #define MD_CHANGE_PENDING 2/*
[PATCH 007 of 9] md: Handle bypassing the read cache (assuming nothing fails).
From: Raz Ben-Jehuda(caro) [EMAIL PROTECTED] Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid5.c | 78 +++ 1 file changed, 78 insertions(+) diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2006-11-06 11:29:13.0 +1100 +++ ./drivers/md/raid5.c2006-11-06 11:29:13.0 +1100 @@ -2633,6 +2633,84 @@ static int raid5_mergeable_bvec(request_ return max; } + +static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) +{ + sector_t sector = bio-bi_sector + get_start_sect(bio-bi_bdev); + unsigned int chunk_sectors = mddev-chunk_size 9; + unsigned int bio_sectors = bio-bi_size 9; + + return chunk_sectors = + ((sector (chunk_sectors - 1)) + bio_sectors); +} + +/* + * The raid5_align_endio should check if the read succeeded and if it + * did, call bio_endio on the original bio (having bio_put the new bio + * first). + * If the read failed.. + */ +int raid5_align_endio(struct bio *bi, unsigned int bytes , int error) +{ + struct bio* raid_bi = bi-bi_private; + if (bi-bi_size) + return 1; + bio_put(bi); + bio_endio(raid_bi, bytes, error); + return 0; +} + +static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio) +{ + mddev_t *mddev = q-queuedata; + raid5_conf_t *conf = mddev_to_conf(mddev); + const unsigned int raid_disks = conf-raid_disks; + const unsigned int data_disks = raid_disks - 1; + unsigned int dd_idx, pd_idx; + struct bio* align_bi; + mdk_rdev_t *rdev; + + if (!in_chunk_boundary(mddev, raid_bio)) { + printk(chunk_aligned_read : non aligned\n); + return 0; + } + /* +* use bio_clone to make a copy of the bio +*/ + align_bi = bio_clone(raid_bio, GFP_NOIO); + if (!align_bi) + return 0; + /* +* set bi_end_io to a new function, and set bi_private to the +* original bio. +*/ + align_bi-bi_end_io = raid5_align_endio; + align_bi-bi_private = raid_bio; + /* +* compute position +*/ + align_bi-bi_sector = raid5_compute_sector(raid_bio-bi_sector, + raid_disks, + data_disks, + dd_idx, + pd_idx, + conf); + + rcu_read_lock(); + rdev = rcu_dereference(conf-disks[dd_idx].rdev); + if (rdev test_bit(In_sync, rdev-flags)) { + align_bi-bi_bdev = rdev-bdev; + atomic_inc(rdev-nr_pending); + rcu_read_unlock(); + generic_make_request(align_bi); + return 1; + } else { + rcu_read_unlock(); + return 0; + } +} + + static int make_request(request_queue_t *q, struct bio * bi) { mddev_t *mddev = q-queuedata; - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 006 of 9] md: Define raid5_mergeable_bvec
From: Raz Ben-Jehuda(caro) [EMAIL PROTECTED] This will encourage read request to be on only one device, so we will often be able to bypass the cache for read requests. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid5.c | 24 1 file changed, 24 insertions(+) diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2006-11-06 11:28:51.0 +1100 +++ ./drivers/md/raid5.c2006-11-06 11:29:13.0 +1100 @@ -2611,6 +2611,28 @@ static int raid5_congested(void *data, i return 0; } +/* We want read requests to align with chunks where possible, + * but write requests don't need to. + */ +static int raid5_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *biovec) +{ + mddev_t *mddev = q-queuedata; + sector_t sector = bio-bi_sector + get_start_sect(bio-bi_bdev); + int max; + unsigned int chunk_sectors = mddev-chunk_size 9; + unsigned int bio_sectors = bio-bi_size 9; + + if (bio_data_dir(bio)) + return biovec-bv_len; /* always allow writes to be mergeable */ + + max = (chunk_sectors - ((sector (chunk_sectors - 1)) + bio_sectors)) 9; + if (max 0) max = 0; + if (max = biovec-bv_len bio_sectors == 0) + return biovec-bv_len; + else + return max; +} + static int make_request(request_queue_t *q, struct bio * bi) { mddev_t *mddev = q-queuedata; @@ -3320,6 +3342,8 @@ static int run(mddev_t *mddev) mddev-array_size = mddev-size * (conf-previous_raid_disks - conf-max_degraded); + blk_queue_merge_bvec(mddev-queue, raid5_mergeable_bvec); + return 0; abort: if (conf) { - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 004 of 9] md: Tidy up device-change notification when an md array is stopped
An md array can be stopped leaving all the setting still in place, or it can torn down and destroyed. set_capacity and other change notifications only happen in the latter case, but should happen in both. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2006-11-06 11:29:00.0 +1100 +++ ./drivers/md/md.c 2006-11-06 11:29:12.0 +1100 @@ -3314,6 +3314,10 @@ static int do_md_stop(mddev_t * mddev, i module_put(mddev-pers-owner); mddev-pers = NULL; + + set_capacity(disk, 0); + mddev-changed = 1; + if (mddev-ro) mddev-ro = 0; } @@ -,7 +3337,7 @@ static int do_md_stop(mddev_t * mddev, i if (mode == 0) { mdk_rdev_t *rdev; struct list_head *tmp; - struct gendisk *disk; + printk(KERN_INFO md: %s stopped.\n, mdname(mddev)); bitmap_destroy(mddev); @@ -3358,10 +3362,6 @@ static int do_md_stop(mddev_t * mddev, i mddev-raid_disks = 0; mddev-recovery_cp = 0; - disk = mddev-gendisk; - if (disk) - set_capacity(disk, 0); - mddev-changed = 1; } else if (mddev-pers) printk(KERN_INFO md: %s switched to read-only mode.\n, mdname(mddev)); - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 001 of 9] md: Change ONLINE/OFFLINE events to a single CHANGE event
It turns out that CHANGE is preferred to ONLINE/OFFLINE for various reasons (not least of which being that udev understands it already). So remove the recently added KOBJ_OFFLINE (no-one is likely to care anyway) and change the ONLINE to a CHANGE event Cc: Kay Sievers [EMAIL PROTECTED] Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c |3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2006-11-06 11:21:25.0 +1100 +++ ./drivers/md/md.c 2006-11-06 11:22:14.0 +1100 @@ -3200,7 +3200,7 @@ static int do_md_run(mddev_t * mddev) mddev-changed = 1; md_new_event(mddev); - kobject_uevent(mddev-gendisk-kobj, KOBJ_ONLINE); + kobject_uevent(mddev-gendisk-kobj, KOBJ_CHANGE); return 0; } @@ -3314,7 +3314,6 @@ static int do_md_stop(mddev_t * mddev, i module_put(mddev-pers-owner); mddev-pers = NULL; - kobject_uevent(mddev-gendisk-kobj, KOBJ_OFFLINE); if (mddev-ro) mddev-ro = 0; } - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 009 of 9] md: Enable bypassing cache for reads.
From: Raz Ben-Jehuda(caro) [EMAIL PROTECTED] Call the chunk_aligned_read where appropriate. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid5.c |5 + 1 file changed, 5 insertions(+) diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2006-11-06 11:29:14.0 +1100 +++ ./drivers/md/raid5.c2006-11-06 11:29:14.0 +1100 @@ -2798,6 +2798,11 @@ static int make_request(request_queue_t disk_stat_inc(mddev-gendisk, ios[rw]); disk_stat_add(mddev-gendisk, sectors[rw], bio_sectors(bi)); + if ( bio_data_dir(bi) == READ +mddev-reshape_position == MaxSector +chunk_aligned_read(q,bi)) + return 0; + logical_sector = bi-bi_sector ~((sector_t)STRIPE_SECTORS-1); last_sector = bi-bi_sector + (bi-bi_size9); bi-bi_next = NULL; - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 003 of 9] md: Do not freeze md threads for suspend.
From: Rafael J. Wysocki [EMAIL PROTECTED] If there's a swap file on a software RAID, it should be possible to use this file for saving the swsusp's suspend image. Also, this file should be available to the memory management subsystem when memory is being freed before the suspend image is created. For the above reasons it seems that md_threads should not be frozen during the suspend and the appended patch makes this happen, but then there is the question if they don't cause any data to be written to disks after the suspend image has been created, provided that all filesystems are frozen at that time. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2006-11-06 11:28:44.0 +1100 +++ ./drivers/md/md.c 2006-11-06 11:29:00.0 +1100 @@ -4488,6 +4488,7 @@ static int md_thread(void * arg) * many dirty RAID5 blocks. */ + current-flags |= PF_NOFREEZE; allow_signal(SIGKILL); while (!kthread_should_stop()) { @@ -4504,7 +4505,6 @@ static int md_thread(void * arg) test_bit(THREAD_WAKEUP, thread-flags) || kthread_should_stop(), thread-timeout); - try_to_freeze(); clear_bit(THREAD_WAKEUP, thread-flags); - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 002 of 6] md: Change lifetime rules for 'md' devices.
Currently md devices are created when first opened and remain in existence until the module is unloaded. This isn't a major problem, but it somewhat ugly. This patch changes the lifetime rules so that an md device will disappear on the last close if it has no state. Locking rules depend on bd_mutex being held in do_open and __blkdev_put, and on setting bd_disk-private_data to 'mddev'. There is room for a race because md_probe is called early in do_open (get_gendisk) to create the mddev. As this isn't protected by bd_mutex, a concurrent call to md_close can destroy that mddev before do_open calls md_open to get a reference on it. md_open and md_close are serialised by md_mutex so the worst that can happen is that md_open finds that the mddev structure doesn't exist after all. In this case bd_disk-private_data will be NULL, and md_open chooses to exit with -EBUSY in this case, which is arguable and appropriate result. The new 'dead' field in mddev is used to track whether it is time to destroy the mddev (if a last-close happens). It is cleared when any state is create (set_array_info) and set when the array is stopped (do_md_stop). mddev_put becomes simpler. It just destroys the mddev when the refcount hits zero. This will normally be the reference held in bd_disk-private_data. cc: [EMAIL PROTECTED] Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c | 35 +-- ./include/linux/raid/md_k.h |3 +++ 2 files changed, 28 insertions(+), 10 deletions(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2006-10-31 16:41:02.0 +1100 +++ ./drivers/md/md.c 2006-10-31 16:41:14.0 +1100 @@ -226,13 +226,14 @@ static void mddev_put(mddev_t *mddev) { if (!atomic_dec_and_lock(mddev-active, all_mddevs_lock)) return; - if (!mddev-raid_disks list_empty(mddev-disks)) { - list_del(mddev-all_mddevs); - spin_unlock(all_mddevs_lock); - blk_cleanup_queue(mddev-queue); - kobject_unregister(mddev-kobj); - } else - spin_unlock(all_mddevs_lock); + list_del(mddev-all_mddevs); + spin_unlock(all_mddevs_lock); + + blk_cleanup_queue(mddev-queue); + mddev-queue = NULL; + del_gendisk(mddev-gendisk); + mddev-gendisk = NULL; + kobject_unregister(mddev-kobj); } static mddev_t * mddev_find(dev_t unit) @@ -273,6 +274,7 @@ static mddev_t * mddev_find(dev_t unit) atomic_set(new-active, 1); spin_lock_init(new-write_lock); init_waitqueue_head(new-sb_wait); + new-dead = 1; new-queue = blk_alloc_queue(GFP_KERNEL); if (!new-queue) { @@ -3362,6 +3364,8 @@ static int do_md_stop(mddev_t * mddev, i disk = mddev-gendisk; if (disk) set_capacity(disk, 0); + mddev-dead = 1; + mddev-changed = 1; } else if (mddev-pers) printk(KERN_INFO md: %s switched to read-only mode.\n, @@ -4293,7 +4297,8 @@ static int md_ioctl(struct inode *inode, printk(KERN_WARNING md: couldn't set array info. %d\n, err); goto abort_unlock; - } + } else + mddev-dead = 0; } goto done_unlock; @@ -4377,6 +4382,8 @@ static int md_ioctl(struct inode *inode, err = -EFAULT; else err = add_new_disk(mddev, info); + if (!err) + mddev-dead = 0; goto done_unlock; } @@ -4423,8 +4430,12 @@ static int md_open(struct inode *inode, * Succeed if we can lock the mddev, which confirms that * it isn't being stopped right now. */ - mddev_t *mddev = inode-i_bdev-bd_disk-private_data; - int err; + mddev_t *mddev; + int err = -EBUSY; + + mddev = inode-i_bdev-bd_disk-private_data; + if (!mddev) + goto out; if ((err = mutex_lock_interruptible_nested(mddev-reconfig_mutex, 1))) goto out; @@ -4443,6 +4454,10 @@ static int md_release(struct inode *inod mddev_t *mddev = inode-i_bdev-bd_disk-private_data; BUG_ON(!mddev); + if (inode-i_bdev-bd_openers == 0 mddev-dead) { + inode-i_bdev-bd_disk-private_data = NULL; + mddev_put(mddev); + } mddev_put(mddev); return 0; diff .prev/include/linux/raid/md_k.h ./include/linux/raid/md_k.h --- .prev/include/linux/raid/md_k.h 2006-10-31 16:40:51.0 +1100 +++ ./include/linux/raid/md_k.h 2006-10-31
[PATCH 005 of 6] md: Allow reads that have bypassed the cache to be retried on failure.
If a bypass-the-cache read fails, we simply try again through the cache. If it fails again it will trigger normal recovery precedures. cc: Raz Ben-Jehuda(caro) [EMAIL PROTECTED] Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid5.c | 150 ++- ./include/linux/raid/raid5.h |3 2 files changed, 150 insertions(+), 3 deletions(-) diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2006-10-31 16:41:51.0 +1100 +++ ./drivers/md/raid5.c2006-10-31 16:42:30.0 +1100 @@ -134,6 +134,8 @@ static void __release_stripe(raid5_conf_ if (!test_bit(STRIPE_EXPANDING, sh-state)) { list_add_tail(sh-lru, conf-inactive_list); wake_up(conf-wait_for_stripe); + if (conf-retry_read_aligned) + md_wakeup_thread(conf-mddev-thread); } } } @@ -2645,18 +2647,74 @@ static int in_chunk_boundary(mddev_t *md } /* + * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) + * later sampled by raid5d. + */ +static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf) +{ + unsigned long flags; + + spin_lock_irqsave(conf-device_lock, flags); + + bi-bi_next = conf-retry_read_aligned; + conf-retry_read_aligned = bi; + + spin_unlock_irqrestore(conf-device_lock, flags); + md_wakeup_thread(conf-mddev-thread); +} + + +static struct bio *remove_bio_from_retry(raid5_conf_t *conf) +{ + struct bio *bi; + + bi = conf-retry_read_aligned; + if (bi) { + conf-retry_read_aligned = NULL; + return bi; + } + bi = conf-retry_read_aligned_list; + if(bi) { + conf-retry_read_aligned = bi-bi_next; + bi-bi_next = NULL; + bi-bi_phys_segments = 1; /* biased count of active stripes */ + bi-bi_hw_segments = 0; /* count of processed stripes */ + } + + return bi; +} + + +/* * The raid5_align_endio should check if the read succeeded and if it * did, call bio_endio on the original bio (having bio_put the new bio * first). * If the read failed.. */ -int raid5_align_endio(struct bio *bi, unsigned int bytes , int error) +int raid5_align_endio(struct bio *bi, unsigned int bytes, int error) { struct bio* raid_bi = bi-bi_private; + mddev_t *mddev; + raid5_conf_t *conf; + if (bi-bi_size) return 1; bio_put(bi); - bio_endio(raid_bi, bytes, error); + + mddev = raid_bi-bi_bdev-bd_disk-queue-queuedata; + conf = mddev_to_conf(mddev); + + if (!error test_bit(BIO_UPTODATE, bi-bi_flags)) { + bio_endio(raid_bi, bytes, 0); + if (atomic_dec_and_test(conf-active_aligned_reads)) + wake_up(conf-wait_for_stripe); + return 0; + } + + + PRINTK(raid5_align_endio : io error...handing IO for a retry\n); + + add_bio_to_retry(raid_bi, conf); return 0; } @@ -2702,6 +2760,14 @@ static int chunk_aligned_read(request_qu align_bi-bi_bdev = rdev-bdev; atomic_inc(rdev-nr_pending); rcu_read_unlock(); + + spin_lock_irq(conf-device_lock); + wait_event_lock_irq(conf-wait_for_stripe, + conf-quiesce == 0, + conf-device_lock, /* nothing */); + atomic_inc(conf-active_aligned_reads); + spin_unlock_irq(conf-device_lock); + generic_make_request(align_bi); return 1; } else { @@ -3050,6 +3116,71 @@ static inline sector_t sync_request(mdde return STRIPE_SECTORS; } +static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) +{ + /* We may not be able to submit a whole bio at once as there +* may not be enough stripe_heads available. +* We cannot pre-allocate enough stripe_heads as we may need +* more than exist in the cache (if we allow ever large chunks). +* So we do one stripe head at a time and record in +* -bi_hw_segments how many have been done. +* +* We *know* that this entire raid_bio is in one chunk, so +* it will be only one 'dd_idx' and only need one call to raid5_compute_sector. +*/ + struct stripe_head *sh; + int dd_idx, pd_idx; + sector_t sector, logical_sector, last_sector; + int scnt = 0; + int remaining; + int handled = 0; + + logical_sector = raid_bio-bi_sector ~((sector_t)STRIPE_SECTORS-1); + sector = raid5_compute_sector( logical_sector, + conf-raid_disks, +
[PATCH 003 of 6] md: Define raid5_mergeable_bvec
This will encourage read request to be on only one device, so we will often be able to bypass the cache for read requests. cc: Raz Ben-Jehuda(caro) [EMAIL PROTECTED] Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid5.c | 24 1 file changed, 24 insertions(+) diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2006-10-31 16:40:57.0 +1100 +++ ./drivers/md/raid5.c2006-10-31 16:41:26.0 +1100 @@ -2611,6 +2611,28 @@ static int raid5_congested(void *data, i return 0; } +/* We want read requests to align with chunks where possible, + * but write requests don't need to. + */ +static int raid5_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *biovec) +{ + mddev_t *mddev = q-queuedata; + sector_t sector = bio-bi_sector + get_start_sect(bio-bi_bdev); + int max; + unsigned int chunk_sectors = mddev-chunk_size 9; + unsigned int bio_sectors = bio-bi_size 9; + + if (bio_data_dir(bio)) + return biovec-bv_len; /* always allow writes to be mergeable */ + + max = (chunk_sectors - ((sector (chunk_sectors - 1)) + bio_sectors)) 9; + if (max 0) max = 0; + if (max = biovec-bv_len bio_sectors == 0) + return biovec-bv_len; + else + return max; +} + static int make_request(request_queue_t *q, struct bio * bi) { mddev_t *mddev = q-queuedata; @@ -3320,6 +3342,8 @@ static int run(mddev_t *mddev) mddev-array_size = mddev-size * (conf-previous_raid_disks - conf-max_degraded); + blk_queue_merge_bvec(mddev-queue, raid5_mergeable_bvec); + return 0; abort: if (conf) { - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 006 of 6] md: Enable bypassing cache for reads.
Call the chunk_aligned_read where appropriate. cc: Raz Ben-Jehuda(caro) [EMAIL PROTECTED] Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid5.c |5 + 1 file changed, 5 insertions(+) diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2006-10-31 16:42:30.0 +1100 +++ ./drivers/md/raid5.c2006-10-31 16:47:53.0 +1100 @@ -2798,6 +2798,11 @@ static int make_request(request_queue_t disk_stat_inc(mddev-gendisk, ios[rw]); disk_stat_add(mddev-gendisk, sectors[rw], bio_sectors(bi)); + if ( bio_data_dir(bi) == READ +mddev-reshape_position == MaxSector +chunk_aligned_read(q,bi)) + return 0; + logical_sector = bi-bi_sector ~((sector_t)STRIPE_SECTORS-1); last_sector = bi-bi_sector + (bi-bi_size9); bi-bi_next = NULL; - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 004 of 6] md: Handle bypassing the read cache (assuming nothing fails).
cc: Raz Ben-Jehuda(caro) [EMAIL PROTECTED] Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid5.c | 78 +++ 1 file changed, 78 insertions(+) diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c --- .prev/drivers/md/raid5.c2006-10-31 16:41:26.0 +1100 +++ ./drivers/md/raid5.c2006-10-31 16:41:51.0 +1100 @@ -2633,6 +2633,84 @@ static int raid5_mergeable_bvec(request_ return max; } + +static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) +{ + sector_t sector = bio-bi_sector + get_start_sect(bio-bi_bdev); + unsigned int chunk_sectors = mddev-chunk_size 9; + unsigned int bio_sectors = bio-bi_size 9; + + return chunk_sectors = + ((sector (chunk_sectors - 1)) + bio_sectors); +} + +/* + * The raid5_align_endio should check if the read succeeded and if it + * did, call bio_endio on the original bio (having bio_put the new bio + * first). + * If the read failed.. + */ +int raid5_align_endio(struct bio *bi, unsigned int bytes , int error) +{ + struct bio* raid_bi = bi-bi_private; + if (bi-bi_size) + return 1; + bio_put(bi); + bio_endio(raid_bi, bytes, error); + return 0; +} + +static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio) +{ + mddev_t *mddev = q-queuedata; + raid5_conf_t *conf = mddev_to_conf(mddev); + const unsigned int raid_disks = conf-raid_disks; + const unsigned int data_disks = raid_disks - 1; + unsigned int dd_idx, pd_idx; + struct bio* align_bi; + mdk_rdev_t *rdev; + + if (!in_chunk_boundary(mddev, raid_bio)) { + printk(chunk_aligned_read : non aligned\n); + return 0; + } + /* +* use bio_clone to make a copy of the bio +*/ + align_bi = bio_clone(raid_bio, GFP_NOIO); + if (!align_bi) + return 0; + /* +* set bi_end_io to a new function, and set bi_private to the +* original bio. +*/ + align_bi-bi_end_io = raid5_align_endio; + align_bi-bi_private = raid_bio; + /* +* compute position +*/ + align_bi-bi_sector = raid5_compute_sector(raid_bio-bi_sector, + raid_disks, + data_disks, + dd_idx, + pd_idx, + conf); + + rcu_read_lock(); + rdev = rcu_dereference(conf-disks[dd_idx].rdev); + if (rdev test_bit(In_sync, rdev-flags)) { + align_bi-bi_bdev = rdev-bdev; + atomic_inc(rdev-nr_pending); + rcu_read_unlock(); + generic_make_request(align_bi); + return 1; + } else { + rcu_read_unlock(); + return 0; + } +} + + static int make_request(request_queue_t *q, struct bio * bi) { mddev_t *mddev = q-queuedata; - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 000 of 4] md: assorted bugfixes - one serious.
I rang my regression test suite on md for the first time in a while (obviously too long :-(). Found two bugs resulting in patches 1 and 3. Others are fixes for more subtle issues. All patches are suitable for 2.6.19, patch 1 is quite serious and should go in 2.6.18.2. Thanks, NeilBrown [PATCH 001 of 4] md: Fix bug where spares don't always get rebuilt properly when they become live. [PATCH 002 of 4] md: Simplify checking of available size when resizing an array [PATCH 003 of 4] md: Fix up maintenance of -degraded in multipath. [PATCH 004 of 4] md: Fix printk format warnings, seen on powerpc64: - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 001 of 4] md: Fix bug where spares don't always get rebuilt properly when they become live.
If save_raid_disk is = 0, then the device could be a device that is already in sync that is being re-added. So we need to default this value to -1. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c |1 + 1 file changed, 1 insertion(+) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2006-10-23 16:34:55.0 +1000 +++ ./drivers/md/md.c 2006-10-23 16:35:05.0 +1000 @@ -2003,6 +2003,7 @@ static mdk_rdev_t *md_import_device(dev_ kobject_init(rdev-kobj); rdev-desc_nr = -1; + rdev-saved_raid_disk = -1; rdev-flags = 0; rdev-data_offset = 0; rdev-sb_events = 0; - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 003 of 4] md: Fix up maintenance of -degraded in multipath.
A recent fix which made sure -degraded was initialised properly exposed a second bug - -degraded wasn't been updated when drives failed or were hot-added. Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/multipath.c |2 ++ 1 file changed, 2 insertions(+) diff .prev/drivers/md/multipath.c ./drivers/md/multipath.c --- .prev/drivers/md/multipath.c2006-10-23 16:34:54.0 +1000 +++ ./drivers/md/multipath.c2006-10-23 16:35:38.0 +1000 @@ -277,6 +277,7 @@ static void multipath_error (mddev_t *md set_bit(Faulty, rdev-flags); set_bit(MD_CHANGE_DEVS, mddev-flags); conf-working_disks--; + mddev-degraded++; printk(KERN_ALERT multipath: IO failure on %s, disabling IO path. \n Operation continuing on %d IO paths.\n, @@ -336,6 +337,7 @@ static int multipath_add_disk(mddev_t *m blk_queue_max_sectors(mddev-queue, PAGE_SIZE9); conf-working_disks++; + mddev-degraded--; rdev-raid_disk = path; set_bit(In_sync, rdev-flags); rcu_assign_pointer(p-rdev, rdev); - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 002 of 4] md: Simplify checking of available size when resizing an array
When mdadm --grow --size=xxx is used to resize an array (use more or less of each device), we check the new siza against the available space in each device. The already have that number recorded in rdev-size, so calculating it is pointless (and wrong in one obscure case). Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/md.c |7 ++- 1 file changed, 2 insertions(+), 5 deletions(-) diff .prev/drivers/md/md.c ./drivers/md/md.c --- .prev/drivers/md/md.c 2006-10-23 16:35:05.0 +1000 +++ ./drivers/md/md.c 2006-10-23 16:35:21.0 +1000 @@ -4047,11 +4047,8 @@ static int update_size(mddev_t *mddev, u return -EBUSY; ITERATE_RDEV(mddev,rdev,tmp) { sector_t avail; - if (rdev-sb_offset rdev-data_offset) - avail = (rdev-sb_offset*2) - rdev-data_offset; - else - avail = get_capacity(rdev-bdev-bd_disk) - - rdev-data_offset; + avail = rdev-size * 2; + if (fit (size == 0 || size avail/2)) size = avail/2; if (avail ((sector_t)size 1)) - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 004 of 4] md: Fix printk format warnings, seen on powerpc64:
From: Randy Dunlap [EMAIL PROTECTED] drivers/md/raid1.c:1479: warning: long long unsigned int format, long unsigned int arg (arg 4) drivers/md/raid10.c:1475: warning: long long unsigned int format, long unsigned int arg (arg 4) Signed-off-by: Randy Dunlap [EMAIL PROTECTED] Signed-off-by: Neil Brown [EMAIL PROTECTED] ### Diffstat output ./drivers/md/raid1.c |4 ++-- ./drivers/md/raid10.c |4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff .prev/drivers/md/raid1.c ./drivers/md/raid1.c --- .prev/drivers/md/raid1.c2006-10-23 16:36:08.0 +1000 +++ ./drivers/md/raid1.c2006-10-23 16:36:08.0 +1000 @@ -1474,8 +1474,8 @@ static void fix_read_error(conf_t *conf, raid1:%s: read error corrected (%d sectors at %llu on %s)\n, mdname(mddev), s, - (unsigned long long)sect + - rdev-data_offset, + (unsigned long long)(sect + + rdev-data_offset), bdevname(rdev-bdev, b)); } } diff .prev/drivers/md/raid10.c ./drivers/md/raid10.c --- .prev/drivers/md/raid10.c 2006-10-23 16:34:54.0 +1000 +++ ./drivers/md/raid10.c 2006-10-23 16:36:08.0 +1000 @@ -1470,8 +1470,8 @@ static void fix_read_error(conf_t *conf, raid10:%s: read error corrected (%d sectors at %llu on %s)\n, mdname(mddev), s, - (unsigned long long)sect+ - rdev-data_offset, + (unsigned long long)(sect+ + rdev-data_offset), bdevname(rdev-bdev, b)); rdev_dec_pending(rdev, mddev); - To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html