> The patch below does not apply to the 3.6-stable tree.
> If someone wants it applied there, or to any other stable or longterm
> tree, then please email the backport, including the original git commit
> id to <[email protected]>.

> From e7c0c3fa29280d62aa5e11101a674bb3064bd791 Mon Sep 17 00:00:00 2001

Here's my manually fixed up version, as posted to linux-raid
a week ago.  neilb wrote:

> Yes, that looks right.
> 
> NeilBrown

Which is perhaps not the most ringing endorsement, but
I think can be taken as an Acked-By:

A large chunk of it is indentation, so I've included the -b version for
human review, then (below a scissors line) the actual patch to feed to
"git am --scissors".

---- diff -b: do not apply; real patch after scissors line ----
commit 0fbb6ffcf9485ac94c57759849ba415352c131da
Author: NeilBrown <[email protected]>
Date:   Thu Nov 22 14:42:49 2012 +1100

    md/raid10: close race that lose writes lost when replacement completes.
    
    When a replacement operation completes there is a small window
    when the original device is marked 'faulty' and the replacement
    still looks like a replacement.  The faulty should be removed and
    the replacement moved in place very quickly, bit it isn't instant.
    
    So the code write out to the array must handle the possibility that
    the only working device for some slot in the replacement - but it
    doesn't.  If the primary device is faulty it just gives up.  This
    can lead to corruption.
    
    So make the code more robust: if either  the primary or the
    replacement is present and working, write to them.  Only when
    neither are present do we give up.
    
    This bug has been present since replacement was introduced in
    3.3, so it is suitable for any -stable kernel since then.
    
    Reported-by: "George Spelvin" <[email protected]>
    Cc: [email protected]
    Signed-off-by: NeilBrown <[email protected]>

diff -b --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a48c215..06a359b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1287,18 +1287,21 @@ retry_write:
                        blocked_rdev = rrdev;
                        break;
                }
+               if (rdev && (test_bit(Faulty, &rdev->flags)
+                            || test_bit(Unmerged, &rdev->flags)))
+                       rdev = NULL;
                if (rrdev && (test_bit(Faulty, &rrdev->flags)
                              || test_bit(Unmerged, &rrdev->flags)))
                        rrdev = NULL;
 
                r10_bio->devs[i].bio = NULL;
                r10_bio->devs[i].repl_bio = NULL;
-               if (!rdev || test_bit(Faulty, &rdev->flags) ||
-                   test_bit(Unmerged, &rdev->flags)) {
+
+               if (!rdev && !rrdev) {
                        set_bit(R10BIO_Degraded, &r10_bio->state);
                        continue;
                }
-               if (test_bit(WriteErrorSeen, &rdev->flags)) {
+               if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
                        sector_t first_bad;
                        sector_t dev_sector = r10_bio->devs[i].addr;
                        int bad_sectors;
@@ -1340,8 +1343,10 @@ retry_write:
                                        max_sectors = good_sectors;
                        }
                }
+               if (rdev) {
                        r10_bio->devs[i].bio = bio;
                        atomic_inc(&rdev->nr_pending);
+               }
                if (rrdev) {
                        r10_bio->devs[i].repl_bio = bio;
                        atomic_inc(&rrdev->nr_pending);
@@ -1397,18 +1402,17 @@ retry_write:
        for (i = 0; i < conf->copies; i++) {
                struct bio *mbio;
                int d = r10_bio->devs[i].devnum;
-               if (!r10_bio->devs[i].bio)
-                       continue;
 
+               if (r10_bio->devs[i].bio) {
+                       struct md_rdev *rdev = conf->mirrors[d].rdev;
                        mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
                        md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
                                    max_sectors);
                        r10_bio->devs[i].bio = mbio;
 
-               mbio->bi_sector = (r10_bio->devs[i].addr+
-                                  choose_data_offset(r10_bio,
-                                                     conf->mirrors[d].rdev));
-               mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
+                       mbio->bi_sector = (r10_bio->devs[i].addr +
+                                          choose_data_offset(r10_bio, rdev));
+                       mbio->bi_bdev = rdev->bdev;
                        mbio->bi_end_io = raid10_end_write_request;
                        mbio->bi_rw = WRITE | do_sync | do_fua;
                        mbio->bi_private = r10_bio;
@@ -1420,24 +1424,23 @@ retry_write:
                        spin_unlock_irqrestore(&conf->device_lock, flags);
                        if (!mddev_check_plugged(mddev))
                                md_wakeup_thread(mddev->thread);
+               }
 
-               if (!r10_bio->devs[i].repl_bio)
-                       continue;
-
+               if (r10_bio->devs[i].repl_bio) {
+                       struct md_rdev *rdev = conf->mirrors[d].replacement;
+                       if (rdev == NULL) {
+                               /* Replacement just got moved to main 'rdev' */
+                               smp_mb();
+                               rdev = conf->mirrors[d].rdev;
+                       }
                        mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
                        md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
                                    max_sectors);
                        r10_bio->devs[i].repl_bio = mbio;
 
-               /* We are actively writing to the original device
-                * so it cannot disappear, so the replacement cannot
-                * become NULL here
-                */
                        mbio->bi_sector = (r10_bio->devs[i].addr +
-                                  choose_data_offset(
-                                          r10_bio,
-                                          conf->mirrors[d].replacement));
-               mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
+                                          choose_data_offset(r10_bio, rdev));
+                       mbio->bi_bdev = rdev->bdev;
                        mbio->bi_end_io = raid10_end_write_request;
                        mbio->bi_rw = WRITE | do_sync | do_fua;
                        mbio->bi_private = r10_bio;
@@ -1450,6 +1453,7 @@ retry_write:
                        if (!mddev_check_plugged(mddev))
                                md_wakeup_thread(mddev->thread);
                }
+       }
 
        /* Don't remove the bias on 'remaining' (one_write_done) until
         * after checking if we need to go around again.


----8<----8<----
commit 0fbb6ffcf9485ac94c57759849ba415352c131da
Author: NeilBrown <[email protected]>
Date:   Thu Nov 22 14:42:49 2012 +1100

    md/raid10: close race that lose writes lost when replacement completes.
    
    When a replacement operation completes there is a small window
    when the original device is marked 'faulty' and the replacement
    still looks like a replacement.  The faulty should be removed and
    the replacement moved in place very quickly, bit it isn't instant.
    
    So the code write out to the array must handle the possibility that
    the only working device for some slot in the replacement - but it
    doesn't.  If the primary device is faulty it just gives up.  This
    can lead to corruption.
    
    So make the code more robust: if either  the primary or the
    replacement is present and working, write to them.  Only when
    neither are present do we give up.
    
    This bug has been present since replacement was introduced in
    3.3, so it is suitable for any -stable kernel since then.
    
    Reported-by: George Spelvin <[email protected]>
    Cc: [email protected]
    Signed-off-by: NeilBrown <[email protected]>
    Signed-off-by: George Spelvin <[email protected]>

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a48c215..06a359b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1287,18 +1287,21 @@ retry_write:
                        blocked_rdev = rrdev;
                        break;
                }
+               if (rdev && (test_bit(Faulty, &rdev->flags)
+                            || test_bit(Unmerged, &rdev->flags)))
+                       rdev = NULL;
                if (rrdev && (test_bit(Faulty, &rrdev->flags)
                              || test_bit(Unmerged, &rrdev->flags)))
                        rrdev = NULL;
 
                r10_bio->devs[i].bio = NULL;
                r10_bio->devs[i].repl_bio = NULL;
-               if (!rdev || test_bit(Faulty, &rdev->flags) ||
-                   test_bit(Unmerged, &rdev->flags)) {
+
+               if (!rdev && !rrdev) {
                        set_bit(R10BIO_Degraded, &r10_bio->state);
                        continue;
                }
-               if (test_bit(WriteErrorSeen, &rdev->flags)) {
+               if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
                        sector_t first_bad;
                        sector_t dev_sector = r10_bio->devs[i].addr;
                        int bad_sectors;
@@ -1340,8 +1343,10 @@ retry_write:
                                        max_sectors = good_sectors;
                        }
                }
-               r10_bio->devs[i].bio = bio;
-               atomic_inc(&rdev->nr_pending);
+               if (rdev) {
+                       r10_bio->devs[i].bio = bio;
+                       atomic_inc(&rdev->nr_pending);
+               }
                if (rrdev) {
                        r10_bio->devs[i].repl_bio = bio;
                        atomic_inc(&rrdev->nr_pending);
@@ -1397,58 +1402,57 @@ retry_write:
        for (i = 0; i < conf->copies; i++) {
                struct bio *mbio;
                int d = r10_bio->devs[i].devnum;
-               if (!r10_bio->devs[i].bio)
-                       continue;
-
-               mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-               md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
-                           max_sectors);
-               r10_bio->devs[i].bio = mbio;
 
-               mbio->bi_sector = (r10_bio->devs[i].addr+
-                                  choose_data_offset(r10_bio,
-                                                     conf->mirrors[d].rdev));
-               mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
-               mbio->bi_end_io = raid10_end_write_request;
-               mbio->bi_rw = WRITE | do_sync | do_fua;
-               mbio->bi_private = r10_bio;
-
-               atomic_inc(&r10_bio->remaining);
-               spin_lock_irqsave(&conf->device_lock, flags);
-               bio_list_add(&conf->pending_bio_list, mbio);
-               conf->pending_count++;
-               spin_unlock_irqrestore(&conf->device_lock, flags);
-               if (!mddev_check_plugged(mddev))
-                       md_wakeup_thread(mddev->thread);
-
-               if (!r10_bio->devs[i].repl_bio)
-                       continue;
+               if (r10_bio->devs[i].bio) {
+                       struct md_rdev *rdev = conf->mirrors[d].rdev;
+                       mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+                       md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
+                                   max_sectors);
+                       r10_bio->devs[i].bio = mbio;
+
+                       mbio->bi_sector = (r10_bio->devs[i].addr +
+                                          choose_data_offset(r10_bio, rdev));
+                       mbio->bi_bdev = rdev->bdev;
+                       mbio->bi_end_io = raid10_end_write_request;
+                       mbio->bi_rw = WRITE | do_sync | do_fua;
+                       mbio->bi_private = r10_bio;
 
-               mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-               md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
-                           max_sectors);
-               r10_bio->devs[i].repl_bio = mbio;
+                       atomic_inc(&r10_bio->remaining);
+                       spin_lock_irqsave(&conf->device_lock, flags);
+                       bio_list_add(&conf->pending_bio_list, mbio);
+                       conf->pending_count++;
+                       spin_unlock_irqrestore(&conf->device_lock, flags);
+                       if (!mddev_check_plugged(mddev))
+                               md_wakeup_thread(mddev->thread);
+               }
 
-               /* We are actively writing to the original device
-                * so it cannot disappear, so the replacement cannot
-                * become NULL here
-                */
-               mbio->bi_sector = (r10_bio->devs[i].addr +
-                                  choose_data_offset(
-                                          r10_bio,
-                                          conf->mirrors[d].replacement));
-               mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
-               mbio->bi_end_io = raid10_end_write_request;
-               mbio->bi_rw = WRITE | do_sync | do_fua;
-               mbio->bi_private = r10_bio;
+               if (r10_bio->devs[i].repl_bio) {
+                       struct md_rdev *rdev = conf->mirrors[d].replacement;
+                       if (rdev == NULL) {
+                               /* Replacement just got moved to main 'rdev' */
+                               smp_mb();
+                               rdev = conf->mirrors[d].rdev;
+                       }
+                       mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+                       md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
+                                   max_sectors);
+                       r10_bio->devs[i].repl_bio = mbio;
+
+                       mbio->bi_sector = (r10_bio->devs[i].addr +
+                                          choose_data_offset(r10_bio, rdev));
+                       mbio->bi_bdev = rdev->bdev;
+                       mbio->bi_end_io = raid10_end_write_request;
+                       mbio->bi_rw = WRITE | do_sync | do_fua;
+                       mbio->bi_private = r10_bio;
 
-               atomic_inc(&r10_bio->remaining);
-               spin_lock_irqsave(&conf->device_lock, flags);
-               bio_list_add(&conf->pending_bio_list, mbio);
-               conf->pending_count++;
-               spin_unlock_irqrestore(&conf->device_lock, flags);
-               if (!mddev_check_plugged(mddev))
-                       md_wakeup_thread(mddev->thread);
+                       atomic_inc(&r10_bio->remaining);
+                       spin_lock_irqsave(&conf->device_lock, flags);
+                       bio_list_add(&conf->pending_bio_list, mbio);
+                       conf->pending_count++;
+                       spin_unlock_irqrestore(&conf->device_lock, flags);
+                       if (!mddev_check_plugged(mddev))
+                               md_wakeup_thread(mddev->thread);
+               }
        }
 
        /* Don't remove the bias on 'remaining' (one_write_done) until
--
To unsubscribe from this list: send the line "unsubscribe stable" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to