dm raid1: fix EIO after log failure

Linux Kernel Mailing List Thu, 07 Feb 2008 20:06:07 -0800

Gitweb:     
http://git.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=b80aa7a0c268d3ae0c472f648af1e3e4a359765c
Commit:     b80aa7a0c268d3ae0c472f648af1e3e4a359765c
Parent:     8f0205b798f926e2745de5fdebf0a8605c621de6
Author:     Jonathan Brassow <[EMAIL PROTECTED]>
AuthorDate: Fri Feb 8 02:11:35 2008 +0000
Committer:  Alasdair G Kergon <[EMAIL PROTECTED]>
CommitDate: Fri Feb 8 02:11:35 2008 +0000


    dm raid1: fix EIO after log failure
    
    This patch adds the ability to requeue write I/O to
    core device-mapper when there is a log device failure.
    
    If a write to the log produces and error, the pending writes are
    put on the "failures" list.  Since the log is marked as failed,
    they will stay on the failures list until a suspend happens.
    
    Suspends come in two phases, presuspend and postsuspend.  We must
    make sure that all the writes on the failures list are requeued
    in the presuspend phase (a requirement of dm core).  This means
    that recovery must be complete (because writes may be delayed
    behind it) and the failures list must be requeued before we
    return from presuspend.
    
    The mechanisms to ensure recovery is complete (or stopped) was
    already in place, but needed to be moved from postsuspend to
    presuspend.  We rely on 'flush_workqueue' to ensure that the
    mirror thread is complete and therefore, has requeued all writes
    in the failures list.
    
    Because we are using flush_workqueue, we must ensure that no
    additional 'queue_work' calls will produce additional I/O
    that we need to requeue (because once we return from
    presuspend, we are unable to do anything about it).  'queue_work'
    is called in response to the following functions:
    - complete_resync_work = NA, recovery is stopped
    - rh_dec (mirror_end_io) = NA, only calls 'queue_work' if it
                               is ready to recover the region
                               (recovery is stopped) or it needs
                               to clear the region in the log*
                               **this doesn't get called while
                               suspending**
    - rh_recovery_end = NA, recovery is stopped
    - rh_recovery_start = NA, recovery is stopped
    - write_callback = 1) Writes w/o failures simply call
                       bio_endio -> mirror_end_io -> rh_dec
                       (see rh_dec above)
                       2) Writes with failures are put on
                       the failures list and queue_work is
                       called**
                       ** write_callbacks don't happen
                       during suspend **
    - do_failures = NA, 'queue_work' not called if suspending
    - add_mirror (initialization) = NA, only done on mirror creation
    - queue_bio = NA, 1) delayed I/O scheduled before flush_workqueue
                  is called.  2) No more I/Os are being issued.
                  3) Re-attempted READs can still be handled.
                  (Write completions are handled through rh_dec/
                  write_callback - mention above - and do not
                  use queue_bio.)
    
    Signed-off-by: Jonathan Brassow <[EMAIL PROTECTED]>
    Signed-off-by: Alasdair G Kergon <[EMAIL PROTECTED]>
---
 drivers/md/dm-raid1.c |  101 +++++++++++++++++++++++++++++++++++++++++++-----
 1 files changed, 90 insertions(+), 11 deletions(-)

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9978b9f..ec6d675 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -146,6 +146,7 @@ struct mirror_set {
        region_t nr_regions;
        int in_sync;
        int log_failure;
+       atomic_t suspend;
 
        atomic_t default_mirror;        /* Default mirror */
 
@@ -372,6 +373,16 @@ static void complete_resync_work(struct region *reg, int 
success)
        struct region_hash *rh = reg->rh;
 
        rh->log->type->set_region_sync(rh->log, reg->key, success);
+
+       /*
+        * Dispatch the bios before we call 'wake_up_all'.
+        * This is important because if we are suspending,
+        * we want to know that recovery is complete and
+        * the work queue is flushed.  If we wake_up_all
+        * before we dispatch_bios (queue bios and call wake()),
+        * then we risk suspending before the work queue
+        * has been properly flushed.
+        */
        dispatch_bios(rh->ms, &reg->delayed_bios);
        if (atomic_dec_and_test(&rh->recovery_in_flight))
                wake_up_all(&_kmirrord_recovery_stopped);
@@ -1069,11 +1080,13 @@ static void do_writes(struct mirror_set *ms, struct 
bio_list *writes)
        /*
         * Dispatch io.
         */
-       if (unlikely(ms->log_failure))
+       if (unlikely(ms->log_failure)) {
+               spin_lock_irq(&ms->lock);
+               bio_list_merge(&ms->failures, &sync);
+               spin_unlock_irq(&ms->lock);
+       } else
                while ((bio = bio_list_pop(&sync)))
-                       bio_endio(bio, -EIO);
-       else while ((bio = bio_list_pop(&sync)))
-               do_write(ms, bio);
+                       do_write(ms, bio);
 
        while ((bio = bio_list_pop(&recover)))
                rh_delay(&ms->rh, bio);
@@ -1091,8 +1104,46 @@ static void do_failures(struct mirror_set *ms, struct 
bio_list *failures)
        if (!failures->head)
                return;
 
-       while ((bio = bio_list_pop(failures)))
-               __bio_mark_nosync(ms, bio, bio->bi_size, 0);
+       if (!ms->log_failure) {
+               while ((bio = bio_list_pop(failures)))
+                       __bio_mark_nosync(ms, bio, bio->bi_size, 0);
+               return;
+       }
+
+       /*
+        * If the log has failed, unattempted writes are being
+        * put on the failures list.  We can't issue those writes
+        * until a log has been marked, so we must store them.
+        *
+        * If a 'noflush' suspend is in progress, we can requeue
+        * the I/O's to the core.  This give userspace a chance
+        * to reconfigure the mirror, at which point the core
+        * will reissue the writes.  If the 'noflush' flag is
+        * not set, we have no choice but to return errors.
+        *
+        * Some writes on the failures list may have been
+        * submitted before the log failure and represent a
+        * failure to write to one of the devices.  It is ok
+        * for us to treat them the same and requeue them
+        * as well.
+        */
+       if (dm_noflush_suspending(ms->ti)) {
+               while ((bio = bio_list_pop(failures)))
+                       bio_endio(bio, DM_ENDIO_REQUEUE);
+               return;
+       }
+
+       if (atomic_read(&ms->suspend)) {
+               while ((bio = bio_list_pop(failures)))
+                       bio_endio(bio, -EIO);
+               return;
+       }
+
+       spin_lock_irq(&ms->lock);
+       bio_list_merge(&ms->failures, failures);
+       spin_unlock_irq(&ms->lock);
+
+       wake(ms);
 }
 
 static void trigger_event(struct work_struct *work)
@@ -1176,6 +1227,8 @@ static struct mirror_set *alloc_context(unsigned int 
nr_mirrors,
        ms->nr_mirrors = nr_mirrors;
        ms->nr_regions = dm_sector_div_up(ti->len, region_size);
        ms->in_sync = 0;
+       ms->log_failure = 0;
+       atomic_set(&ms->suspend, 0);
        atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
 
        ms->io_client = dm_io_client_create(DM_IO_PAGES);
@@ -1511,26 +1564,51 @@ static int mirror_end_io(struct dm_target *ti, struct 
bio *bio,
        return 0;
 }
 
-static void mirror_postsuspend(struct dm_target *ti)
+static void mirror_presuspend(struct dm_target *ti)
 {
        struct mirror_set *ms = (struct mirror_set *) ti->private;
        struct dirty_log *log = ms->rh.log;
 
+       atomic_set(&ms->suspend, 1);
+
+       /*
+        * We must finish up all the work that we've
+        * generated (i.e. recovery work).
+        */
        rh_stop_recovery(&ms->rh);
 
-       /* Wait for all I/O we generated to complete */
        wait_event(_kmirrord_recovery_stopped,
                   !atomic_read(&ms->rh.recovery_in_flight));
 
+       if (log->type->presuspend && log->type->presuspend(log))
+               /* FIXME: need better error handling */
+               DMWARN("log presuspend failed");
+
+       /*
+        * Now that recovery is complete/stopped and the
+        * delayed bios are queued, we need to wait for
+        * the worker thread to complete.  This way,
+        * we know that all of our I/O has been pushed.
+        */
+       flush_workqueue(ms->kmirrord_wq);
+}
+
+static void mirror_postsuspend(struct dm_target *ti)
+{
+       struct mirror_set *ms = ti->private;
+       struct dirty_log *log = ms->rh.log;
+
        if (log->type->postsuspend && log->type->postsuspend(log))
                /* FIXME: need better error handling */
-               DMWARN("log suspend failed");
+               DMWARN("log postsuspend failed");
 }
 
 static void mirror_resume(struct dm_target *ti)
 {
-       struct mirror_set *ms = (struct mirror_set *) ti->private;
+       struct mirror_set *ms = ti->private;
        struct dirty_log *log = ms->rh.log;
+
+       atomic_set(&ms->suspend, 0);
        if (log->type->resume && log->type->resume(log))
                /* FIXME: need better error handling */
                DMWARN("log resume failed");
@@ -1564,7 +1642,7 @@ static int mirror_status(struct dm_target *ti, 
status_type_t type,
                DMEMIT("%d", ms->nr_mirrors);
                for (m = 0; m < ms->nr_mirrors; m++)
                        DMEMIT(" %s %llu", ms->mirror[m].dev->name,
-                               (unsigned long long)ms->mirror[m].offset);
+                              (unsigned long long)ms->mirror[m].offset);
 
                if (ms->features & DM_RAID1_HANDLE_ERRORS)
                        DMEMIT(" 1 handle_errors");
@@ -1581,6 +1659,7 @@ static struct target_type mirror_target = {
        .dtr     = mirror_dtr,
        .map     = mirror_map,
        .end_io  = mirror_end_io,
+       .presuspend = mirror_presuspend,
        .postsuspend = mirror_postsuspend,
        .resume  = mirror_resume,
        .status  = mirror_status,
-
To unsubscribe from this list: send the line "unsubscribe git-commits-head" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

dm raid1: fix EIO after log failure

Reply via email to