From: Dan Williams <[EMAIL PROTECTED]>

Enable handle_stripe5 to pass off write operations to
raid5_do_soft_blocks_ops (which can be run as a workqueue).  The operations
moved are reconstruct-writes and read-modify-writes formerly handled by
compute_parity5.

Changelog:
* moved raid5_do_soft_block_ops changes into a separate patch
* changed handle_write_operations5 to only initiate write operations, which
prevents new writes from being requested while the current one is in flight
* all blocks undergoing a write are now marked locked and !uptodate at the
beginning of the write operation
* blocks undergoing a read-modify-write need a request flag to distinguish
them from blocks that are locked for reading. Reconstruct-writes still use
the R5_LOCKED bit to select blocks for the operation
* integrated the work queue Kconfig option

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/Kconfig         |   21 +++++
 drivers/md/raid5.c         |  192 ++++++++++++++++++++++++++++++++++++++------
 include/linux/raid/raid5.h |    3 +
 3 files changed, 190 insertions(+), 26 deletions(-)

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index bf869ed..2a16b3b 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -162,6 +162,27 @@ config MD_RAID5_RESHAPE
          There should be enough spares already present to make the new
          array workable.
 
+config MD_RAID456_WORKQUEUE
+       depends on MD_RAID456
+       bool "Offload raid work to a workqueue from raid5d"
+       ---help---
+         This option enables raid work (block copy and xor operations)
+         to run in a workqueue.  If your platform has a high context
+         switch penalty say N.  If you are using hardware offload or
+         are running on an SMP platform say Y.
+
+         If unsure say, Y.
+
+config MD_RAID456_WORKQUEUE_MULTITHREAD
+       depends on MD_RAID456_WORKQUEUE && SMP
+       bool "Enable multi-threaded raid processing"
+       default y
+       ---help---
+         This option controls whether the raid workqueue will be multi-
+         threaded or single threaded.
+
+         If unsure say, Y.
+
 config MD_MULTIPATH
        tristate "Multipath I/O support"
        depends on BLK_DEV_MD
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8fde62b..e39d248 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -222,6 +222,8 @@ static void init_stripe(struct stripe_he
 
        BUG_ON(atomic_read(&sh->count) != 0);
        BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
+       BUG_ON(sh->ops.state);
+       BUG_ON(sh->ops.pending);
        
        CHECK_DEVLOCK();
        PRINTK("init_stripe called, stripe %llu\n", 
@@ -331,6 +333,9 @@ static int grow_one_stripe(raid5_conf_t 
        memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev));
        sh->raid_conf = conf;
        spin_lock_init(&sh->lock);
+       #ifdef CONFIG_MD_RAID456_WORKQUEUE
+       INIT_WORK(&sh->ops.work, conf->do_block_ops, sh);
+       #endif
 
        if (grow_buffers(sh, conf->raid_disks)) {
                shrink_buffers(sh, conf->raid_disks);
@@ -1266,7 +1271,72 @@ static void compute_block_2(struct strip
        }
 }
 
+static int handle_write_operations5(struct stripe_head *sh, int rcw)
+{
+       int i, pd_idx = sh->pd_idx, disks = sh->disks;
+       int locked=0;
+
+       if (rcw == 0) {
+               /* skip the drain operation on an expand */
+               if (test_bit(STRIPE_OP_RCW_Expand, &sh->ops.state)) {
+                       set_bit(STRIPE_OP_RCW, &sh->state);
+                       set_bit(STRIPE_OP_RCW_Parity, &sh->ops.state);
+                       for (i=disks ; i-- ;) {
+                               set_bit(R5_LOCKED, &sh->dev[i].flags);
+                               locked++;
+                       }
+               } else { /* enter stage 1 of reconstruct write operation */
+                       set_bit(STRIPE_OP_RCW, &sh->state);
+                       set_bit(STRIPE_OP_RCW_Drain, &sh->ops.state);
+                       for (i=disks ; i-- ;) {
+                               struct r5dev *dev = &sh->dev[i];
+
+                               if (dev->towrite) {
+                                       set_bit(R5_LOCKED, &dev->flags);
+                                       clear_bit(R5_UPTODATE, &dev->flags);
+                                       locked++;
+                               }
+                       }
+               }
+       } else {
+               /* enter stage 1 of read modify write operation */
+               BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags));
+
+               set_bit(STRIPE_OP_RMW, &sh->state);
+               set_bit(STRIPE_OP_RMW_ParityPre, &sh->ops.state);
+               for (i=disks ; i-- ;) {
+                       struct r5dev *dev = &sh->dev[i];
+                       if (i==pd_idx)
+                               continue;
+
+                       /* For a read-modify write there may be blocks that are
+                        * locked for reading while others are ready to be 
written
+                        * so we distinguish these blocks by the RMWReq bit
+                        */
+                       if (dev->towrite &&
+                           test_bit(R5_UPTODATE, &dev->flags)) {
+                               set_bit(R5_RMWReq, &dev->flags);
+                               set_bit(R5_LOCKED, &dev->flags);
+                               clear_bit(R5_UPTODATE, &dev->flags);
+                               locked++;
+                       }
+               }
+       }
+
+       /* keep the parity disk locked while asynchronous operations
+        * are in flight
+        */
+       set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
+       clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+       locked++;
+       sh->ops.pending++;
 
+       PRINTK("%s: stripe %llu locked: %d op_state: %lx\n",
+               __FUNCTION__, (unsigned long long)sh->sector,
+               locked, sh->ops.state);
+
+       return locked;
+}
 
 /*
  * Each stripe/dev can have one or more bion attached.
@@ -1664,7 +1734,6 @@ static void raid5_do_soft_block_ops(void
  *    schedule a write of some buffers
  *    return confirmation of parity correctness
  *
- * Parity calculations are done inside the stripe lock
  * buffers are taken off read_list or write_list, and bh_cache buffers
  * get BH_Lock set before the stripe lock is released.
  *
@@ -1679,13 +1748,13 @@ static void handle_stripe5(struct stripe
        int i;
        int syncing, expanding, expanded;
        int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
-       int non_overwrite = 0;
+       int non_overwrite=0, write_complete=0;
        int failed_num=0;
        struct r5dev *dev;
 
-       PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n",
-               (unsigned long long)sh->sector, atomic_read(&sh->count),
-               sh->pd_idx);
+       PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d\n",
+              (unsigned long long)sh->sector, sh->state, 
atomic_read(&sh->count),
+              sh->pd_idx);
 
        spin_lock(&sh->lock);
        clear_bit(STRIPE_HANDLE, &sh->state);
@@ -1926,8 +1995,56 @@ #endif
                set_bit(STRIPE_HANDLE, &sh->state);
        }
 
-       /* now to consider writing and what else, if anything should be read */
-       if (to_write) {
+       /* Now we check to see if any write operations have recently
+        * completed
+        */
+       if (test_bit(STRIPE_OP_RCW, &sh->state) &&
+               test_bit(STRIPE_OP_RCW_Done, &sh->ops.state)) {
+               clear_bit(STRIPE_OP_RCW, &sh->state);
+               clear_bit(STRIPE_OP_RCW_Done, &sh->ops.state);
+               write_complete++;
+       }
+
+       if (test_bit(STRIPE_OP_RMW, &sh->state) &&
+               test_bit(STRIPE_OP_RMW_Done, &sh->ops.state)) {
+               clear_bit(STRIPE_OP_RMW, &sh->state);
+               clear_bit(STRIPE_OP_RMW_Done, &sh->ops.state);
+               BUG_ON(++write_complete > 1);
+               for (i=disks; i--;)
+                       clear_bit(R5_RMWReq, &sh->dev[i].flags);
+       }
+
+       /* All the 'written' buffers and the parity block are ready to be
+        * written back to disk
+        */
+       if (write_complete) {
+               BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
+               for (i=disks; i--;) {
+                       dev = &sh->dev[i];
+                       if (test_bit(R5_LOCKED, &dev->flags) &&
+                               (i == sh->pd_idx || dev->written)) {
+                               PRINTK("Writing block %d\n", i);
+                               set_bit(R5_Wantwrite, &dev->flags);
+                               if (!test_bit(R5_Insync, &dev->flags)
+                                   || (i==sh->pd_idx && failed == 0))
+                                       set_bit(STRIPE_INSYNC, &sh->state);
+                       }
+               }
+               if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+                       atomic_dec(&conf->preread_active_stripes);
+                       if (atomic_read(&conf->preread_active_stripes) < 
IO_THRESHOLD)
+                               md_wakeup_thread(conf->mddev->thread);
+               }
+       }
+
+       /* 1/ Now to consider new write requests and what else, if anything 
should be read
+        * 2/ Check operations clobber the parity block so do not start new 
writes while
+        *    a check is in flight
+        * 3/ Write operations do not stack
+        */
+       if (to_write && !test_bit(STRIPE_OP_RCW, &sh->state) &&
+               !test_bit(STRIPE_OP_RMW, &sh->state) &&
+               !test_bit(STRIPE_OP_CHECK, &sh->state)) {
                int rmw=0, rcw=0;
                for (i=disks ; i--;) {
                        /* would I have to read this buffer for 
read_modify_write */
@@ -2000,25 +2117,8 @@ #endif
                        }
                /* now if nothing is locked, and if we have enough data, we can 
start a write request */
                if (locked == 0 && (rcw == 0 ||rmw == 0) &&
-                   !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
-                       PRINTK("Computing parity...\n");
-                       compute_parity5(sh, rcw==0 ? RECONSTRUCT_WRITE : 
READ_MODIFY_WRITE);
-                       /* now every locked buffer is ready to be written */
-                       for (i=disks; i--;)
-                               if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
-                                       PRINTK("Writing block %d\n", i);
-                                       locked++;
-                                       set_bit(R5_Wantwrite, 
&sh->dev[i].flags);
-                                       if (!test_bit(R5_Insync, 
&sh->dev[i].flags)
-                                           || (i==sh->pd_idx && failed == 0))
-                                               set_bit(STRIPE_INSYNC, 
&sh->state);
-                               }
-                       if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, 
&sh->state)) {
-                               atomic_dec(&conf->preread_active_stripes);
-                               if (atomic_read(&conf->preread_active_stripes) 
< IO_THRESHOLD)
-                                       md_wakeup_thread(conf->mddev->thread);
-                       }
-               }
+                   !test_bit(STRIPE_BIT_DELAY, &sh->state))
+                       locked += handle_write_operations5(sh, rcw);
        }
 
        /* maybe we need to check and possibly fix the parity for this stripe
@@ -2150,8 +2250,17 @@ #endif
                        }
        }
 
+       queue_raid_work(sh);
+
        spin_unlock(&sh->lock);
 
+       #ifndef CONFIG_MD_RAID456_WORKQUEUE
+       while (test_bit(STRIPE_OP_QUEUED, &sh->state)) {
+               PRINTK("run do_block_ops\n");
+               conf->do_block_ops(sh);
+       }
+       #endif
+
        while ((bi=return_bi)) {
                int bytes = bi->bi_size;
 
@@ -3439,6 +3548,30 @@ static int run(mddev_t *mddev)
                if (!conf->spare_page)
                        goto abort;
        }
+
+       #ifdef CONFIG_MD_RAID456_WORKQUEUE
+       sprintf(conf->workqueue_name, "%s_raid5_ops",
+               mddev->gendisk->disk_name);
+
+       #ifdef CONFIG_MD_RAID456_MULTITHREAD
+       if ((conf->block_ops_queue = create_workqueue(conf->workqueue_name))
+                                    == NULL)
+               goto abort;
+       #else
+       if ((conf->block_ops_queue = create_singlethread_workqueue(
+                                       conf->workqueue_name)) == NULL)
+               goto abort;
+       #endif
+       #endif
+
+       /* To Do:
+        * 1/ Offload to asynchronous copy / xor engines
+        * 2/ Automated selection of optimal do_block_ops
+        *      routine similar to the xor template selection
+        */
+       conf->do_block_ops = raid5_do_soft_block_ops;
+
+
        spin_lock_init(&conf->device_lock);
        init_waitqueue_head(&conf->wait_for_stripe);
        init_waitqueue_head(&conf->wait_for_overlap);
@@ -3598,6 +3731,10 @@ abort:
                safe_put_page(conf->spare_page);
                kfree(conf->disks);
                kfree(conf->stripe_hashtbl);
+               #ifdef CONFIG_MD_RAID456_WORKQUEUE
+               if (conf->do_block_ops)
+                       destroy_workqueue(conf->block_ops_queue);
+               #endif
                kfree(conf);
        }
        mddev->private = NULL;
@@ -3618,6 +3755,9 @@ static int stop(mddev_t *mddev)
        blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
        sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
        kfree(conf->disks);
+       #ifdef CONFIG_MD_RAID456_WORKQUEUE
+       destroy_workqueue(conf->block_ops_queue);
+       #endif
        kfree(conf);
        mddev->private = NULL;
        return 0;
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index c8a315b..31ae55c 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -3,6 +3,7 @@ #define _RAID5_H
 
 #include <linux/raid/md.h>
 #include <linux/raid/xor.h>
+#include <linux/workqueue.h>
 
 /*
  *
@@ -333,6 +334,7 @@ struct raid5_private_data {
        atomic_t                preread_active_stripes; /* stripes with 
scheduled io */
 
        atomic_t                reshape_stripes; /* stripes with pending writes 
for reshape */
+
        #ifdef CONFIG_MD_RAID456_WORKQUEUE
        struct workqueue_struct *block_ops_queue;
        #endif
@@ -376,6 +378,7 @@ struct raid5_private_data {
 typedef struct raid5_private_data raid5_conf_t;
 
 #define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
+
 /* must be called under the stripe lock */
 static inline void queue_raid_work(struct stripe_head *sh)
 {
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to