struct stripe_cache_policy is introduced as an interface to enable multiple
caching policies.  It adds several methods to be called when cache events
occur.  See the definition of stripe_cache_policy in
include/linux/raid/raid5.h.  This patch does not add any new caching
policies, it just moves the current code to a new location and calls it by
a struct stripe_cache_policy method.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c         |  644 +++++++++++++++++++++++++-------------------
 include/linux/raid/raid5.h |   82 +++++-
 2 files changed, 446 insertions(+), 280 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 684552a..3b32a19 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -112,11 +112,12 @@ static void __release_stripe(raid5_conf_t *conf, struct 
stripe_head *sh)
        if (atomic_dec_and_test(&sh->count)) {
                BUG_ON(!list_empty(&sh->lru));
                BUG_ON(atomic_read(&conf->active_stripes)==0);
+               if (conf->cache_policy->release_stripe(conf, sh,
+                                               test_bit(STRIPE_HANDLE, 
&sh->state)))
+                       return; /* stripe was moved to a cache policy specific 
queue */
+
                if (test_bit(STRIPE_HANDLE, &sh->state)) {
-                       if (test_bit(STRIPE_DELAYED, &sh->state)) {
-                               list_add_tail(&sh->lru, &conf->delayed_list);
-                               blk_plug_device(conf->mddev->queue);
-                       } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+                       if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
                                   sh->bm_seq - conf->seq_write > 0) {
                                list_add_tail(&sh->lru, &conf->bitmap_list);
                                blk_plug_device(conf->mddev->queue);
@@ -125,23 +126,11 @@ static void __release_stripe(raid5_conf_t *conf, struct 
stripe_head *sh)
                                list_add_tail(&sh->lru, &conf->handle_list);
                        }
                        md_wakeup_thread(conf->mddev->thread);
-               } else {
-                       BUG_ON(sh->ops.pending);
-                       if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, 
&sh->state)) {
-                               atomic_dec(&conf->preread_active_stripes);
-                               if (atomic_read(&conf->preread_active_stripes) 
< IO_THRESHOLD)
-                                       md_wakeup_thread(conf->mddev->thread);
-                       }
-                       atomic_dec(&conf->active_stripes);
-                       if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
-                               list_add_tail(&sh->lru, &conf->inactive_list);
-                               wake_up(&conf->wait_for_stripe);
-                               if (conf->retry_read_aligned)
-                                       md_wakeup_thread(conf->mddev->thread);
-                       }
-               }
+               } else
+                       BUG();
        }
 }
+
 static void release_stripe(struct stripe_head *sh)
 {
        raid5_conf_t *conf = sh->raid_conf;
@@ -724,39 +713,6 @@ ops_run_biodrain(struct stripe_head *sh, struct 
dma_async_tx_descriptor *tx)
        return tx;
 }
 
-static void ops_complete_postxor(void *stripe_head_ref)
-{
-       struct stripe_head *sh = stripe_head_ref;
-
-       PRINTK("%s: stripe %llu\n", __FUNCTION__,
-               (unsigned long long)sh->sector);
-
-       set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
-       set_bit(STRIPE_HANDLE, &sh->state);
-       release_stripe(sh);
-}
-
-static void ops_complete_write(void *stripe_head_ref)
-{
-       struct stripe_head *sh = stripe_head_ref;
-       int disks = sh->disks, i, pd_idx = sh->pd_idx;
-
-       PRINTK("%s: stripe %llu\n", __FUNCTION__,
-               (unsigned long long)sh->sector);
-
-       for (i=disks ; i-- ;) {
-               struct r5dev *dev = &sh->dev[i];
-               if (dev->written || i == pd_idx)
-                       set_bit(R5_UPTODATE, &dev->flags);
-       }
-
-       set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
-       set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
-
-       set_bit(STRIPE_HANDLE, &sh->state);
-       release_stripe(sh);
-}
-
 static void
 ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 {
@@ -764,6 +720,7 @@ ops_run_postxor(struct stripe_head *sh, struct 
dma_async_tx_descriptor *tx)
        int disks = sh->disks;
        struct page *xor_srcs[disks];
 
+       raid5_conf_t *conf = sh->raid_conf;
        int count = 0, pd_idx = sh->pd_idx, i;
        struct page *xor_dest;
        int prexor = test_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
@@ -792,9 +749,8 @@ ops_run_postxor(struct stripe_head *sh, struct 
dma_async_tx_descriptor *tx)
                }
        }
 
-       /* check whether this postxor is part of a write */
-       callback = test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending) ?
-               ops_complete_write : ops_complete_postxor;
+       /* take cache policy specific action upon completion of the postxor */
+       callback = conf->cache_policy->complete_postxor_action;
 
        /* 1/ if we prexor'd then the dest is reused as a source
         * 2/ if we did not prexor then we are redoing the parity
@@ -1683,7 +1639,8 @@ static void compute_block_2(struct stripe_head *sh, int 
dd_idx1, int dd_idx2)
        }
 }
 
-static int handle_write_operations5(struct stripe_head *sh, int rcw, int 
expand)
+static int 
+raid5_wt_cache_handle_parity_updates(struct stripe_head *sh, int rcw, int 
expand)
 {
        int i, pd_idx = sh->pd_idx, disks = sh->disks;
        int locked=0;
@@ -1847,6 +1804,327 @@ static int stripe_to_pdidx(sector_t stripe, 
raid5_conf_t *conf, int disks)
        return pd_idx;
 }
 
+static int
+raid5_wt_cache_release_stripe(raid5_conf_t *conf, struct stripe_head *sh,
+       int handle)
+{
+       struct stripe_cache_policy *cp = conf->cache_policy;
+
+       PRINTK("%s: stripe %llu\n", __FUNCTION__,
+               (unsigned long long)sh->sector);
+
+       if (handle && test_bit(STRIPE_DELAYED, &sh->state)) {
+               list_add_tail(&sh->lru, &cp->delayed_list);
+               blk_plug_device(conf->mddev->queue);
+               return 1;
+       } else if (!handle) {
+               BUG_ON(sh->ops.pending);
+               if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+                       atomic_dec(&cp->preread_active_stripes);
+                       if (atomic_read(&cp->preread_active_stripes) < 
IO_THRESHOLD)
+                               md_wakeup_thread(conf->mddev->thread);
+               }
+               atomic_dec(&conf->active_stripes);
+               if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
+                       list_add_tail(&sh->lru, &conf->inactive_list);
+                       wake_up(&conf->wait_for_stripe);
+                       if (conf->retry_read_aligned)
+                               md_wakeup_thread(conf->mddev->thread);
+               }
+               return 1;
+       }
+
+       return 0;
+}
+
+static void raid5_wt_cache_complete_postxor_action(void *stripe_head_ref)
+{
+       struct stripe_head *sh = stripe_head_ref;
+
+       PRINTK("%s: stripe %llu\n", __FUNCTION__,
+               (unsigned long long)sh->sector);
+
+       set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
+
+       /* leaving prexor set until postxor is done allows us to distinguish
+        * a rmw from a rcw during biodrain
+        */
+       if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete)) {
+               int i;
+               for (i=sh->disks; i--;)
+                       clear_bit(R5_Wantprexor, &sh->dev[i].flags);
+
+               clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
+               clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack);
+               clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
+       }
+
+       if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) {
+               int disks = sh->disks, i, pd_idx = sh->pd_idx;
+
+               for (i=disks ; i-- ;) {
+                       struct r5dev *dev = &sh->dev[i];
+                       if (dev->written || i == pd_idx)
+                               set_bit(R5_UPTODATE, &dev->flags);
+               }
+
+               set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
+       }
+
+       set_bit(STRIPE_HANDLE, &sh->state);
+       release_stripe(sh);
+}
+
+static struct bio *
+raid5_wt_cache_handle_completed_writes(struct stripe_head *sh,
+       struct stripe_head_state *s)
+{
+       struct bio *return_bi = NULL;
+
+       /* might be able to return some write requests if the parity block
+        * is safe, or on a failed drive
+        */
+       struct r5dev *dev = &sh->dev[sh->pd_idx];
+       if ( s->written &&
+            ( (test_bit(R5_Insync, &dev->flags) && !test_bit(R5_LOCKED, 
&dev->flags) &&
+               test_bit(R5_UPTODATE, &dev->flags))
+              || (s->failed == 1 && s->failed_num == sh->pd_idx))
+           ) {
+           raid5_conf_t *conf = sh->raid_conf;
+           int i;
+           /* any written block on an uptodate or failed drive can be returned.
+            * Note that if we 'wrote' to a failed drive, it will be UPTODATE, 
but 
+            * never LOCKED, so we don't need to test 'failed' directly.
+            */
+           for (i=sh->disks; i--; )
+               if (sh->dev[i].written) {
+                   dev = &sh->dev[i];
+                   if (!test_bit(R5_LOCKED, &dev->flags) &&
+                        test_bit(R5_UPTODATE, &dev->flags) ) {
+                       /* We can return any write requests */
+                           struct bio *wbi, *wbi2;
+                           int bitmap_end = 0;
+                           PRINTK("%s: Return write for disc %d\n",
+                               __FUNCTION__, i);
+                           spin_lock_irq(&conf->device_lock);
+                           wbi = dev->written;
+                           dev->written = NULL;
+                           while (wbi && wbi->bi_sector < dev->sector + 
STRIPE_SECTORS) {
+                                   wbi2 = r5_next_bio(wbi, dev->sector);
+                                   if (--wbi->bi_phys_segments == 0) {
+                                           md_write_end(conf->mddev);
+                                           wbi->bi_next = return_bi;
+                                           return_bi = wbi;
+                                   }
+                                   wbi = wbi2;
+                           }
+                           if (dev->towrite == NULL)
+                                   bitmap_end = 1;
+                           spin_unlock_irq(&conf->device_lock);
+                           if (bitmap_end)
+                                   bitmap_endwrite(conf->mddev->bitmap, 
sh->sector,
+                                                   STRIPE_SECTORS,
+                                                   !test_bit(STRIPE_DEGRADED, 
&sh->state), 0);
+                   }
+               }
+       }
+
+       return return_bi;
+}
+
+static void
+raid5_wt_cache_submit_pending_writes(struct stripe_head *sh,
+       struct stripe_head_state *s)
+{
+       /* if only POSTXOR is set then this is an 'expand' postxor */
+       if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) &&
+               test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
+               raid5_conf_t *conf = sh->raid_conf;
+               struct stripe_cache_policy *cp = conf->cache_policy;
+               int i;
+
+               PRINTK("%s: stripe %llu\n", __FUNCTION__,
+                       (unsigned long long)sh->sector);
+
+               /* All the 'written' buffers and the parity block are ready to 
be
+                * written back to disk
+                */
+               BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
+               for (i=sh->disks; i--;) {
+                       struct r5dev *dev = &sh->dev[i];
+                       if (test_bit(R5_LOCKED, &dev->flags) &&
+                               (i == sh->pd_idx || dev->written)) {
+                               PRINTK("Writing block %d\n", i);
+                               set_bit(R5_Wantwrite, &dev->flags);
+                               if (!test_and_set_bit(STRIPE_OP_IO, 
&sh->ops.pending))
+                                       sh->ops.count++;
+                               if (!test_bit(R5_Insync, &dev->flags)
+                                   || (i==sh->pd_idx && s->failed == 0))
+                                       set_bit(STRIPE_INSYNC, &sh->state);
+                       }
+               }
+               if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+                       atomic_dec(&cp->preread_active_stripes);
+                       if (atomic_read(&cp->preread_active_stripes) < 
IO_THRESHOLD)
+                               md_wakeup_thread(conf->mddev->thread);
+               }
+
+               clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
+               clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack);
+               clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
+
+               clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
+               clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
+               clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
+       }
+
+}
+
+static void
+raid5_wt_cache_handle_new_writes(struct stripe_head *sh, struct 
stripe_head_state *s)
+{
+       /* 1/ Check operations clobber the parity block so do not start new 
writes while
+        *    a check is in flight
+        * 2/ Write operations do not stack
+        */
+       if (s->to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) &&
+               !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
+               int rmw=0, rcw=0, disks = sh->disks, i;
+               struct r5dev *dev;
+               for (i=disks ; i--;) {
+                       /* would I have to read this buffer for 
read_modify_write */
+                       dev = &sh->dev[i];
+                       if ((dev->towrite || i == sh->pd_idx) &&
+                           (!test_bit(R5_LOCKED, &dev->flags) 
+                                   ) &&
+                           !(test_bit(R5_UPTODATE, &dev->flags) || 
test_bit(R5_Wantcompute, &dev->flags))) {
+                               if (test_bit(R5_Insync, &dev->flags)
+/*                                 && !(!mddev->insync && i == sh->pd_idx) */
+                                       )
+                                       rmw++;
+                               else rmw += 2*disks;  /* cannot read it */
+                       }
+                       /* Would I have to read this buffer for 
reconstruct_write */
+                       if (!test_bit(R5_OVERWRITE, &dev->flags) && i != 
sh->pd_idx &&
+                           (!test_bit(R5_LOCKED, &dev->flags) 
+                                   ) &&
+                           !(test_bit(R5_UPTODATE, &dev->flags) || 
test_bit(R5_Wantcompute, &dev->flags))) {
+                               if (test_bit(R5_Insync, &dev->flags)) rcw++;
+                               else rcw += 2*disks;
+                       }
+               }
+               PRINTK("for sector %llu, rmw=%d rcw=%d\n", 
+                       (unsigned long long)sh->sector, rmw, rcw);
+               set_bit(STRIPE_HANDLE, &sh->state);
+               if (rmw < rcw && rmw > 0)
+                       /* prefer read-modify-write, but need to get some data 
*/
+                       for (i=disks; i--;) {
+                               dev = &sh->dev[i];
+                               if ((dev->towrite || i == sh->pd_idx) &&
+                                   !test_bit(R5_LOCKED, &dev->flags) &&
+                                   !(test_bit(R5_UPTODATE, &dev->flags) || 
test_bit(R5_Wantcompute, &dev->flags)) &&
+                                   test_bit(R5_Insync, &dev->flags)) {
+                                       if (test_bit(STRIPE_PREREAD_ACTIVE, 
&sh->state))
+                                       {
+                                               PRINTK("Read_old block %d for 
r-m-w\n", i);
+                                               set_bit(R5_LOCKED, &dev->flags);
+                                               set_bit(R5_Wantread, 
&dev->flags);
+                                               if 
(!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+                                                       sh->ops.count++;
+                                               s->locked++;
+                                       } else {
+                                               set_bit(STRIPE_DELAYED, 
&sh->state);
+                                               set_bit(STRIPE_HANDLE, 
&sh->state);
+                                       }
+                               }
+                       }
+               if (rcw <= rmw && rcw > 0)
+                       /* want reconstruct write, but need to get some data */
+                       for (i=disks; i--;) {
+                               dev = &sh->dev[i];
+                               if (!test_bit(R5_OVERWRITE, &dev->flags) && i 
!= sh->pd_idx &&
+                                   !test_bit(R5_LOCKED, &dev->flags) &&
+                                   !(test_bit(R5_UPTODATE, &dev->flags) || 
test_bit(R5_Wantcompute, &dev->flags)) &&
+                                   test_bit(R5_Insync, &dev->flags)) {
+                                       if (test_bit(STRIPE_PREREAD_ACTIVE, 
&sh->state))
+                                       {
+                                               PRINTK("Read_old block %d for 
Reconstruct\n", i);
+                                               set_bit(R5_LOCKED, &dev->flags);
+                                               set_bit(R5_Wantread, 
&dev->flags);
+                                               if 
(!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+                                                       sh->ops.count++;
+                                               s->locked++;
+                                       } else {
+                                               set_bit(STRIPE_DELAYED, 
&sh->state);
+                                               set_bit(STRIPE_HANDLE, 
&sh->state);
+                                       }
+                               }
+                       }
+               /* now if nothing is locked, and if we have enough data, we can 
start a write request */
+               /* since handle_stripe can be called at any time we need to 
handle the case
+                * where a compute block operation has been submitted and then 
a subsequent
+                * call wants to start a write request.  raid5_run_ops only 
handles the case where
+                * compute block and postxor are requested simultaneously.  If 
this
+                * is not the case then new writes need to be held off until 
the compute
+                * completes.
+                */
+               if ((s->req_compute || !test_bit(STRIPE_OP_COMPUTE_BLK, 
&sh->ops.pending)) &&
+                       (s->locked == 0 && (rcw == 0 ||rmw == 0) &&
+                       !test_bit(STRIPE_BIT_DELAY, &sh->state)))
+                       s->locked += raid5_wt_cache_handle_parity_updates(sh, 
rcw == 0, 0);
+                       
+       }
+}
+
+static void raid5_wt_cache_activate_delayed(raid5_conf_t *conf)
+{
+       struct stripe_cache_policy *cp = conf->cache_policy;
+       if (atomic_read(&cp->preread_active_stripes) < IO_THRESHOLD) {
+               while (!list_empty(&cp->delayed_list)) {
+                       struct list_head *l = cp->delayed_list.next;
+                       struct stripe_head *sh;
+                       sh = list_entry(l, struct stripe_head, lru);
+                       list_del_init(l);
+                       clear_bit(STRIPE_DELAYED, &sh->state);
+                       if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 
&sh->state))
+                               atomic_inc(&cp->preread_active_stripes);
+                       list_add_tail(&sh->lru, &conf->handle_list);
+               }
+       }
+}
+
+static void raid5_wt_cache_raid5d(mddev_t *mddev, raid5_conf_t *conf)
+{
+       struct stripe_cache_policy *cp = conf->cache_policy;
+
+       if (list_empty(&conf->handle_list) &&
+           atomic_read(&cp->preread_active_stripes) < IO_THRESHOLD &&
+           !blk_queue_plugged(mddev->queue) &&
+           !list_empty(&cp->delayed_list))
+               raid5_wt_cache_activate_delayed(conf);
+}
+
+static void raid5_wt_cache_init(raid5_conf_t *conf)
+{
+       atomic_set(&conf->cache_policy->preread_active_stripes, 0);
+       INIT_LIST_HEAD(&conf->cache_policy->delayed_list);
+}
+
+static void raid5_wt_cache_unplug_device(raid5_conf_t *conf)
+{
+       raid5_wt_cache_activate_delayed(conf);
+}
+
+static struct stripe_cache_policy raid5_cache_policy_write_through = {
+       .release_stripe = raid5_wt_cache_release_stripe,
+       .complete_postxor_action = raid5_wt_cache_complete_postxor_action,
+       .submit_pending_writes = raid5_wt_cache_submit_pending_writes,
+       .handle_new_writes = raid5_wt_cache_handle_new_writes,
+       .handle_completed_writes = raid5_wt_cache_handle_completed_writes,
+       .raid5d = raid5_wt_cache_raid5d,
+       .init = raid5_wt_cache_init,
+       .unplug_device = raid5_wt_cache_unplug_device,
+};
 
 /*
  * handle_stripe - do things to a stripe.
@@ -1944,12 +2222,13 @@ static void handle_stripe5(struct stripe_head *sh)
        }
        rcu_read_unlock();
 
+       /* do we need to request a biofill operation? */
        if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
                sh->ops.count++;
 
-       PRINTK("locked=%d uptodate=%d to_read=%d"
+       PRINTK("locked=%d dirty=%d uptodate=%d to_read=%d"
                " to_write=%d to_fill=%d failed=%d failed_num=%d\n",
-               s.locked, s.uptodate, s.to_read, s.to_write, s.to_fill,
+               s.locked, s.dirty, s.uptodate, s.to_read, s.to_write, s.to_fill,
                s.failed, s.failed_num);
        /* check if the array has lost two devices and, if so, some requests 
might
         * need to be failed
@@ -2035,50 +2314,8 @@ static void handle_stripe5(struct stripe_head *sh)
                s.syncing = 0;
        }
 
-       /* might be able to return some write requests if the parity block
-        * is safe, or on a failed drive
-        */
-       dev = &sh->dev[sh->pd_idx];
-       if ( s.written &&
-            ( (test_bit(R5_Insync, &dev->flags) && !test_bit(R5_LOCKED, 
&dev->flags) &&
-               test_bit(R5_UPTODATE, &dev->flags))
-              || (s.failed == 1 && s.failed_num == sh->pd_idx))
-           ) {
-           /* any written block on an uptodate or failed drive can be returned.
-            * Note that if we 'wrote' to a failed drive, it will be UPTODATE, 
but 
-            * never LOCKED, so we don't need to test 'failed' directly.
-            */
-           for (i=disks; i--; )
-               if (sh->dev[i].written) {
-                   dev = &sh->dev[i];
-                   if (!test_bit(R5_LOCKED, &dev->flags) &&
-                        test_bit(R5_UPTODATE, &dev->flags) ) {
-                       /* We can return any write requests */
-                           struct bio *wbi, *wbi2;
-                           int bitmap_end = 0;
-                           PRINTK("Return write for disc %d\n", i);
-                           spin_lock_irq(&conf->device_lock);
-                           wbi = dev->written;
-                           dev->written = NULL;
-                           while (wbi && wbi->bi_sector < dev->sector + 
STRIPE_SECTORS) {
-                                   wbi2 = r5_next_bio(wbi, dev->sector);
-                                   if (--wbi->bi_phys_segments == 0) {
-                                           md_write_end(conf->mddev);
-                                           wbi->bi_next = return_bi;
-                                           return_bi = wbi;
-                                   }
-                                   wbi = wbi2;
-                           }
-                           if (dev->towrite == NULL)
-                                   bitmap_end = 1;
-                           spin_unlock_irq(&conf->device_lock);
-                           if (bitmap_end)
-                                   bitmap_endwrite(conf->mddev->bitmap, 
sh->sector,
-                                                   STRIPE_SECTORS,
-                                                   !test_bit(STRIPE_DEGRADED, 
&sh->state), 0);
-                   }
-               }
-       }
+       /* handle the completion of writes to the backing disks */
+       return_bi = conf->cache_policy->handle_completed_writes(sh, &s);
 
        /* Now we might consider reading some blocks, either to check/generate
         * parity, or to satisfy requests
@@ -2135,7 +2372,8 @@ static void handle_stripe5(struct stripe_head *sh)
                                         * 3/ We hold off parity block re-reads 
until check
                                         * operations have quiesced.
                                         */
-                                       if ((s.uptodate == disks-1) && 
!test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
+                                       if (((s.uptodate == disks-1) && 
!s.dirty) &&
+                                               !test_bit(STRIPE_OP_CHECK, 
&sh->ops.pending)) {
                                                set_bit(STRIPE_OP_COMPUTE_BLK, 
&sh->ops.pending);
                                                set_bit(R5_Wantcompute, 
&dev->flags);
                                                sh->ops.target = i;
@@ -2148,7 +2386,8 @@ static void handle_stripe5(struct stripe_head *sh)
                                                 */
                                                s.uptodate++;
                                                break; /* uptodate + compute == 
disks */
-                                       } else if ((s.uptodate < disks-1) && 
test_bit(R5_Insync, &dev->flags)) {
+                                       } else if (((s.uptodate < disks-1) || 
s.dirty) &&
+                                                       test_bit(R5_Insync, 
&dev->flags)) {
                                                /* Note: we hold off compute 
operations while checks are in flight,
                                                 * but we still prefer 
'compute' over 'read' hence we only read if
                                                 * (uptodate < disks-1)
@@ -2167,158 +2406,20 @@ static void handle_stripe5(struct stripe_head *sh)
                set_bit(STRIPE_HANDLE, &sh->state);
        }
 
-       /* Now we check to see if any write operations have recently
-        * completed
-        */
-
-       /* leave prexor set until postxor is done, allows us to distinguish
-        * a rmw from a rcw during biodrain
-        */
-       if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) &&
-               test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
-
-               clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
-               clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack);
-               clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
-
-               for (i=disks; i--;)
-                       clear_bit(R5_Wantprexor, &sh->dev[i].flags);
-       }
-
-       /* if only POSTXOR is set then this is an 'expand' postxor */
-       if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) &&
-               test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
-
-               clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
-               clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack);
-               clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
+       /* Now we check to see if any blocks are ready to be written to disk */
+       conf->cache_policy->submit_pending_writes(sh, &s);
 
-               clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
-               clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
-               clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
-
-               /* All the 'written' buffers and the parity block are ready to 
be
-                * written back to disk
-                */
-               BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
-               for (i=disks; i--;) {
-                       dev = &sh->dev[i];
-                       if (test_bit(R5_LOCKED, &dev->flags) &&
-                               (i == sh->pd_idx || dev->written)) {
-                               PRINTK("Writing block %d\n", i);
-                               set_bit(R5_Wantwrite, &dev->flags);
-                               if (!test_and_set_bit(STRIPE_OP_IO, 
&sh->ops.pending))
-                                       sh->ops.count++;
-                               if (!test_bit(R5_Insync, &dev->flags)
-                                   || (i==sh->pd_idx && s.failed == 0))
-                                       set_bit(STRIPE_INSYNC, &sh->state);
-                       }
-               }
-               if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-                       atomic_dec(&conf->preread_active_stripes);
-                       if (atomic_read(&conf->preread_active_stripes) < 
IO_THRESHOLD)
-                               md_wakeup_thread(conf->mddev->thread);
-               }
-       }
-
-       /* 1/ Now to consider new write requests and what else, if anything 
should be read
-        * 2/ Check operations clobber the parity block so do not start new 
writes while
-        *    a check is in flight
-        * 3/ Write operations do not stack
-        */
-       if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) &&
-               !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
-               int rmw=0, rcw=0;
-               for (i=disks ; i--;) {
-                       /* would I have to read this buffer for 
read_modify_write */
-                       dev = &sh->dev[i];
-                       if ((dev->towrite || i == sh->pd_idx) &&
-                           (!test_bit(R5_LOCKED, &dev->flags) 
-                                   ) &&
-                           !(test_bit(R5_UPTODATE, &dev->flags) || 
test_bit(R5_Wantcompute, &dev->flags))) {
-                               if (test_bit(R5_Insync, &dev->flags)
-/*                                 && !(!mddev->insync && i == sh->pd_idx) */
-                                       )
-                                       rmw++;
-                               else rmw += 2*disks;  /* cannot read it */
-                       }
-                       /* Would I have to read this buffer for 
reconstruct_write */
-                       if (!test_bit(R5_OVERWRITE, &dev->flags) && i != 
sh->pd_idx &&
-                           (!test_bit(R5_LOCKED, &dev->flags) 
-                                   ) &&
-                           !(test_bit(R5_UPTODATE, &dev->flags) || 
test_bit(R5_Wantcompute, &dev->flags))) {
-                               if (test_bit(R5_Insync, &dev->flags)) rcw++;
-                               else rcw += 2*disks;
-                       }
-               }
-               PRINTK("for sector %llu, rmw=%d rcw=%d\n", 
-                       (unsigned long long)sh->sector, rmw, rcw);
-               set_bit(STRIPE_HANDLE, &sh->state);
-               if (rmw < rcw && rmw > 0)
-                       /* prefer read-modify-write, but need to get some data 
*/
-                       for (i=disks; i--;) {
-                               dev = &sh->dev[i];
-                               if ((dev->towrite || i == sh->pd_idx) &&
-                                   !test_bit(R5_LOCKED, &dev->flags) &&
-                                   !(test_bit(R5_UPTODATE, &dev->flags) || 
test_bit(R5_Wantcompute, &dev->flags)) &&
-                                   test_bit(R5_Insync, &dev->flags)) {
-                                       if (test_bit(STRIPE_PREREAD_ACTIVE, 
&sh->state))
-                                       {
-                                               PRINTK("Read_old block %d for 
r-m-w\n", i);
-                                               set_bit(R5_LOCKED, &dev->flags);
-                                               set_bit(R5_Wantread, 
&dev->flags);
-                                               if 
(!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-                                                       sh->ops.count++;
-                                               s.locked++;
-                                       } else {
-                                               set_bit(STRIPE_DELAYED, 
&sh->state);
-                                               set_bit(STRIPE_HANDLE, 
&sh->state);
-                                       }
-                               }
-                       }
-               if (rcw <= rmw && rcw > 0)
-                       /* want reconstruct write, but need to get some data */
-                       for (i=disks; i--;) {
-                               dev = &sh->dev[i];
-                               if (!test_bit(R5_OVERWRITE, &dev->flags) && i 
!= sh->pd_idx &&
-                                   !test_bit(R5_LOCKED, &dev->flags) &&
-                                   !(test_bit(R5_UPTODATE, &dev->flags) || 
test_bit(R5_Wantcompute, &dev->flags)) &&
-                                   test_bit(R5_Insync, &dev->flags)) {
-                                       if (test_bit(STRIPE_PREREAD_ACTIVE, 
&sh->state))
-                                       {
-                                               PRINTK("Read_old block %d for 
Reconstruct\n", i);
-                                               set_bit(R5_LOCKED, &dev->flags);
-                                               set_bit(R5_Wantread, 
&dev->flags);
-                                               if 
(!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-                                                       sh->ops.count++;
-                                               s.locked++;
-                                       } else {
-                                               set_bit(STRIPE_DELAYED, 
&sh->state);
-                                               set_bit(STRIPE_HANDLE, 
&sh->state);
-                                       }
-                               }
-                       }
-               /* now if nothing is locked, and if we have enough data, we can 
start a write request */
-               /* since handle_stripe can be called at any time we need to 
handle the case
-                * where a compute block operation has been submitted and then 
a subsequent
-                * call wants to start a write request.  raid5_run_ops only 
handles the case where
-                * compute block and postxor are requested simultaneously.  If 
this
-                * is not the case then new writes need to be held off until 
the compute
-                * completes.
-                */
-               if ((s.req_compute || !test_bit(STRIPE_OP_COMPUTE_BLK, 
&sh->ops.pending)) &&
-                       (s.locked == 0 && (rcw == 0 ||rmw == 0) &&
-                       !test_bit(STRIPE_BIT_DELAY, &sh->state)))
-                       s.locked += handle_write_operations5(sh, rcw == 0, 0);
-       }
+       /* Now to consider new write requests and what else, if anything should 
be read */
+       conf->cache_policy->handle_new_writes(sh, &s);
 
        /* 1/ Maybe we need to check and possibly fix the parity for this 
stripe.
         *    Any reads will already have been scheduled, so we just see if 
enough data
         *    is available.
         * 2/ Hold off parity checks while parity dependent operations are in 
flight
-        *    (conflicting writes are protected by the 'locked' variable)
+        *    (conflicting writes are protected by the 'locked' and 'dirty' 
variables)
         */
-       if ((s.syncing && s.locked == 0 && !test_bit(STRIPE_OP_COMPUTE_BLK, 
&sh->ops.pending) &&
+       if ((s.syncing && s.locked == 0 && s.dirty == 0 &&
+               !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
                !test_bit(STRIPE_INSYNC, &sh->state)) ||
                test_bit(STRIPE_OP_CHECK, &sh->ops.pending) ||
                test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
@@ -2451,7 +2552,7 @@ static void handle_stripe5(struct stripe_head *sh)
                /* Need to write out all blocks after computing parity */
                sh->disks = conf->raid_disks;
                sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 
conf->raid_disks);
-               s.locked += handle_write_operations5(sh, 0, 1);
+               s.locked += raid5_wt_cache_handle_parity_updates(sh, 0, 1);
        } else if (s.expanded && !test_bit(STRIPE_OP_POSTXOR, 
&sh->ops.pending)) {
                clear_bit(STRIPE_EXPAND_READY, &sh->state);
                atomic_dec(&conf->reshape_stripes);
@@ -2885,8 +2986,9 @@ static void handle_stripe6(struct stripe_head *sh, struct 
page *tmp_page)
                        set_bit(STRIPE_INSYNC, &sh->state);
 
                        if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, 
&sh->state)) {
-                               atomic_dec(&conf->preread_active_stripes);
-                               if (atomic_read(&conf->preread_active_stripes) 
< IO_THRESHOLD)
+                               
atomic_dec(&conf->cache_policy->preread_active_stripes);
+                               if 
(atomic_read(&conf->cache_policy->preread_active_stripes)
+                                       < IO_THRESHOLD)
                                        md_wakeup_thread(conf->mddev->thread);
                        }
                }
@@ -3164,22 +3266,6 @@ static void handle_stripe(struct stripe_head *sh, struct 
page *tmp_page)
 
 
 
-static void raid5_activate_delayed(raid5_conf_t *conf)
-{
-       if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
-               while (!list_empty(&conf->delayed_list)) {
-                       struct list_head *l = conf->delayed_list.next;
-                       struct stripe_head *sh;
-                       sh = list_entry(l, struct stripe_head, lru);
-                       list_del_init(l);
-                       clear_bit(STRIPE_DELAYED, &sh->state);
-                       if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 
&sh->state))
-                               atomic_inc(&conf->preread_active_stripes);
-                       list_add_tail(&sh->lru, &conf->handle_list);
-               }
-       }
-}
-
 static void activate_bit_delay(raid5_conf_t *conf)
 {
        /* device_lock is held */
@@ -3222,14 +3308,17 @@ static void raid5_unplug_device(request_queue_t *q)
 {
        mddev_t *mddev = q->queuedata;
        raid5_conf_t *conf = mddev_to_conf(mddev);
+       struct stripe_cache_policy *cp = conf->cache_policy;
        unsigned long flags;
 
        spin_lock_irqsave(&conf->device_lock, flags);
 
        if (blk_remove_plug(q)) {
                conf->seq_flush++;
-               raid5_activate_delayed(conf);
+               if (cp->unplug_device)
+                       cp->unplug_device(conf);
        }
+
        md_wakeup_thread(mddev->thread);
 
        spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -3944,11 +4033,8 @@ static void raid5d (mddev_t *mddev)
                        activate_bit_delay(conf);
                }
 
-               if (list_empty(&conf->handle_list) &&
-                   atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
-                   !blk_queue_plugged(mddev->queue) &&
-                   !list_empty(&conf->delayed_list))
-                       raid5_activate_delayed(conf);
+               if (conf->cache_policy->raid5d)
+                       conf->cache_policy->raid5d(mddev, conf);
 
                while ((bio = remove_bio_from_retry(conf))) {
                        int ok;
@@ -4150,16 +4236,22 @@ static int run(mddev_t *mddev)
                if (!conf->spare_page)
                        goto abort;
        }
+
+       #ifdef CONFIG_RAID5_CACHE_POLICY_WRITE_BACK
+       conf->cache_policy = &raid5_cache_policy_write_back;
+       #else
+       conf->cache_policy = &raid5_cache_policy_write_through;
+       #endif
+       
        spin_lock_init(&conf->device_lock);
        init_waitqueue_head(&conf->wait_for_stripe);
        init_waitqueue_head(&conf->wait_for_overlap);
        INIT_LIST_HEAD(&conf->handle_list);
-       INIT_LIST_HEAD(&conf->delayed_list);
        INIT_LIST_HEAD(&conf->bitmap_list);
        INIT_LIST_HEAD(&conf->inactive_list);
        atomic_set(&conf->active_stripes, 0);
-       atomic_set(&conf->preread_active_stripes, 0);
        atomic_set(&conf->active_aligned_reads, 0);
+       conf->cache_policy->init(conf);
 
        PRINTK("raid5: run(%s) called.\n", mdname(mddev));
 
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 54e2aa2..f00da23 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -224,8 +224,8 @@ struct stripe_head_state {
 #define STRIPE_HANDLE          2
 #define        STRIPE_SYNCING          3
 #define        STRIPE_INSYNC           4
-#define        STRIPE_PREREAD_ACTIVE   5
-#define        STRIPE_DELAYED          6
+#define        STRIPE_PREREAD_ACTIVE   5 /* wt cache state */
+#define        STRIPE_DELAYED          6 /* wt cache state */
 #define        STRIPE_DEGRADED         7
 #define        STRIPE_BIT_DELAY        8
 #define        STRIPE_EXPANDING        9
@@ -276,6 +276,81 @@ struct disk_info {
        mdk_rdev_t      *rdev;
 };
 
+/**
+ * struct stripe_cache_policy - handle writethrough/writeback caching
+ * @post_run_biodrain:
+ *  wb: allows writes to be signalled complete once
+ *      they are in the stripe cache
+ *  wt: NULL
+ * @notify_release:
+ *  wb: transition inactive stripes with pending data to a dirty list
+ *  rather than the inactive list
+ *  wt: handle delayed stripes and issuing pre-read actions.
+ * @submit_pending_writes:
+ *  wb: only writeback when STRIPE_EVICT is set
+ *  wt: always writethrough after postxor completes
+ */
+
+/* wt = write through
+ * wb = write back
+ */
+struct stripe_cache_policy {
+       /* release_stripe - returns '1' if stripe was moved to cache-private 
list
+        *  else '0'
+        * [ called from __release_stripe under 
spin_lock_irq(&conf->device_lock) ]
+        * wt: catch 'delayed' stripes and poke the 'preread' state machine
+        * if necessary
+        */
+       int (*release_stripe)(struct raid5_private_data *conf,
+               struct stripe_head *sh, int handle);
+       /* complete_postxor_action
+        * wt: check if this is the end of a rcw/rmw write request and set
+        * the state bits accordingly.  set 'handle' and release.
+        */
+       void (*complete_postxor_action)(void *stripe_head_ref);
+       /* submit_pending_writes
+        * [ called from handle_stripe under spin_lock(&sh->lock) ]
+        * wt: check if 'biodrain' and 'postxor' are complete and schedule 
writes
+        * to the backing disks
+        */
+       void (*submit_pending_writes)(struct stripe_head *sh,
+               struct stripe_head_state *s);
+       /* handle_new_writes
+        * [ called from handle_stripe under spin_lock(&sh->lock) ]
+        * wt: schedule reads to prepare for a rcw or rmw operation.  once 
preread
+        * data is available lock the blocks and schedule 
'[prexor]+biodrain+postxor'
+        */
+       void (*handle_new_writes)(struct stripe_head *sh,
+               struct stripe_head_state *s);
+       /* handle_completed_writes
+        * [ called from handle_stripe under spin_lock(&sh->lock) ]
+        * wt: call bi_end_io on all written blocks and perform general 
md/bitmap
+        * post write housekeeping.
+        */
+       struct bio *(*handle_completed_writes)(struct stripe_head *sh,
+               struct stripe_head_state *s);
+       /* raid5d
+        * wt: check for stripes that can be taken off the delayed list
+        */
+       void (*raid5d)(mddev_t *mddev, struct raid5_private_data *conf);
+       /* init
+        * wt: initialize 'delayed_list' and 'preread_active_stripes'
+        * wb: initialize 'dirty_list' and 'dirty_stripes'
+        */
+       void (*init)(struct raid5_private_data *conf);
+       /* unplug_device
+        * [ called from raid5_unplug_device under 
spin_lock_irqsave(&conf->device_lock) ]
+        * wt: activate stripes on the delayed list
+        */
+       void (*unplug_device)(struct raid5_private_data *conf);
+       union {
+               struct list_head delayed_list; /* wt: stripes that have plugged 
requests */
+       };
+       union {
+               atomic_t preread_active_stripes;
+       };
+};
+
 struct raid5_private_data {
        struct hlist_head       *stripe_hashtbl;
        mddev_t                 *mddev;
@@ -284,6 +359,7 @@ struct raid5_private_data {
        int                     max_degraded;
        int                     raid_disks;
        int                     max_nr_stripes;
+       struct stripe_cache_policy *cache_policy;
 
        /* used during an expand */
        sector_t                expand_progress;        /* MaxSector when no 
expand happening */
@@ -293,11 +369,9 @@ struct raid5_private_data {
        int                     previous_raid_disks;
 
        struct list_head        handle_list; /* stripes needing handling */
-       struct list_head        delayed_list; /* stripes that have plugged 
requests */
        struct list_head        bitmap_list; /* stripes delaying awaiting 
bitmap update */
        struct bio              *retry_read_aligned; /* currently retrying 
aligned bios   */
        struct bio              *retry_read_aligned_list; /* aligned bios retry 
list  */
-       atomic_t                preread_active_stripes; /* stripes with 
scheduled io */
        atomic_t                active_aligned_reads;
 
        atomic_t                reshape_stripes; /* stripes with pending writes 
for reshape */
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to