From: Dan Williams <[EMAIL PROTECTED]>

handle_stripe sets STRIPE_OP_COMPUTE_BLK to request servicing from
raid5_run_ops.  It also sets a flag for the block being computed to let
other parts of handle_stripe submit dependent operations.  raid5_run_ops
guarantees that the compute operation completes before any dependent
operation starts.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |  125 +++++++++++++++++++++++++++++++++++++++-------------
 1 files changed, 93 insertions(+), 32 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2390657..279a30c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1980,7 +1980,7 @@ static void handle_stripe5(struct stripe_head *sh)
        int i;
        int syncing, expanding, expanded;
        int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
-       int non_overwrite = 0;
+       int compute=0, req_compute=0, non_overwrite=0;
        int failed_num=0;
        struct r5dev *dev;
        unsigned long pending=0;
@@ -2032,8 +2032,8 @@ static void handle_stripe5(struct stripe_head *sh)
                /* now count some things */
                if (test_bit(R5_LOCKED, &dev->flags)) locked++;
                if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
+               if (test_bit(R5_Wantcompute, &dev->flags)) BUG_ON(++compute > 
1);
 
-               
                if (dev->toread) to_read++;
                if (dev->towrite) {
                        to_write++;
@@ -2188,31 +2188,82 @@ static void handle_stripe5(struct stripe_head *sh)
         * parity, or to satisfy requests
         * or to load a block that is being partially written.
         */
-       if (to_read || non_overwrite || (syncing && (uptodate < disks)) || 
expanding) {
-               for (i=disks; i--;) {
-                       dev = &sh->dev[i];
-                       if (!test_bit(R5_LOCKED, &dev->flags) && 
!test_bit(R5_UPTODATE, &dev->flags) &&
-                           (dev->toread ||
-                            (dev->towrite && !test_bit(R5_OVERWRITE, 
&dev->flags)) ||
-                            syncing ||
-                            expanding ||
-                            (failed && (sh->dev[failed_num].toread ||
-                                        (sh->dev[failed_num].towrite && 
!test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags))))
-                                   )
-                               ) {
-                               /* we would like to get this block, possibly
-                                * by computing it, but we might not be able to
+       if (to_read || non_overwrite || (syncing && (uptodate + compute < 
disks)) || expanding ||
+               test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
+
+               /* Clear completed compute operations.  Parity recovery
+                * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is 
handled
+                * later on in this routine
+                */
+               if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
+                       !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
+                       clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
+                       clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
+                       clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
+               }
+
+               /* look for blocks to read/compute, skip this if a compute
+                * is already in flight, or if the stripe contents are in the
+                * midst of changing due to a write
+                */
+               if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
+                       !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) &&
+                       !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
+                       for (i=disks; i--;) {
+                               dev = &sh->dev[i];
+
+                               /* don't schedule compute operations or reads on
+                                * the parity block while a check is in flight
                                 */
-                               if (uptodate == disks-1) {
-                                       PRINTK("Computing block %d\n", i);
-                                       compute_block(sh, i);
-                                       uptodate++;
-                               } else if (test_bit(R5_Insync, &dev->flags)) {
-                                       set_bit(R5_LOCKED, &dev->flags);
-                                       set_bit(R5_Wantread, &dev->flags);
-                                       locked++;
-                                       PRINTK("Reading block %d (sync=%d)\n", 
-                                               i, syncing);
+                               if ((i == sh->pd_idx) && 
test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
+                                       continue;
+
+                               if (!test_bit(R5_LOCKED, &dev->flags) && 
!test_bit(R5_UPTODATE, &dev->flags) &&
+                                    (dev->toread ||
+                                    (dev->towrite && !test_bit(R5_OVERWRITE, 
&dev->flags)) ||
+                                    syncing ||
+                                    expanding ||
+                                    (failed && (sh->dev[failed_num].toread ||
+                                                (sh->dev[failed_num].towrite &&
+                                                       !test_bit(R5_OVERWRITE, 
&sh->dev[failed_num].flags))))
+                                           )
+                                       ) {
+                                       /* 1/ We would like to get this block, 
possibly
+                                        * by computing it, but we might not be 
able to.
+                                        *
+                                        * 2/ Since parity check operations 
potentially
+                                        * make the parity block !uptodate it 
will need
+                                        * to be refreshed before any compute 
operations
+                                        * on data disks are scheduled.
+                                        *
+                                        * 3/ We hold off parity block re-reads 
until check
+                                        * operations have quiesced.
+                                        */
+                                       if ((uptodate == disks-1) && 
!test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
+                                               set_bit(STRIPE_OP_COMPUTE_BLK, 
&sh->ops.pending);
+                                               set_bit(R5_Wantcompute, 
&dev->flags);
+                                               sh->ops.target = i;
+                                               BUG_ON(req_compute++);
+                                               sh->ops.count++;
+                                               /* Careful: from this point on 
'uptodate' is in the eye of
+                                                * raid5_run_ops which services 
'compute' operations before
+                                                * writes. R5_Wantcompute flags 
a block that will be R5_UPTODATE
+                                                * by the time it is needed for 
a subsequent operation.
+                                                */
+                                               uptodate++;
+                                       } else if ((uptodate < disks-1) && 
test_bit(R5_Insync, &dev->flags)) {
+                                               /* Note: we hold off compute 
operations while checks are in flight,
+                                                * but we still prefer 
'compute' over 'read' hence we only read if
+                                                * (uptodate < disks-1)
+                                                */
+                                               set_bit(R5_LOCKED, &dev->flags);
+                                               set_bit(R5_Wantread, 
&dev->flags);
+                                               if 
(!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+                                                       sh->ops.count++;
+                                               locked++;
+                                               PRINTK("Reading block %d 
(sync=%d)\n",
+                                                       i, syncing);
+                                       }
                                }
                        }
                }
@@ -2287,7 +2338,7 @@ static void handle_stripe5(struct stripe_head *sh)
                        if ((dev->towrite || i == sh->pd_idx) &&
                            (!test_bit(R5_LOCKED, &dev->flags) 
                                    ) &&
-                           !test_bit(R5_UPTODATE, &dev->flags)) {
+                           !(test_bit(R5_UPTODATE, &dev->flags) || 
test_bit(R5_Wantcompute, &dev->flags))) {
                                if (test_bit(R5_Insync, &dev->flags)
 /*                                 && !(!mddev->insync && i == sh->pd_idx) */
                                        )
@@ -2298,7 +2349,7 @@ static void handle_stripe5(struct stripe_head *sh)
                        if (!test_bit(R5_OVERWRITE, &dev->flags) && i != 
sh->pd_idx &&
                            (!test_bit(R5_LOCKED, &dev->flags) 
                                    ) &&
-                           !test_bit(R5_UPTODATE, &dev->flags)) {
+                           !(test_bit(R5_UPTODATE, &dev->flags) || 
test_bit(R5_Wantcompute, &dev->flags))) {
                                if (test_bit(R5_Insync, &dev->flags)) rcw++;
                                else rcw += 2*disks;
                        }
@@ -2311,7 +2362,8 @@ static void handle_stripe5(struct stripe_head *sh)
                        for (i=disks; i--;) {
                                dev = &sh->dev[i];
                                if ((dev->towrite || i == sh->pd_idx) &&
-                                   !test_bit(R5_LOCKED, &dev->flags) && 
!test_bit(R5_UPTODATE, &dev->flags) &&
+                                   !test_bit(R5_LOCKED, &dev->flags) &&
+                                   !(test_bit(R5_UPTODATE, &dev->flags) || 
test_bit(R5_Wantcompute, &dev->flags)) &&
                                    test_bit(R5_Insync, &dev->flags)) {
                                        if (test_bit(STRIPE_PREREAD_ACTIVE, 
&sh->state))
                                        {
@@ -2330,7 +2382,8 @@ static void handle_stripe5(struct stripe_head *sh)
                        for (i=disks; i--;) {
                                dev = &sh->dev[i];
                                if (!test_bit(R5_OVERWRITE, &dev->flags) && i 
!= sh->pd_idx &&
-                                   !test_bit(R5_LOCKED, &dev->flags) && 
!test_bit(R5_UPTODATE, &dev->flags) &&
+                                   !test_bit(R5_LOCKED, &dev->flags) &&
+                                   !(test_bit(R5_UPTODATE, &dev->flags) || 
test_bit(R5_Wantcompute, &dev->flags)) &&
                                    test_bit(R5_Insync, &dev->flags)) {
                                        if (test_bit(STRIPE_PREREAD_ACTIVE, 
&sh->state))
                                        {
@@ -2345,8 +2398,16 @@ static void handle_stripe5(struct stripe_head *sh)
                                }
                        }
                /* now if nothing is locked, and if we have enough data, we can 
start a write request */
-               if (locked == 0 && (rcw == 0 ||rmw == 0) &&
-                   !test_bit(STRIPE_BIT_DELAY, &sh->state))
+               /* since handle_stripe can be called at any time we need to 
handle the case
+                * where a compute block operation has been submitted and then 
a subsequent
+                * call wants to start a write request.  raid5_run_ops only 
handles the case where
+                * compute block and postxor are requested simultaneously.  If 
this
+                * is not the case then new writes need to be held off until 
the compute
+                * completes.
+                */
+               if ((req_compute || !test_bit(STRIPE_OP_COMPUTE_BLK, 
&sh->ops.pending)) &&
+                       (locked == 0 && (rcw == 0 ||rmw == 0) &&
+                       !test_bit(STRIPE_BIT_DELAY, &sh->state)))
                        locked += handle_write_operations5(sh, rcw, 0);
        }
 
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to