CVSROOT:        /cvs/cluster
Module name:    cluster
Branch:         RHEL4
Changes by:     [EMAIL PROTECTED]       2007-09-26 03:15:41

Modified files:
        cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h 
                            dm-cmirror-server.c 

Log message:
        Bug 291521: Cluster mirror can become out-of-sync if nominal I/O 
overl...
        
        Another touch-up for this bug.
        
        Bad news:
        Because a node can cache the state of a region indefinitely (especially 
for
        blocks that are used alot - e.g. a journaling area of a file system), 
we must
        deny writes to any region of the mirror that is not yet recovered.  
This is only
        the case with cluster mirroring.  This means poor performance of 
nominal I/O
        during recovery - probably really bad performance.  However, this is 
absolutely
        necessary for mirror reliability.
        
        Good news:
        The time I spent coding different fixes for this bug weren't a complete 
waste.
        I've been able to reuse some of that code to optimize the recovery 
process.
        Now, rather than going through the mirror from front to back, it skips 
ahead to
        recover regions that have pending writes.  Bottom line: performance 
will be bad
        during recovery, but it will be better than RHEL4.5.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.52&r2=1.1.2.53
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.14&r2=1.1.2.15
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.38&r2=1.1.2.39

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c        2007/09/21 
20:07:37     1.1.2.52
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c        2007/09/26 
03:15:40     1.1.2.53
@@ -142,6 +142,7 @@
        lc->sync_count = (sync == NOSYNC) ? region_count : 0;
 
        lc->recovering_region = (uint64_t)-1;
+       lc->recovering_next = (uint64_t)-1;
        lc->sync_search = 0;
        log->context = lc;
        return 0;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h        2007/04/10 
07:12:24     1.1.2.14
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h        2007/09/26 
03:15:40     1.1.2.15
@@ -98,6 +98,7 @@
        uint32_t *clean_bits;
        uint32_t *sync_bits;
        uint64_t recovering_region;
+       uint64_t recovering_next;
 
        int sync_pass;          /* number of passes attempting to resync */
        int sync_search;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c        2007/09/21 
20:07:37     1.1.2.38
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c        2007/09/26 
03:15:40     1.1.2.39
@@ -446,18 +446,25 @@
 
 static int server_is_remote_recovering(struct log_c *lc, struct log_request 
*lr)
 {
-       uint64_t high, low;
+       lr->u.lr_int_rtn = 0;
 
-       high = lc->sync_search + 10;
-       low = (lc->recovering_region != (uint64_t)-1) ?
-               lc->recovering_region :
-               lc->sync_search;
-       if ((lr->u.lr_region >= low) && (lr->u.lr_region <= high)) {
-               DMDEBUG("Remote recovery conflict: %Lu/%s",
-                       lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
+       if ((lc->sync_search > lc->region_count) && !lc->sync_pass)
+               return 0;
+
+       /*
+        * If the region hasn't been recovered yet,
+        * we need to block the write
+        */
+       if (!log_test_bit(lc->sync_bits, lr->u.lr_region) ||
+           (lc->recovering_region == lr->u.lr_region)) {
                lr->u.lr_int_rtn = 1;
-       } else
-               lr->u.lr_int_rtn = 0;
+
+               /* Try to make this region a priority */
+               if ((lr->u.lr_region != lc->recovering_region) &&
+                   (lc->recovering_next == (uint64_t)-1))
+                       lc->recovering_next = lr->u.lr_region;
+               return 0;
+       }
 
        return 0;
 }
@@ -542,7 +549,9 @@
        }
 
        if (!find_ru_by_region(lc, lr->u.lr_region)) {
-               log_set_bit(lc, lc->clean_bits, lr->u.lr_region);
+               /* Only clear the region if it is also in sync */
+               if (log_test_bit(lc->sync_bits, lr->u.lr_region))
+                       log_set_bit(lc, lc->clean_bits, lr->u.lr_region);
        } else if (check_bug) {
                DMERR("Multiple marks exist on a region being recovered: 
%Lu/%s",
                      lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
@@ -608,26 +617,45 @@
                }
        }
 
-       *region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
-                                         lc->region_count,
-                                         lc->sync_search);
-       if (find_ru_by_region(lc, *region)) {
-               DMDEBUG("Recovery blocked by outstanding write on region 
%Lu/%s",
-                       *region, lc->uuid + (strlen(lc->uuid) - 8));
-               return 0;
-       }
+       DMDEBUG("Priority recovery region: %Lu/%s",
+               lc->recovering_next, lc->uuid + (strlen(lc->uuid) - 8));
 
-       if (*region >= lc->region_count)
-               return 0;
+       if ((lc->recovering_next != (uint64_t)-1) &&
+           (!log_test_bit(lc->sync_bits, lc->recovering_next))) {
+               new = mempool_alloc(region_user_pool, GFP_NOFS);
+               if (!new)
+                       return -ENOMEM;
+               *region = lc->recovering_region = lc->recovering_next;
+               DMDEBUG("Preempting normal recovery work for preferred 
region...");
+       } else {
+               *region = ext2_find_next_zero_bit((unsigned long *) 
lc->sync_bits,
+                                                 lc->region_count,
+                                                 lc->sync_search);
+               if (find_ru_by_region(lc, *region)) {
+                       /*
+                        * We disallow writes to regions that have not yet been
+                        * recovered via is_remote_recovering(), so this should
+                        * not happen.
+                        */
+                       DMERR("Recovery blocked by outstanding write on region 
%Lu/%s",
+                             *region, lc->uuid + (strlen(lc->uuid) - 8));
+                       BUG();
+                       return 0;
+               }
 
-       new = mempool_alloc(region_user_pool, GFP_NOFS);
-       if (!new)
-               return -ENOMEM;
+               if (*region >= lc->region_count)
+                       return 0;
 
-       lc->sync_search = *region + 1;
+               new = mempool_alloc(region_user_pool, GFP_NOFS);
+               if (!new)
+                       return -ENOMEM;
 
-       lc->recovering_region = *region;
+               lc->sync_search = *region + 1;
+
+               lc->recovering_region = *region;
+       }
 
+       lc->recovering_next = (uint64_t)-1;
        lr->u.lr_int_rtn = 1; /* Assigning work */
        new->ru_nodeid = who;
        new->ru_region = *region;

Reply via email to