CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL4
Changes by: [EMAIL PROTECTED] 2007-09-26 03:15:41
Modified files:
cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h
dm-cmirror-server.c
Log message:
Bug 291521: Cluster mirror can become out-of-sync if nominal I/O
overl...
Another touch-up for this bug.
Bad news:
Because a node can cache the state of a region indefinitely (especially
for
blocks that are used alot - e.g. a journaling area of a file system),
we must
deny writes to any region of the mirror that is not yet recovered.
This is only
the case with cluster mirroring. This means poor performance of
nominal I/O
during recovery - probably really bad performance. However, this is
absolutely
necessary for mirror reliability.
Good news:
The time I spent coding different fixes for this bug weren't a complete
waste.
I've been able to reuse some of that code to optimize the recovery
process.
Now, rather than going through the mirror from front to back, it skips
ahead to
recover regions that have pending writes. Bottom line: performance
will be bad
during recovery, but it will be better than RHEL4.5.
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.52&r2=1.1.2.53
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.14&r2=1.1.2.15
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.38&r2=1.1.2.39
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/09/21
20:07:37 1.1.2.52
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/09/26
03:15:40 1.1.2.53
@@ -142,6 +142,7 @@
lc->sync_count = (sync == NOSYNC) ? region_count : 0;
lc->recovering_region = (uint64_t)-1;
+ lc->recovering_next = (uint64_t)-1;
lc->sync_search = 0;
log->context = lc;
return 0;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h 2007/04/10
07:12:24 1.1.2.14
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h 2007/09/26
03:15:40 1.1.2.15
@@ -98,6 +98,7 @@
uint32_t *clean_bits;
uint32_t *sync_bits;
uint64_t recovering_region;
+ uint64_t recovering_next;
int sync_pass; /* number of passes attempting to resync */
int sync_search;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/09/21
20:07:37 1.1.2.38
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/09/26
03:15:40 1.1.2.39
@@ -446,18 +446,25 @@
static int server_is_remote_recovering(struct log_c *lc, struct log_request
*lr)
{
- uint64_t high, low;
+ lr->u.lr_int_rtn = 0;
- high = lc->sync_search + 10;
- low = (lc->recovering_region != (uint64_t)-1) ?
- lc->recovering_region :
- lc->sync_search;
- if ((lr->u.lr_region >= low) && (lr->u.lr_region <= high)) {
- DMDEBUG("Remote recovery conflict: %Lu/%s",
- lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
+ if ((lc->sync_search > lc->region_count) && !lc->sync_pass)
+ return 0;
+
+ /*
+ * If the region hasn't been recovered yet,
+ * we need to block the write
+ */
+ if (!log_test_bit(lc->sync_bits, lr->u.lr_region) ||
+ (lc->recovering_region == lr->u.lr_region)) {
lr->u.lr_int_rtn = 1;
- } else
- lr->u.lr_int_rtn = 0;
+
+ /* Try to make this region a priority */
+ if ((lr->u.lr_region != lc->recovering_region) &&
+ (lc->recovering_next == (uint64_t)-1))
+ lc->recovering_next = lr->u.lr_region;
+ return 0;
+ }
return 0;
}
@@ -542,7 +549,9 @@
}
if (!find_ru_by_region(lc, lr->u.lr_region)) {
- log_set_bit(lc, lc->clean_bits, lr->u.lr_region);
+ /* Only clear the region if it is also in sync */
+ if (log_test_bit(lc->sync_bits, lr->u.lr_region))
+ log_set_bit(lc, lc->clean_bits, lr->u.lr_region);
} else if (check_bug) {
DMERR("Multiple marks exist on a region being recovered:
%Lu/%s",
lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
@@ -608,26 +617,45 @@
}
}
- *region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
- lc->region_count,
- lc->sync_search);
- if (find_ru_by_region(lc, *region)) {
- DMDEBUG("Recovery blocked by outstanding write on region
%Lu/%s",
- *region, lc->uuid + (strlen(lc->uuid) - 8));
- return 0;
- }
+ DMDEBUG("Priority recovery region: %Lu/%s",
+ lc->recovering_next, lc->uuid + (strlen(lc->uuid) - 8));
- if (*region >= lc->region_count)
- return 0;
+ if ((lc->recovering_next != (uint64_t)-1) &&
+ (!log_test_bit(lc->sync_bits, lc->recovering_next))) {
+ new = mempool_alloc(region_user_pool, GFP_NOFS);
+ if (!new)
+ return -ENOMEM;
+ *region = lc->recovering_region = lc->recovering_next;
+ DMDEBUG("Preempting normal recovery work for preferred
region...");
+ } else {
+ *region = ext2_find_next_zero_bit((unsigned long *)
lc->sync_bits,
+ lc->region_count,
+ lc->sync_search);
+ if (find_ru_by_region(lc, *region)) {
+ /*
+ * We disallow writes to regions that have not yet been
+ * recovered via is_remote_recovering(), so this should
+ * not happen.
+ */
+ DMERR("Recovery blocked by outstanding write on region
%Lu/%s",
+ *region, lc->uuid + (strlen(lc->uuid) - 8));
+ BUG();
+ return 0;
+ }
- new = mempool_alloc(region_user_pool, GFP_NOFS);
- if (!new)
- return -ENOMEM;
+ if (*region >= lc->region_count)
+ return 0;
- lc->sync_search = *region + 1;
+ new = mempool_alloc(region_user_pool, GFP_NOFS);
+ if (!new)
+ return -ENOMEM;
- lc->recovering_region = *region;
+ lc->sync_search = *region + 1;
+
+ lc->recovering_region = *region;
+ }
+ lc->recovering_next = (uint64_t)-1;
lr->u.lr_int_rtn = 1; /* Assigning work */
new->ru_nodeid = who;
new->ru_region = *region;