In a certain rare scenario, the checkpoint service throws away the
current checkpoint database.
An example of when this occurs is when there are 3 nodes A, B, C, node A
and C are killed then node B syncs. After this completes, Node C is
started and node B again begins resyncing, but during this sync process
node A starts up.
This results in node b no longer believing it is required to sync its
current database contents. The abort called on node b throws away all
checkpoints in the system but since node b is no longer the lowest node
id in the system it believes it doesn't have to sync.
The design change is that once a node has been declared as a responsible
for synchronization, any aborts or configuration changes will never
change the fact that node is still responsible for synchronization.
Regards
-steve
Index: exec/ckpt.c
===================================================================
--- exec/ckpt.c (revision 1658)
+++ exec/ckpt.c (working copy)
@@ -94,6 +94,8 @@
};
enum sync_state {
+ SYNC_STATE_NOT_STARTED,
+ SYNC_STATE_STARTED,
SYNC_STATE_GLOBALID,
SYNC_STATE_CHECKPOINT,
SYNC_STATE_REFCOUNT
@@ -375,7 +377,7 @@
static mar_uint32_t global_ckpt_id = 0;
-static enum sync_state my_sync_state;
+static enum sync_state my_sync_state = SYNC_STATE_NOT_STARTED;
static enum iteration_state my_iteration_state;
@@ -766,6 +768,8 @@
mar_refcount_set_t refcount_set[PROCESSOR_COUNT_MAX] __attribute__((aligned(8)));
};
+static int first_configuration = 1;
+
/*
* Implementation
*/
@@ -778,8 +782,18 @@
{
unsigned int i, j;
unsigned int lowest_nodeid;
- static int first_configuration = 1;
+ memcpy (&my_saved_ring_id, ring_id,
+ sizeof (struct memb_ring_id));
+ if (configuration_type != TOTEM_CONFIGURATION_REGULAR) {
+ return;
+ }
+ if (my_sync_state != SYNC_STATE_NOT_STARTED) {
+ return;
+ }
+
+ my_sync_state = SYNC_STATE_STARTED;
+
my_should_sync = 0;
/*
@@ -787,29 +801,25 @@
*/
lowest_nodeid = 0xffffffff;
- if (configuration_type == TOTEM_CONFIGURATION_REGULAR) {
- for (i = 0; i < my_old_member_list_entries; i++) {
- for (j = 0; j < member_list_entries; j++) {
- if (my_old_member_list[i] == member_list[j]) {
- if (lowest_nodeid > member_list[j]) {
- lowest_nodeid = member_list[j];
- }
+ for (i = 0; i < my_old_member_list_entries; i++) {
+ for (j = 0; j < member_list_entries; j++) {
+ if (my_old_member_list[i] == member_list[j]) {
+ if (lowest_nodeid > member_list[j]) {
+ lowest_nodeid = member_list[j];
}
}
}
- memcpy (&my_saved_ring_id, ring_id,
- sizeof (struct memb_ring_id));
- memcpy (my_old_member_list, member_list,
- sizeof (unsigned int) * member_list_entries);
- my_old_member_list_entries = member_list_entries;
+ }
+ memcpy (my_old_member_list, member_list,
+ sizeof (unsigned int) * member_list_entries);
+ my_old_member_list_entries = member_list_entries;
- if ((first_configuration) ||
- (lowest_nodeid == totempg_my_nodeid_get())) {
+ if ((first_configuration) ||
+ (lowest_nodeid == totempg_my_nodeid_get())) {
- my_should_sync = 1;
- }
- first_configuration = 0;
+ my_should_sync = 1;
}
+ first_configuration = 0;
}
static struct checkpoint *checkpoint_find (
@@ -3691,6 +3701,9 @@
}
}
break;
+
+ default:
+ assert (0);
}
LEAVE();
@@ -3711,7 +3724,7 @@
list_init (&sync_checkpoint_list_head);
- my_sync_state = SYNC_STATE_CHECKPOINT;
+ my_sync_state = SYNC_STATE_NOT_STARTED;
LEAVE();
}
_______________________________________________
Openais mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/openais