In a certain rare scenario, the checkpoint service throws away the
current checkpoint database.

An example of when this occurs is when there are 3 nodes A, B, C, node A
and C are killed then node B syncs.  After this completes, Node C is
started and node B again begins resyncing, but during this sync process
node A starts up.

This results in node b no longer believing it is required to sync its
current database contents.  The abort called on node b throws away all
checkpoints in the system but since node b is no longer the lowest node
id in the system it believes it doesn't have to sync.

The design change is that once a node has been declared as a responsible
for synchronization, any aborts or configuration changes will never
change the fact that node is still responsible for synchronization.

Regards
-steve
Index: exec/ckpt.c
===================================================================
--- exec/ckpt.c	(revision 1658)
+++ exec/ckpt.c	(working copy)
@@ -94,6 +94,8 @@
 };
 
 enum sync_state {
+	SYNC_STATE_NOT_STARTED,
+	SYNC_STATE_STARTED,
 	SYNC_STATE_GLOBALID,
 	SYNC_STATE_CHECKPOINT,
 	SYNC_STATE_REFCOUNT
@@ -375,7 +377,7 @@
 
 static mar_uint32_t global_ckpt_id = 0;
 
-static enum sync_state my_sync_state;
+static enum sync_state my_sync_state = SYNC_STATE_NOT_STARTED;
 
 static enum iteration_state my_iteration_state;
 
@@ -766,6 +768,8 @@
 	mar_refcount_set_t refcount_set[PROCESSOR_COUNT_MAX] __attribute__((aligned(8)));
 };
 
+static int first_configuration = 1;
+
 /*
  * Implementation
  */
@@ -778,8 +782,18 @@
 {
 	unsigned int i, j;
 	unsigned int lowest_nodeid;
-	static int first_configuration = 1;
 
+	memcpy (&my_saved_ring_id, ring_id,
+		sizeof (struct memb_ring_id));
+       if (configuration_type != TOTEM_CONFIGURATION_REGULAR) {
+                return;
+        }
+        if (my_sync_state != SYNC_STATE_NOT_STARTED) {
+                return;
+        }
+
+	my_sync_state = SYNC_STATE_STARTED;
+
 	my_should_sync = 0;
 
 	/*
@@ -787,29 +801,25 @@
 	 */
 	lowest_nodeid = 0xffffffff;
 
-	if (configuration_type == TOTEM_CONFIGURATION_REGULAR) {
-		for (i = 0; i < my_old_member_list_entries; i++) {
-			for (j = 0; j < member_list_entries; j++) {
-				if (my_old_member_list[i] == member_list[j]) {
-					if (lowest_nodeid > member_list[j]) {
-						lowest_nodeid = member_list[j];
-					}
+	for (i = 0; i < my_old_member_list_entries; i++) {
+		for (j = 0; j < member_list_entries; j++) {
+			if (my_old_member_list[i] == member_list[j]) {
+				if (lowest_nodeid > member_list[j]) {
+					lowest_nodeid = member_list[j];
 				}
 			}
 		}
-		memcpy (&my_saved_ring_id, ring_id,
-			sizeof (struct memb_ring_id));
-		memcpy (my_old_member_list, member_list,
-			sizeof (unsigned int) * member_list_entries);
-		my_old_member_list_entries = member_list_entries;
+	}
+	memcpy (my_old_member_list, member_list,
+		sizeof (unsigned int) * member_list_entries);
+	my_old_member_list_entries = member_list_entries;
 
-		if ((first_configuration) ||
-			(lowest_nodeid == totempg_my_nodeid_get())) {
+	if ((first_configuration) ||
+		(lowest_nodeid == totempg_my_nodeid_get())) {
 
-			my_should_sync = 1;
-		}
-		first_configuration = 0;
+		my_should_sync = 1;
 	}
+	first_configuration = 0;
 }
 
 static struct checkpoint *checkpoint_find (
@@ -3691,6 +3701,9 @@
 			}
 		}
 		break;
+
+	default:
+		assert (0);
 	}
 
 	LEAVE();
@@ -3711,7 +3724,7 @@
 
 	list_init (&sync_checkpoint_list_head);
 
-	my_sync_state = SYNC_STATE_CHECKPOINT;
+	my_sync_state = SYNC_STATE_NOT_STARTED;
 
 	LEAVE();
 }
_______________________________________________
Openais mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/openais

Reply via email to