During some testing of new development work in a package that uses the
checkpoint service, it was found that sometimes the global id used to
keep track of unlinked checkpoints was not synchronized.
This scenario would occur when a checkpoint was created on a node, then
it was unlinked and closed. Each node would now have a global id of 1.
If at this point a new node was started, that new node would have a
global id of 0. This would result in the state machine losing
synchronization resulting in all kinds of bad behavior.
The checkpoint global id was previous to this patch only synchronized if
checkpoints were existing in the system. This patch now sends a "fake
checkpoint" which triggers the synchronization of the global id (which
occurs every time a checkpoint is synchronized).
This patch does not break protocol compatibility.
Regards
-steve
Index: exec/ckpt.c
===================================================================
--- exec/ckpt.c (revision 1547)
+++ exec/ckpt.c (working copy)
@@ -65,6 +65,8 @@
#include "totempg.h"
#include "print.h"
+#define GLOBALID_CHECKPOINT_NAME "global_checkpoint_name_do_not_use_in_an_application"
+
#define CKPT_MAX_SECTION_DATA_SEND (1024*400)
enum ckpt_message_req_types {
@@ -92,6 +94,7 @@
};
enum sync_state {
+ SYNC_STATE_GLOBALID,
SYNC_STATE_CHECKPOINT,
SYNC_STATE_REFCOUNT
};
@@ -768,6 +771,7 @@
{
unsigned int i, j;
unsigned int lowest_nodeid;
+ static int first_configuration = 1;
my_should_sync = 0;
@@ -792,9 +796,12 @@
sizeof (unsigned int) * member_list_entries);
my_old_member_list_entries = member_list_entries;
- if (lowest_nodeid == totempg_my_nodeid_get()) {
+ if ((first_configuration) ||
+ (lowest_nodeid == totempg_my_nodeid_get())) {
+
my_should_sync = 1;
}
+ first_configuration = 0;
}
}
@@ -3319,14 +3326,14 @@
list_init (ckpt_list_head);
}
-static inline void sync_checkpoints_enter (void)
+static inline void sync_gloalid_enter (void)
{
struct checkpoint *checkpoint;
ENTER();
- my_sync_state = SYNC_STATE_CHECKPOINT;
- my_iteration_state = ITERATION_STATE_CHECKPOINT;
+ my_sync_state = SYNC_STATE_GLOBALID;
+
my_iteration_state_checkpoint_list = checkpoint_list_head.next;
checkpoint = list_entry (checkpoint_list_head.next, struct checkpoint,
@@ -3336,6 +3343,16 @@
LEAVE();
}
+static inline void sync_checkpoints_enter (void)
+{
+ ENTER();
+
+ my_sync_state = SYNC_STATE_CHECKPOINT;
+ my_iteration_state = ITERATION_STATE_CHECKPOINT;
+
+ LEAVE();
+}
+
static inline void sync_refcounts_enter (void)
{
my_sync_state = SYNC_STATE_REFCOUNT;
@@ -3347,7 +3364,7 @@
{
ENTER();
- sync_checkpoints_enter();
+ sync_gloalid_enter();
LEAVE();
}
@@ -3387,6 +3404,19 @@
return (totempg_groups_mcast_joined (openais_group_handle, &iovec, 1, TOTEMPG_AGREED));
}
+static int sync_checkpoint_globalid_transmit (void)
+{
+ struct checkpoint checkpoint;
+
+ strcpy ((char *)checkpoint.name.value, GLOBALID_CHECKPOINT_NAME);
+
+ checkpoint.name.length = strlen (GLOBALID_CHECKPOINT_NAME);
+ checkpoint.ckpt_id = global_ckpt_id;
+
+ return (sync_checkpoint_transmit(&checkpoint));
+}
+
+
static int sync_checkpoint_section_transmit (
struct checkpoint *checkpoint,
struct checkpoint_section *checkpoint_section)
@@ -3566,6 +3596,20 @@
continue_processing = 0;
switch (my_sync_state) {
+ case SYNC_STATE_GLOBALID:
+ done_queueing = 1;
+ continue_processing = 1;
+ if (my_should_sync) {
+ res = sync_checkpoint_globalid_transmit ();
+ if (res != 0) {
+ done_queueing = 0;
+ }
+ }
+ if (done_queueing) {
+ sync_checkpoints_enter ();
+ }
+ break;
+
case SYNC_STATE_CHECKPOINT:
done_queueing = 1;
continue_processing = 1;
@@ -3643,6 +3687,22 @@
return;
}
+ /*
+ * Discard checkpoints that are used to synchronize the global_ckpt_id
+ * also setting the global ckpt_id as well.
+ */
+ if (memcmp (&req_exec_ckpt_sync_checkpoint->checkpoint_name.value,
+ GLOBALID_CHECKPOINT_NAME,
+ req_exec_ckpt_sync_checkpoint->checkpoint_name.length) == 0) {
+
+ if (req_exec_ckpt_sync_checkpoint->ckpt_id >= global_ckpt_id) {
+ global_ckpt_id = req_exec_ckpt_sync_checkpoint->ckpt_id + 1;
+ }
+
+ LEAVE();
+ return;
+ }
+
checkpoint = checkpoint_find_specific (
&sync_checkpoint_list_head,
&req_exec_ckpt_sync_checkpoint->checkpoint_name,
_______________________________________________
Openais mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/openais