Hi,

In my cluster implementation, I can get a communication loss occurring when a
node does a graceful failover, i.e. I get a node-leave during a node-failover
and aisexec asserts. Unfortunately, I cannot avoid the connectivity loss, so
I've added changes to AIS to handle this case.

Attached is the patch. Please let me know if you have any feedback.

The change is: ensure any of the leaving/failing-over node's remaining
instantiated components get set to terminating (i.e. skip the stepwise
instantiation-level termination), then invoke amf_comp_cleanup_completed() for
all the node's terminating components (since the node has left, no CLEANUP
COMPLETED message will be received). All the node's components should then be
UNINSTANTIATED, which will trigger workload reassignment. The cluster should
then be in a stable state to continue the node_leave operation.
Also in acsm_enter_activating_standby(), prevent the failing-over node from
assigning the active workload to a node that's leaving.

Thanks,
Tim
diff --git a/exec/amf.h b/exec/amf.h
index 1001ec2..105b1d5 100644
--- a/exec/amf.h
+++ b/exec/amf.h
@@ -877,6 +877,7 @@ extern SaAmfReadinessStateT amf_su_get_saAmfSUReadinessState (
 	struct amf_su *su);
 extern int amf_su_are_all_comps_in_su (struct amf_su *su,
 	SaAmfPresenceStateT state);
+extern void amf_su_node_left_while_failing_over (struct amf_su *su);
 
 /* Event methods */
 /**
diff --git a/exec/amfnode.c b/exec/amfnode.c
index e210aad..ba18a0d 100644
--- a/exec/amfnode.c
+++ b/exec/amfnode.c
@@ -203,11 +203,38 @@ LOGSYS_DECLARE_SUBSYS ("AMF", LOG_INFO)
  * Internal (static) utility functions
  *****************************************************************************/
 
+static void foreach_su_hosted_by_node (
+	struct amf_node *node,
+	void (*foreach_fn) (struct amf_su *su))
+{
+	struct amf_application *app;
+	struct amf_sg *sg;
+	struct amf_su *su;
+
+	assert (foreach_fn != NULL);
+	for (app = amf_cluster->application_head; app != NULL; app = app->next) {
+		for (sg = app->sg_head; sg != NULL; sg = sg->next) {
+			for (su = sg->su_head; su != NULL; su = su->next) {
+				if (name_match(&node->name, &su->saAmfSUHostedByNode)) {
+					foreach_fn (su);
+				}
+			}
+		}
+	}
+}
+
 static void node_acsm_enter_leaving_spontaneously(struct amf_node *node)
 {
 	ENTER("'%s'", node->name.value);
 	node->saAmfNodeOperState = SA_AMF_OPERATIONAL_DISABLED;
 	node->nodeid = 0;
+
+	/* complete component termination and cleanup if node was failing over */
+	if (node->acsm_state == NODE_ACSM_FAILING_GRACEFULLY_FAILING_OVER) {
+		log_printf (LOG_LEVEL_NOTICE, "Node '%s' left while failing over",
+			node->name.value);
+		foreach_su_hosted_by_node(node, amf_su_node_left_while_failing_over);
+	}
 }
 
 static void node_acsm_enter_failing_over (struct amf_node *node)
@@ -365,6 +392,7 @@ void amf_node_leave (struct amf_node *node)
 		case NODE_ACSM_IDLE_ESCALLATION_LEVEL_0:
 		case NODE_ACSM_IDLE_ESCALLATION_LEVEL_2:
 		case NODE_ACSM_IDLE_ESCALLATION_LEVEL_3:
+		case NODE_ACSM_FAILING_GRACEFULLY_FAILING_OVER:
 			node_acsm_enter_leaving_spontaneously(node);    
 			node_acsm_enter_failing_over (node);
 			break;
diff --git a/exec/amfsg.c b/exec/amfsg.c
index f835355..6b3c7d9 100644
--- a/exec/amfsg.c
+++ b/exec/amfsg.c
@@ -772,8 +772,10 @@ static void acsm_enter_activating_standby (struct amf_sg *sg)
 	while (*sis != NULL) {
 		si_assignment = (*sis)->assigned_sis;
 		while (si_assignment != NULL) {
-			if (si_assignment->saAmfSISUHAState == 
-				SA_AMF_HA_STANDBY) {
+			if (si_assignment->saAmfSISUHAState ==
+				SA_AMF_HA_STANDBY &&
+				si_assignment->su->saAmfSUOperState !=
+				SA_AMF_OPERATIONAL_DISABLED) {
 				si_assignment->requested_ha_state = 
 					SA_AMF_HA_ACTIVE;
 				amf_si_ha_state_assume (
diff --git a/exec/amfsu.c b/exec/amfsu.c
index 0157c88..f4b0c49 100644
--- a/exec/amfsu.c
+++ b/exec/amfsu.c
@@ -1800,3 +1800,40 @@ int amf_su_are_all_comps_in_su (struct amf_su *su,
 	return all_comps_in_su_are_set;
 }
 
+/**
+ * Handles clean-up of an SU if a node unexpectedly leaves while in the process
+ * of failing over. This ensures any remaining components are uninstantiated
+ * (since we won't receive the CLC_CLEANUP_COMPLETED msgs from the failed node)
+ * This assumes component termination has started, i.e. all comps above
+ * su->current_comp_instantiation_level are already TERMINATING or UNINSTANTIATED
+ */ 
+void amf_su_node_left_while_failing_over (struct amf_su *su)
+{
+	amf_comp_t *comp;
+	int curr_level, lowest_level;
+
+	/* The comp cleanup was being done stepwise from highest instantiation level
+	 * to lowest. Now, just work backwards from the current level so that ALL
+	 * remaining instantiated components are set to terminating */
+	curr_level = su->current_comp_instantiation_level;
+	lowest_level = su_lowest_comp_instantiation_level_set (su);
+
+	ENTER ("Terminating '%s' component levels %u thru %u",
+			su->name.value, curr_level, lowest_level);
+
+	while (curr_level >= lowest_level) {
+		terminate_all_components_in_level (su, --curr_level);
+	}
+
+	/* Uninstantiate any terminating components (all this SU's components should
+	 * be either already UNINSTANTIATED or TERM_FAILED, or now TERMINATING).
+	 * Uninstantiating the last terminating comp triggers workload reassignment */
+	for (comp = su->comp_head; comp != NULL; comp = comp->next) {
+		if (comp->saAmfCompPresenceState == SA_AMF_PRESENCE_TERMINATING) {
+			amf_comp_cleanup_completed (comp);
+		}
+	}
+
+	/* check component cleanup was successful */
+	assert(are_all_comps_in_level_uninst_or_term_failed (su));
+}
_______________________________________________
Openais mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/openais

Reply via email to