When a process pauses for longer then the token timeout, the other
processors in the system form a new ring.  The remaining processor then
eventually reschedules and processes the pending membership multicast
messages in its kernel queues.  This wreaks havok on the membership of
the other nodes.

While a proper kernel shouldn't pause for long periods, its a reality
that many kernels still have long periods of spinlocking without
scheduling and no proper preemption.

This patch resolves the scenario by creating a timer which records a
time stamp at an interval that is the token timeout / 5.  Then if a
process executes the membership algorithm by receiving a join message,
the current time is retrieved and compared to the timestamp.  If they
differ by more then token timeout / 2, it is assumed the process
couldn't schedule (because it couldn't trigger the timer callbacks via
poll) and calls totemnet to flush any pending multicasts in the file
descriptor responsible for receiving multicast messages.  This results
in the old membership messages being thrown away allowing the new
membership to form properly.

This can be tested by ctrl-z a corosync process in a 8 node cluster.
Then use fg to bring it into the foreground.  Pre-patch - bad news -
post patch, prints a notice and proceeds properly.

Regards
-steve

Index: exec/totemnet.c
===================================================================
--- exec/totemnet.c	(revision 2289)
+++ exec/totemnet.c	(working copy)
@@ -2075,3 +2075,57 @@
 error_exit:
 	return (res);
 }
+
+extern int totemnet_recv_mcast_empty (
+	hdb_handle_t handle)
+{
+	struct totemnet_instance *instance;
+	unsigned int res;
+	struct sockaddr_storage system_from;
+	struct msghdr msg_recv;
+	struct pollfd ufd;
+	int nfds;
+	int msg_processed = 0;
+
+	res = hdb_handle_get (&totemnet_instance_database, handle,
+		(void *)&instance);
+	if (res != 0) {
+		goto error_exit;
+	}
+
+	/*
+	 * Receive datagram
+	 */
+	msg_recv.msg_name = &system_from;
+	msg_recv.msg_namelen = sizeof (struct sockaddr_storage);
+	msg_recv.msg_iov = &instance->totemnet_iov_recv_flush;
+	msg_recv.msg_iovlen = 1;
+#if !defined(COROSYNC_SOLARIS)
+	msg_recv.msg_control = 0;
+	msg_recv.msg_controllen = 0;
+	msg_recv.msg_flags = 0;
+#else
+	msg_recv.msg_accrights = NULL;
+	msg_recv.msg_accrightslen = 0;
+#endif
+
+	do {
+		ufd.fd = instance->totemnet_sockets.mcast_recv;
+		ufd.events = POLLIN;
+		nfds = poll (&ufd, 1, 0);
+		if (nfds == 1 && ufd.revents & POLLIN) {
+			res = recvmsg (instance->totemnet_sockets.mcast_recv, &msg_recv, MSG_NOSIGNAL | MSG_DONTWAIT);
+			if (res != -1) {
+				msg_processed = 1;
+			}
+		}
+	} while (nfds == 1);
+
+	hdb_handle_put (&totemnet_instance_database, handle);
+
+	return (msg_processed);
+
+error_exit:
+	return (res);
+}
+
Index: exec/totemnet.h
===================================================================
--- exec/totemnet.h	(revision 2289)
+++ exec/totemnet.h	(working copy)
@@ -109,4 +109,7 @@
 	hdb_handle_t handle,
 	unsigned int type);
 
+extern int totemnet_recv_mcast_empty (
+	hdb_handle_t handle);
+
 #endif /* TOTEMNET_H_DEFINED */
Index: exec/totemrrp.c
===================================================================
--- exec/totemrrp.c	(revision 2289)
+++ exec/totemrrp.c	(working copy)
@@ -163,6 +163,9 @@
 
 	void (*ring_reenable) (
 		struct totemrrp_instance *instance);
+
+	int (*mcast_recv_empty) (
+		struct totemrrp_instance *instance);
 };
 
 struct totemrrp_instance {
@@ -284,6 +287,9 @@
 static void none_ring_reenable (
 	struct totemrrp_instance *instance);
 
+static int none_mcast_recv_empty (
+	struct totemrrp_instance *instance);
+
 /*
  * Passive Replication Forward Declerations
  */
@@ -342,6 +348,9 @@
 static void passive_ring_reenable (
 	struct totemrrp_instance *instance);
 
+static int passive_mcast_recv_empty (
+	struct totemrrp_instance *instance);
+
 /*
  * Active Replication Forward Definitions
  */
@@ -400,6 +409,9 @@
 static void active_ring_reenable (
 	struct totemrrp_instance *instance);
 
+static int active_mcast_recv_empty (
+	struct totemrrp_instance *instance);
+
 static void active_timer_expired_token_start (
 	struct active_instance *active_instance);
 
@@ -425,7 +437,8 @@
 	.iface_check		= none_iface_check,
 	.processor_count_set	= none_processor_count_set,
 	.token_target_set	= none_token_target_set,
-	.ring_reenable		= none_ring_reenable
+	.ring_reenable		= none_ring_reenable,
+	.mcast_recv_empty	= none_mcast_recv_empty
 };
 
 struct rrp_algo passive_algo = {
@@ -441,7 +454,8 @@
 	.iface_check		= passive_iface_check,
 	.processor_count_set	= passive_processor_count_set,
 	.token_target_set	= passive_token_target_set,
-	.ring_reenable		= passive_ring_reenable
+	.ring_reenable		= passive_ring_reenable,
+	.mcast_recv_empty	= passive_mcast_recv_empty
 };
 
 struct rrp_algo active_algo = {
@@ -457,7 +471,8 @@
 	.iface_check		= active_iface_check,
 	.processor_count_set	= active_processor_count_set,
 	.token_target_set	= active_token_target_set,
-	.ring_reenable		= active_ring_reenable
+	.ring_reenable		= active_ring_reenable,
+	.mcast_recv_empty	= active_mcast_recv_empty
 };
 
 struct rrp_algo *rrp_algos[] = {
@@ -579,6 +594,16 @@
 	 */
 }
 
+static int none_mcast_recv_empty (
+	struct totemrrp_instance *instance)
+{
+	int res;
+
+	res = totemnet_recv_mcast_empty (instance->net_handles[0]);
+
+	return (res);
+}
+
 /*
  * Passive Replication Implementation
  */
@@ -908,6 +933,23 @@
 	totemnet_token_target_set (instance->net_handles[iface_no], token_target);
 }
 
+static int passive_mcast_recv_empty (
+	struct totemrrp_instance *instance)
+{
+	int res;
+	int msgs_emptied = 0;
+	int i;
+
+	for (i = 0; i < instance->interface_count; i++) {
+		res = totemnet_recv_mcast_empty (instance->net_handles[i]);
+		if (res == 1) {
+			msgs_emptied = 1;
+		}
+	}
+
+	return (msgs_emptied);
+}
+
 static void passive_ring_reenable (
 	struct totemrrp_instance *instance)
 {
@@ -1262,6 +1304,23 @@
 	totemnet_token_target_set (instance->net_handles[iface_no], token_target);
 }
 
+static int active_mcast_recv_empty (
+	struct totemrrp_instance *instance)
+{
+	int res;
+	int msgs_emptied = 0;
+	int i;
+
+	for (i = 0; i < instance->interface_count; i++) {
+		res = totemnet_recv_mcast_empty (instance->net_handles[i]);
+		if (res == 1) {
+			msgs_emptied = 1;
+		}
+	}
+
+	return (msgs_emptied);
+}
+
 static void active_ring_reenable (
 	struct totemrrp_instance *instance)
 {
@@ -1762,3 +1821,25 @@
 error_exit:
 	return (res);
 }
+
+extern int totemrrp_mcast_recv_empty (
+        hdb_handle_t handle)
+{
+	struct totemrrp_instance *instance;
+	int res;
+
+	res = hdb_handle_get (&totemrrp_instance_database, handle,
+		(void *)&instance);
+	if (res != 0) {
+		res = ENOENT;
+		goto error_exit;
+	}
+
+	res = instance->rrp_algo->mcast_recv_empty (instance);
+
+	hdb_handle_put (&totemrrp_instance_database, handle);
+
+error_exit:
+	return (res);
+}
+
Index: exec/totemsrp.c
===================================================================
--- exec/totemsrp.c	(revision 2289)
+++ exec/totemsrp.c	(working copy)
@@ -409,6 +409,8 @@
 	/*
 	 * Timers
 	 */
+	poll_timer_handle timer_pause_timeout;
+
 	poll_timer_handle timer_orf_token_timeout;
 
 	poll_timer_handle timer_orf_token_retransmit_timeout;
@@ -501,6 +503,8 @@
 	unsigned int my_pbl;
 
 	unsigned int my_cbl;
+
+	struct timeval pause_timestamp;
 };
 
 struct message_handlers {
@@ -596,6 +600,7 @@
 	struct memb_merge_detect *out);
 static void srp_addr_copy_endian_convert (struct srp_addr *out, const struct srp_addr *in);
 static void timer_function_orf_token_timeout (void *data);
+static void timer_function_pause_timeout (void *data);
 static void timer_function_heartbeat_timeout (void *data);
 static void timer_function_token_retransmit_timeout (void *data);
 static void timer_function_token_hold_retransmit_timeout (void *data);
@@ -684,6 +689,25 @@
 	return (0);
 }
 
+static int pause_flush (struct totemsrp_instance *instance)
+{
+	struct timeval now;
+	uint64_t now_msec;
+	uint64_t timestamp_msec;
+	int res = 0;
+
+	gettimeofday (&now, NULL);
+        now_msec = ((now.tv_sec * 1000ULL) + (now.tv_usec / 1000ULL));
+        timestamp_msec = ((instance->pause_timestamp.tv_sec * 1000ULL) + (instance->pause_timestamp.tv_usec/1000ULL));
+
+	if ((now_msec - timestamp_msec) > (instance->totem_config->token_timeout / 2)) {
+		log_printf (instance->totemsrp_log_level_notice,
+			"Process pause detected for %lld ms, flushing membership messages.\n", (now_msec - timestamp_msec));
+		res = totemrrp_mcast_recv_empty (instance->totemrrp_handle);
+	}
+	return (res);
+}
+
 /*
  * Exported interfaces
  */
@@ -816,6 +840,8 @@
 	instance->totemsrp_confchg_fn = confchg_fn;
 	instance->use_heartbeat = 1;
 
+	gettimeofday (&instance->pause_timestamp, NULL);
+
 	if ( totem_config->heartbeat_failures_allowed == 0 ) {
 		log_printf (instance->totemsrp_log_level_notice,
 			"HeartBeat is Disabled. To enable set heartbeat_failures_allowed > 0\n");
@@ -1399,6 +1425,16 @@
 	instance->old_ring_state_saved = 0;
 }
 
+static void reset_pause_timeout (struct totemsrp_instance *instance)
+{
+	poll_timer_delete (instance->totemsrp_poll_handle, instance->timer_pause_timeout);
+	poll_timer_add (instance->totemsrp_poll_handle,
+		instance->totem_config->token_timeout / 5,
+		(void *)instance,
+		timer_function_pause_timeout,
+		&instance->timer_pause_timeout);
+}
+
 static void reset_token_timeout (struct totemsrp_instance *instance) {
 	poll_timer_delete (instance->totemsrp_poll_handle, instance->timer_orf_token_timeout);
 	poll_timer_add (instance->totemsrp_poll_handle,
@@ -1479,6 +1515,14 @@
 /*
  * Timers used for various states of the membership algorithm
  */
+static void timer_function_pause_timeout (void *data)
+{
+	struct totemsrp_instance *instance = data;
+
+	gettimeofday (&instance->pause_timestamp, NULL);
+	reset_pause_timeout (instance);
+}
+
 static void timer_function_orf_token_timeout (void *data)
 {
 	struct totemsrp_instance *instance = data;
@@ -1740,6 +1784,8 @@
 
 	instance->my_received_flg = 1;
 
+	reset_pause_timeout (instance);
+
 	return;
 }
 
@@ -4035,6 +4081,14 @@
 	} else {
 		memb_join = msg;
 	}
+	/*
+	 * If the process paused because it wasn't scheduled in a timely
+	 * fashion, flush the join messages because they may be queued
+	 * entries
+	 */
+	if (pause_flush (instance)) {
+		return (0);
+	}
 
 	if (instance->token_ring_id_seq < memb_join->ring_seq) {
 		instance->token_ring_id_seq = memb_join->ring_seq;
Index: exec/totemrrp.h
===================================================================
--- exec/totemrrp.h	(revision 2289)
+++ exec/totemrrp.h	(working copy)
@@ -119,4 +119,7 @@
 extern int totemrrp_ring_reenable (
 	hdb_handle_t handle);
 
+extern int totemrrp_mcast_recv_empty (
+	hdb_handle_t handle);
+
 #endif /* TOTEMRRP_H_DEFINED */
_______________________________________________
Openais mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/openais

Reply via email to