[Openais] [PATCH] fix delayed shutdown

Chrissie Caulfield Thu, 14 May 2009 06:57:12 -0700

Steven Dake wrote:
> On Thu, 2009-04-16 at 08:17 +0100, Chrissie Caulfield wrote:
>> Fabio M. Di Nitto wrote:
>>> On Wed, 2009-04-15 at 14:35 -0500, David Teigland wrote:
>>>> If I run 'cman_tool leave' on four nodes in parallel, node1 will leave 
>>>> right
>>>> away, but the other three nodes don't leave until the token timeout expires
>>>> for node1 causing a confchg for it, after which the other three all leave
>>>> right away.  This has only been annoying me recently, so I think it must 
>>>> have
>>>> been some recent change.
>>> I have seen this in the form of:
>>>
>>> all nodes - 1 leave right away
>>> the lonely node takes a while
>>>
>>> but this is triggered by "cman_tool leave remove" as invoked in the init
>>> script.
>>>
>>> Agreed it's annoying.
>>>
>> It's a consequence of how the messaging works. Shutdown sends a message
>> to al the other nodes telling them that it is going down. When that
>> message is received by itself then it actually shuts down
>>
>> If you shut down a lot of nodes at the same time then one will go down
>> right away as you observed, because the ring is fully active. Once
>> that's happened the others will all wait for the token timeout for that
>> node, before the rest of their messages can be delivered. And that
>> happens for each node that is shut down.
>>
>> I suppose the solution is to hook the shutdown mechanism into totem
>> somehow so that it can remove the node immediately it receives the
>> shutdown message and not have to wait for the token to time-out too (try
>> saying that with your teeth out!). I'm not sure how hard/politically
>> incorrect this is though...
>>
>>
> 
> I think its a great idea if you are willing to work out a patch.  
> 
> totempg_finalize already exists and calls down through the stack to
> totemsrp_finalize.  totemsrp_finalize is a noop atm.  totemnet needs a
> finalize call which removes file descriptors from the poll mechanism and
> shuts down those file descriptors.
> 
> A new abi is needed for orderly shutdown in the coroapi (which your
> shutdown mechanism would call).  It should be linked to corosync_exit()
> which is already implemented.
> 
> The trick is what do to in totemsrp_finalize() to trigger a new
> membership phase.  One possibility is to create a function similar to
> memb_join_message_send() except that the memb_join.proc_list_entries and
> memb_join.failed_list_entries is equal to the current lists minus the
> instance->my_id.  You can use memb_set_subtract to generate a new proc
> and failed list.  memb_join.system_from would have to be some dummy
> initialized node id (0 since it isn't allowed as a node id) and
> memb_consensus_set() would have to check for that node id and then do no
> operation if node id is 0.   After sending the message, call
> totemnet_finalize().
> 
> I can't guarantee this will work but its worth investigation and would
> be a nice improvement.  If you want to add this, please hurry as we are
> approaching freeze.
>


OK, here's my first attempt at a patch to speed up shutdown using a
variant of Steve's suggested method:

-- 

Chrissie

Index: services/cfg.c
===================================================================
--- services/cfg.c	(revision 2183)
+++ services/cfg.c	(working copy)
@@ -621,7 +621,7 @@
 
 	log_printf(LOGSYS_LEVEL_NOTICE, "Node %d was shut down by sysadmin\n", nodeid);
 	if (nodeid == api->totem_nodeid_get()) {
-		corosync_fatal_error(COROSYNC_FATAL_ERROR_EXIT);
+		api->request_shutdown();
 	}
 	LEAVE();
 }
Index: exec/totemnet.c
===================================================================
--- exec/totemnet.c	(revision 2183)
+++ exec/totemnet.c	(working copy)
@@ -1113,6 +1113,20 @@
 
 	worker_thread_group_exit (&instance->worker_thread_group);
 
+	if (instance->totemnet_sockets.mcast_recv > 0) {
+		close (instance->totemnet_sockets.mcast_recv);
+	 	poll_dispatch_delete (instance->totemnet_poll_handle,
+			instance->totemnet_sockets.mcast_recv);
+	}
+	if (instance->totemnet_sockets.mcast_send > 0) {
+		close (instance->totemnet_sockets.mcast_send);
+	}
+	if (instance->totemnet_sockets.token > 0) {
+		close (instance->totemnet_sockets.token);
+		poll_dispatch_delete (instance->totemnet_poll_handle,
+			instance->totemnet_sockets.token);
+	}
+
 	hdb_handle_put (&totemnet_instance_database, handle);
 
 error_exit:
Index: exec/totemsrp.c
===================================================================
--- exec/totemsrp.c	(revision 2183)
+++ exec/totemsrp.c	(working copy)
@@ -91,6 +91,7 @@
 #define MAXIOVS					5
 #define RETRANSMIT_ENTRIES_MAX			30
 #define TOKEN_SIZE_MAX				64000 /* bytes */
+#define LEAVE_DUMMY_NODEID                      0
 
 /*
  * Rollover handling:
@@ -132,12 +133,13 @@
 #define ENDIAN_LOCAL					 0xff22
 
 enum message_type {
-	MESSAGE_TYPE_ORF_TOKEN = 0,		/* Ordering, Reliability, Flow (ORF) control Token */
+        MESSAGE_TYPE_ORF_TOKEN = 0,		/* Ordering, Reliability, Flow (ORF) control Token */
 	MESSAGE_TYPE_MCAST = 1,			/* ring ordered multicast message */
 	MESSAGE_TYPE_MEMB_MERGE_DETECT = 2,	/* merge rings if there are available rings */
 	MESSAGE_TYPE_MEMB_JOIN = 3, 		/* membership join message */
 	MESSAGE_TYPE_MEMB_COMMIT_TOKEN = 4,	/* membership commit token */
 	MESSAGE_TYPE_TOKEN_HOLD_CANCEL = 5,	/* cancel the holding of the token */
+	MESSAGE_TYPE_MEMB_LEAVE = 6, 		/* membership leave message */
 };
 
 enum encapsulation_type {
@@ -496,7 +498,7 @@
 
 struct message_handlers {
 	int count;
-	int (*handler_functions[6]) (
+	int (*handler_functions[7]) (
 		struct totemsrp_instance *instance,
 		const void *msg,
 		size_t msg_len,
@@ -530,6 +532,12 @@
 	size_t msg_len,
 	int endian_conversion_needed);
 
+static int message_handler_memb_leave (
+	struct totemsrp_instance *instance,
+	const void *msg,
+	size_t msg_len,
+	int endian_conversion_needed);
+
 static int message_handler_memb_commit_token (
 	struct totemsrp_instance *instance,
 	const void *msg,
@@ -560,6 +568,8 @@
 
 static int srp_addr_equal (const struct srp_addr *a, const struct srp_addr *b);
 
+static void memb_leave_message_send (struct totemsrp_instance *instance);
+
 static void memb_ring_id_create_or_load (struct totemsrp_instance *, struct memb_ring_id *);
 
 static void token_callbacks_execute (struct totemsrp_instance *instance, enum totem_callback_token_type type);
@@ -606,14 +616,15 @@
 DECLARE_HDB_DATABASE (totemsrp_instance_database,NULL);
 
 struct message_handlers totemsrp_message_handlers = {
-	6,
+	7,
 	{
 		message_handler_orf_token,
 		message_handler_mcast,
 		message_handler_memb_merge_detect,
 		message_handler_memb_join,
 		message_handler_memb_commit_token,
-		message_handler_token_hold_cancel
+		message_handler_token_hold_cancel,
+		message_handler_memb_leave
 	}
 };
 
@@ -872,6 +883,7 @@
 	if (res != 0) {
 		return;
 	}
+	memb_leave_message_send (instance);
 
 	hdb_handle_put (&totemsrp_instance_database, handle);
 }
@@ -1096,6 +1108,9 @@
 	int found = 0;
 	int i;
 
+	if (addr->addr[0].nodeid == LEAVE_DUMMY_NODEID)
+	        return;
+
 	for (i = 0; i < instance->consensus_list_entries; i++) {
 		if (srp_addr_equal(addr, &instance->consensus_list[i].addr)) {
 			found = 1;
@@ -2873,6 +2888,74 @@
 		addr_idx);
 }
 
+static void memb_leave_message_send (struct totemsrp_instance *instance)
+{
+	char memb_join_data[10000];
+	struct memb_join *memb_join = (struct memb_join *)memb_join_data;
+	char *addr;
+	unsigned int addr_idx;
+	int active_memb_entries;
+	struct srp_addr active_memb[PROCESSOR_COUNT_MAX];
+
+	log_printf (instance->totemsrp_log_level_debug,
+		"sending leave message\n");
+
+	/*
+	 * add us to the failed list, and remove us from
+	 * the members list
+	 */
+	memb_set_merge(
+		       &instance->my_id, 1,
+		       instance->my_failed_list, &instance->my_failed_list_entries);
+
+	memb_set_subtract (active_memb, &active_memb_entries,
+			   instance->my_proc_list, instance->my_proc_list_entries,
+			   &instance->my_id, 1);
+
+
+	memb_join->header.type = MESSAGE_TYPE_MEMB_LEAVE;
+	memb_join->header.endian_detector = ENDIAN_LOCAL;
+	memb_join->header.encapsulated = 0;
+	memb_join->header.nodeid = LEAVE_DUMMY_NODEID;
+
+	memb_join->ring_seq = instance->my_ring_id.seq;
+	memb_join->proc_list_entries = active_memb_entries;
+	memb_join->failed_list_entries = instance->my_failed_list_entries;
+	srp_addr_copy (&memb_join->system_from, &instance->my_id);
+
+
+	/*
+	 * This mess adds the joined and failed processor lists into the join
+	 * message
+	 */
+	addr = (char *)memb_join;
+	addr_idx = sizeof (struct memb_join);
+	memcpy (&addr[addr_idx],
+		active_memb,
+		active_memb_entries *
+			sizeof (struct srp_addr));
+	addr_idx += 
+		instance->my_proc_list_entries *
+		sizeof (struct srp_addr);
+	memcpy (&addr[addr_idx],
+		instance->my_failed_list,
+		instance->my_failed_list_entries *
+		sizeof (struct srp_addr));
+	addr_idx += 
+		instance->my_failed_list_entries *
+		sizeof (struct srp_addr);
+
+
+	if (instance->totem_config->send_join_timeout) {
+		usleep (random() % (instance->totem_config->send_join_timeout * 1000));
+	}
+
+	totemrrp_mcast_flush_send (
+		instance->totemrrp_handle,
+		memb_join,
+		addr_idx);
+}
+
 static void memb_merge_detect_transmit (struct totemsrp_instance *instance)
 {
 	struct memb_merge_detect memb_merge_detect;
@@ -3996,6 +4079,32 @@
 	return (0);
 }
 
+static int message_handler_memb_leave (
+	struct totemsrp_instance *instance,
+	const void *msg,
+	size_t msg_len,
+	int endian_conversion_needed)
+{
+	const struct memb_join *memb_join;
+	struct memb_join *memb_join_convert = alloca (msg_len);
+
+	if (endian_conversion_needed) {
+		memb_join = memb_join_convert;
+		memb_join_endian_convert (msg, memb_join_convert);
+
+	} else {
+		memb_join = msg;
+	}
+
+	log_printf (instance->totemsrp_log_level_debug,
+		"got LEAVE message\n");
+
+	memb_set_merge(&memb_join->system_from, 1,
+		       instance->my_failed_list, &instance->my_failed_list_entries);
+
+	return message_handler_memb_join (instance, msg, msg_len, 0);
+}
+
 static int message_handler_memb_commit_token (
 	struct totemsrp_instance *instance,
 	const void *msg,

_______________________________________________
Openais mailing list
Openais@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/openais

[Openais] [PATCH] fix delayed shutdown

Reply via email to