Patch looks good, but please make sure it works with old versions of corosync and openais before committing.
Regards -steve On Mon, 2010-05-03 at 20:31 +1000, Angus Salkeld wrote: > This patch fixes the case where you do the following: > 1: start all nodes > 2: isolate node n1 > 3: Kill corosync on n1 > 4: unisolate node n1 > 5: start corosync on n1 > 6: start cpg on all nodes > 7: isolate node n1 > 8: Kill corosync on n1 > 9: unisolate node n1 > 10: start corosync on n1 > 11: Waiting for config change on n2 > > > To achieve this we need to mcast some info around the > cluster when cpg service gets a sync event. > We need to do this to determine the correct downlist > to use. > > The problem is that all nodes do not have a consistent view > of the cluster (a joining node think lots of other nodes are joining > whilst the others think only one is joining). So I have > used the number of "old_members" and then the nodeid as > a way of desciding the sync master. So each node send it's > nodeid, number of old_members and downlist. The sync master > is the node with the largest num_old_members and smallest nodeid. > > Would it be useful to make this information public? > i.e. in objdb "runtime/cpg/sync_master=<nodeid>" > Then if you are interested you could read this value, > get notifications etc... > > Regards > Angus > > Signed-off-by: Angus Salkeld <[email protected]> > --- > services/cpg.c | 215 > ++++++++++++++++++++++++++++++++++++++++---------------- > 1 files changed, 153 insertions(+), 62 deletions(-) > > diff --git a/services/cpg.c b/services/cpg.c > index 8ce74ae..e076942 100644 > --- a/services/cpg.c > +++ b/services/cpg.c > @@ -75,7 +75,8 @@ enum cpg_message_req_types { > MESSAGE_REQ_EXEC_CPG_PROCLEAVE = 1, > MESSAGE_REQ_EXEC_CPG_JOINLIST = 2, > MESSAGE_REQ_EXEC_CPG_MCAST = 3, > - MESSAGE_REQ_EXEC_CPG_DOWNLIST = 4 > + MESSAGE_REQ_EXEC_CPG_DOWNLIST_OLD = 4, > + MESSAGE_REQ_EXEC_CPG_DOWNLIST = 5 > }; > > /* > @@ -127,6 +128,13 @@ enum cpg_sync_state { > CPGSYNC_JOINLIST > }; > > +enum cpg_downlist_state_e { > + CPG_DOWNLIST_NONE, > + CPG_DOWNLIST_WAITING_FOR_MESSAGES, > + CPG_DOWNLIST_APPLYING, > +}; > +static enum cpg_downlist_state_e downlist_state; > +static struct list_head downlist_messages_head; > > struct cpg_pd { > void *conn; > @@ -202,6 +210,10 @@ static void message_handler_req_exec_cpg_mcast ( > const void *message, > unsigned int nodeid); > > +static void message_handler_req_exec_cpg_downlist_old ( > + const void *message, > + unsigned int nodeid); > + > static void message_handler_req_exec_cpg_downlist ( > const void *message, > unsigned int nodeid); > @@ -212,6 +224,8 @@ static void exec_cpg_joinlist_endian_convert (void *msg); > > static void exec_cpg_mcast_endian_convert (void *msg); > > +static void exec_cpg_downlist_endian_convert_old (void *msg); > + > static void exec_cpg_downlist_endian_convert (void *msg); > > static void message_handler_req_lib_cpg_join (void *conn, const void > *message); > @@ -326,6 +340,10 @@ static struct corosync_exec_handler cpg_exec_engine[] = > .exec_endian_convert_fn = exec_cpg_mcast_endian_convert > }, > { /* 4 */ > + .exec_handler_fn = > message_handler_req_exec_cpg_downlist_old, > + .exec_endian_convert_fn = exec_cpg_downlist_endian_convert_old > + }, > + { /* 5 */ > .exec_handler_fn = message_handler_req_exec_cpg_downlist, > .exec_endian_convert_fn = exec_cpg_downlist_endian_convert > }, > @@ -415,34 +433,30 @@ struct req_exec_cpg_mcast { > mar_uint8_t message[] __attribute__((aligned(8))); > }; > > -struct req_exec_cpg_downlist { > +struct req_exec_cpg_downlist_old { > coroipc_request_header_t header __attribute__((aligned(8))); > mar_uint32_t left_nodes __attribute__((aligned(8))); > mar_uint32_t nodeids[PROCESSOR_COUNT_MAX] __attribute__((aligned(8))); > }; > > -static struct req_exec_cpg_downlist g_req_exec_cpg_downlist; > +struct req_exec_cpg_downlist { > + coroipc_request_header_t header __attribute__((aligned(8))); > + /* merge decisions */ > + mar_uint32_t old_members __attribute__((aligned(8))); > + /* downlist below */ > + mar_uint32_t left_nodes __attribute__((aligned(8))); > + mar_uint32_t nodeids[PROCESSOR_COUNT_MAX] __attribute__((aligned(8))); > +}; > > -static int memb_list_remove_value (unsigned int *list, > - size_t list_entries, int value) > -{ > - int j; > - int found = 0; > +struct downlist_msg { > + mar_uint32_t sender_nodeid; > + mar_uint32_t old_members __attribute__((aligned(8))); > + mar_uint32_t left_nodes __attribute__((aligned(8))); > + mar_uint32_t nodeids[PROCESSOR_COUNT_MAX] __attribute__((aligned(8))); > + struct list_head list; > +}; > > - for (j = 0; j < list_entries; j++) { > - if (list[j] == value) { > - /* mark next values to be copied down */ > - found = 1; > - } > - else if (found) { > - list[j-1] = list[j]; > - } > - } > - if (found) > - return (list_entries - 1); > - else > - return list_entries; > -} > +static struct req_exec_cpg_downlist g_req_exec_cpg_downlist; > > static void cpg_sync_init_v2 ( > const unsigned int *trans_list, > @@ -451,7 +465,6 @@ static void cpg_sync_init_v2 ( > size_t member_list_entries, > const struct memb_ring_id *ring_id) > { > - unsigned int lowest_nodeid = 0xffffffff; > int entries; > int i, j; > int found; > @@ -465,6 +478,11 @@ static void cpg_sync_init_v2 ( > last_sync_ring_id.nodeid = ring_id->rep.nodeid; > last_sync_ring_id.seq = ring_id->seq; > > + downlist_state = CPG_DOWNLIST_WAITING_FOR_MESSAGES; > + entries = 0; > + /* > + * Determine list of nodeids for downlist message > + */ > for (i = 0; i < my_old_member_list_entries; i++) { > found = 0; > for (j = 0; j < trans_list_entries; j++) { > @@ -474,35 +492,8 @@ static void cpg_sync_init_v2 ( > } > } > if (found == 0) { > - my_member_list_entries = memb_list_remove_value ( > - my_member_list, my_member_list_entries, > - my_old_member_list[i]); > - } > - } > - > - for (i = 0; i < my_member_list_entries; i++) { > - if (my_member_list[i] < lowest_nodeid) { > - lowest_nodeid = my_member_list[i]; > - } > - } > - > - entries = 0; > - if (lowest_nodeid == api->totem_nodeid_get()) { > - /* > - * Determine list of nodeids for downlist message > - */ > - for (i = 0; i < my_old_member_list_entries; i++) { > - found = 0; > - for (j = 0; j < trans_list_entries; j++) { > - if (my_old_member_list[i] == trans_list[j]) { > - found = 1; > - break; > - } > - } > - if (found == 0) { > - g_req_exec_cpg_downlist.nodeids[entries++] = > - my_old_member_list[i]; > - } > + g_req_exec_cpg_downlist.nodeids[entries++] = > + my_old_member_list[i]; > } > } > g_req_exec_cpg_downlist.left_nodes = entries; > @@ -525,17 +516,69 @@ static int cpg_sync_process (void) > return (res); > } > > +static struct downlist_msg* downlist_master_choose (void) > +{ > + struct downlist_msg *cmp; > + struct downlist_msg *best = NULL; > + struct list_head *iter; > + > + for (iter = downlist_messages_head.next; > + iter != &downlist_messages_head; > + iter = iter->next) { > + > + cmp = list_entry(iter, struct downlist_msg, list); > + if (best == NULL) { > + best = cmp; > + continue; > + } > + > + if (cmp->old_members < best->old_members) { > + continue; > + } > + else if (cmp->old_members > best->old_members) { > + best = cmp; > + } > + else if (cmp->sender_nodeid < best->sender_nodeid) { > + best = cmp; > + } > + > + } > + return best; > +} > + > +static void downlist_messages_delete (void) > +{ > + struct downlist_msg *stored_msg; > + struct list_head *iter, *iter_next; > + > + for (iter = downlist_messages_head.next; > + iter != &downlist_messages_head; > + iter = iter_next) { > + > + iter_next = iter->next; > + > + stored_msg = list_entry(iter, struct downlist_msg, list); > + list_del (&stored_msg->list); > + free (stored_msg); > + } > +} > + > static void cpg_sync_activate (void) > { > memcpy (my_old_member_list, my_member_list, > my_member_list_entries * sizeof (unsigned int)); > my_old_member_list_entries = my_member_list_entries; > > + downlist_messages_delete (); > + downlist_state = CPG_DOWNLIST_NONE; > + > notify_lib_totem_membership (NULL, my_member_list_entries, > my_member_list); > } > > static void cpg_sync_abort (void) > { > + downlist_messages_delete (); > + downlist_state = CPG_DOWNLIST_NONE; > } > > static int notify_lib_totem_membership ( > @@ -711,6 +754,7 @@ static int cpg_exec_init_fn (struct corosync_api_v1 > *corosync_api) > #ifdef COROSYNC_SOLARIS > logsys_subsys_init(); > #endif > + list_init (&downlist_messages_head); > api = corosync_api; > return (0); > } > @@ -817,12 +861,17 @@ static void exec_cpg_joinlist_endian_convert (void > *msg_v) > } > } > > +static void exec_cpg_downlist_endian_convert_old (void *msg) > +{ > +} > + > static void exec_cpg_downlist_endian_convert (void *msg) > { > struct req_exec_cpg_downlist *req_exec_cpg_downlist = msg; > unsigned int i; > > req_exec_cpg_downlist->left_nodes = > swab32(req_exec_cpg_downlist->left_nodes); > + req_exec_cpg_downlist->old_members = > swab32(req_exec_cpg_downlist->old_members); > > for (i = 0; i < req_exec_cpg_downlist->left_nodes; i++) { > req_exec_cpg_downlist->nodeids[i] = > swab32(req_exec_cpg_downlist->nodeids[i]); > @@ -908,7 +957,13 @@ static void do_proc_join( > MESSAGE_RES_CPG_CONFCHG_CALLBACK); > } > > -static void message_handler_req_exec_cpg_downlist ( > +static void message_handler_req_exec_cpg_downlist_old ( > + const void *message, > + unsigned int nodeid) > +{ > +} > + > +static void message_handler_req_exec_cpg_downlist( > const void *message, > unsigned int nodeid) > { > @@ -916,27 +971,61 @@ static void message_handler_req_exec_cpg_downlist ( > int i; > mar_cpg_address_t left_list[1]; > struct list_head *iter; > + struct downlist_msg *stored_msg; > + int found; > > - /* > - FOR OPTIMALIZATION - Make list of lists > - */ > + if (downlist_state != CPG_DOWNLIST_WAITING_FOR_MESSAGES) { > + log_printf (LOGSYS_LEVEL_WARNING, "downlist left_list: %d > received in state %d", > + req_exec_cpg_downlist->left_nodes, downlist_state); > + return; > + } > + > + stored_msg = malloc (sizeof (struct downlist_msg)); > + stored_msg->sender_nodeid = nodeid; > + stored_msg->old_members = req_exec_cpg_downlist->old_members; > + stored_msg->left_nodes = req_exec_cpg_downlist->left_nodes; > + memcpy (stored_msg->nodeids, req_exec_cpg_downlist->nodeids, > + req_exec_cpg_downlist->left_nodes * sizeof (mar_uint32_t)); > + list_init (&stored_msg->list); > + list_add (&stored_msg->list, &downlist_messages_head); > > - log_printf (LOGSYS_LEVEL_DEBUG, "downlist left_list: %d\n", > req_exec_cpg_downlist->left_nodes); > + for (i = 0; i < my_member_list_entries; i++) { > + found = 0; > + for (iter = downlist_messages_head.next; > + iter != &downlist_messages_head; > + iter = iter->next) { > > + stored_msg = list_entry(iter, struct downlist_msg, > list); > + if (my_member_list[i] == stored_msg->sender_nodeid) { > + found = 1; > + } > + } > + if (!found) { > + return; > + } > + } > + > + downlist_state = CPG_DOWNLIST_APPLYING; > + stored_msg = downlist_master_choose (); > + > + log_printf (LOGSYS_LEVEL_DEBUG, "chosen downlist from node %s", > + api->totem_ifaces_print(stored_msg->sender_nodeid)); > + > + /* send events */ > for (iter = process_info_list_head.next; iter != > &process_info_list_head; ) { > struct process_info *pi = list_entry(iter, struct process_info, > list); > iter = iter->next; > > - for (i = 0; i < req_exec_cpg_downlist->left_nodes; i++) { > - if (pi->nodeid == req_exec_cpg_downlist->nodeids[i]) { > + for (i = 0; i < stored_msg->left_nodes; i++) { > + if (pi->nodeid == stored_msg->nodeids[i]) { > left_list[0].nodeid = pi->nodeid; > left_list[0].pid = pi->pid; > left_list[0].reason = > CONFCHG_CPG_REASON_NODEDOWN; > > notify_lib_joinlist(&pi->group, NULL, > - 0, NULL, > - 1, left_list, > - > MESSAGE_RES_CPG_CONFCHG_CALLBACK); > + 0, NULL, > + 1, left_list, > + MESSAGE_RES_CPG_CONFCHG_CALLBACK); > list_del (&pi->list); > free (pi); > break; > @@ -1081,6 +1170,8 @@ static int cpg_exec_send_downlist(void) > g_req_exec_cpg_downlist.header.id = SERVICE_ID_MAKE(CPG_SERVICE, > MESSAGE_REQ_EXEC_CPG_DOWNLIST); > g_req_exec_cpg_downlist.header.size = sizeof(struct > req_exec_cpg_downlist); > > + g_req_exec_cpg_downlist.old_members = my_old_member_list_entries; > + > iov.iov_base = (void *)&g_req_exec_cpg_downlist; > iov.iov_len = g_req_exec_cpg_downlist.header.size; > _______________________________________________ Openais mailing list [email protected] https://lists.linux-foundation.org/mailman/listinfo/openais
