The failed to receive logic in totem is not correct. This condition
occurs when a node can't receive multicast packets for a long period of
time. Generally it impacts low numbers of users which have hardware
that exhibit out-of-norm behaviours.
The solution is to more closely match the spec when forming a new gather
list after a FAILED TO RECV is detected. Once this occurs, a singleton
ring is formed. Then the FAILED TO RECV node is free to try to form a
ring again if it can with the existing nodes.
Regards
-steve
Index: services/cpg.c
===================================================================
--- services/cpg.c (revision 2918)
+++ services/cpg.c (working copy)
@@ -774,6 +774,7 @@
0, NULL,
1, &left_list,
MESSAGE_RES_CPG_CONFCHG_CALLBACK);
+printf ("deleting pi\n");
list_del (&pi->list);
free (pi);
break;
@@ -1147,6 +1148,7 @@
struct iovec iovec[2];
int known_node = 0;
+printf ("mcast handler\n");
res_lib_cpg_mcast.header.id = MESSAGE_RES_CPG_DELIVER_CALLBACK;
res_lib_cpg_mcast.header.size = sizeof(res_lib_cpg_mcast) + msglen;
res_lib_cpg_mcast.msglen = msglen;
@@ -1185,10 +1187,13 @@
if (!known_node) {
/* Unknown node -> we will not deliver message
*/
+printf ("unknown node\n");
return ;
}
api->ipc_dispatch_iov_send (cpd->conn, iovec, 2);
+ } else {
+printf ("unknown cpg\n");
}
}
}
Index: exec/totemsrp.c
===================================================================
--- exec/totemsrp.c (revision 2918)
+++ exec/totemsrp.c (working copy)
@@ -298,6 +298,8 @@
struct totemsrp_instance {
int iface_changes;
+ int failed_to_recv;
+
/*
* Flow control mcasts and remcasts on last and current orf_token
*/
@@ -3500,20 +3502,17 @@
}
if (instance->my_aru_count >
instance->totem_config->fail_to_recv_const &&
- token->aru_addr != instance->my_id.addr[0].nodeid) {
+ token->aru_addr == instance->my_id.addr[0].nodeid) {
log_printf (instance->totemsrp_log_level_error,
"FAILED TO RECEIVE\n");
-// TODO if we fail to receive, it may be possible to end with a gather
-// state of proc == failed = 0 entries
-/* THIS IS A BIG TODO
- memb_set_merge (&token->aru_addr, 1,
+
+ instance->failed_to_recv = 1;
+
+ memb_set_merge (&instance->my_id, 1,
instance->my_failed_list,
&instance->my_failed_list_entries);
-*/
- ring_state_restore (instance);
-
memb_state_gather_enter (instance, 6);
} else {
instance->my_token_seq = token->token_seq;
@@ -3754,10 +3753,7 @@
#ifdef TEST_DROP_MCAST_PERCENTAGE
if (random()%100 < TEST_DROP_MCAST_PERCENTAGE) {
- printf ("dropping message %d\n", mcast_header.seq);
return (0);
- } else {
- printf ("accepting message %d\n", mcast_header.seq);
}
#endif
@@ -3936,6 +3932,18 @@
memb_consensus_set (instance, &memb_join->system_from);
+ if (memb_consensus_agreed && instance->failed_to_recv == 1) {
+ instance->failed_to_recv = 0;
+ srp_addr_copy (&instance->my_proc_list[0],
+ &instance->my_id);
+ instance->my_proc_list_entries = 1;
+ instance->my_failed_list_entries = 0;
+
+ memb_state_commit_token_create (instance);
+
+ memb_state_commit_enter (instance);
+ return (0);
+ }
if (memb_consensus_agreed (instance) &&
memb_lowest_in_config (instance)) {
_______________________________________________
Openais mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/openais