The failed to receive logic in totem is not correct. This condition occurs when a node can't receive multicast packets for a long period of time. Generally it impacts low numbers of users which have hardware that exhibit out-of-norm behaviours.

The solution is to more closely match the spec when forming a new gather list after a FAILED TO RECV is detected. Once this occurs, a singleton ring is formed. Then the FAILED TO RECV node is free to try to form a ring again if it can with the existing nodes.

Regards
-steve
Index: services/cpg.c
===================================================================
--- services/cpg.c      (revision 2918)
+++ services/cpg.c      (working copy)
@@ -774,6 +774,7 @@
                                        0, NULL,
                                        1, &left_list,
                                        MESSAGE_RES_CPG_CONFCHG_CALLBACK);
+printf ("deleting pi\n");
                                list_del (&pi->list);
                                free (pi);
                                break;
@@ -1147,6 +1148,7 @@
        struct iovec iovec[2];
        int known_node = 0;
 
+printf ("mcast handler\n");
        res_lib_cpg_mcast.header.id = MESSAGE_RES_CPG_DELIVER_CALLBACK;
        res_lib_cpg_mcast.header.size = sizeof(res_lib_cpg_mcast) + msglen;
        res_lib_cpg_mcast.msglen = msglen;
@@ -1185,10 +1187,13 @@
 
                        if (!known_node) {
                                /* Unknown node -> we will not deliver message 
*/
+printf ("unknown node\n");
                                return ;
                        }
 
                        api->ipc_dispatch_iov_send (cpd->conn, iovec, 2);
+               } else {
+printf ("unknown cpg\n");
                }
        }
 }
Index: exec/totemsrp.c
===================================================================
--- exec/totemsrp.c     (revision 2918)
+++ exec/totemsrp.c     (working copy)
@@ -298,6 +298,8 @@
 struct totemsrp_instance {
        int iface_changes;
 
+       int failed_to_recv;
+
        /*
         * Flow control mcasts and remcasts on last and current orf_token
         */
@@ -3500,20 +3502,17 @@
                }
 
                if (instance->my_aru_count > 
instance->totem_config->fail_to_recv_const &&
-                       token->aru_addr != instance->my_id.addr[0].nodeid) {
+                       token->aru_addr == instance->my_id.addr[0].nodeid) {
 
                        log_printf (instance->totemsrp_log_level_error,
                                "FAILED TO RECEIVE\n");
-// TODO if we fail to receive, it may be possible to end with a gather
-// state of proc == failed = 0 entries
-/* THIS IS A BIG TODO
-                       memb_set_merge (&token->aru_addr, 1,
+
+                       instance->failed_to_recv = 1;
+
+                       memb_set_merge (&instance->my_id, 1,
                                instance->my_failed_list,
                                &instance->my_failed_list_entries);
-*/
 
-                       ring_state_restore (instance);
-
                        memb_state_gather_enter (instance, 6);
                } else {
                        instance->my_token_seq = token->token_seq;
@@ -3754,10 +3753,7 @@
 
 #ifdef TEST_DROP_MCAST_PERCENTAGE
        if (random()%100 < TEST_DROP_MCAST_PERCENTAGE) {
-               printf ("dropping message %d\n", mcast_header.seq);
                return (0);
-       } else {
-               printf ("accepting message %d\n", mcast_header.seq);
        }
 #endif
 
@@ -3936,6 +3932,18 @@
 
                memb_consensus_set (instance, &memb_join->system_from);
 
+               if (memb_consensus_agreed && instance->failed_to_recv == 1) {
+                               instance->failed_to_recv = 0;
+                               srp_addr_copy (&instance->my_proc_list[0],
+                                       &instance->my_id);
+                               instance->my_proc_list_entries = 1;
+                               instance->my_failed_list_entries = 0;
+
+                               memb_state_commit_token_create (instance);
+
+                               memb_state_commit_enter (instance);
+                               return (0);
+               }
                if (memb_consensus_agreed (instance) &&
                        memb_lowest_in_config (instance)) {
 
_______________________________________________
Openais mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/openais

Reply via email to