This may typically happen if local firewall is enabled. Patch adds new
item to statistics called continuous_gather where is number of
continuous entered gather state. If this number is bigger then
MAX_NO_CONT_GATHER, warning message is displayed. This is also used on
exiting, so stop of corosync is now possible even with enabled firewall.

Signed-off-by: Jan Friesse <[email protected]>
---
 exec/main.c                    |   15 +++++++++++++++
 exec/totemsrp.c                |   15 +++++++++++++++
 include/corosync/totem/totem.h |    6 ++++++
 3 files changed, 36 insertions(+), 0 deletions(-)

diff --git a/exec/main.c b/exec/main.c
index cd6cb83..b04f503 100644
--- a/exec/main.c
+++ b/exec/main.c
@@ -198,8 +198,17 @@ void corosync_shutdown_request (void)
 
 static void *corosync_exit_thread_handler (void *arg)
 {
+       totempg_stats_t * stats;
+
        sem_wait (&corosync_exit_sem);
 
+       stats = api->totem_get_stats();
+       if (stats->mrp->srp->continuous_gather > MAX_NO_CONT_GATHER ||
+           stats->mrp->srp->operational_entered == 0) {
+               unlink_all_completed ();
+               /* NOTREACHED */
+       }
+
        corosync_service_unlink_all (api, unlink_all_completed);
 
        return arg;
@@ -626,6 +635,9 @@ static void corosync_totem_stats_updater (void *data)
        objdb->object_key_replace (stats->mrp->srp->hdr.handle,
                "rx_msg_dropped", strlen("rx_msg_dropped"),
                &stats->mrp->srp->rx_msg_dropped, sizeof 
(stats->mrp->srp->rx_msg_dropped));
+       objdb->object_key_replace (stats->mrp->srp->hdr.handle,
+               "continuous_gather", strlen("continuous_gather"),
+               &stats->mrp->srp->continuous_gather, sizeof 
(stats->mrp->srp->continuous_gather));
 
        total_mtt_rx_token = 0;
        total_token_holdtime = 0;
@@ -784,6 +796,9 @@ static void corosync_totem_stats_init (void)
                objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
                        "rx_msg_dropped", &zero_64,
                        sizeof (zero_64), OBJDB_VALUETYPE_UINT64);
+               objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
+                       "continuous_gather", &zero_32,
+                       sizeof (zero_32), OBJDB_VALUETYPE_UINT32);
 
        }
        /* start stats timer */
diff --git a/exec/totemsrp.c b/exec/totemsrp.c
index f7a6638..c9ad391 100644
--- a/exec/totemsrp.c
+++ b/exec/totemsrp.c
@@ -502,6 +502,7 @@ struct totemsrp_instance {
        struct memb_commit_token *commit_token;
 
        totemsrp_stats_t stats;
+
        void * token_recv_event_handle;
        void * token_sent_event_handle;
        char commit_token_storage[9000];
@@ -1789,6 +1790,8 @@ static void memb_state_operational_enter (struct 
totemsrp_instance *instance)
        instance->memb_state = MEMB_STATE_OPERATIONAL;
 
        instance->stats.operational_entered++;
+       instance->stats.continuous_gather = 0;
+
        instance->my_received_flg = 1;
 
        reset_pause_timeout (instance);
@@ -1853,6 +1856,15 @@ static void memb_state_gather_enter (
 
        instance->memb_state = MEMB_STATE_GATHER;
        instance->stats.gather_entered++;
+       instance->stats.continuous_gather++;
+
+       if (instance->stats.continuous_gather > MAX_NO_CONT_GATHER) {
+               log_printf (instance->totemsrp_log_level_warning,
+                       "Totem is unable to form a cluster because of an "
+                       "operating system or network fault. The most common "
+                       "cause of this message is that the local firewall is "
+                       "configured improperly.\n");
+       }
 
        return;
 }
@@ -1897,6 +1909,7 @@ static void memb_state_commit_enter (
        reset_token_timeout (instance); // REVIEWED
 
        instance->stats.commit_entered++;
+       instance->stats.continuous_gather = 0;
 
        /*
         * reset all flow control variables since we are starting a new ring
@@ -2093,6 +2106,8 @@ originated:
 
        instance->memb_state = MEMB_STATE_RECOVERY;
        instance->stats.recovery_entered++;
+       instance->stats.continuous_gather = 0;
+
        return;
 }
 
diff --git a/include/corosync/totem/totem.h b/include/corosync/totem/totem.h
index 4e2e475..cf78e4c 100644
--- a/include/corosync/totem/totem.h
+++ b/include/corosync/totem/totem.h
@@ -52,6 +52,11 @@
 #define SEND_THREADS_MAX       16
 #define INTERFACE_MAX          2
 
+/*
+ * Maximum number of continuous gather states
+ */
+#define MAX_NO_CONT_GATHER     3
+
 struct totem_interface {
        struct totem_ip_address bindnet;
        struct totem_ip_address boundto;
@@ -250,6 +255,7 @@ typedef struct {
        uint64_t recovery_token_lost;
        uint64_t consensus_timeouts;
        uint64_t rx_msg_dropped;
+       uint32_t continuous_gather;
 
        int earliest_token;
        int latest_token;
-- 
1.6.2.5

_______________________________________________
Openais mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/openais

Reply via email to