On Fri, Mar 04, 2011 at 05:31:41PM -0700, Steven Dake wrote:
> Lars,
> 
> As we discussed on IRC, I agreed to do a first rough cut of automatic
> redundant ring recovery, and you agreed to find some suitable engineers
> to finish the work on this topic.  If there are others in the community
> interested in seeing this work merged into Corosync, feel free to amend
> the patch, address any particular points that are remaining, and send an
> updated patch.  Once the 7 points are addressed, we should be good to go
> with a merge.
> 
> Thanks for your assistance.
> -steve
> 
> On 03/04/2011 05:25 PM, Steven Dake wrote:
> > Here is a very rough implementation of automatic ring recovery.  This patch
> > only works for the rrp mode active.  It requires the following changes to be
> > mergeable:
> > 
> > 1. endian detection & conversion of the message headers in totemrrp must be 
> > done
> > 2. The implementation is entirely in the active algo.  Instead, the
> >    implementation should be made to avoid an algorithm specific 
> > implementation
> >    and only be implemented in one place (and work for both passive/active)
> > 3. the timer variable timer_active_test_ring_timeout should be stored in
> >    totemrrp_instance instead of active_instance
> > 4. An array of timeouts should be created for each iface_no so as to not
> >    overwrite timer variables with more then 2 rings.
> > 5. If the ACTIVATE message is lost, the ring will not recover.  This needs
> >    more consideration and correction.
> > 6. The active test message is sent once per second (search 1000 which is 
> > msec)
> >    This should be tunable and added to the man pages.
> > 7. testing!
> >

Hi, 

Just a status update here, I'm working on this now;)

For now, the itmes 4, 5, 6, 7 haven't been finished yet. The following
patch is the incomplete one, it may not work since I haven't tested it
yet, I posted here just to see if I'm on the right track;)

Thanks,
Jiaju

Signed-off-by: Jiaju Zhang <[email protected]>
--- 
 exec/totemrrp.c |  197 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 189 insertions(+), 8 deletions(-)

diff --git a/exec/totemrrp.c b/exec/totemrrp.c
index 8107a1c..2de5428 100644
--- a/exec/totemrrp.c
+++ b/exec/totemrrp.c
@@ -237,7 +237,11 @@ struct totemrrp_instance {
 
        int processor_count;
 
+       int my_nodeid;
+
        struct totem_config *totem_config;
+
+       poll_timer_handle timer_active_test_ring_timeout;
 };
 
 /*
@@ -450,6 +454,22 @@ static void active_timer_problem_decrementer_start (
 static void active_timer_problem_decrementer_cancel (
        struct active_instance *active_instance);
 
+/*
+ * 0-5 reserved for totemsrp.c
+ */
+#define MESSAGE_TYPE_RING_TEST_ACTIVE          6
+#define MESSAGE_TYPE_RING_TEST_ACTIVATE        7
+
+#define ENDIAN_LOCAL                           0xff22
+
+struct message_header {
+       char type;
+       char encapsulated;
+       unsigned short endian_detector;
+       int ring_number;
+       int nodeid_activator;
+} __attribute__((packed));
+
 struct rrp_algo none_algo = {
        .name                   = "none",
        .initialize             = NULL,
@@ -522,6 +542,51 @@ do {                                                       
                \
                format, ##args);                                        \
 } while (0);
 
+static void test_active_msg_endian_convert(const struct message_header *in, 
struct message_header *out)
+{
+       out->type = in->type;
+       out->encapsulated = in->encapsulated;
+       out->endian_detector = ENDIAN_LOCAL;
+       out->ring_number = swab32 (in->ring_number);
+       out->nodeid_activator = swab32(in->nodeid_activator);
+}
+
+static void timer_function_test_ring_timeout (void *context)
+{
+       struct totemrrp_instance *rrp_instance = (struct totemrrp_instance 
*)context;
+       int faulty = 0;
+       unsigned int *instance_faulty = NULL;
+       int i;
+       struct message_header msg;
+
+       if (strcmp(rrp_instance->totem_config->rrp_mode, "active") == 0)
+               instance_faulty = ((struct active_instance 
*)(rrp_instance->rrp_algo_instance))->faulty;
+       if (strcmp(rrp_instance->totem_config->rrp_mode, "passive") == 0)
+               instance_faulty = ((struct passive_instance 
*)(rrp_instance->rrp_algo_instance))->faulty;
+
+       assert (instance_faulty != NULL);
+               
+       for (i = 0; i < rrp_instance->interface_count; i++) {
+               msg.type = MESSAGE_TYPE_RING_TEST_ACTIVE;
+               if (instance_faulty[i] == 1) {
+                       faulty = 1;
+                       msg.endian_detector = ENDIAN_LOCAL;
+                       msg.ring_number = i;
+                       msg.nodeid_activator = rrp_instance->my_nodeid;
+                       totemnet_token_send (
+                               rrp_instance->net_handles[i],
+                               &msg, sizeof (struct message_header));
+               }
+       }
+       if (faulty) {
+               poll_timer_add (rrp_instance->poll_handle,
+                       1000,
+                       (void *)rrp_instance,
+                       timer_function_test_ring_timeout,
+                       &rrp_instance->timer_active_test_ring_timeout);
+       }
+}
+
 /*
  * None Replication Implementation
  */
@@ -797,6 +862,12 @@ static void passive_mcast_recv (
                        (max - passive_instance->mcast_recv_count[i] >
                        
rrp_instance->totem_config->rrp_problem_count_threshold)) {
                        passive_instance->faulty[i] = 1;
+                       poll_timer_add (rrp_instance->poll_handle,
+                               1000,
+                               (void *)rrp_instance,
+                               timer_function_test_ring_timeout,
+                               &rrp_instance->timer_active_test_ring_timeout);
+
                        sprintf (rrp_instance->status[i],
                                "Marking ringid %u interface %s FAULTY - 
administrative intervention required.",
                                i,
@@ -849,8 +920,55 @@ static void passive_token_recv (
        struct passive_instance *passive_instance = (struct passive_instance 
*)rrp_instance->rrp_algo_instance;
        unsigned int max;
        unsigned int i;
+       const struct message_header *hdr = msg;
+       struct message_header tmp;
+       struct message_header activate_msg;
+
+       passive_instance->totemrrp_context = context;
 
-       passive_instance->totemrrp_context = context; // this should be in 
totemrrp_instance ? TODO
+       if (hdr->endian_detector != ENDIAN_LOCAL) {
+               test_active_msg_endian_convert(hdr, &tmp);
+               hdr = &tmp;
+       }
+
+       if (hdr->type == MESSAGE_TYPE_RING_TEST_ACTIVE) {
+               log_printf (
+                       rrp_instance->totemrrp_log_level_debug,
+                       "received message requesting test of ring now 
active\n");
+
+               if (hdr->nodeid_activator == rrp_instance->my_nodeid) {
+                       /*
+                        * Send an activate message
+                        */
+                       activate_msg.type = MESSAGE_TYPE_RING_TEST_ACTIVATE;
+                       activate_msg.ring_number = hdr->ring_number;;
+                       activate_msg.nodeid_activator = rrp_instance->my_nodeid;
+                       totemnet_token_send (
+                               rrp_instance->net_handles[iface_no],
+                               &activate_msg, sizeof (struct message_header));
+               } else {
+                       /*
+                        * Send a ring test message
+                        */
+                       totemnet_token_send (
+                               rrp_instance->net_handles[iface_no],
+                               msg, msg_len);
+               }
+               return;
+       } else
+       if (hdr->type == MESSAGE_TYPE_RING_TEST_ACTIVATE) {
+               log_printf (
+                       rrp_instance->totemrrp_log_level_notice,
+                       "Automatically recovered ring %d\n", hdr->ring_number);
+
+               totemrrp_ring_reenable (rrp_instance);
+               if (hdr->nodeid_activator != rrp_instance->my_nodeid) {
+                       totemnet_token_send (
+                               rrp_instance->net_handles[iface_no],
+                               msg, msg_len);
+               }
+               return;
+       }
 
        if (rrp_instance->totemrrp_msgs_missing() == 0) {
                rrp_instance->totemrrp_deliver_fn (
@@ -880,6 +998,12 @@ static void passive_token_recv (
                        (max - passive_instance->token_recv_count[i] >
                        
rrp_instance->totem_config->rrp_problem_count_threshold)) {
                        passive_instance->faulty[i] = 1;
+                       poll_timer_add (rrp_instance->poll_handle,
+                               1000,
+                               (void *)rrp_instance,
+                               timer_function_test_ring_timeout,
+                               &rrp_instance->timer_active_test_ring_timeout);
+
                        sprintf (rrp_instance->status[i],
                                "Marking seqid %d ringid %u interface %s FAULTY 
- administrative intervention required.",
                                token_seq,
@@ -1128,6 +1252,12 @@ static void timer_function_active_token_expired (void 
*context)
                if (active_instance->counter_problems[i] >= 
rrp_instance->totem_config->rrp_problem_count_threshold)
                {
                        active_instance->faulty[i] = 1;
+                       poll_timer_add (rrp_instance->poll_handle,
+                               1000,
+                               (void *)rrp_instance,
+                               timer_function_test_ring_timeout,
+                               &rrp_instance->timer_active_test_ring_timeout);
+
                        sprintf (rrp_instance->status[i],
                                "Marking seqid %d ringid %u interface %s FAULTY 
- administrative intervention required.",
                                active_instance->last_token_seq,
@@ -1233,7 +1363,7 @@ static void active_mcast_noflush_send (
 }
 
 static void active_token_recv (
-       struct totemrrp_instance *instance,
+       struct totemrrp_instance *rrp_instance,
        unsigned int iface_no,
        void *context,
        const void *msg,
@@ -1241,13 +1371,62 @@ static void active_token_recv (
        unsigned int token_seq)
 {
        int i;
-       struct active_instance *active_instance = (struct active_instance 
*)instance->rrp_algo_instance;
+       struct active_instance *active_instance = (struct active_instance 
*)rrp_instance->rrp_algo_instance;
+       const struct message_header *hdr = msg;
+       struct message_header tmp;
+       struct message_header activate_msg;
+
+       active_instance->totemrrp_context = context;
+
+       if (hdr->endian_detector != ENDIAN_LOCAL) {
+               test_active_msg_endian_convert(hdr, &tmp);
+               hdr = &tmp;
+       }
+
+       if (hdr->type == MESSAGE_TYPE_RING_TEST_ACTIVE) {
+
+               log_printf (
+                       rrp_instance->totemrrp_log_level_debug,
+                       "received message requesting test of ring now 
active\n");
+
+               if (hdr->nodeid_activator == rrp_instance->my_nodeid) {
+                       /*
+                        * Send an activate message
+                        */
+                       activate_msg.type = MESSAGE_TYPE_RING_TEST_ACTIVATE;
+                       activate_msg.ring_number = hdr->ring_number;;
+                       activate_msg.nodeid_activator = rrp_instance->my_nodeid;
+                       totemnet_token_send (
+                               rrp_instance->net_handles[iface_no],
+                               &activate_msg, sizeof (struct message_header));
+               } else {
+                       /*
+                        * Send a ring test message
+                        */
+                       totemnet_token_send (
+                               rrp_instance->net_handles[iface_no],
+                               msg, msg_len);
+               }
+               return;
+       } else
+       if (hdr->type == MESSAGE_TYPE_RING_TEST_ACTIVATE) {
+               log_printf (
+                       rrp_instance->totemrrp_log_level_notice,
+                       "Automatically recovered ring %d\n", hdr->ring_number);
+
+               totemrrp_ring_reenable (rrp_instance);
+               if (hdr->nodeid_activator != rrp_instance->my_nodeid) {
+                       totemnet_token_send (
+                               rrp_instance->net_handles[iface_no],
+                               msg, msg_len);
+               }
+               return;
+       }
 
-       active_instance->totemrrp_context = context; // this should be in 
totemrrp_instance ?
        if (token_seq > active_instance->last_token_seq) {
                memcpy (active_instance->token, msg, msg_len);
                active_instance->token_len = msg_len;
-               for (i = 0; i < instance->interface_count; i++) {
+               for (i = 0; i < rrp_instance->interface_count; i++) {
                        active_instance->last_token_recv[i] = 0;
                }
 
@@ -1259,7 +1438,7 @@ static void active_token_recv (
 
        if (token_seq == active_instance->last_token_seq) {
                active_instance->last_token_recv[iface_no] = 1;
-               for (i = 0; i < instance->interface_count; i++) {
+               for (i = 0; i < rrp_instance->interface_count; i++) {
                        if ((active_instance->last_token_recv[i] == 0) &&
                                active_instance->faulty[i] == 0) {
                                return; /* don't deliver token */
@@ -1267,7 +1446,7 @@ static void active_token_recv (
                }
                active_timer_expired_token_cancel (active_instance);
 
-               instance->totemrrp_deliver_fn (
+               rrp_instance->totemrrp_deliver_fn (
                        context,
                        msg,
                        msg_len);
@@ -1441,13 +1620,14 @@ void rrp_deliver_fn (
        unsigned int token_is;
 
        struct deliver_fn_context *deliver_fn_context = (struct 
deliver_fn_context *)context;
+       const struct message_header *hdr = msg;
 
        deliver_fn_context->instance->totemrrp_token_seqid_get (
                msg,
                &token_seqid,
                &token_is);
 
-       if (token_is) {
+       if (hdr->type == MESSAGE_TYPE_RING_TEST_ACTIVE || hdr->type == 
MESSAGE_TYPE_RING_TEST_ACTIVATE || token_is) {
                /*
                 * Deliver to the token receiver for this rrp algorithm
                 */
@@ -1477,6 +1657,7 @@ void rrp_iface_change_fn (
 {
        struct deliver_fn_context *deliver_fn_context = (struct 
deliver_fn_context *)context;
 
+       deliver_fn_context->instance->my_nodeid = iface_addr->nodeid;
        deliver_fn_context->instance->totemrrp_iface_change_fn (
                deliver_fn_context->context,
                iface_addr,
_______________________________________________
Openais mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/openais

Reply via email to