From: Liu Yuan <tailai...@taobao.com>

Currently, we have to start up the frist failed node or last failed one to
recover the crash cluster (nodes with different epoch histories). This patch
simply remove this disgusting constraint.

To this point, we can precisely define 'leave node'.

Leave Node:
        Crash cluster: For the master node (first started node), leave nodes 
are nodes
        that are contained in the master epoch and are supposed to leave during 
the recovery
        stage. That is, leave nodes are nodes that enable the master to get the 
knowledge
        of when to recover.

        Shutdown cluster: Account for unhealthy nodes that are supposed to 
leave during the
        recovery stage. This enables nodes alive in the cluster to get the 
knowledge of when
        to recover.

With this patch, there is no start-up order imposed for the crash cluster to 
recover. We can
do this because the epoch on each node has the node with the highest epoch 
number contained.

The method that tries to test this idea:

$ for i in 0 1 2; do ./sheep/sheep /store/$i -z $i -p 700$i; sleep 1; done
$ collie/collie cluster format
$ for i in 0 1 2; do pkill -f "sheep /store/$i"; sleep 1; done
$ for i in 1 0 2; do ./sheep/sheep /store/$i -z $i -p 700$i; sleep 1; done
$ for i in 0 2; do ./sheep/sheep /store/$i -z $i -p 700$i; sleep 1; done
$ for i in 0 1 2; do ./collie/collie cluster info -p 700$i; done
Cluster status: running

Creation time        Epoch Nodes
2011-09-24 11:45:52      6 [192.168.0.4:7000, 192.168.0.4:7001, 
192.168.0.4:7002]
2011-09-24 11:45:52      5 [192.168.0.4:7000, 192.168.0.4:7001]
2011-09-24 11:45:52      4 [192.168.0.4:7001]
2011-09-24 11:45:52      3 [192.168.0.4:7002]
2011-09-24 11:45:52      2 [192.168.0.4:7001, 192.168.0.4:7002]
2011-09-24 11:45:52      1 [192.168.0.4:7000, 192.168.0.4:7001, 
192.168.0.4:7002]
Cluster status: running

Creation time        Epoch Nodes
2011-09-24 11:45:52      6 [192.168.0.4:7000, 192.168.0.4:7001, 
192.168.0.4:7002]
2011-09-24 11:45:52      5 [192.168.0.4:7000, 192.168.0.4:7001]
2011-09-24 11:45:52      4 [192.168.0.4:7001]
2011-09-24 11:45:52      3 [192.168.0.4:7002]
2011-09-24 11:45:52      2 [192.168.0.4:7001, 192.168.0.4:7002]
2011-09-24 11:45:52      1 [192.168.0.4:7000, 192.168.0.4:7001, 
192.168.0.4:7002]
Cluster status: running

Creation time        Epoch Nodes
2011-09-24 11:45:52      6 [192.168.0.4:7000, 192.168.0.4:7001, 
192.168.0.4:7002]
2011-09-24 11:45:52      5 [192.168.0.4:7000, 192.168.0.4:7001]
2011-09-24 11:45:52      4 [192.168.0.4:7001]
2011-09-24 11:45:52      3 [192.168.0.4:7002]
2011-09-24 11:45:52      2 [192.168.0.4:7001, 192.168.0.4:7002]
2011-09-24 11:45:52      1 [192.168.0.4:7000, 192.168.0.4:7001, 
192.168.0.4:7002]

Signed-off-by: Liu Yuan <tailai...@taobao.com>
---
 sheep/group.c |   25 ++++++++++++++++++++++---
 1 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/sheep/group.c b/sheep/group.c
index 812f6a0..53846cb 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -435,10 +435,27 @@ static struct sheepdog_node_list_entry 
*find_entry_list(struct sheepdog_node_lis
        return NULL;
 
 }
+
+static struct sheepdog_node_list_entry *find_entry_epoch(struct 
sheepdog_node_list_entry *entry,
+                                                        int epoch)
+{
+       struct sheepdog_node_list_entry nodes[SD_MAX_NODES];
+       int nr, i;
+
+       nr = epoch_log_read(epoch, (char *)nodes, sizeof(nodes));
+       nr /= sizeof(nodes[0]);
+
+       for (i = 0; i < nr; i++)
+               if (node_cmp(&nodes[i], entry) == 0)
+                       return entry;
+
+       return NULL;
+}
+
 static int add_node_to_leave_list(struct message_header *msg)
 {
        int ret = SD_RES_SUCCESS;
-       int nr, i;
+       int nr, i, le = get_latest_epoch();
        LIST_HEAD(tmp_list);
        struct node *n, *t;
        struct join_message *jm;
@@ -450,7 +467,8 @@ static int add_node_to_leave_list(struct message_header 
*msg)
                        goto err;
                }
 
-               if (find_entry_list(&msg->from, &sys->leave_list)) {
+               if (find_entry_list(&msg->from, &sys->leave_list)
+                   || !find_entry_epoch(&msg->from, le)) {
                        free(n);
                        goto ret;
                }
@@ -471,7 +489,8 @@ static int add_node_to_leave_list(struct message_header 
*msg)
                                goto free;
                        }
 
-                       if (find_entry_list(&jm->leave_nodes[i].ent, 
&sys->leave_list)) {
+                       if (find_entry_list(&jm->leave_nodes[i].ent, 
&sys->leave_list)
+                           || !find_entry_epoch(&jm->leave_nodes[i].ent, le)) {
                                free(n);
                                continue;
                        }
-- 
1.7.6.1

-- 
sheepdog mailing list
sheepdog@lists.wpkg.org
http://lists.wpkg.org/mailman/listinfo/sheepdog

Reply via email to