When a sheep with a higher epoch joins we must kill all other sheep, not
just the current master but all currently active nodes to avoid the node
list and epoch history getting out of sync between nodes.

While it would be nice to automatically restart them that would be a bit
ugly with the current cluster driver architecture and thus is for now
left to external mangement tools.

This patch also removes most of the master transfer special casing in
the cluster drivers - sd_join_handler now uses the right nodes array
and nr_nodes for the new master, and kills all nodes but the new master
directly.

Signed-off-by: Christoph Hellwig <[email protected]>

---
 sheep/group.c |   30 +++++++++---------------------
 1 file changed, 9 insertions(+), 21 deletions(-)

Index: sheepdog/sheep/group.c
===================================================================
--- sheepdog.orig/sheep/group.c 2012-08-07 16:02:48.042008047 +0200
+++ sheepdog/sheep/group.c      2012-08-07 16:05:15.025343623 +0200
@@ -1092,31 +1092,19 @@ void sd_join_handler(struct sd_node *joi
                }
                break;
        case CJ_RES_MASTER_TRANSFER:
-               update_exceptional_node_list(le, jm);
-
-               /* Sheep needs this to identify itself as master.
-                * Now mastership transfer is done.
-                */
-               if (node_eq(joined, &sys->this_node)) {
-                       sys->epoch = get_latest_epoch();
-                       put_vnode_info(current_vnode_info);
-                       current_vnode_info = alloc_vnode_info(&sys->this_node, 
1);
+               if (!node_eq(joined, &sys->this_node)) {
+                       eprintf("Transferring mastership to %s, exiting.\n",
+                               node_to_str(joined));
+                       sys->cdrv->leave();
+                       exit(1);
                }
 
-               nr_local = get_nodes_nr_epoch(sys->epoch);
-               nr = nr_members;
-               nr_failed = get_nodes_nr_from(&sys->failed_nodes);
-               nr_delayed_nodes = get_nodes_nr_from(&sys->delayed_nodes);
+               eprintf("Took mastership\n");
 
-               dprintf("%d == %d + %d\n", nr_local, nr, nr_failed);
-               if (nr_local == nr + nr_failed - nr_delayed_nodes) {
-                       sys->status = SD_STATUS_OK;
-                       log_current_epoch();
-               }
+               assert(current_vnode_info == NULL);
 
-               if (node_eq(joined, &sys->this_node))
-                       /* this output is used for testing */
-                       vprintf(SDOG_DEBUG, "join Sheepdog cluster\n");
+               sys->epoch = get_latest_epoch();
+               current_vnode_info = alloc_vnode_info(&sys->this_node, 1);
                break;
        }
 }
Index: sheepdog/sheep/cluster/accord.c
===================================================================
--- sheepdog.orig/sheep/cluster/accord.c        2012-08-07 15:57:43.252003392 
+0200
+++ sheepdog/sheep/cluster/accord.c     2012-08-07 16:03:53.025342370 +0200
@@ -485,24 +485,8 @@ static void acrd_handler(int listen_fd,
                ev.join_result = res;
                ev.type = EVENT_JOIN_RESPONSE;
                acrd_queue_push_back(ahandle, &ev);
-
-               if (res == CJ_RES_MASTER_TRANSFER) {
-                       eprintf("failed to join sheepdog cluster: "
-                               "please retry when master is up\n");
-                       exit(1);
-               }
                break;
        case EVENT_JOIN_RESPONSE:
-               if (ev.join_result == CJ_RES_MASTER_TRANSFER) {
-                       /* FIXME: This code is tricky, but Sheepdog assumes 
that */
-                       /* nr_nodes = 1 when join_result = MASTER_TRANSFER... */
-                       ev.nr_nodes = 1;
-                       ev.nodes[0] = this_node;
-                       ev.ids[0] = this_id;
-                       acrd_queue_push_back(ahandle, &ev);
-                       acrd_queue_pop(ahandle, &ev);
-               }
-
                sd_join_handler(&ev.sender, ev.nodes, ev.nr_nodes,
                                    ev.join_result, ev.buf);
                break;
Index: sheepdog/sheep/cluster/corosync.c
===================================================================
--- sheepdog.orig/sheep/cluster/corosync.c      2012-08-07 15:57:43.252003392 
+0200
+++ sheepdog/sheep/cluster/corosync.c   2012-08-07 16:04:01.215342498 +0200
@@ -308,11 +308,6 @@ static int __corosync_dispatch_one(struc
                             &cevent->sender, cpg_nodes, nr_cpg_nodes,
                             cevent->msg, cevent->msg_len);
 
-               if (res == CJ_RES_MASTER_TRANSFER) {
-                       eprintf("failed to join sheepdog cluster: please retry 
when master is up\n");
-                       exit(1);
-               }
-
                cevent->callbacked = 1;
                return 0;
        case COROSYNC_EVENT_TYPE_JOIN_RESPONSE:
Index: sheepdog/sheep/cluster/local.c
===================================================================
--- sheepdog.orig/sheep/cluster/local.c 2012-08-07 15:57:43.252003392 +0200
+++ sheepdog/sheep/cluster/local.c      2012-08-07 16:04:11.522009323 +0200
@@ -363,25 +363,8 @@ static bool local_process_event(void)
                msync(ev, sizeof(*ev), MS_SYNC);
 
                shm_queue_notify();
-
-               if (res == CJ_RES_MASTER_TRANSFER) {
-                       eprintf("failed to join sheepdog cluster: "
-                               "please retry when master is up\n");
-                       shm_queue_unlock();
-                       exit(1);
-               }
                return false;
        case EVENT_JOIN_RESPONSE:
-               if (ev->join_result == CJ_RES_MASTER_TRANSFER) {
-                       /* FIXME: This code is tricky, but Sheepdog assumes 
that */
-                       /* nr_nodes = 1 when join_result = MASTER_TRANSFER... */
-                       ev->nr_nodes = 1;
-                       ev->nodes[0] = this_node;
-                       ev->pids[0] = getpid();
-
-                       shm_queue_set_chksum();
-               }
-
                sd_join_handler(&ev->sender, ev->nodes, ev->nr_nodes,
                                    ev->join_result, ev->buf);
                shm_queue_pop();
Index: sheepdog/sheep/cluster/zookeeper.c
===================================================================
--- sheepdog.orig/sheep/cluster/zookeeper.c     2012-08-07 15:57:43.252003392 
+0200
+++ sheepdog/sheep/cluster/zookeeper.c  2012-08-07 16:05:07.708676847 +0200
@@ -688,13 +688,6 @@ static void zk_handler(int listen_fd, in
 
                dprintf("I'm master, push back join event\n");
                zk_queue_push_back(zhandle, &ev);
-
-               if (res == CJ_RES_MASTER_TRANSFER) {
-                       eprintf("failed to join sheepdog cluster: "
-                               "please retry when master is up\n");
-                       zk_leave();
-                       exit(1);
-               }
                break;
        case EVENT_JOIN_RESPONSE:
                dprintf("JOIN RESPONSE\n");
@@ -720,17 +713,6 @@ static void zk_handler(int listen_fd, in
                        }
                }
 
-               if (node_eq(&ev.sender.node, &this_node.node))
-                       zk_member_init(zhandle);
-
-               if (ev.join_result == CJ_RES_MASTER_TRANSFER)
-                       /*
-                        * Sheepdog assumes that only one sheep(master will kill
-                        * itself) is alive in MASTER_TRANSFER scenario. So only
-                        * the joining sheep will run into here.
-                        */
-                       node_btree_clear(&zk_node_btroot);
-
                node_btree_add(&zk_node_btroot, &ev.sender);
                dprintf("one sheep joined[down], nr_nodes:%zu, sender:%s,"
                        " joined:%d\n", nr_zk_nodes,
@@ -743,6 +725,7 @@ static void zk_handler(int listen_fd, in
                        sprintf(path, MEMBER_ZNODE "/%s", 
node_to_str(&ev.sender.node));
                        if (node_eq(&ev.sender.node, &this_node.node)) {
                                dprintf("create path:%s\n", path);
+                               zk_member_init(zhandle);
                                rc = zk_create(zhandle, path, (char 
*)&ev.sender, sizeof(ev.sender),
                                        &ZOO_OPEN_ACL_UNSAFE, ZOO_EPHEMERAL, 
NULL, 0);
                                if (rc != ZOK)

-- 
sheepdog mailing list
[email protected]
http://lists.wpkg.org/mailman/listinfo/sheepdog

Reply via email to