When a sheep with a higher epoch joins we must kill all other sheep, not just the current master but all currently active nodes to avoid the node list and epoch history getting out of sync between nodes.
While it would be nice to automatically restart them that would be a bit ugly with the current cluster driver architecture and thus is for now left to external mangement tools. This patch also removes most of the master transfer special casing in the cluster drivers - sd_join_handler now uses the right nodes array and nr_nodes for the new master, and kills all nodes but the new master directly. Signed-off-by: Christoph Hellwig <[email protected]> --- sheep/group.c | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) Index: sheepdog/sheep/group.c =================================================================== --- sheepdog.orig/sheep/group.c 2012-08-07 16:02:48.042008047 +0200 +++ sheepdog/sheep/group.c 2012-08-07 16:05:15.025343623 +0200 @@ -1092,31 +1092,19 @@ void sd_join_handler(struct sd_node *joi } break; case CJ_RES_MASTER_TRANSFER: - update_exceptional_node_list(le, jm); - - /* Sheep needs this to identify itself as master. - * Now mastership transfer is done. - */ - if (node_eq(joined, &sys->this_node)) { - sys->epoch = get_latest_epoch(); - put_vnode_info(current_vnode_info); - current_vnode_info = alloc_vnode_info(&sys->this_node, 1); + if (!node_eq(joined, &sys->this_node)) { + eprintf("Transferring mastership to %s, exiting.\n", + node_to_str(joined)); + sys->cdrv->leave(); + exit(1); } - nr_local = get_nodes_nr_epoch(sys->epoch); - nr = nr_members; - nr_failed = get_nodes_nr_from(&sys->failed_nodes); - nr_delayed_nodes = get_nodes_nr_from(&sys->delayed_nodes); + eprintf("Took mastership\n"); - dprintf("%d == %d + %d\n", nr_local, nr, nr_failed); - if (nr_local == nr + nr_failed - nr_delayed_nodes) { - sys->status = SD_STATUS_OK; - log_current_epoch(); - } + assert(current_vnode_info == NULL); - if (node_eq(joined, &sys->this_node)) - /* this output is used for testing */ - vprintf(SDOG_DEBUG, "join Sheepdog cluster\n"); + sys->epoch = get_latest_epoch(); + current_vnode_info = alloc_vnode_info(&sys->this_node, 1); break; } } Index: sheepdog/sheep/cluster/accord.c =================================================================== --- sheepdog.orig/sheep/cluster/accord.c 2012-08-07 15:57:43.252003392 +0200 +++ sheepdog/sheep/cluster/accord.c 2012-08-07 16:03:53.025342370 +0200 @@ -485,24 +485,8 @@ static void acrd_handler(int listen_fd, ev.join_result = res; ev.type = EVENT_JOIN_RESPONSE; acrd_queue_push_back(ahandle, &ev); - - if (res == CJ_RES_MASTER_TRANSFER) { - eprintf("failed to join sheepdog cluster: " - "please retry when master is up\n"); - exit(1); - } break; case EVENT_JOIN_RESPONSE: - if (ev.join_result == CJ_RES_MASTER_TRANSFER) { - /* FIXME: This code is tricky, but Sheepdog assumes that */ - /* nr_nodes = 1 when join_result = MASTER_TRANSFER... */ - ev.nr_nodes = 1; - ev.nodes[0] = this_node; - ev.ids[0] = this_id; - acrd_queue_push_back(ahandle, &ev); - acrd_queue_pop(ahandle, &ev); - } - sd_join_handler(&ev.sender, ev.nodes, ev.nr_nodes, ev.join_result, ev.buf); break; Index: sheepdog/sheep/cluster/corosync.c =================================================================== --- sheepdog.orig/sheep/cluster/corosync.c 2012-08-07 15:57:43.252003392 +0200 +++ sheepdog/sheep/cluster/corosync.c 2012-08-07 16:04:01.215342498 +0200 @@ -308,11 +308,6 @@ static int __corosync_dispatch_one(struc &cevent->sender, cpg_nodes, nr_cpg_nodes, cevent->msg, cevent->msg_len); - if (res == CJ_RES_MASTER_TRANSFER) { - eprintf("failed to join sheepdog cluster: please retry when master is up\n"); - exit(1); - } - cevent->callbacked = 1; return 0; case COROSYNC_EVENT_TYPE_JOIN_RESPONSE: Index: sheepdog/sheep/cluster/local.c =================================================================== --- sheepdog.orig/sheep/cluster/local.c 2012-08-07 15:57:43.252003392 +0200 +++ sheepdog/sheep/cluster/local.c 2012-08-07 16:04:11.522009323 +0200 @@ -363,25 +363,8 @@ static bool local_process_event(void) msync(ev, sizeof(*ev), MS_SYNC); shm_queue_notify(); - - if (res == CJ_RES_MASTER_TRANSFER) { - eprintf("failed to join sheepdog cluster: " - "please retry when master is up\n"); - shm_queue_unlock(); - exit(1); - } return false; case EVENT_JOIN_RESPONSE: - if (ev->join_result == CJ_RES_MASTER_TRANSFER) { - /* FIXME: This code is tricky, but Sheepdog assumes that */ - /* nr_nodes = 1 when join_result = MASTER_TRANSFER... */ - ev->nr_nodes = 1; - ev->nodes[0] = this_node; - ev->pids[0] = getpid(); - - shm_queue_set_chksum(); - } - sd_join_handler(&ev->sender, ev->nodes, ev->nr_nodes, ev->join_result, ev->buf); shm_queue_pop(); Index: sheepdog/sheep/cluster/zookeeper.c =================================================================== --- sheepdog.orig/sheep/cluster/zookeeper.c 2012-08-07 15:57:43.252003392 +0200 +++ sheepdog/sheep/cluster/zookeeper.c 2012-08-07 16:05:07.708676847 +0200 @@ -688,13 +688,6 @@ static void zk_handler(int listen_fd, in dprintf("I'm master, push back join event\n"); zk_queue_push_back(zhandle, &ev); - - if (res == CJ_RES_MASTER_TRANSFER) { - eprintf("failed to join sheepdog cluster: " - "please retry when master is up\n"); - zk_leave(); - exit(1); - } break; case EVENT_JOIN_RESPONSE: dprintf("JOIN RESPONSE\n"); @@ -720,17 +713,6 @@ static void zk_handler(int listen_fd, in } } - if (node_eq(&ev.sender.node, &this_node.node)) - zk_member_init(zhandle); - - if (ev.join_result == CJ_RES_MASTER_TRANSFER) - /* - * Sheepdog assumes that only one sheep(master will kill - * itself) is alive in MASTER_TRANSFER scenario. So only - * the joining sheep will run into here. - */ - node_btree_clear(&zk_node_btroot); - node_btree_add(&zk_node_btroot, &ev.sender); dprintf("one sheep joined[down], nr_nodes:%zu, sender:%s," " joined:%d\n", nr_zk_nodes, @@ -743,6 +725,7 @@ static void zk_handler(int listen_fd, in sprintf(path, MEMBER_ZNODE "/%s", node_to_str(&ev.sender.node)); if (node_eq(&ev.sender.node, &this_node.node)) { dprintf("create path:%s\n", path); + zk_member_init(zhandle); rc = zk_create(zhandle, path, (char *)&ev.sender, sizeof(ev.sender), &ZOO_OPEN_ACL_UNSAFE, ZOO_EPHEMERAL, NULL, 0); if (rc != ZOK) -- sheepdog mailing list [email protected] http://lists.wpkg.org/mailman/listinfo/sheepdog
