[Sheepdog] [PATCH] fix master selection race

FUJITA Tomonori Thu, 08 Apr 2010 04:03:41 -0700

We wrongly assume that the node that joins corosync sends Sheepdog's
JOIN message before the other nodes do. We hit a bug that two nodes
are the master node temporarily.


This patch makes sure that the node that joins corosync will be always
the master.

Signed-off-by: FUJITA Tomonori <[email protected]>
---
 collie/collie.h |    2 +
 collie/group.c  |   94 +++++++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 80 insertions(+), 16 deletions(-)

diff --git a/collie/collie.h b/collie/collie.h
index fac6809..048cc7b 100644
--- a/collie/collie.h
+++ b/collie/collie.h
@@ -74,6 +74,8 @@ struct cluster_info {
        DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
 
        int nr_sobjs;
+
+       struct list_head work_deliver_siblings;
 };
 
 struct cluster_info *sys;
diff --git a/collie/group.c b/collie/group.c
index d659485..56e0fe9 100644
--- a/collie/group.c
+++ b/collie/group.c
@@ -71,6 +71,7 @@ struct work_deliver {
        struct message_header *msg;
 
        struct work work;
+       struct list_head work_deliver_list;
 };
 
 struct work_confch {
@@ -402,9 +403,6 @@ static void join(struct join_message *msg)
        if (!sys->synchronized)
                return;
 
-       if (!is_master())
-               return;
-
        if (msg->nr_sobjs)
                sys->nr_sobjs = msg->nr_sobjs;
 
@@ -414,12 +412,8 @@ static void join(struct join_message *msg)
                msg->epoch = sys->epoch;
        else
                msg->epoch = 0;
-       list_for_each_entry(node, &sys->cpg_node_list, list) {
-               if (node->nodeid == msg->nodeid && node->pid == msg->pid)
-                       continue;
-               if (node->ent.id == 0)
-                       continue;
 
+       list_for_each_entry(node, &sys->sd_node_list, list) {
                msg->nodes[msg->nr_nodes].nodeid = node->nodeid;
                msg->nodes[msg->nr_nodes].pid = node->pid;
                msg->nodes[msg->nr_nodes].ent = node->ent;
@@ -747,6 +741,8 @@ static void __sd_deliver(struct work *work, int idx)
                        break;
                }
 
+               vprintf(SDOG_DEBUG "will send\n");
+
                m->done = 1;
                send_message(sys->handle, m);
        } else {
@@ -766,8 +762,23 @@ static void __sd_deliver(struct work *work, int idx)
 
 static void __sd_deliver_done(struct work *work, int idx)
 {
-       struct work_deliver *w = container_of(work, struct work_deliver, work);
-       struct message_header *m = w->msg;
+       struct work_deliver *w, *n = NULL;
+       struct message_header *m;
+
+       w = container_of(work, struct work_deliver, work);
+       m = w->msg;
+
+       list_del(&w->work_deliver_list);
+
+       /*
+        * When I finished one message, if I have pending messages, I
+        * need to perform the first of them now.
+        */
+       if (m->done && !list_empty(&sys->work_deliver_siblings)) {
+
+               n = list_first_entry(&sys->work_deliver_siblings,
+                                    struct work_deliver, work_deliver_list);
+       }
 
        /*
         * FIXME: we want to recover only after all nodes are fully
@@ -779,6 +790,9 @@ static void __sd_deliver_done(struct work *work, int idx)
 
        free(w->msg);
        free(w);
+
+       if (n)
+               queue_work(dobj_queue, &n->work);
 }
 
 static void sd_deliver(cpg_handle_t handle, const struct cpg_name *group_name,
@@ -800,12 +814,34 @@ static void sd_deliver(cpg_handle_t handle, const struct 
cpg_name *group_name,
        if (!w->msg)
                return;
        memcpy(w->msg, msg, msg_len);
+       INIT_LIST_HEAD(&w->work_deliver_list);
 
        w->work.fn = __sd_deliver;
        w->work.done = __sd_deliver_done;
 
-       if (m->op == SD_MSG_JOIN)
-               w->work.attr = WORK_ORDERED;
+       if (is_master()) {
+               if (!m->done) {
+                       int run = 0;
+
+                       /*
+                        * I can broadcast this message if there is no
+                        * outstanding messages.
+                        */
+                       if (list_empty(&sys->work_deliver_siblings))
+                               run = 1;
+
+                       list_add_tail(&w->work_deliver_list,
+                                     &sys->work_deliver_siblings);
+                       if (run) {
+                               vprintf(SDOG_DEBUG "%u\n", pid);
+                               queue_work(dobj_queue, &w->work);
+                       } else
+                               vprintf(SDOG_DEBUG "%u\n", pid);
+
+                       return;
+               }
+       } else if (m->op == SD_MSG_JOIN)
+                 w->work.attr = WORK_ORDERED;
 
        queue_work(dobj_queue, &w->work);
 }
@@ -815,6 +851,7 @@ static void __sd_confch(struct work *work, int idx)
        struct work_confch *w = container_of(work, struct work_confch, work);
        struct node *node;
        int i;
+       int init = 0;
 
        const struct cpg_address *member_list = w->member_list;
        size_t member_list_entries = w->member_list_entries;
@@ -825,8 +862,10 @@ static void __sd_confch(struct work *work, int idx)
 
        if (member_list_entries == joined_list_entries - left_list_entries &&
            sys->this_nodeid == member_list[0].nodeid &&
-           sys->this_pid == member_list[0].pid)
+           sys->this_pid == member_list[0].pid){
                sys->synchronized = 1;
+               init = 1;
+       }
 
        if (list_empty(&sys->cpg_node_list)) {
                for (i = 0; i < member_list_entries; i++)
@@ -865,6 +904,28 @@ static void __sd_confch(struct work *work, int idx)
                }
        }
 
+       if (init) {
+               struct join_message msg;
+
+               /*
+                * If I'm the first collie joins in colosync, I
+                * becomes the master without sending JOIN.
+                */
+
+               vprintf(SDOG_DEBUG "%d %x\n", sys->this_pid, sys->this_nodeid);
+
+               memset(&msg, 0, sizeof(msg));
+
+               msg.header.from = sys->this_node;
+               msg.nodeid = sys->this_nodeid;
+               msg.pid = sys->this_pid;
+               msg.cluster_status = get_cluster_status(&msg.header.from);
+
+               update_cluster_info(&msg);
+
+               return;
+       }
+
        for (i = 0; i < joined_list_entries; i++) {
                if (sys->this_nodeid == joined_list[i].nodeid &&
                    sys->this_pid == joined_list[i].pid) {
@@ -993,7 +1054,7 @@ int build_node_list(struct list_head *node_list,
        return nr;
 }
 
-static void set_addr(unsigned int nodeid)
+static void set_addr(unsigned int nodeid, int port)
 {
        int ret, nr;
        corosync_cfg_handle_t handle;
@@ -1036,7 +1097,7 @@ static void set_addr(unsigned int nodeid)
 
        inet_ntop(ss->ss_family, saddr, tmp, sizeof(tmp));
 
-       vprintf(SDOG_INFO "addr = %s\n", tmp);
+       vprintf(SDOG_INFO "addr = %s, port = %d\n", tmp, port);
 }
 
 int create_cluster(int port)
@@ -1082,7 +1143,7 @@ join_retry:
        sys->this_nodeid = nodeid;
        sys->this_pid = getpid();
 
-       set_addr(nodeid);
+       set_addr(nodeid, port);
        sys->this_node.port = port;
 
        ret = get_nodeid(&sys->this_node.id);
@@ -1104,6 +1165,7 @@ join_retry:
        INIT_LIST_HEAD(&sys->cpg_node_list);
        INIT_LIST_HEAD(&sys->vm_list);
        INIT_LIST_HEAD(&sys->pending_list);
+       INIT_LIST_HEAD(&sys->work_deliver_siblings);
        cpg_context_set(cpg_handle, sys);
 
        cpg_fd_get(cpg_handle, &fd);
-- 
1.7.0

-- 
sheepdog mailing list
[email protected]
http://lists.wpkg.org/mailman/listinfo/sheepdog

[Sheepdog] [PATCH] fix master selection race

Reply via email to