Due to the way the cluster status and inc_epoch works this can get us
different historic epochs for different sheep in a cluster. Instead only
create a previous generation in-core node list if we don't already have
a valid one.  Now that all recovery code operates on the in-memory node
lists only that is easily possible.

Signed-off-by: Christoph Hellwig <[email protected]>

diff --git a/sheep/group.c b/sheep/group.c
index 4e83b1e..0edb97e 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -611,23 +611,18 @@ int log_current_epoch(void)
                                current_vnode_info->nr_nodes);
 }
 
-static void log_last_epoch(struct join_message *msg, struct sd_node *joined,
+static struct vnode_info *alloc_old_vnode_info(struct sd_node *joined,
                struct sd_node *nodes, size_t nr_nodes)
 {
-       if ((msg->cluster_status == SD_STATUS_OK ||
-            msg->cluster_status == SD_STATUS_HALT) && msg->inc_epoch) {
-               struct sd_node old_nodes[SD_MAX_NODES];
-               size_t count = 0, i;
-
-               /* exclude the newly added one */
-               for (i = 0; i < nr_nodes; i++) {
-                       if (!node_eq(nodes + i, joined))
-                               old_nodes[count++] = nodes[i];
-               }
-               put_vnode_info(current_vnode_info);
-               current_vnode_info = alloc_vnode_info(old_nodes, count);
-               log_current_epoch();
+       struct sd_node old_nodes[SD_MAX_NODES];
+       size_t count = 0, i;
+
+       /* exclude the newly added one */
+       for (i = 0; i < nr_nodes; i++) {
+               if (!node_eq(nodes + i, joined))
+                       old_nodes[count++] = nodes[i];
        }
+       return alloc_vnode_info(old_nodes, count);
 }
 
 static void finish_join(struct join_message *msg, struct sd_node *joined,
@@ -639,12 +634,6 @@ static void finish_join(struct join_message *msg, struct 
sd_node *joined,
        sys->nr_copies = msg->nr_copies;
        sys->epoch = msg->epoch;
 
-       /*
-        * Make sure we have an epoch log record for the epoch before
-        * this node joins, as recovery expects this record to exist.
-        */
-       log_last_epoch(msg, joined, nodes, nr_nodes);
-
        if (msg->cluster_status != SD_STATUS_OK) {
                int nr_leave_nodes;
                uint32_t le;
@@ -740,6 +729,12 @@ static void update_cluster_info(struct join_message *msg,
        if (msg->inc_epoch) {
                list_for_each_entry_safe(n, t, &sys->leave_list, list)
                        list_del(&n->list);
+
+               if (!old_vnode_info) {
+                       old_vnode_info =
+                               alloc_old_vnode_info(joined, nodes, nr_nodes);
+               }
+
                start_recovery(current_vnode_info, old_vnode_info);
        }
 

-- 
sheepdog mailing list
[email protected]
http://lists.wpkg.org/mailman/listinfo/sheepdog

Reply via email to