Current recovery process can cause revival of orphan objects. This
patch solves this problem.

How to: exclude a newly joining sheep from recovery process if a
cluster is already in SD_STATUS_OK. It emans the cluster has every
replica already.

Cc: Valerio Pachera <[email protected]>
Signed-off-by: Hitoshi Mitake <[email protected]>
---
 sheep/group.c      | 20 ++++++++++++++------
 sheep/md.c         |  2 +-
 sheep/ops.c        |  4 ++--
 sheep/recovery.c   | 46 +++++++++++++++++++++++++++++++++++++++++++++-
 sheep/sheep_priv.h |  3 ++-
 5 files changed, 64 insertions(+), 11 deletions(-)

diff --git a/sheep/group.c b/sheep/group.c
index 2b98a9b..34e4963 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -838,7 +838,7 @@ static bool membership_changed(const struct cluster_info 
*cinfo,
 static void update_cluster_info(const struct cluster_info *cinfo,
                                const struct sd_node *joined,
                                const struct rb_root *nroot,
-                               size_t nr_nodes)
+                               size_t nr_nodes, enum sd_status prev_status)
 {
        struct vnode_info *old_vnode_info;
 
@@ -890,6 +890,8 @@ static void update_cluster_info(const struct cluster_info 
*cinfo,
 
                if (membership_changed(cinfo, nroot, nr_nodes)) {
                        int ret;
+                       struct sd_node *excluded = NULL;
+
                        if (old_vnode_info)
                                put_vnode_info(old_vnode_info);
 
@@ -899,12 +901,17 @@ static void update_cluster_info(const struct cluster_info 
*cinfo,
                                panic("cannot log current epoch %d",
                                      sys->cinfo.epoch);
 
+                       if (prev_status == SD_STATUS_OK ||
+                           (prev_status == SD_STATUS_WAIT &&
+                            !node_cmp(joined, &sys->this_node)))
+                               excluded = (struct sd_node *)joined;
+
                        start_recovery(main_thread_get(current_vnode_info),
-                                      old_vnode_info, true);
+                                      old_vnode_info, true, excluded);
                } else if (!was_cluster_shutdowned()) {
                        start_recovery(main_thread_get(current_vnode_info),
                                       main_thread_get(current_vnode_info),
-                                      false);
+                                      false, NULL);
                }
                set_cluster_shutdown(false);
        }
@@ -1118,6 +1125,7 @@ main_fn void sd_accept_handler(const struct sd_node 
*joined,
 {
        const struct cluster_info *cinfo = opaque;
        struct sd_node *n;
+       enum sd_status prev_status = sys->cinfo.status;
 
        if (node_is_local(joined) && !cluster_join_check(cinfo)) {
                sd_err("failed to join Sheepdog");
@@ -1134,7 +1142,7 @@ main_fn void sd_accept_handler(const struct sd_node 
*joined,
        if (sys->cinfo.status == SD_STATUS_SHUTDOWN)
                return;
 
-       update_cluster_info(cinfo, joined, nroot, nr_nodes);
+       update_cluster_info(cinfo, joined, nroot, nr_nodes, prev_status);
 
        if (node_is_local(joined)) {
                /* this output is used for testing */
@@ -1180,7 +1188,7 @@ main_fn void sd_leave_handler(const struct sd_node *left,
                if (ret != 0)
                        panic("cannot log current epoch %d", sys->cinfo.epoch);
                start_recovery(main_thread_get(current_vnode_info),
-                              old_vnode_info, true);
+                              old_vnode_info, true, NULL);
        }
 
        put_vnode_info(old_vnode_info);
@@ -1220,7 +1228,7 @@ static void kick_node_recover(void)
        ret = inc_and_log_epoch();
        if (ret != 0)
                panic("cannot log current epoch %d", sys->cinfo.epoch);
-       start_recovery(main_thread_get(current_vnode_info), old, true);
+       start_recovery(main_thread_get(current_vnode_info), old, true, NULL);
        put_vnode_info(old);
 }
 
diff --git a/sheep/md.c b/sheep/md.c
index 0313b95..a90fdb9 100644
--- a/sheep/md.c
+++ b/sheep/md.c
@@ -520,7 +520,7 @@ static inline void kick_recover(void)
        if (is_cluster_diskmode(&sys->cinfo))
                sys->cdrv->update_node(&sys->this_node);
        else {
-               start_recovery(vinfo, vinfo, false);
+               start_recovery(vinfo, vinfo, false, NULL);
                put_vnode_info(vinfo);
        }
 }
diff --git a/sheep/ops.c b/sheep/ops.c
index a617a83..2abaf13 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -618,7 +618,7 @@ static int cluster_force_recover_main(const struct sd_req 
*req,
 
        vnode_info = get_vnode_info();
        old_vnode_info = alloc_vnode_info(&nroot);
-       start_recovery(vnode_info, old_vnode_info, true);
+       start_recovery(vnode_info, old_vnode_info, true, NULL);
        put_vnode_info(vnode_info);
        put_vnode_info(old_vnode_info);
        return ret;
@@ -761,7 +761,7 @@ static int cluster_alter_vdi_copy(const struct sd_req *req, 
struct sd_rsp *rsp,
        add_vdi_state(vid, nr_copies, false, 0);
 
        vinfo = get_vnode_info();
-       start_recovery(vinfo, vinfo, false);
+       start_recovery(vinfo, vinfo, false, NULL);
        put_vnode_info(vinfo);
 
        return SD_RES_SUCCESS;
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 7874fc9..a8fabad 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -76,6 +76,8 @@ struct recovery_info {
        int max_epoch;
        struct vnode_info **vinfo_array;
        struct sd_mutex vinfo_lock;
+
+       struct sd_node *excluded;
 };
 
 static struct recovery_info *next_rinfo;
@@ -188,6 +190,8 @@ static void *read_erasure_object(uint64_t oid, uint8_t idx,
        uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid));
        int edp = ec_policy_to_dp(policy, NULL, NULL);
        int ret;
+       struct sd_node *excluded = row->base.rinfo->excluded;
+
 again:
        if (unlikely(old->nr_zones < edp)) {
                if (search_erasure_object(oid, idx, &old->nroot, rw,
@@ -202,6 +206,11 @@ again:
                 oid, epoch, tgt_epoch, idx, node_to_str(node));
        if (invalid_node(node, rw->cur_vinfo))
                goto rollback;
+       if (excluded && !node_cmp(excluded, node)) {
+               sd_debug("skipping reading object from %s", node_to_str(node));
+               goto rollback;
+       }
+
        sd_init_req(&hdr, SD_OP_READ_PEER);
        hdr.epoch = epoch;
        hdr.flags = SD_FLAG_CMD_RECOVERY;
@@ -316,6 +325,7 @@ static int recover_object_from_replica(struct 
recovery_obj_work *row,
        uint32_t epoch = row->base.epoch;
        int nr_copies, ret = SD_RES_SUCCESS, start = 0;
        bool fully_replicated = true;
+       struct sd_node *excluded = row->base.rinfo->excluded;
 
        nr_copies = get_obj_copy_number(oid, old->nr_zones);
 
@@ -341,6 +351,12 @@ static int recover_object_from_replica(struct 
recovery_obj_work *row,
                if (invalid_node(node, row->base.cur_vinfo))
                        continue;
 
+               if (excluded && !node_cmp(excluded, node)) {
+                       sd_debug("skipping reading from %s because it is"
+                                " excluded", node_to_str(node));
+                       continue;
+               }
+
                ret = recover_object_from(row, node, tgt_epoch);
                switch (ret) {
                case SD_RES_SUCCESS:
@@ -551,12 +567,20 @@ static void recover_object_work(struct work *work)
        uint64_t oid = row->oid;
        struct vnode_info *cur = rw->cur_vinfo;
        int ret, epoch;
+       struct sd_node *excluded = rw->rinfo->excluded;
 
        if (sd_store->exist(oid, local_ec_index(cur, oid))) {
                sd_debug("the object is already recovered");
                return;
        }
 
+       if (excluded && !node_cmp(excluded, &sys->this_node)) {
+               sd_debug("skipping recovery from stale dir because"
+                        " I'm newly joining node (%s)",
+                        node_to_str(&sys->this_node));
+               goto remote_recover;
+       }
+
        /* find object in the stale directory */
        if (!is_erasure_oid(oid))
                for (epoch = sys_epoch() - 1; epoch >= last_gathered_epoch;
@@ -570,6 +594,7 @@ static void recover_object_work(struct work *work)
                        }
                }
 
+remote_recover:
        ret = do_recover_object(row);
        if (ret != 0)
                sd_err("failed to recover object %"PRIx64, oid);
@@ -1048,6 +1073,7 @@ static void prepare_object_list(struct work *work)
        int start = random() % nr_nodes, i, end = nr_nodes;
        uint64_t *oids;
        struct sd_node *nodes;
+       struct sd_node *excluded = rw->rinfo->excluded;
 
        if (node_is_gateway_only())
                return;
@@ -1068,6 +1094,13 @@ again:
                        goto out;
                }
 
+               if (excluded && !node_cmp(excluded, node)) {
+                       sd_info("skipping object list reading from %s because"
+                               "it is marked as excluded node",
+                               node_to_str(excluded));
+                       continue;
+               }
+
                oids = fetch_object_list(node, rw->epoch, &nr_oids);
                if (!oids)
                        continue;
@@ -1086,8 +1119,17 @@ out:
        free(nodes);
 }
 
+/*
+ * notes on the parameter excluded of start_recovery():
+ * When a status of a cluster is SD_STATUS_OK and a new node is joining to the
+ * cluster, the recovery process should avoid reading objects from the newly
+ * joining node. Because the newly joining node can have orphan objects.
+ * If the cluster status is already SD_STATUS_OK, the cluster must have every
+ * replica, so the newly joining node shouldn't provide any objects for the
+ * recovery process.
+ */
 int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo,
-                  bool epoch_lifted)
+                  bool epoch_lifted, struct sd_node *excluded)
 {
        struct recovery_info *rinfo;
 
@@ -1109,6 +1151,8 @@ int start_recovery(struct vnode_info *cur_vinfo, struct 
vnode_info *old_vinfo,
        rinfo->cur_vinfo = grab_vnode_info(cur_vinfo);
        rinfo->old_vinfo = grab_vnode_info(old_vinfo);
 
+       rinfo->excluded = excluded;
+
        if (!node_is_gateway_only())
                sd_store->update_epoch(rinfo->tgt_epoch);
 
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 5fc6b90..8310728 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -419,7 +419,8 @@ int init_config_file(void);
 int get_obj_list(const struct sd_req *, struct sd_rsp *, void *);
 int objlist_cache_cleanup(uint32_t vid);
 
-int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *, bool);
+int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *, bool,
+                  struct sd_node *);
 bool oid_in_recovery(uint64_t oid);
 bool node_in_recovery(void);
 void get_recovery_state(struct recovery_state *state);
-- 
1.8.3.2

-- 
sheepdog mailing list
[email protected]
http://lists.wpkg.org/mailman/listinfo/sheepdog

Reply via email to