Current recovery process can cause revival of orphan objects. This patch solves this problem.
How to: exclude a newly joining sheep from recovery process if a cluster is already in SD_STATUS_OK. It emans the cluster has every replica already. Cc: Valerio Pachera <[email protected]> Signed-off-by: Hitoshi Mitake <[email protected]> --- sheep/group.c | 20 ++++++++++++++------ sheep/md.c | 2 +- sheep/ops.c | 4 ++-- sheep/recovery.c | 46 +++++++++++++++++++++++++++++++++++++++++++++- sheep/sheep_priv.h | 3 ++- 5 files changed, 64 insertions(+), 11 deletions(-) diff --git a/sheep/group.c b/sheep/group.c index 2b98a9b..34e4963 100644 --- a/sheep/group.c +++ b/sheep/group.c @@ -838,7 +838,7 @@ static bool membership_changed(const struct cluster_info *cinfo, static void update_cluster_info(const struct cluster_info *cinfo, const struct sd_node *joined, const struct rb_root *nroot, - size_t nr_nodes) + size_t nr_nodes, enum sd_status prev_status) { struct vnode_info *old_vnode_info; @@ -890,6 +890,8 @@ static void update_cluster_info(const struct cluster_info *cinfo, if (membership_changed(cinfo, nroot, nr_nodes)) { int ret; + struct sd_node *excluded = NULL; + if (old_vnode_info) put_vnode_info(old_vnode_info); @@ -899,12 +901,17 @@ static void update_cluster_info(const struct cluster_info *cinfo, panic("cannot log current epoch %d", sys->cinfo.epoch); + if (prev_status == SD_STATUS_OK || + (prev_status == SD_STATUS_WAIT && + !node_cmp(joined, &sys->this_node))) + excluded = (struct sd_node *)joined; + start_recovery(main_thread_get(current_vnode_info), - old_vnode_info, true); + old_vnode_info, true, excluded); } else if (!was_cluster_shutdowned()) { start_recovery(main_thread_get(current_vnode_info), main_thread_get(current_vnode_info), - false); + false, NULL); } set_cluster_shutdown(false); } @@ -1118,6 +1125,7 @@ main_fn void sd_accept_handler(const struct sd_node *joined, { const struct cluster_info *cinfo = opaque; struct sd_node *n; + enum sd_status prev_status = sys->cinfo.status; if (node_is_local(joined) && !cluster_join_check(cinfo)) { sd_err("failed to join Sheepdog"); @@ -1134,7 +1142,7 @@ main_fn void sd_accept_handler(const struct sd_node *joined, if (sys->cinfo.status == SD_STATUS_SHUTDOWN) return; - update_cluster_info(cinfo, joined, nroot, nr_nodes); + update_cluster_info(cinfo, joined, nroot, nr_nodes, prev_status); if (node_is_local(joined)) { /* this output is used for testing */ @@ -1180,7 +1188,7 @@ main_fn void sd_leave_handler(const struct sd_node *left, if (ret != 0) panic("cannot log current epoch %d", sys->cinfo.epoch); start_recovery(main_thread_get(current_vnode_info), - old_vnode_info, true); + old_vnode_info, true, NULL); } put_vnode_info(old_vnode_info); @@ -1220,7 +1228,7 @@ static void kick_node_recover(void) ret = inc_and_log_epoch(); if (ret != 0) panic("cannot log current epoch %d", sys->cinfo.epoch); - start_recovery(main_thread_get(current_vnode_info), old, true); + start_recovery(main_thread_get(current_vnode_info), old, true, NULL); put_vnode_info(old); } diff --git a/sheep/md.c b/sheep/md.c index 0313b95..a90fdb9 100644 --- a/sheep/md.c +++ b/sheep/md.c @@ -520,7 +520,7 @@ static inline void kick_recover(void) if (is_cluster_diskmode(&sys->cinfo)) sys->cdrv->update_node(&sys->this_node); else { - start_recovery(vinfo, vinfo, false); + start_recovery(vinfo, vinfo, false, NULL); put_vnode_info(vinfo); } } diff --git a/sheep/ops.c b/sheep/ops.c index a617a83..2abaf13 100644 --- a/sheep/ops.c +++ b/sheep/ops.c @@ -618,7 +618,7 @@ static int cluster_force_recover_main(const struct sd_req *req, vnode_info = get_vnode_info(); old_vnode_info = alloc_vnode_info(&nroot); - start_recovery(vnode_info, old_vnode_info, true); + start_recovery(vnode_info, old_vnode_info, true, NULL); put_vnode_info(vnode_info); put_vnode_info(old_vnode_info); return ret; @@ -761,7 +761,7 @@ static int cluster_alter_vdi_copy(const struct sd_req *req, struct sd_rsp *rsp, add_vdi_state(vid, nr_copies, false, 0); vinfo = get_vnode_info(); - start_recovery(vinfo, vinfo, false); + start_recovery(vinfo, vinfo, false, NULL); put_vnode_info(vinfo); return SD_RES_SUCCESS; diff --git a/sheep/recovery.c b/sheep/recovery.c index 7874fc9..a8fabad 100644 --- a/sheep/recovery.c +++ b/sheep/recovery.c @@ -76,6 +76,8 @@ struct recovery_info { int max_epoch; struct vnode_info **vinfo_array; struct sd_mutex vinfo_lock; + + struct sd_node *excluded; }; static struct recovery_info *next_rinfo; @@ -188,6 +190,8 @@ static void *read_erasure_object(uint64_t oid, uint8_t idx, uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid)); int edp = ec_policy_to_dp(policy, NULL, NULL); int ret; + struct sd_node *excluded = row->base.rinfo->excluded; + again: if (unlikely(old->nr_zones < edp)) { if (search_erasure_object(oid, idx, &old->nroot, rw, @@ -202,6 +206,11 @@ again: oid, epoch, tgt_epoch, idx, node_to_str(node)); if (invalid_node(node, rw->cur_vinfo)) goto rollback; + if (excluded && !node_cmp(excluded, node)) { + sd_debug("skipping reading object from %s", node_to_str(node)); + goto rollback; + } + sd_init_req(&hdr, SD_OP_READ_PEER); hdr.epoch = epoch; hdr.flags = SD_FLAG_CMD_RECOVERY; @@ -316,6 +325,7 @@ static int recover_object_from_replica(struct recovery_obj_work *row, uint32_t epoch = row->base.epoch; int nr_copies, ret = SD_RES_SUCCESS, start = 0; bool fully_replicated = true; + struct sd_node *excluded = row->base.rinfo->excluded; nr_copies = get_obj_copy_number(oid, old->nr_zones); @@ -341,6 +351,12 @@ static int recover_object_from_replica(struct recovery_obj_work *row, if (invalid_node(node, row->base.cur_vinfo)) continue; + if (excluded && !node_cmp(excluded, node)) { + sd_debug("skipping reading from %s because it is" + " excluded", node_to_str(node)); + continue; + } + ret = recover_object_from(row, node, tgt_epoch); switch (ret) { case SD_RES_SUCCESS: @@ -551,12 +567,20 @@ static void recover_object_work(struct work *work) uint64_t oid = row->oid; struct vnode_info *cur = rw->cur_vinfo; int ret, epoch; + struct sd_node *excluded = rw->rinfo->excluded; if (sd_store->exist(oid, local_ec_index(cur, oid))) { sd_debug("the object is already recovered"); return; } + if (excluded && !node_cmp(excluded, &sys->this_node)) { + sd_debug("skipping recovery from stale dir because" + " I'm newly joining node (%s)", + node_to_str(&sys->this_node)); + goto remote_recover; + } + /* find object in the stale directory */ if (!is_erasure_oid(oid)) for (epoch = sys_epoch() - 1; epoch >= last_gathered_epoch; @@ -570,6 +594,7 @@ static void recover_object_work(struct work *work) } } +remote_recover: ret = do_recover_object(row); if (ret != 0) sd_err("failed to recover object %"PRIx64, oid); @@ -1048,6 +1073,7 @@ static void prepare_object_list(struct work *work) int start = random() % nr_nodes, i, end = nr_nodes; uint64_t *oids; struct sd_node *nodes; + struct sd_node *excluded = rw->rinfo->excluded; if (node_is_gateway_only()) return; @@ -1068,6 +1094,13 @@ again: goto out; } + if (excluded && !node_cmp(excluded, node)) { + sd_info("skipping object list reading from %s because" + "it is marked as excluded node", + node_to_str(excluded)); + continue; + } + oids = fetch_object_list(node, rw->epoch, &nr_oids); if (!oids) continue; @@ -1086,8 +1119,17 @@ out: free(nodes); } +/* + * notes on the parameter excluded of start_recovery(): + * When a status of a cluster is SD_STATUS_OK and a new node is joining to the + * cluster, the recovery process should avoid reading objects from the newly + * joining node. Because the newly joining node can have orphan objects. + * If the cluster status is already SD_STATUS_OK, the cluster must have every + * replica, so the newly joining node shouldn't provide any objects for the + * recovery process. + */ int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo, - bool epoch_lifted) + bool epoch_lifted, struct sd_node *excluded) { struct recovery_info *rinfo; @@ -1109,6 +1151,8 @@ int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo, rinfo->cur_vinfo = grab_vnode_info(cur_vinfo); rinfo->old_vinfo = grab_vnode_info(old_vinfo); + rinfo->excluded = excluded; + if (!node_is_gateway_only()) sd_store->update_epoch(rinfo->tgt_epoch); diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h index 5fc6b90..8310728 100644 --- a/sheep/sheep_priv.h +++ b/sheep/sheep_priv.h @@ -419,7 +419,8 @@ int init_config_file(void); int get_obj_list(const struct sd_req *, struct sd_rsp *, void *); int objlist_cache_cleanup(uint32_t vid); -int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *, bool); +int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *, bool, + struct sd_node *); bool oid_in_recovery(uint64_t oid); bool node_in_recovery(void); void get_recovery_state(struct recovery_state *state); -- 1.8.3.2 -- sheepdog mailing list [email protected] http://lists.wpkg.org/mailman/listinfo/sheepdog
