If qemu dies during write object operation, consistency of the replicated objects would be broken. This patch fixes the broken consistency in the object recovery routines.
Signed-off-by: MORITA Kazutaka <[email protected]> --- collie/collie.h | 2 +- collie/group.c | 30 ++++++++++++++++++++++++++++-- collie/store.c | 55 ++++++++++++++++++++++++++++++++++++++++++------------- 3 files changed, 71 insertions(+), 16 deletions(-) diff --git a/collie/collie.h b/collie/collie.h index 826ac3a..734b1a4 100644 --- a/collie/collie.h +++ b/collie/collie.h @@ -130,7 +130,7 @@ int remove_epoch(int epoch); int set_cluster_ctime(uint64_t ctime); uint64_t get_cluster_ctime(void); -int start_recovery(uint32_t epoch); +int start_recovery(uint32_t epoch, unsigned long *failed_vdis, int nr_failed_vdis); int start_deletion(uint64_t oid); static inline int is_myself(struct sheepdog_node_list_entry *e) diff --git a/collie/group.c b/collie/group.c index 1f5544a..d8ebad9 100644 --- a/collie/group.c +++ b/collie/group.c @@ -90,6 +90,9 @@ struct work_confch { struct cpg_address *joined_list; size_t joined_list_entries; + unsigned long *failed_vdis; + int nr_failed_vdis; + struct work work; }; @@ -881,7 +884,7 @@ static void __sd_deliver_done(struct work *work, int idx) */ if (m->state == DM_FIN && m->op == SD_MSG_JOIN && sys->epoch >= 2) - start_recovery(sys->epoch); + start_recovery(sys->epoch, NULL, 0); free(w->msg); free(w); @@ -1051,7 +1054,12 @@ static void __sd_confch(struct work *work, int idx) int nr; struct sheepdog_node_list_entry e[SD_MAX_NODES]; struct vm *vm, *n; + int ret, size; + uint64_t oid; + void *buf; + size = sizeof(*w->failed_vdis) * 64; + w->failed_vdis = malloc(size); list_for_each_entry_safe(vm, n, &sys->vm_list, list) { if (memcmp(vm->ent.host_addr, node->ent.addr, sizeof(node->ent.addr)) != 0) @@ -1059,6 +1067,23 @@ static void __sd_confch(struct work *work, int idx) if (vm->ent.host_port != node->ent.port) continue; + if (w->nr_failed_vdis * sizeof(*w->failed_vdis) >= size) { + size *= 2; + buf = realloc(w->failed_vdis, size); + if (!buf) { + eprintf("out of memory, %d\n", size); + break; + } + w->failed_vdis = buf; + } + + ret = lookup_vdi((char *)vm->ent.name, + sizeof(vm->ent.name), &oid, 0); + if (ret == SD_RES_SUCCESS) + w->failed_vdis[w->nr_failed_vdis++] = oid_to_bit(oid); + else + eprintf("cannot find vdi %s\n", vm->ent.name); + list_del(&vm->list); free(vm); } @@ -1144,12 +1169,13 @@ static void __sd_confch_done(struct work *work, int idx) if (w->left_list_entries) { if (w->left_list_entries > 1) eprintf("we can't handle %Zd\n", w->left_list_entries); - start_recovery(sys->epoch); + start_recovery(sys->epoch, w->failed_vdis, w->nr_failed_vdis); } free(w->member_list); free(w->left_list); free(w->joined_list); + free(w->failed_vdis); free(w); } diff --git a/collie/store.c b/collie/store.c index 0fa711e..d89433e 100644 --- a/collie/store.c +++ b/collie/store.c @@ -959,6 +959,9 @@ struct recovery_work { struct work work; struct list_head rw_siblings; + unsigned long *failed_vdis; + int nr_failed_vdis; + int count; char *buf; }; @@ -1153,6 +1156,7 @@ static void recover_one(struct work *work, int idx) int old_nr, cur_nr; uint32_t epoch = rw->epoch; int i, my_idx = -1, copy_idx, cur_idx = -1; + int is_failed_oid = 0; eprintf("%d %d, %16lx\n", rw->done, rw->count, oid); @@ -1180,22 +1184,30 @@ static void recover_one(struct work *work, int idx) cur_idx = obj_to_sheep(cur_entry, cur_nr, oid, 0); - for (i = 0; i < cur_nr; i++) { - if (cur_entry[i].id == sys->this_node.id) { - my_idx = i; - break; - } + for (i = 0; i < rw->nr_failed_vdis; i++) { + if (rw->failed_vdis[i] == oid_to_bit(oid)) + is_failed_oid = 1; } - copy_idx = node_distance(my_idx, cur_idx, cur_nr); - dprintf("%d, %d, %d, %d\n", my_idx, cur_idx, cur_nr, copy_idx); - ret = __recover_one(rw, old_entry, old_nr, cur_entry, cur_nr, cur_idx, - copy_idx, epoch, epoch - 1, oid, buf, SD_DATA_OBJ_SIZE); - if (ret == 0) - goto out; + if (!is_failed_oid) { + for (i = 0; i < cur_nr; i++) { + if (cur_entry[i].id == sys->this_node.id) { + my_idx = i; + break; + } + } + copy_idx = node_distance(my_idx, cur_idx, cur_nr); + dprintf("%d, %d, %d, %d\n", my_idx, cur_idx, cur_nr, copy_idx); + + ret = __recover_one(rw, old_entry, old_nr, cur_entry, cur_nr, + cur_idx, copy_idx, epoch, epoch - 1, oid, + buf, SD_DATA_OBJ_SIZE); + if (ret == 0) + goto out; + } for (i = 0; i < sys->nr_sobjs; i++) { - if (i == copy_idx) + if (!is_failed_oid && i == copy_idx) continue; ret = __recover_one(rw, old_entry, old_nr, cur_entry, cur_nr, cur_idx, i, @@ -1243,6 +1255,7 @@ static void recover_one_done(struct work *work, int idx) recovering = 0; free(rw->buf); + free(rw->failed_vdis); free(rw); if (!list_empty(&recovery_work_list)) { @@ -1460,6 +1473,7 @@ static void __start_recovery_done(struct work *work, int idx) recovering = 0; free(rw->buf); + free(rw->failed_vdis); free(rw); if (!list_empty(&recovery_work_list)) { @@ -1473,7 +1487,7 @@ static void __start_recovery_done(struct work *work, int idx) } } -int start_recovery(uint32_t epoch) +int start_recovery(uint32_t epoch, unsigned long *failed_vdis, int nr_failed_vdis) { struct recovery_work *rw; @@ -1485,6 +1499,16 @@ int start_recovery(uint32_t epoch) rw->epoch = epoch; rw->count = 0; + if (failed_vdis) { + rw->failed_vdis = malloc(nr_failed_vdis * sizeof(*failed_vdis)); + if (!rw->failed_vdis) { + eprintf("out of memory\n"); + goto fail; + } + memcpy(rw->failed_vdis, failed_vdis, + nr_failed_vdis * sizeof(*failed_vdis)); + } + rw->work.fn = __start_recovery; rw->work.done = __start_recovery_done; @@ -1496,6 +1520,11 @@ int start_recovery(uint32_t epoch) } return 0; +fail: + free(rw->buf); + free(rw->failed_vdis); + free(rw); + return -1; } static int init_path(char *d, int *new) -- 1.5.6.5 -- sheepdog mailing list [email protected] http://lists.wpkg.org/mailman/listinfo/sheepdog
