If qemu dies during write object operation, consistency of the replicated
objects would be broken. This patch fixes the broken consistency in the
object recovery routines.

Signed-off-by: MORITA Kazutaka <[email protected]>
---
 collie/collie.h |    2 +-
 collie/group.c  |   30 ++++++++++++++++++++++++++++--
 collie/store.c  |   55 ++++++++++++++++++++++++++++++++++++++++++-------------
 3 files changed, 71 insertions(+), 16 deletions(-)

diff --git a/collie/collie.h b/collie/collie.h
index 826ac3a..734b1a4 100644
--- a/collie/collie.h
+++ b/collie/collie.h
@@ -130,7 +130,7 @@ int remove_epoch(int epoch);
 int set_cluster_ctime(uint64_t ctime);
 uint64_t get_cluster_ctime(void);
 
-int start_recovery(uint32_t epoch);
+int start_recovery(uint32_t epoch, unsigned long *failed_vdis, int 
nr_failed_vdis);
 int start_deletion(uint64_t oid);
 
 static inline int is_myself(struct sheepdog_node_list_entry *e)
diff --git a/collie/group.c b/collie/group.c
index 1f5544a..d8ebad9 100644
--- a/collie/group.c
+++ b/collie/group.c
@@ -90,6 +90,9 @@ struct work_confch {
        struct cpg_address *joined_list;
        size_t joined_list_entries;
 
+       unsigned long *failed_vdis;
+       int nr_failed_vdis;
+
        struct work work;
 };
 
@@ -881,7 +884,7 @@ static void __sd_deliver_done(struct work *work, int idx)
         */
 
        if (m->state == DM_FIN && m->op == SD_MSG_JOIN && sys->epoch >= 2)
-               start_recovery(sys->epoch);
+               start_recovery(sys->epoch, NULL, 0);
 
        free(w->msg);
        free(w);
@@ -1051,7 +1054,12 @@ static void __sd_confch(struct work *work, int idx)
                        int nr;
                        struct sheepdog_node_list_entry e[SD_MAX_NODES];
                        struct vm *vm, *n;
+                       int ret, size;
+                       uint64_t oid;
+                       void *buf;
 
+                       size = sizeof(*w->failed_vdis) * 64;
+                       w->failed_vdis = malloc(size);
                        list_for_each_entry_safe(vm, n, &sys->vm_list, list) {
                                if (memcmp(vm->ent.host_addr, node->ent.addr,
                                           sizeof(node->ent.addr)) != 0)
@@ -1059,6 +1067,23 @@ static void __sd_confch(struct work *work, int idx)
                                if (vm->ent.host_port != node->ent.port)
                                        continue;
 
+                               if (w->nr_failed_vdis * sizeof(*w->failed_vdis) 
>= size) {
+                                       size *= 2;
+                                       buf = realloc(w->failed_vdis, size);
+                                       if (!buf) {
+                                               eprintf("out of memory, %d\n", 
size);
+                                               break;
+                                       }
+                                       w->failed_vdis = buf;
+                               }
+
+                               ret = lookup_vdi((char *)vm->ent.name,
+                                                sizeof(vm->ent.name), &oid, 0);
+                               if (ret == SD_RES_SUCCESS)
+                                       w->failed_vdis[w->nr_failed_vdis++] = 
oid_to_bit(oid);
+                               else
+                                       eprintf("cannot find vdi %s\n", 
vm->ent.name);
+
                                list_del(&vm->list);
                                free(vm);
                        }
@@ -1144,12 +1169,13 @@ static void __sd_confch_done(struct work *work, int idx)
        if (w->left_list_entries) {
                if (w->left_list_entries > 1)
                        eprintf("we can't handle %Zd\n", w->left_list_entries);
-               start_recovery(sys->epoch);
+               start_recovery(sys->epoch, w->failed_vdis, w->nr_failed_vdis);
        }
 
        free(w->member_list);
        free(w->left_list);
        free(w->joined_list);
+       free(w->failed_vdis);
        free(w);
 }
 
diff --git a/collie/store.c b/collie/store.c
index 0fa711e..d89433e 100644
--- a/collie/store.c
+++ b/collie/store.c
@@ -959,6 +959,9 @@ struct recovery_work {
        struct work work;
        struct list_head rw_siblings;
 
+       unsigned long *failed_vdis;
+       int nr_failed_vdis;
+
        int count;
        char *buf;
 };
@@ -1153,6 +1156,7 @@ static void recover_one(struct work *work, int idx)
        int old_nr, cur_nr;
        uint32_t epoch = rw->epoch;
        int i, my_idx = -1, copy_idx, cur_idx = -1;
+       int is_failed_oid = 0;
 
        eprintf("%d %d, %16lx\n", rw->done, rw->count, oid);
 
@@ -1180,22 +1184,30 @@ static void recover_one(struct work *work, int idx)
 
        cur_idx = obj_to_sheep(cur_entry, cur_nr, oid, 0);
 
-       for (i = 0; i < cur_nr; i++) {
-               if (cur_entry[i].id == sys->this_node.id) {
-                       my_idx = i;
-                       break;
-               }
+       for (i = 0; i < rw->nr_failed_vdis; i++) {
+               if (rw->failed_vdis[i] == oid_to_bit(oid))
+                       is_failed_oid = 1;
        }
-       copy_idx = node_distance(my_idx, cur_idx, cur_nr);
-       dprintf("%d, %d, %d, %d\n", my_idx, cur_idx, cur_nr, copy_idx);
 
-       ret = __recover_one(rw, old_entry, old_nr, cur_entry, cur_nr, cur_idx,
-                           copy_idx, epoch, epoch - 1, oid, buf, 
SD_DATA_OBJ_SIZE);
-       if (ret == 0)
-               goto out;
+       if (!is_failed_oid) {
+               for (i = 0; i < cur_nr; i++) {
+                       if (cur_entry[i].id == sys->this_node.id) {
+                               my_idx = i;
+                               break;
+                       }
+               }
+               copy_idx = node_distance(my_idx, cur_idx, cur_nr);
+               dprintf("%d, %d, %d, %d\n", my_idx, cur_idx, cur_nr, copy_idx);
+
+               ret = __recover_one(rw, old_entry, old_nr, cur_entry, cur_nr,
+                                   cur_idx, copy_idx, epoch, epoch - 1, oid,
+                                   buf, SD_DATA_OBJ_SIZE);
+               if (ret == 0)
+                       goto out;
+       }
 
        for (i = 0; i < sys->nr_sobjs; i++) {
-               if (i == copy_idx)
+               if (!is_failed_oid && i == copy_idx)
                        continue;
                ret = __recover_one(rw, old_entry, old_nr,
                                    cur_entry, cur_nr, cur_idx, i,
@@ -1243,6 +1255,7 @@ static void recover_one_done(struct work *work, int idx)
        recovering = 0;
 
        free(rw->buf);
+       free(rw->failed_vdis);
        free(rw);
 
        if (!list_empty(&recovery_work_list)) {
@@ -1460,6 +1473,7 @@ static void __start_recovery_done(struct work *work, int 
idx)
        recovering = 0;
 
        free(rw->buf);
+       free(rw->failed_vdis);
        free(rw);
 
        if (!list_empty(&recovery_work_list)) {
@@ -1473,7 +1487,7 @@ static void __start_recovery_done(struct work *work, int 
idx)
        }
 }
 
-int start_recovery(uint32_t epoch)
+int start_recovery(uint32_t epoch, unsigned long *failed_vdis, int 
nr_failed_vdis)
 {
        struct recovery_work *rw;
 
@@ -1485,6 +1499,16 @@ int start_recovery(uint32_t epoch)
        rw->epoch = epoch;
        rw->count = 0;
 
+       if (failed_vdis) {
+               rw->failed_vdis = malloc(nr_failed_vdis * sizeof(*failed_vdis));
+               if (!rw->failed_vdis) {
+                       eprintf("out of memory\n");
+                       goto fail;
+               }
+               memcpy(rw->failed_vdis, failed_vdis,
+                      nr_failed_vdis * sizeof(*failed_vdis));
+       }
+
        rw->work.fn = __start_recovery;
        rw->work.done = __start_recovery_done;
 
@@ -1496,6 +1520,11 @@ int start_recovery(uint32_t epoch)
        }
 
        return 0;
+fail:
+       free(rw->buf);
+       free(rw->failed_vdis);
+       free(rw);
+       return -1;
 }
 
 static int init_path(char *d, int *new)
-- 
1.5.6.5

-- 
sheepdog mailing list
[email protected]
http://lists.wpkg.org/mailman/listinfo/sheepdog

Reply via email to