At Sat, 19 Oct 2013 00:26:37 +0800, Liu Yuan wrote: > > When data copy is missing, this command will rebuild it. > > For the case that data get inconsistent but still accessable, we just print > inconsistent warning because it is kind of hard to handle it and leave it > as a future work > > Signed-off-by: Liu Yuan <[email protected]> > --- > dog/common.c | 9 +++ > dog/dog.h | 1 + > dog/vdi.c | 148 > ++++++++++++++++++++++++++++++++++++++-------- > tests/functional/050 | 2 +- > tests/functional/052 | 22 +++---- > tests/functional/052.out | 128 ++++++++++++++++++++++++++++----------- > tests/functional/055 | 8 +-- > tests/functional/055.out | 12 ++-- > tests/functional/056 | 12 ++-- > tests/functional/056.out | 10 ++-- > tests/functional/058 | 8 +-- > tests/functional/058.out | 3 + > 12 files changed, 265 insertions(+), 98 deletions(-) > > diff --git a/dog/common.c b/dog/common.c > index 028d367..5e7ce2e 100644 > --- a/dog/common.c > +++ b/dog/common.c > @@ -342,3 +342,12 @@ size_t get_store_objsize(uint8_t copy_policy, uint64_t > oid) > } > return get_objsize(oid); > } > + > +bool is_erasure_oid(uint64_t oid, uint8_t policy) > +{ > + if (is_vdi_obj(oid)) > + return false; > + if (policy == 0) > + return false; > + return true; > +} > diff --git a/dog/dog.h b/dog/dog.h > index cfc9d54..769fc6c 100644 > --- a/dog/dog.h > +++ b/dog/dog.h > @@ -83,6 +83,7 @@ int do_vdi_create(const char *vdiname, int64_t vdi_size, > int do_vdi_check(const struct sd_inode *inode); > void show_progress(uint64_t done, uint64_t total, bool raw); > size_t get_store_objsize(uint8_t copy_policy, uint64_t oid); > +bool is_erasure_oid(uint64_t oid, uint8_t policy); > > extern struct command vdi_command; > extern struct command node_command; > diff --git a/dog/vdi.c b/dog/vdi.c > index 2639007..4362f13 100644 > --- a/dog/vdi.c > +++ b/dog/vdi.c > @@ -1397,7 +1397,7 @@ static void *read_object_from(const struct sd_vnode > *vnode, uint64_t oid) > } > > static void write_object_to(const struct sd_vnode *vnode, uint64_t oid, > - void *buf, bool create) > + void *buf, bool create, uint8_t ec_index) > { > struct sd_req hdr; > struct sd_rsp *rsp = (struct sd_rsp *)&hdr; > @@ -1411,6 +1411,7 @@ static void write_object_to(const struct sd_vnode > *vnode, uint64_t oid, > hdr.flags = SD_FLAG_CMD_WRITE; > hdr.data_length = get_objsize(oid); > hdr.obj.oid = oid; > + hdr.obj.ec_index = ec_index; > > ret = dog_exec_req(&vnode->node->nid, &hdr, buf); > if (ret < 0) > @@ -1427,13 +1428,16 @@ struct vdi_check_work { > struct vdi_check_info *info; > const struct sd_vnode *vnode; > uint8_t hash[SHA1_DIGEST_SIZE]; > + uint8_t ec_index; > + uint8_t *buf; > bool object_found; > struct work work; > }; > > struct vdi_check_info { > uint64_t oid; > - int nr_copies; > + uint8_t nr_copies; > + uint8_t copy_policy; > uint64_t total; > uint64_t *done; > int refcnt; > @@ -1459,7 +1463,7 @@ static void vdi_repair_work(struct work *work) > void *buf; > > buf = read_object_from(info->base->vnode, info->oid); > - write_object_to(vcw->vnode, info->oid, buf, !vcw->object_found); > + write_object_to(vcw->vnode, info->oid, buf, !vcw->object_found, 0); > free(buf); > } > > @@ -1479,7 +1483,7 @@ static void vdi_repair_main(struct work *work) > free_vdi_check_info(info); > } > > -static void vdi_hash_check_work(struct work *work) > +static void vdi_check_object_work(struct work *work) > { > struct vdi_check_work *vcw = container_of(work, struct vdi_check_work, > work); > @@ -1488,19 +1492,29 @@ static void vdi_hash_check_work(struct work *work) > struct sd_req hdr; > struct sd_rsp *rsp = (struct sd_rsp *)&hdr; > > - sd_init_req(&hdr, SD_OP_GET_HASH); > + if (is_erasure_oid(info->oid, info->copy_policy)) { > + sd_init_req(&hdr, SD_OP_READ_PEER); > + hdr.data_length = get_store_objsize(info->copy_policy, > + info->oid); > + hdr.obj.ec_index = vcw->ec_index; > + hdr.epoch = sd_epoch; > + vcw->buf = xmalloc(hdr.data_length); > + } else > + sd_init_req(&hdr, SD_OP_GET_HASH); > hdr.obj.oid = info->oid; > hdr.obj.tgt_epoch = sd_epoch; > > - ret = dog_exec_req(&vcw->vnode->node->nid, &hdr, NULL); > + ret = dog_exec_req(&vcw->vnode->node->nid, &hdr, vcw->buf); > if (ret < 0) > exit(EXIT_SYSFAIL); > > switch (rsp->result) { > case SD_RES_SUCCESS: > vcw->object_found = true; > - memcpy(vcw->hash, rsp->hash.digest, sizeof(vcw->hash)); > - uatomic_set(&info->base, vcw); > + if (!is_erasure_oid(info->oid, info->copy_policy)) { > + memcpy(vcw->hash, rsp->hash.digest, sizeof(vcw->hash)); > + uatomic_set(&info->base, vcw); > + } > break; > case SD_RES_NO_OBJ: > vcw->object_found = false; > @@ -1514,17 +1528,9 @@ static void vdi_hash_check_work(struct work *work) > } > } > > -static void vdi_hash_check_main(struct work *work) > +static void check_replicatoin_object(struct vdi_check_info *info) > { > - struct vdi_check_work *vcw = container_of(work, struct vdi_check_work, > - work); > - struct vdi_check_info *info = vcw->info; > - > - info->refcnt--; > - if (info->refcnt > 0) > - return; > - > - if (info->base == NULL) { > + if (info->base == NULL) { > sd_err("no node has %" PRIx64, info->oid); > exit(EXIT_FAILURE); > } > @@ -1542,6 +1548,99 @@ static void vdi_hash_check_main(struct work *work) > queue_work(info->wq, &info->vcw[i].work); > } > } > +} > + > +static void check_erasure_object(struct vdi_check_info *info) > +{ > + int d = 0, p = 0, i, j, k; > + int dp = ec_policy_to_dp(info->copy_policy, &d, &p); > + struct fec *ctx = ec_init(d, dp); > + int miss_idx[dp], input_idx[dp]; > + size_t strip_size = SD_EC_DATA_STRIPE_SIZE / d; > + uint64_t oid = info->oid; > + size_t len = get_store_objsize(info->copy_policy, oid); > + char *obj = xmalloc(len); > + uint8_t *input[dp]; > + > + for (i = 0; i < dp; i++) > + miss_idx[i] = -1; > + > + for (i = 0, j = 0, k = 0; i < info->nr_copies; i++) > + if (!info->vcw[i].object_found) { > + miss_idx[j++] = i; > + } else { > + input_idx[k] = i; > + input[k] = info->vcw[i].buf; > + k++; > + } > + > + if (!j) { /* No object missing */ > + int idx[d]; > + > + for (i = 0; i < d; i++) > + idx[i] = i; > + > + for (k = 0; k < p; k++) { > + for (i = 0; i < SD_EC_NR_STRIPE_PER_OBJECT; i++) { > + const uint8_t *ds[d]; > + uint8_t out[strip_size]; > + > + for (j = 0; j < d; j++) > + ds[j] = info->vcw[j].buf + strip_size > + * i; > + ec_decode(ctx, ds, idx, out, d + k); > + memcpy(obj + strip_size * i, out, strip_size); > + } > + if (memcmp(obj, info->vcw[d + k].buf, len) != 0) { > + /* TODO repair the inconsistency */ > + sd_err("object %"PRIx64" is inconsistent", oid); > + goto out; > + } > + } > + } else if (j > p) { > + sd_err("failed to rebuild object %"PRIx64". %d copies get " > + "lost, more than %d", oid, j, p); > + goto out; > + } else { > + for (k = 0; k < j; k++) { > + int m = miss_idx[k], n; > + > + for (i = 0; i < SD_EC_NR_STRIPE_PER_OBJECT; i++) { > + const uint8_t *ds[d]; > + uint8_t out[strip_size]; > + > + for (n = 0; n < d; n++) > + ds[n] = input[n] + strip_size * i; > + ec_decode(ctx, ds, input_idx, out, m); > + memcpy(obj + strip_size * i, out, strip_size); > + } > + write_object_to(info->vcw[m].vnode, oid, obj, true, > + info->vcw[m].ec_index); > + fprintf(stdout, "fixed missing %"PRIx64", " > + "copy index %d\n", info->oid, m); > + } > + } > +out: > + for (i = 0; i < dp; i++) > + free(info->vcw[i].buf); > + free(obj); > + ec_destroy(ctx); > +} > + > +static void vdi_check_object_main(struct work *work) > +{ > + struct vdi_check_work *vcw = container_of(work, struct vdi_check_work, > + work); > + struct vdi_check_info *info = vcw->info; > + > + info->refcnt--; > + if (info->refcnt > 0) > + return; > + > + if (is_erasure_oid(info->oid, info->copy_policy)) > + check_erasure_object(info); > + else > + check_replicatoin_object(info); > > if (info->refcnt == 0) > free_vdi_check_info(info); > @@ -1560,13 +1659,15 @@ static void queue_vdi_check_work(const struct > sd_inode *inode, uint64_t oid, > info->total = inode->vdi_size; > info->done = done; > info->wq = wq; > + info->copy_policy = inode->copy_policy; > > oid_to_vnodes(oid, &sd_vroot, nr_copies, tgt_vnodes); > for (int i = 0; i < nr_copies; i++) { > info->vcw[i].info = info; > + info->vcw[i].ec_index = i; > info->vcw[i].vnode = tgt_vnodes[i]; > - info->vcw[i].work.fn = vdi_hash_check_work; > - info->vcw[i].work.done = vdi_hash_check_main; > + info->vcw[i].work.fn = vdi_check_object_work; > + info->vcw[i].work.done = vdi_check_object_main; > info->refcnt++; > queue_work(info->wq, &info->vcw[i].work); > } > @@ -1586,6 +1687,8 @@ int do_vdi_check(const struct sd_inode *inode) > > wq = create_work_queue("vdi check", WQ_DYNAMIC); > > + init_fec(); > + > queue_vdi_check_work(inode, vid_to_vdi_oid(inode->vdi_id), NULL, wq); > > max_idx = count_data_objs(inode); > @@ -1622,11 +1725,6 @@ static int vdi_check(int argc, char **argv) > return ret; > } > > - if (inode->copy_policy > 0) { > - sd_err("not implemented for erasure coded vdi"); > - return EXIT_FAILURE; > - } > - > return do_vdi_check(inode); > } > > diff --git a/tests/functional/050 b/tests/functional/050 > index ab28f1f..c7cfbab 100755 > --- a/tests/functional/050 > +++ b/tests/functional/050 > @@ -22,7 +22,7 @@ _wait_for_sheep 6 > > _cluster_format -c 6 > > -$DOG vdi create test 100M > +_vdi_create test 100M > dd if=/dev/zero | $DOG vdi write test & > > # simulate IO NIC down of sheep 1 > diff --git a/tests/functional/052 b/tests/functional/052 > index bff9d52..2d556eb 100755 > --- a/tests/functional/052 > +++ b/tests/functional/052 > @@ -4,22 +4,22 @@ > > . ./common > > -for i in 0 1 2 3; do > +for i in `seq 0 5`; do > _start_sheep $i > done > -_wait_for_sheep 4 > -_cluster_format > -$DOG vdi create test 20M -P > +_wait_for_sheep 6 > +_cluster_format -c 6 > +_vdi_create test 20M -P
Rather than adding '-c 6' to _cluster_format, it looks cleaner to update _vdi_create() so that it creates 6 copies when -ec is not specified. Thanks, Kazutaka -- sheepdog mailing list [email protected] http://lists.wpkg.org/mailman/listinfo/sheepdog
