This introduces ledger objects, which keeps track of the number of outstanding references of each generation. Sheep decrements a generational reference count with a gateway request SD_OP_DECREF_OBJ, and reclaims objects when there is no generational reference.
Cc: MORITA Kazutaka <morita.kazut...@lab.ntt.co.jp> Tested-by: Valerio Pachera <siri...@gmail.com> Cc: Alessandro Bolgia <alessan...@extensys.it> Signed-off-by: Hitoshi Mitake <mitake.hito...@lab.ntt.co.jp> --- v5: - use sd_mutex include/internal_proto.h | 2 + include/sheepdog_proto.h | 28 ++++++++++++- sheep/gateway.c | 6 +++ sheep/ops.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++ sheep/sheep_priv.h | 5 +++ sheep/store.c | 26 ++++++++++++ sheep/vdi.c | 19 +-------- 7 files changed, 172 insertions(+), 19 deletions(-) diff --git a/include/internal_proto.h b/include/internal_proto.h index 56801d2..4afc87e 100644 --- a/include/internal_proto.h +++ b/include/internal_proto.h @@ -102,6 +102,8 @@ #define SD_OP_CLUSTER_INFO 0xBE #define SD_OP_ALTER_CLUSTER_COPY 0xBF #define SD_OP_ALTER_VDI_COPY 0xC0 +#define SD_OP_DECREF_OBJ 0xC1 +#define SD_OP_DECREF_PEER 0xC2 /* internal flags for hdr.flags, must be above 0x80 */ #define SD_FLAG_CMD_RECOVERY 0x0080 diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h index 80277ad..94b3c39 100644 --- a/include/sheepdog_proto.h +++ b/include/sheepdog_proto.h @@ -98,6 +98,7 @@ #define VMSTATE_BIT (UINT64_C(1) << 62) #define VDI_ATTR_BIT (UINT64_C(1) << 61) #define VDI_BTREE_BIT (UINT64_C(1) << 60) +#define LEDGER_BIT (UINT64_C(1) << 59) #define OLD_MAX_DATA_OBJS (1ULL << 20) #define MAX_DATA_OBJS (1ULL << 32) #define MAX_CHILDREN 1024U @@ -117,6 +118,7 @@ #define SD_INODE_DATA_INDEX_SIZE (sizeof(uint32_t) * SD_INODE_DATA_INDEX) #define SD_INODE_HEADER_SIZE offsetof(struct sd_inode, data_vdi_id) #define SD_ATTR_OBJ_SIZE (sizeof(struct sheepdog_vdi_attr)) +#define SD_LEDGER_OBJ_SIZE (UINT64_C(1) << 22) #define CURRENT_VDI_ID 0 #define STORE_LEN 16 @@ -170,6 +172,11 @@ struct sd_req { /* others mean true */ uint8_t copy_policy; } vdi_state; + struct { + uint64_t oid; + uint32_t generation; + uint32_t count; + } ref; uint32_t __pad[8]; }; @@ -410,10 +417,16 @@ static inline bool is_vdi_btree_obj(uint64_t oid) return !!(oid & VDI_BTREE_BIT); } +static inline bool is_ledger_object(uint64_t oid) +{ + return !!(oid & LEDGER_BIT); +} + static inline bool is_data_obj(uint64_t oid) { return !is_vdi_obj(oid) && !is_vmstate_obj(oid) && - !is_vdi_attr_obj(oid) && !is_vdi_btree_obj(oid); + !is_vdi_attr_obj(oid) && !is_vdi_btree_obj(oid) && + !is_ledger_object(oid); } static inline size_t count_data_objs(const struct sd_inode *inode) @@ -432,6 +445,9 @@ static inline size_t get_objsize(uint64_t oid) if (is_vdi_btree_obj(oid)) return SD_INODE_DATA_INDEX_SIZE; + if (is_ledger_object(oid)) + return SD_LEDGER_OBJ_SIZE; + return SD_DATA_OBJ_SIZE; } @@ -482,4 +498,14 @@ static inline __attribute__((used)) void __sd_proto_build_bug_ons(void) BUILD_BUG_ON(sizeof(struct sd_rsp) != SD_RSP_SIZE); } +static inline uint64_t ledger_oid_to_data_oid(uint64_t oid) +{ + return ~LEDGER_BIT & oid; +} + +static inline uint64_t data_oid_to_ledger_oid(uint64_t oid) +{ + return LEDGER_BIT | oid; +} + #endif diff --git a/sheep/gateway.c b/sheep/gateway.c index aea4495..073e650 100644 --- a/sheep/gateway.c +++ b/sheep/gateway.c @@ -182,6 +182,7 @@ out: bool is_erasure_oid(uint64_t oid) { return !is_vdi_obj(oid) && !is_vdi_btree_obj(oid) && + !is_ledger_object(oid) && get_vdi_copy_policy(oid_to_vid(oid)) > 0; } @@ -624,3 +625,8 @@ int gateway_remove_obj(struct request *req) { return gateway_forward_request(req); } + +int gateway_decref_object(struct request *req) +{ + return gateway_forward_request(req); +} diff --git a/sheep/ops.c b/sheep/ops.c index 61eb37f..f47aa46 100644 --- a/sheep/ops.c +++ b/sheep/ops.c @@ -1092,6 +1092,98 @@ static inline int local_nfs_delete(struct request *req) #endif +static bool is_zero_ledger(uint32_t *ledger) +{ + for (int i = 0; i < SD_LEDGER_OBJ_SIZE / sizeof(uint32_t); i++) + if (ledger[i]) + return false; + + return true; +} + +int peer_decref_object(struct request *req) +{ + struct sd_req *hdr = &req->rq; + int ret; + uint32_t epoch = hdr->epoch; + uint64_t ledger_oid = hdr->ref.oid; + uint64_t data_oid = ledger_oid_to_data_oid(ledger_oid); + uint32_t generation = hdr->ref.generation; + uint32_t count = hdr->ref.count; + uint32_t *ledger = NULL; + bool exist = false, locked = false; + static struct sd_mutex lock = SD_MUTEX_INITIALIZER; + + sd_debug("%" PRIx64 ", %" PRIu32 ", %" PRIu32 ", %" PRIu32, + ledger_oid, epoch, generation, count); + + ledger = xvalloc(SD_LEDGER_OBJ_SIZE); + memset(ledger, 0, SD_LEDGER_OBJ_SIZE); + + struct siocb iocb = { + .epoch = epoch, + .buf = ledger, + .length = SD_LEDGER_OBJ_SIZE, + }; + + /* we don't allow concurrent updates to the ledger objects */ + sd_mutex_lock(&lock); + locked = true; + + ret = sd_store->read(ledger_oid, &iocb); + switch (ret) { + case SD_RES_SUCCESS: + exist = true; + break; + case SD_RES_NO_OBJ: + /* initialize ledger */ + ledger[0] = 1; + break; + default: + sd_err("failed to read ledger object %"PRIx64": %s", + ledger_oid, sd_strerror(ret)); + goto out; + } + + ledger[generation]--; + ledger[generation + 1] += count; + + if (is_zero_ledger(ledger)) { + /* reclaim object */ + if (exist) { + ret = sd_store->remove_object(ledger_oid, -1); + if (ret != SD_RES_SUCCESS) { + sd_err("error %s", sd_strerror(ret)); + goto out; + } + } + sd_mutex_unlock(&lock); + locked = false; + + ret = sd_remove_object(data_oid); + if (ret != SD_RES_SUCCESS) { + sd_err("error %s", sd_strerror(ret)); + goto out; + } + } else { + /* update ledger */ + if (exist) + ret = sd_store->write(ledger_oid, &iocb); + else + ret = sd_store->create_and_write(ledger_oid, &iocb); + + if (ret != SD_RES_SUCCESS) + sd_err("failed to update ledger object %"PRIx64": %s", + ledger_oid, sd_strerror(ret)); + } +out: + if (locked) + sd_mutex_unlock(&lock); + free(ledger); + + return ret; +} + static struct sd_op_template sd_ops[] = { /* cluster operations */ @@ -1456,6 +1548,12 @@ static struct sd_op_template sd_ops[] = { .process_work = gateway_remove_obj, }, + [SD_OP_DECREF_OBJ] = { + .name = "DECREF_OBJ", + .type = SD_OP_TYPE_GATEWAY, + .process_work = gateway_decref_object, + }, + /* peer I/O operations */ [SD_OP_CREATE_AND_WRITE_PEER] = { .name = "CREATE_AND_WRITE_PEER", @@ -1480,6 +1578,12 @@ static struct sd_op_template sd_ops[] = { .type = SD_OP_TYPE_PEER, .process_work = peer_remove_obj, }, + + [SD_OP_DECREF_PEER] = { + .name = "DECREF_PEER", + .type = SD_OP_TYPE_PEER, + .process_work = peer_decref_object, + }, }; const struct sd_op_template *get_sd_op(uint8_t opcode) @@ -1568,6 +1672,7 @@ static int map_table[] = { [SD_OP_READ_OBJ] = SD_OP_READ_PEER, [SD_OP_WRITE_OBJ] = SD_OP_WRITE_PEER, [SD_OP_REMOVE_OBJ] = SD_OP_REMOVE_PEER, + [SD_OP_DECREF_OBJ] = SD_OP_DECREF_PEER, }; int gateway_to_peer_opcode(int opcode) diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h index 83537b2..eb3e7aa 100644 --- a/sheep/sheep_priv.h +++ b/sheep/sheep_priv.h @@ -250,6 +250,7 @@ struct store_driver { /* backend store */ int peer_read_obj(struct request *req); +int peer_decref_object(struct request *req); int default_init(void); bool default_exist(uint64_t oid, uint8_t ec_index); @@ -408,6 +409,8 @@ int sd_read_object(uint64_t oid, char *data, unsigned int datalen, uint64_t offset); int sd_remove_object(uint64_t oid); int sd_discard_object(uint64_t oid); +int sd_dec_object_refcnt(uint64_t data_oid, uint32_t generation, + uint32_t refcnt); struct request_iocb *local_req_init(void); int exec_local_req(struct sd_req *rq, void *data); @@ -463,6 +466,8 @@ int gateway_read_obj(struct request *req); int gateway_write_obj(struct request *req); int gateway_create_and_write_obj(struct request *req); int gateway_remove_obj(struct request *req); +int gateway_decref_object(struct request *req); + bool is_erasure_oid(uint64_t oid); uint8_t local_ec_index(struct vnode_info *vinfo, uint64_t oid); diff --git a/sheep/store.c b/sheep/store.c index 2d5aa32..eee88c7 100644 --- a/sheep/store.c +++ b/sheep/store.c @@ -468,3 +468,29 @@ int sd_discard_object(uint64_t oid) return ret; } + +int sd_dec_object_refcnt(uint64_t data_oid, uint32_t generation, + uint32_t refcnt) +{ + struct sd_req hdr; + int ret; + uint64_t ledger_oid = data_oid_to_ledger_oid(data_oid); + + sd_debug("%"PRIx64", %" PRId32 ", %" PRId32, + data_oid, generation, refcnt); + + if (generation == 0 && refcnt == 0) + return sd_remove_object(data_oid); + + sd_init_req(&hdr, SD_OP_DECREF_OBJ); + hdr.ref.oid = ledger_oid; + hdr.ref.generation = generation; + hdr.ref.count = refcnt; + + ret = exec_local_req(&hdr, NULL); + if (ret != SD_RES_SUCCESS) + sd_err("failed to decrement reference %" PRIx64 ", %s", + ledger_oid, sd_strerror(ret)); + + return ret; +} diff --git a/sheep/vdi.c b/sheep/vdi.c index 4a8c4f7..6b51672 100644 --- a/sheep/vdi.c +++ b/sheep/vdi.c @@ -390,7 +390,7 @@ static int snapshot_vdi(const struct vdi_iocb *iocb, uint32_t new_snapid, base_vid, iocb->nr_copies, new_snapid); ret = sd_read_object(vid_to_vdi_oid(base_vid), (char *)base, - SD_INODE_HEADER_SIZE, 0); + sizeof(*base), 0); if (ret != SD_RES_SUCCESS) { ret = SD_RES_BASE_VDI_READ; goto out; @@ -408,23 +408,6 @@ static int snapshot_vdi(const struct vdi_iocb *iocb, uint32_t new_snapid, base->snap_ctime = iocb->time; base->child_vdi_id[idx] = new_vid; - /* TODO: multiple sd_write_object should be performed atomically */ - - ret = sd_write_object(vid_to_vdi_oid(base_vid), (char *)base, - SD_INODE_HEADER_SIZE, 0, false); - if (ret != SD_RES_SUCCESS) { - sd_err("updating header of VDI %" PRIx32 "failed", base_vid); - ret = SD_RES_BASE_VDI_WRITE; - goto out; - } - - ret = sd_read_object(vid_to_vdi_oid(base_vid), (char *)base, - sizeof(*base), 0); - if (ret != SD_RES_SUCCESS) { - ret = SD_RES_BASE_VDI_READ; - goto out; - } - for (int i = 0; i < ARRAY_SIZE(base->gref); i++) { if (!base->data_vdi_id[i]) continue; -- 1.9.1 -- sheepdog mailing list sheepdog@lists.wpkg.org http://lists.wpkg.org/mailman/listinfo/sheepdog