This introduces ledger objects, which keeps track of the number of
outstanding references of each generation.  Sheep decrements a
generational reference count with a gateway request SD_OP_DECREF_OBJ,
and reclaims objects when there is no generational reference.

Cc: MORITA Kazutaka <morita.kazut...@lab.ntt.co.jp>
Tested-by: Valerio Pachera <siri...@gmail.com>
Cc: Alessandro Bolgia <alessan...@extensys.it>
Signed-off-by: Hitoshi Mitake <mitake.hito...@lab.ntt.co.jp>
---
v5:
 - use sd_mutex

 include/internal_proto.h |   2 +
 include/sheepdog_proto.h |  28 ++++++++++++-
 sheep/gateway.c          |   6 +++
 sheep/ops.c              | 105 +++++++++++++++++++++++++++++++++++++++++++++++
 sheep/sheep_priv.h       |   5 +++
 sheep/store.c            |  26 ++++++++++++
 sheep/vdi.c              |  19 +--------
 7 files changed, 172 insertions(+), 19 deletions(-)

diff --git a/include/internal_proto.h b/include/internal_proto.h
index 56801d2..4afc87e 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -102,6 +102,8 @@
 #define SD_OP_CLUSTER_INFO     0xBE
 #define SD_OP_ALTER_CLUSTER_COPY       0xBF
 #define SD_OP_ALTER_VDI_COPY   0xC0
+#define SD_OP_DECREF_OBJ     0xC1
+#define SD_OP_DECREF_PEER    0xC2
 
 /* internal flags for hdr.flags, must be above 0x80 */
 #define SD_FLAG_CMD_RECOVERY 0x0080
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 80277ad..94b3c39 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -98,6 +98,7 @@
 #define VMSTATE_BIT (UINT64_C(1) << 62)
 #define VDI_ATTR_BIT (UINT64_C(1) << 61)
 #define VDI_BTREE_BIT (UINT64_C(1) << 60)
+#define LEDGER_BIT (UINT64_C(1) << 59)
 #define OLD_MAX_DATA_OBJS (1ULL << 20)
 #define MAX_DATA_OBJS (1ULL << 32)
 #define MAX_CHILDREN 1024U
@@ -117,6 +118,7 @@
 #define SD_INODE_DATA_INDEX_SIZE (sizeof(uint32_t) * SD_INODE_DATA_INDEX)
 #define SD_INODE_HEADER_SIZE offsetof(struct sd_inode, data_vdi_id)
 #define SD_ATTR_OBJ_SIZE (sizeof(struct sheepdog_vdi_attr))
+#define SD_LEDGER_OBJ_SIZE (UINT64_C(1) << 22)
 #define CURRENT_VDI_ID 0
 
 #define STORE_LEN 16
@@ -170,6 +172,11 @@ struct sd_req {
                                                    /* others mean true */
                        uint8_t         copy_policy;
                } vdi_state;
+               struct {
+                       uint64_t        oid;
+                       uint32_t        generation;
+                       uint32_t        count;
+               } ref;
 
                uint32_t                __pad[8];
        };
@@ -410,10 +417,16 @@ static inline bool is_vdi_btree_obj(uint64_t oid)
        return !!(oid & VDI_BTREE_BIT);
 }
 
+static inline bool is_ledger_object(uint64_t oid)
+{
+       return !!(oid & LEDGER_BIT);
+}
+
 static inline bool is_data_obj(uint64_t oid)
 {
        return !is_vdi_obj(oid) && !is_vmstate_obj(oid) &&
-               !is_vdi_attr_obj(oid) && !is_vdi_btree_obj(oid);
+               !is_vdi_attr_obj(oid) && !is_vdi_btree_obj(oid) &&
+               !is_ledger_object(oid);
 }
 
 static inline size_t count_data_objs(const struct sd_inode *inode)
@@ -432,6 +445,9 @@ static inline size_t get_objsize(uint64_t oid)
        if (is_vdi_btree_obj(oid))
                return SD_INODE_DATA_INDEX_SIZE;
 
+       if (is_ledger_object(oid))
+               return SD_LEDGER_OBJ_SIZE;
+
        return SD_DATA_OBJ_SIZE;
 }
 
@@ -482,4 +498,14 @@ static inline __attribute__((used)) void 
__sd_proto_build_bug_ons(void)
        BUILD_BUG_ON(sizeof(struct sd_rsp) != SD_RSP_SIZE);
 }
 
+static inline uint64_t ledger_oid_to_data_oid(uint64_t oid)
+{
+       return ~LEDGER_BIT & oid;
+}
+
+static inline uint64_t data_oid_to_ledger_oid(uint64_t oid)
+{
+       return LEDGER_BIT | oid;
+}
+
 #endif
diff --git a/sheep/gateway.c b/sheep/gateway.c
index aea4495..073e650 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -182,6 +182,7 @@ out:
 bool is_erasure_oid(uint64_t oid)
 {
        return !is_vdi_obj(oid) && !is_vdi_btree_obj(oid) &&
+               !is_ledger_object(oid) &&
                get_vdi_copy_policy(oid_to_vid(oid)) > 0;
 }
 
@@ -624,3 +625,8 @@ int gateway_remove_obj(struct request *req)
 {
        return gateway_forward_request(req);
 }
+
+int gateway_decref_object(struct request *req)
+{
+       return gateway_forward_request(req);
+}
diff --git a/sheep/ops.c b/sheep/ops.c
index 61eb37f..f47aa46 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -1092,6 +1092,98 @@ static inline int local_nfs_delete(struct request *req)
 
 #endif
 
+static bool is_zero_ledger(uint32_t *ledger)
+{
+       for (int i = 0; i < SD_LEDGER_OBJ_SIZE / sizeof(uint32_t); i++)
+               if (ledger[i])
+                       return false;
+
+       return true;
+}
+
+int peer_decref_object(struct request *req)
+{
+       struct sd_req *hdr = &req->rq;
+       int ret;
+       uint32_t epoch = hdr->epoch;
+       uint64_t ledger_oid = hdr->ref.oid;
+       uint64_t data_oid = ledger_oid_to_data_oid(ledger_oid);
+       uint32_t generation = hdr->ref.generation;
+       uint32_t count = hdr->ref.count;
+       uint32_t *ledger = NULL;
+       bool exist = false, locked = false;
+       static struct sd_mutex lock = SD_MUTEX_INITIALIZER;
+
+       sd_debug("%" PRIx64 ", %" PRIu32 ", %" PRIu32 ", %" PRIu32,
+                ledger_oid, epoch, generation, count);
+
+       ledger = xvalloc(SD_LEDGER_OBJ_SIZE);
+       memset(ledger, 0, SD_LEDGER_OBJ_SIZE);
+
+       struct siocb iocb = {
+               .epoch = epoch,
+               .buf = ledger,
+               .length = SD_LEDGER_OBJ_SIZE,
+       };
+
+       /* we don't allow concurrent updates to the ledger objects */
+       sd_mutex_lock(&lock);
+       locked = true;
+
+       ret = sd_store->read(ledger_oid, &iocb);
+       switch (ret) {
+       case SD_RES_SUCCESS:
+               exist = true;
+               break;
+       case SD_RES_NO_OBJ:
+               /* initialize ledger */
+               ledger[0] = 1;
+               break;
+       default:
+               sd_err("failed to read ledger object %"PRIx64": %s",
+                      ledger_oid, sd_strerror(ret));
+               goto out;
+       }
+
+       ledger[generation]--;
+       ledger[generation + 1] += count;
+
+       if (is_zero_ledger(ledger)) {
+               /* reclaim object */
+               if (exist) {
+                       ret = sd_store->remove_object(ledger_oid, -1);
+                       if (ret != SD_RES_SUCCESS) {
+                               sd_err("error %s", sd_strerror(ret));
+                               goto out;
+                       }
+               }
+               sd_mutex_unlock(&lock);
+               locked = false;
+
+               ret = sd_remove_object(data_oid);
+               if (ret != SD_RES_SUCCESS) {
+                       sd_err("error %s", sd_strerror(ret));
+                       goto out;
+               }
+       } else {
+               /* update ledger */
+               if (exist)
+                       ret = sd_store->write(ledger_oid, &iocb);
+               else
+                       ret = sd_store->create_and_write(ledger_oid, &iocb);
+
+               if (ret != SD_RES_SUCCESS)
+                       sd_err("failed to update ledger object %"PRIx64": %s",
+                              ledger_oid, sd_strerror(ret));
+       }
+out:
+       if (locked)
+               sd_mutex_unlock(&lock);
+       free(ledger);
+
+       return ret;
+}
+
 static struct sd_op_template sd_ops[] = {
 
        /* cluster operations */
@@ -1456,6 +1548,12 @@ static struct sd_op_template sd_ops[] = {
                .process_work = gateway_remove_obj,
        },
 
+       [SD_OP_DECREF_OBJ] = {
+               .name = "DECREF_OBJ",
+               .type = SD_OP_TYPE_GATEWAY,
+               .process_work = gateway_decref_object,
+       },
+
        /* peer I/O operations */
        [SD_OP_CREATE_AND_WRITE_PEER] = {
                .name = "CREATE_AND_WRITE_PEER",
@@ -1480,6 +1578,12 @@ static struct sd_op_template sd_ops[] = {
                .type = SD_OP_TYPE_PEER,
                .process_work = peer_remove_obj,
        },
+
+       [SD_OP_DECREF_PEER] = {
+               .name = "DECREF_PEER",
+               .type = SD_OP_TYPE_PEER,
+               .process_work = peer_decref_object,
+       },
 };
 
 const struct sd_op_template *get_sd_op(uint8_t opcode)
@@ -1568,6 +1672,7 @@ static int map_table[] = {
        [SD_OP_READ_OBJ] = SD_OP_READ_PEER,
        [SD_OP_WRITE_OBJ] = SD_OP_WRITE_PEER,
        [SD_OP_REMOVE_OBJ] = SD_OP_REMOVE_PEER,
+       [SD_OP_DECREF_OBJ] = SD_OP_DECREF_PEER,
 };
 
 int gateway_to_peer_opcode(int opcode)
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 83537b2..eb3e7aa 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -250,6 +250,7 @@ struct store_driver {
 
 /* backend store */
 int peer_read_obj(struct request *req);
+int peer_decref_object(struct request *req);
 
 int default_init(void);
 bool default_exist(uint64_t oid, uint8_t ec_index);
@@ -408,6 +409,8 @@ int sd_read_object(uint64_t oid, char *data, unsigned int 
datalen,
                   uint64_t offset);
 int sd_remove_object(uint64_t oid);
 int sd_discard_object(uint64_t oid);
+int sd_dec_object_refcnt(uint64_t data_oid, uint32_t generation,
+                        uint32_t refcnt);
 
 struct request_iocb *local_req_init(void);
 int exec_local_req(struct sd_req *rq, void *data);
@@ -463,6 +466,8 @@ int gateway_read_obj(struct request *req);
 int gateway_write_obj(struct request *req);
 int gateway_create_and_write_obj(struct request *req);
 int gateway_remove_obj(struct request *req);
+int gateway_decref_object(struct request *req);
+
 bool is_erasure_oid(uint64_t oid);
 uint8_t local_ec_index(struct vnode_info *vinfo, uint64_t oid);
 
diff --git a/sheep/store.c b/sheep/store.c
index 2d5aa32..eee88c7 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -468,3 +468,29 @@ int sd_discard_object(uint64_t oid)
 
        return ret;
 }
+
+int sd_dec_object_refcnt(uint64_t data_oid, uint32_t generation,
+                        uint32_t refcnt)
+{
+       struct sd_req hdr;
+       int ret;
+       uint64_t ledger_oid = data_oid_to_ledger_oid(data_oid);
+
+       sd_debug("%"PRIx64", %" PRId32 ", %" PRId32,
+                data_oid, generation, refcnt);
+
+       if (generation == 0 && refcnt == 0)
+               return sd_remove_object(data_oid);
+
+       sd_init_req(&hdr, SD_OP_DECREF_OBJ);
+       hdr.ref.oid = ledger_oid;
+       hdr.ref.generation = generation;
+       hdr.ref.count = refcnt;
+
+       ret = exec_local_req(&hdr, NULL);
+       if (ret != SD_RES_SUCCESS)
+               sd_err("failed to decrement reference %" PRIx64 ", %s",
+                      ledger_oid, sd_strerror(ret));
+
+       return ret;
+}
diff --git a/sheep/vdi.c b/sheep/vdi.c
index 4a8c4f7..6b51672 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -390,7 +390,7 @@ static int snapshot_vdi(const struct vdi_iocb *iocb, 
uint32_t new_snapid,
                 base_vid, iocb->nr_copies, new_snapid);
 
        ret = sd_read_object(vid_to_vdi_oid(base_vid), (char *)base,
-                            SD_INODE_HEADER_SIZE, 0);
+                            sizeof(*base), 0);
        if (ret != SD_RES_SUCCESS) {
                ret = SD_RES_BASE_VDI_READ;
                goto out;
@@ -408,23 +408,6 @@ static int snapshot_vdi(const struct vdi_iocb *iocb, 
uint32_t new_snapid,
        base->snap_ctime = iocb->time;
        base->child_vdi_id[idx] = new_vid;
 
-       /* TODO: multiple sd_write_object should be performed atomically */
-
-       ret = sd_write_object(vid_to_vdi_oid(base_vid), (char *)base,
-                             SD_INODE_HEADER_SIZE, 0, false);
-       if (ret != SD_RES_SUCCESS) {
-               sd_err("updating header of VDI %" PRIx32 "failed", base_vid);
-               ret = SD_RES_BASE_VDI_WRITE;
-               goto out;
-       }
-
-       ret = sd_read_object(vid_to_vdi_oid(base_vid), (char *)base,
-                            sizeof(*base), 0);
-       if (ret != SD_RES_SUCCESS) {
-               ret = SD_RES_BASE_VDI_READ;
-               goto out;
-       }
-
        for (int i = 0; i < ARRAY_SIZE(base->gref); i++) {
                if (!base->data_vdi_id[i])
                        continue;
-- 
1.9.1

-- 
sheepdog mailing list
sheepdog@lists.wpkg.org
http://lists.wpkg.org/mailman/listinfo/sheepdog

Reply via email to