This patch adds new data structres which represent VDI family tree. It
is different from the existing vdi_state_entry tree. The purpose of
these new data structres is efficient tracking of family relation
between VDIs. It is required for garbage collecting needless VIDs.

Cc: Saeki Masaki <[email protected]>
Cc: Yuka Kawasaki <[email protected]>
Signed-off-by: Hitoshi Mitake <[email protected]>
---
 sheep/group.c       |   7 +--
 sheep/plain_store.c |   2 +-
 sheep/sheep_priv.h  |   2 +
 sheep/vdi.c         | 146 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 148 insertions(+), 9 deletions(-)

diff --git a/sheep/group.c b/sheep/group.c
index 079d96d..9462aa4 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -509,9 +509,10 @@ retry:
                atomic_set_bit(vs[i].vid, sys->vdi_inuse);
                if (vs[i].deleted)
                        atomic_set_bit(vs[i].vid, sys->vdi_deleted);
-               add_vdi_state(vs[i].vid, vs[i].nr_copies, vs[i].snapshot,
-                             vs[i].copy_policy, vs[i].block_size_shift,
-                             vs[i].parent_vid);
+               add_vdi_state_unordered(vs[i].vid, vs[i].nr_copies,
+                                       vs[i].snapshot, vs[i].copy_policy,
+                                       vs[i].block_size_shift,
+                                       vs[i].parent_vid);
        }
 out:
        free(vs);
diff --git a/sheep/plain_store.c b/sheep/plain_store.c
index a7b0864..92f9a14 100644
--- a/sheep/plain_store.c
+++ b/sheep/plain_store.c
@@ -281,7 +281,7 @@ static int init_vdi_state(uint64_t oid, const char *wd, 
uint32_t epoch)
                       "wat %s", oid, epoch, wd);
                goto out;
        }
-       add_vdi_state(oid_to_vid(oid), inode->nr_copies,
+       add_vdi_state_unordered(oid_to_vid(oid), inode->nr_copies,
                      vdi_is_snapshot(inode), inode->copy_policy,
                      inode->block_size_shift, inode->parent_vdi_id);
 
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index a798c50..2b32b4e 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -335,6 +335,8 @@ int get_obj_copy_number(uint64_t oid, int nr_zones);
 int get_req_copy_number(struct request *req);
 int add_vdi_state(uint32_t vid, int nr_copies, bool snapshot,
                  uint8_t, uint8_t block_size_shift, uint32_t parent_vid);
+int add_vdi_state_unordered(uint32_t vid, int nr_copies, bool snapshot,
+                 uint8_t, uint8_t block_size_shift, uint32_t parent_vid);
 int vdi_exist(uint32_t vid);
 int vdi_create(const struct vdi_iocb *iocb, uint32_t *new_vid);
 int vdi_snapshot(const struct vdi_iocb *iocb, uint32_t *new_vid);
diff --git a/sheep/vdi.c b/sheep/vdi.c
index 982f587..9bf6b23 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -30,11 +30,125 @@ struct vdi_state_entry {
        int nr_participants;
        enum shared_lock_state participants_state[SD_MAX_COPIES];
        struct node_id participants[SD_MAX_COPIES];
+
+       struct vdi_family_member *family_member;
 };
 
 static struct rb_root vdi_state_root = RB_ROOT;
 static struct sd_rw_lock vdi_state_lock = SD_RW_LOCK_INITIALIZER;
 
+struct vdi_family_member {
+       uint32_t vid, parent_vid;
+       struct vdi_family_member *parent;
+       struct vdi_state_entry *entry;
+
+       struct list_node roots_list; /* only used for root VDIs */
+
+       struct list_head child_list_head;
+       struct list_node child_list_node;
+};
+
+static LIST_HEAD(vdi_family_roots);
+static LIST_HEAD(vdi_family_temporal_orphans);
+static struct sd_mutex vdi_family_mutex = SD_MUTEX_INITIALIZER;
+
+static struct vdi_family_member *lookup_vdi_family_member(uint32_t vid,
+                                 struct vdi_family_member *family_member)
+{
+       /* FIXME: more effective data structure */
+       struct vdi_family_member *vdi;
+
+       if (family_member->vid == vid)
+               return family_member;
+
+       list_for_each_entry(vdi, &family_member->child_list_head,
+                           child_list_node) {
+               struct vdi_family_member *ret;
+
+               ret = lookup_vdi_family_member(vid, vdi);
+               if (ret)
+                       return ret;
+       }
+
+       return NULL;
+}
+
+static void update_vdi_family(uint32_t parent_vid,
+                             struct vdi_state_entry *entry, bool unordered)
+{
+       uint32_t vid = entry->vid;
+       struct vdi_family_member *new, *vdi, *parent;
+
+       sd_mutex_lock(&vdi_family_mutex);
+
+       if (!parent_vid) {
+               new = xzalloc(sizeof(*new));
+               new->vid = vid;
+               new->entry = entry;
+               entry->family_member = new;
+
+               INIT_LIST_NODE(&new->roots_list);
+               INIT_LIST_HEAD(&new->child_list_head);
+
+               list_add_tail(&new->roots_list, &vdi_family_roots);
+
+               sd_debug("new vid %"PRIx32 " is added as a root VDI", vid);
+               goto out;
+       }
+
+       new = xzalloc(sizeof(*new));
+       new->vid = vid;
+       new->parent_vid = parent_vid;
+       new->entry = entry;
+       entry->family_member = new;
+
+       INIT_LIST_HEAD(&new->child_list_head);
+       INIT_LIST_NODE(&new->child_list_node);
+
+       list_for_each_entry(vdi, &vdi_family_roots, roots_list) {
+               parent = lookup_vdi_family_member(parent_vid, vdi);
+               if (parent) {
+                       new->parent = parent;
+                       goto found;
+               }
+       }
+
+       if (unordered) {
+               sd_debug("parent of VID: %"PRIx32" (%"PRIx32") is not"
+                        " initialized yet", vid, parent_vid);
+
+               list_add_tail(&new->child_list_node,
+                             &vdi_family_temporal_orphans);
+               goto ret;
+       }
+
+       panic("parent VID: %"PRIx32" not found", parent_vid);
+
+found:
+       list_add_tail(&new->child_list_node, &parent->child_list_head);
+       sd_debug("new child vid %"PRIx32" is added to parent %"PRIx32
+                " correctly", vid, parent_vid);
+
+out:
+       /* correct children from orphan list */
+
+       list_for_each_entry(vdi, &vdi_family_temporal_orphans,
+                           child_list_node) {
+               if (vdi->parent_vid != vid)
+                       continue;
+
+               list_del(&vdi->child_list_node);
+               list_add_tail(&vdi->child_list_node, &new->child_list_head);
+               vdi->parent = new;
+
+               sd_debug("new vid %"PRIx32" rescued orphan vid %"PRIx32"",
+                        vid, vdi->vid);
+       }
+
+ret:
+       sd_mutex_unlock(&vdi_family_mutex);
+}
+
 /*
  * ec_max_data_strip represent max number of data strips in the cluster. When
  * nr_zones < it, we don't purge the stale objects because for erasure coding,
@@ -189,10 +303,12 @@ int get_req_copy_number(struct request *req)
        return nr_copies;
 }
 
-int add_vdi_state(uint32_t vid, int nr_copies, bool snapshot,
-                 uint8_t cp, uint8_t block_size_shift, uint32_t parent_vid)
+static int do_add_vdi_state(uint32_t vid, int nr_copies, bool snapshot,
+                           uint8_t cp, uint8_t block_size_shift,
+                           uint32_t parent_vid, bool unordered)
 {
        struct vdi_state_entry *entry, *old;
+       bool already_exists = false;
 
        entry = xzalloc(sizeof(*entry));
        entry->vid = vid;
@@ -216,8 +332,8 @@ int add_vdi_state(uint32_t vid, int nr_copies, bool 
snapshot,
                sd_mutex_unlock(&m);
        }
 
-       sd_debug("%" PRIx32 ", %d, %d, %"PRIu8,
-                vid, nr_copies, cp, block_size_shift);
+       sd_debug("%" PRIx32 ", %d, %d, %"PRIu8", %"PRIx32,
+                vid, nr_copies, cp, block_size_shift, parent_vid);
 
        sd_write_lock(&vdi_state_lock);
        old = vdi_state_insert(&vdi_state_root, entry);
@@ -238,13 +354,32 @@ int add_vdi_state(uint32_t vid, int nr_copies, bool 
snapshot,
 
                        entry->parent_vid = parent_vid;
                }
+
+               already_exists = true;
        }
 
+       if (!already_exists)
+               update_vdi_family(parent_vid, entry, unordered);
+
        sd_rw_unlock(&vdi_state_lock);
 
        return SD_RES_SUCCESS;
 }
 
+int add_vdi_state(uint32_t vid, int nr_copies, bool snapshot,
+                 uint8_t cp, uint8_t block_size_shift, uint32_t parent_vid)
+{
+       return do_add_vdi_state(vid, nr_copies, snapshot, cp, block_size_shift,
+                               parent_vid, false);
+}
+
+int add_vdi_state_unordered(uint32_t vid, int nr_copies, bool snapshot,
+                 uint8_t cp, uint8_t block_size_shift, uint32_t parent_vid)
+{
+       return do_add_vdi_state(vid, nr_copies, snapshot, cp, block_size_shift,
+                               parent_vid, true);
+}
+
 int fill_vdi_state_list(const struct sd_req *hdr,
                        struct sd_rsp *rsp, void *data)
 {
@@ -1389,7 +1524,8 @@ int vdi_create(const struct vdi_iocb *iocb, uint32_t 
*new_vid)
        if (info.snapid == 0)
                info.snapid = 1;
        *new_vid = info.free_bit;
-       ret = notify_vdi_add(*new_vid, iocb->nr_copies, info.vid,
+       ret = notify_vdi_add(*new_vid, iocb->nr_copies,
+                            iocb->base_vid == 0 ? info.vid : iocb->base_vid,
                             iocb->copy_policy, iocb->block_size_shift);
        if (ret != SD_RES_SUCCESS)
                return ret;
-- 
1.9.1

-- 
sheepdog mailing list
[email protected]
https://lists.wpkg.org/mailman/listinfo/sheepdog

Reply via email to