Signed-off-by: FUJITA Tomonori <[email protected]>
---
 collie/collie.h          |   14 ++-
 collie/group.c           |   17 ++-
 collie/net.c             |    2 +-
 collie/vdi.c             |  278 ++++++++++++++++++++++++++++++++--------------
 include/bitops.h         |  132 ++++++++++++++++++++++
 include/meta.h           |   65 ++++++-----
 include/sheepdog_proto.h |    8 +-
 include/util.h           |    3 +-
 shepherd/shepherd.c      |   80 +++++++-------
 9 files changed, 428 insertions(+), 171 deletions(-)
 create mode 100644 include/bitops.h

diff --git a/collie/collie.h b/collie/collie.h
index 5cd2383..8829079 100644
--- a/collie/collie.h
+++ b/collie/collie.h
@@ -19,6 +19,7 @@
 #include "logger.h"
 #include "work.h"
 #include "net.h"
+#include "meta.h"
 
 #define SD_MSG_JOIN             0x01
 #define SD_MSG_VDI_OP           0x02
@@ -70,6 +71,8 @@ struct cluster_info {
        struct list_head vm_list;
        struct list_head pending_list;
 
+       DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
+
        int nr_sobjs;
 };
 
@@ -79,12 +82,13 @@ int create_listen_port(int port, void *data);
 
 int init_store(char *dir);
 
-int add_vdi(char *buf, int len, uint64_t size,
-           uint64_t *added_oid, uint64_t base_oid, uint32_t tag, int copies,
-           uint16_t flags);
+int add_vdi(char *data, int data_len, uint64_t size,
+           uint64_t *new_oid, uint64_t base_oid, uint32_t copies,
+           int is_snapshot);
+
+int lookup_vdi(char *data, int data_len, uint64_t *oid, uint32_t snapid);
 
-int lookup_vdi(char *filename, uint64_t * oid,
-              uint32_t tag, int do_lock, int *current);
+int read_vdis(char *data, int len, unsigned int *rsp_len);
 
 int make_super_object(struct sd_vdi_req *hdr);
 
diff --git a/collie/group.c b/collie/group.c
index a49c1be..4a2397b 100644
--- a/collie/group.c
+++ b/collie/group.c
@@ -199,6 +199,9 @@ void cluster_queue_request(struct work *work, int idx)
 
                rsp->result = SD_RES_SUCCESS;
                break;
+       case SD_OP_READ_VDIS:
+               rsp->result = read_vdis(req->data, hdr->data_length, 
&rsp->data_length);
+               break;
        default:
                /* forward request to group */
                goto forward;
@@ -506,21 +509,20 @@ static void vdi_op(struct vdi_op_message *msg)
        const struct sd_vdi_req *hdr = &msg->req;
        struct sd_vdi_rsp *rsp = &msg->rsp;
        void *data = msg->data;
-       int ret = SD_RES_SUCCESS, is_current;
+       int ret = SD_RES_SUCCESS;
        uint64_t oid = 0;
 
        switch (hdr->opcode) {
        case SD_OP_NEW_VDI:
                ret = add_vdi(data, hdr->data_length, hdr->vdi_size, &oid,
-                             hdr->base_oid, hdr->tag, hdr->copies, hdr->flags);
+                             hdr->base_oid, hdr->copies,
+                             hdr->snapid);
                break;
        case SD_OP_LOCK_VDI:
        case SD_OP_GET_VDI_INFO:
-               ret = lookup_vdi(data, &oid, hdr->tag, 1, &is_current);
+               ret = lookup_vdi(data, hdr->data_length, &oid, hdr->snapid);
                if (ret != SD_RES_SUCCESS)
                        break;
-               if (is_current)
-                       rsp->flags = SD_VDI_RSP_FLAG_CURRENT;
                break;
        case SD_OP_RELEASE_VDI:
                break;
@@ -556,7 +558,12 @@ static void vdi_op_done(struct vdi_op_message *msg)
 
        switch (hdr->opcode) {
        case SD_OP_NEW_VDI:
+       {
+               unsigned long nr = (rsp->oid & ~VDI_BIT) >> VDI_SPACE_SHIFT;
+               vprintf(SDOG_INFO "done %d %ld %" PRIx64 "\n", ret, nr, 
rsp->oid);
+               set_bit(nr, sys->vdi_inuse);
                break;
+       }
        case SD_OP_LOCK_VDI:
                if (lookup_vm(&sys->vm_list, (char *)data)) {
                        ret = SD_RES_VDI_LOCKED;
diff --git a/collie/net.c b/collie/net.c
index 04f9547..749c33d 100644
--- a/collie/net.c
+++ b/collie/net.c
@@ -89,10 +89,10 @@ static void queue_request(struct request *req)
        case SD_OP_MAKE_FS:
        case SD_OP_SHUTDOWN:
        case SD_OP_STAT_CLUSTER:
+       case SD_OP_READ_VDIS:
                req->work.fn = cluster_queue_request;
                break;
        case SD_OP_SO:
-       case SD_OP_SO_NEW_VDI:
        case SD_OP_SO_LOOKUP_VDI:
        case SD_OP_SO_READ_VDIS:
        case SD_OP_SO_STAT:
diff --git a/collie/vdi.c b/collie/vdi.c
index 5904488..25cc83c 100644
--- a/collie/vdi.c
+++ b/collie/vdi.c
@@ -16,17 +16,23 @@
 #include "meta.h"
 #include "collie.h"
 
+
 /* TODO: should be performed atomically */
-static int create_inode_obj(struct sheepdog_node_list_entry *entries,
-                           int nr_nodes, uint64_t epoch, int copies,
-                           uint64_t oid, uint64_t size, uint64_t base_oid)
+static int create_vdi_obj(char *name, uint64_t new_oid, uint64_t size,
+                         uint64_t base_oid, uint64_t cur_oid, uint32_t copies,
+                         uint32_t snapid, int is_snapshot)
 {
-       struct sheepdog_inode inode, base;
+       struct sheepdog_node_list_entry entries[SD_MAX_NODES];
+       /* we are not called concurrently */
+       static struct sheepdog_inode new, base, cur;
        struct timeval tv;
-       int ret;
+       int ret, nr_nodes;
+       unsigned long block_size = SD_DATA_OBJ_SIZE;
+
+       nr_nodes = build_node_list(&sys->sd_node_list, entries);
 
        if (base_oid) {
-               ret = read_object(entries, nr_nodes, epoch,
+               ret = read_object(entries, nr_nodes, sys->epoch,
                                  base_oid, (char *)&base, sizeof(base), 0,
                                  copies);
                if (ret < 0)
@@ -35,26 +41,45 @@ static int create_inode_obj(struct sheepdog_node_list_entry 
*entries,
 
        gettimeofday(&tv, NULL);
 
-       memset(&inode, 0, sizeof(inode));
+       if (is_snapshot) {
+               if (cur_oid != base_oid) {
+                       vprintf(SDOG_INFO "tree snapshot %s %" PRIx64 " %" 
PRIx64 "\n",
+                               name, cur_oid, base_oid);
+
+                       ret = read_object(entries, nr_nodes, sys->epoch,
+                                         cur_oid, (char *)&cur, sizeof(cur), 0,
+                                         copies);
+                       if (ret < 0) {
+                               vprintf(SDOG_ERR "failed\n");
+                               return SD_RES_BASE_VDI_READ;
+                       }
+
+                       cur.snap_ctime = (uint64_t) tv.tv_sec << 32 | 
tv.tv_usec * 1000;
+               } else
+                       base.snap_ctime = (uint64_t) tv.tv_sec << 32 | 
tv.tv_usec * 1000;
+       }
+
+       memset(&new, 0, sizeof(new));
 
-       inode.oid = oid;
-       inode.vdi_size = size;
-       inode.block_size = SD_DATA_OBJ_SIZE;
-       inode.ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000;
-       inode.nr_copies = copies;
+       strncpy(new.name, name, sizeof(new.name));
+       new.oid = new_oid;
+       new.ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000;
+       new.vdi_size = size;
+       new.copy_policy = 0;
+       new.nr_copies = copies;
+       new.block_size_shift = find_next_bit(&block_size, BITS_PER_LONG, 0);
+       new.snap_id = snapid;
 
        if (base_oid) {
                int i;
 
-               eprintf("%zd %zd\n", sizeof(inode.data_oid),
-                       ARRAY_SIZE(base.child_oid));
-               inode.parent_oid = base_oid;
-               memcpy(inode.data_oid, base.data_oid,
+               new.parent_oid = base_oid;
+               memcpy(new.data_oid, base.data_oid,
                       MAX_DATA_OBJS * sizeof(uint64_t));
 
                for (i = 0; i < ARRAY_SIZE(base.child_oid); i++) {
                        if (!base.child_oid[i]) {
-                               base.child_oid[i] = oid;
+                               base.child_oid[i] = new_oid;
                                break;
                        }
                }
@@ -62,120 +87,203 @@ static int create_inode_obj(struct 
sheepdog_node_list_entry *entries,
                if (i == ARRAY_SIZE(base.child_oid))
                        return SD_RES_NO_BASE_VDI;
 
+       }
+
+       if (is_snapshot && cur_oid != base_oid) {
+               ret = write_object(entries, nr_nodes, sys->epoch,
+                                  cur_oid, (char *)&cur, sizeof(cur), 0,
+                                  copies, 0);
+               if (ret < 0) {
+                       vprintf(SDOG_ERR "failed\n");
+                       return SD_RES_BASE_VDI_READ;
+               }
+       }
+
+       if (base_oid) {
                ret = write_object(entries, nr_nodes,
-                                  epoch, base_oid, (char *)&base,
+                                  sys->epoch, base_oid, (char *)&base,
                                   sizeof(base), 0, copies, 0);
-               if (ret < 0)
+               if (ret < 0) {
+                       vprintf(SDOG_ERR "failed\n");
                        return SD_RES_BASE_VDI_WRITE;
+               }
        }
 
-       ret = write_object(entries, nr_nodes, epoch,
-                          oid, (char *)&inode, sizeof(inode), 0, copies, 1);
+       ret = write_object(entries, nr_nodes, sys->epoch,
+                          new_oid, (char *)&new, sizeof(new), 0, copies, 1);
        if (ret < 0)
                return SD_RES_VDI_WRITE;
 
        return ret;
 }
 
-/*
- * TODO: handle larger buffer
- */
-int add_vdi(char *name, int len, uint64_t size,
-           uint64_t *added_oid, uint64_t base_oid, uint32_t tag, int copies,
-           uint16_t flags)
+static int find_first_vdi(unsigned long start, unsigned long end,
+                         char *name, int namelen, uint32_t snapid, uint64_t 
*oid,
+                         unsigned long *deleted_nr, uint32_t *next_snap)
 {
        struct sheepdog_node_list_entry entries[SD_MAX_NODES];
+       static struct sheepdog_inode inode;
+       unsigned long i;
        int nr_nodes, nr_reqs;
-       uint64_t oid = 0;
        int ret;
-       struct sd_so_req req;
-       struct sd_so_rsp *rsp = (struct sd_so_rsp *)&req;
-
-       memset(&req, 0, sizeof(req));
 
        nr_nodes = build_node_list(&sys->sd_node_list, entries);
 
-       dprintf("%s (%d) %" PRIu64 ", base: %" PRIu64 "\n", name, len, size,
-               base_oid);
-
        nr_reqs = sys->nr_sobjs;
        if (nr_reqs > nr_nodes)
                nr_reqs = nr_nodes;
 
-       memset(&req, 0, sizeof(req));
-
-       eprintf("%d %d\n", copies, sys->nr_sobjs);
-       /* qemu doesn't specify the copies, then we use the default. */
-       if (!copies)
-               copies = sys->nr_sobjs;
-
-       req.opcode = SD_OP_SO_NEW_VDI;
-       req.copies = copies;
-       req.tag = tag;
-       req.flags |= flags;
+       for (i = start; i >= end; i--) {
+               ret = read_object(entries, nr_nodes, sys->epoch,
+                                 bit_to_oid(i), (char *)&inode, sizeof(inode), 
0,
+                                 nr_reqs);
+               if (ret < 0)
+                       return SD_RES_EIO;
 
-       ret = exec_reqs(entries, nr_nodes, sys->epoch,
-                       SD_DIR_OID, (struct sd_req *)&req, name, len, 0,
-                       nr_reqs, nr_reqs);
+               if (inode.name[0] == '\0') {
+                       *deleted_nr = i;
+                       continue; /* deleted */
+               }
 
-       if (ret < 0)
-               return rsp->result;
+               if (!strncmp(inode.name, name, strlen(inode.name))) {
+                       if (snapid && snapid != inode.snap_id)
+                               continue;
 
-       oid = rsp->oid;
-       *added_oid = oid;
+                       *next_snap = inode.snap_id + 1;
+                       *oid = inode.oid;
+                       return SD_RES_SUCCESS;
+               }
+       }
+       return SD_RES_NO_VDI;
+}
 
-       dprintf("%s (%d) %" PRIu64 ", base: %" PRIu64 "\n", name, len, size,
-               oid);
 
-       ret = create_inode_obj(entries, nr_nodes, sys->epoch, copies,
-                              oid, size, base_oid);
+static int do_lookup_vdi(char *name, int namelen, uint64_t *oid, uint32_t 
snapid,
+                        uint32_t *next_snapid,
+                        unsigned long *right_nr,  unsigned long *deleted_nr)
+{
+       int ret;
+       unsigned long nr, start_nr;
+
+       start_nr = fnv_64a_buf(name, namelen, FNV1A_64_INIT) & (SD_NR_VDIS - 1);
+
+       vprintf(SDOG_INFO "looking for %s %d, %lx\n", name, namelen, start_nr);
+
+       /* bitmap search from the hash point */
+       nr = find_next_zero_bit(sys->vdi_inuse, SD_NR_VDIS, start_nr);
+       *right_nr = nr;
+       if (nr == start_nr) {
+               return SD_RES_NO_VDI;
+       } else if (nr < SD_NR_VDIS) {
+       right_side:
+               /* look up on the right side of the hash point */
+               ret = find_first_vdi(nr - 1, start_nr, name, namelen, snapid, 
oid,
+                                    deleted_nr, next_snapid);
+               return ret;
+       } else {
+               /* round up... bitmap search from the head of the bitmap */
+               nr = find_next_zero_bit(sys->vdi_inuse, SD_NR_VDIS, 0);
+               *right_nr = nr;
+               if (nr >= SD_NR_VDIS)
+                       return SD_RES_FULL_VDI;
+               else if (nr) {
+                       /* look up on the left side of the hash point */
+                       ret = find_first_vdi(nr - 1, 0, name, namelen, snapid, 
oid,
+                                            deleted_nr, next_snapid);
+                       if (ret == SD_RES_NO_VDI)
+                               ; /* we need to go to the right side */
+                       else
+                               return ret;
+               }
 
-       return ret;
+               nr = SD_NR_VDIS;
+               goto right_side;
+       }
 }
 
-int del_vdi(char *name, int len)
+int lookup_vdi(char *data, int data_len, uint64_t *oid, uint32_t snapid)
 {
-       return 0;
+       char *name = data;
+       uint32_t dummy0;
+       unsigned long dummy1, dummy2;
+
+       if (data_len != SD_MAX_VDI_LEN)
+               return SD_RES_INVALID_PARMS;
+
+       return do_lookup_vdi(name, strlen(name), oid, snapid,
+                            &dummy0, &dummy1, &dummy2);
 }
 
-int lookup_vdi(char *filename, uint64_t * oid, uint32_t tag, int do_lock,
-              int *current)
+int add_vdi(char *data, int data_len, uint64_t size,
+           uint64_t *new_oid, uint64_t base_oid, uint32_t copies, int 
is_snapshot)
 {
-       struct sheepdog_node_list_entry entries[SD_MAX_NODES];
-       int nr_nodes, nr_reqs;
+       uint64_t cur_oid;
+       uint32_t next_snapid;
+       unsigned long nr, deleted_nr = SD_NR_VDIS, right_nr = SD_NR_VDIS;
        int ret;
-       struct sd_so_req req;
-       struct sd_so_rsp *rsp = (struct sd_so_rsp *)&req;
+       char *name;
 
-       memset(&req, 0, sizeof(req));
+       if (data_len != SD_MAX_VDI_LEN)
+               return SD_RES_INVALID_PARMS;
 
-       nr_nodes = build_node_list(&sys->sd_node_list, entries);
+       name = data;
 
-       *current = 0;
+       ret = do_lookup_vdi(name, strlen(name), &cur_oid, 0, &next_snapid,
+                           &right_nr, &deleted_nr);
 
-       dprintf("looking for %s %zd\n", filename, strlen(filename));
+       if (is_snapshot) {
+               if (ret != SD_RES_SUCCESS) {
+                       if (ret == SD_RES_NO_VDI)
+                               vprintf(SDOG_CRIT "we dont's have %s\n", name);
+                       return ret;
+               }
+               nr = right_nr;
+       } else {
+               /* we already have the same vdi or met other errors. */
+               if (ret != SD_RES_NO_VDI) {
+                       if (ret == SD_RES_SUCCESS)
+                               ret = SD_RES_VDI_EXIST;
+                       return ret;
+               }
 
-       nr_reqs = sys->nr_sobjs;
-       if (nr_reqs > nr_nodes)
-               nr_reqs = nr_nodes;
+               if (deleted_nr == SD_NR_VDIS)
+                       nr = right_nr;
+               else
+                       nr = deleted_nr; /* we can recycle a deleted vdi */
 
-       memset(&req, 0, sizeof(req));
+               next_snapid = 1;
+       }
 
-       req.opcode = SD_OP_SO_LOOKUP_VDI;
-       req.tag = tag;
+       *new_oid = bit_to_oid(nr);
 
-       ret = exec_reqs(entries, nr_nodes, sys->epoch,
-                       SD_DIR_OID, (struct sd_req *)&req, filename, 
strlen(filename), 0,
-                       nr_reqs, 1);
+       vprintf(SDOG_INFO "we create a new vdi, %d %s (%zd) %" PRIu64 ", oid: %"
+               PRIx64 ", base %" PRIx64 ", cur %" PRIx64 " \n",
+               is_snapshot, name, strlen(name), size, *new_oid, base_oid, 
cur_oid);
 
-       *oid = rsp->oid;
-       if (rsp->flags & SD_VDI_RSP_FLAG_CURRENT)
-               *current = 1;
+       if (!copies) {
+               vprintf(SDOG_WARNING "qemu doesn't specify the copies... %d\n",
+                       sys->nr_sobjs);
+               copies = sys->nr_sobjs;
+       }
 
-       dprintf("looking for %s %lx\n", filename, *oid);
+       ret = create_vdi_obj(name, *new_oid, size, base_oid, cur_oid, copies,
+                            next_snapid, is_snapshot);
 
-       if (ret < 0)
-               return rsp->result;
+       return ret;
+}
+
+int del_vdi(char *name, int len)
+{
+       return 0;
+}
+
+int read_vdis(char *data, int len, unsigned int *rsp_len)
+{
+       if (len != sizeof(sys->vdi_inuse))
+               return SD_RES_INVALID_PARMS;
+
+       memcpy(data, sys->vdi_inuse, sizeof(sys->vdi_inuse));
+       *rsp_len = sizeof(sys->vdi_inuse);
 
        return SD_RES_SUCCESS;
 }
diff --git a/include/bitops.h b/include/bitops.h
new file mode 100644
index 0000000..e3191dd
--- /dev/null
+++ b/include/bitops.h
@@ -0,0 +1,132 @@
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+#define BITS_PER_BYTE          8
+#define BITS_TO_LONGS(nr)      DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
+#define DECLARE_BITMAP(name,bits) \
+       unsigned long name[BITS_TO_LONGS(bits)]
+
+#define BITS_PER_LONG (BITS_PER_BYTE * sizeof(long))
+
+#define ffz(x)  __ffs(~(x))
+
+static inline unsigned long __ffs(unsigned long word)
+{
+       int num = 0;
+
+       if (BITS_PER_LONG == 64) {
+               if ((word & 0xffffffff) == 0) {
+                       num += 32;
+                       word >>= 32;
+               }
+       }
+
+       if ((word & 0xffff) == 0) {
+               num += 16;
+               word >>= 16;
+       }
+       if ((word & 0xff) == 0) {
+               num += 8;
+               word >>= 8;
+       }
+       if ((word & 0xf) == 0) {
+               num += 4;
+               word >>= 4;
+       }
+       if ((word & 0x3) == 0) {
+               num += 2;
+               word >>= 2;
+       }
+       if ((word & 0x1) == 0)
+               num += 1;
+       return num;
+}
+
+#define BITOP_WORD(nr)         ((nr) / BITS_PER_LONG)
+
+static inline unsigned long find_next_zero_bit(const unsigned long *addr, 
unsigned long size,
+                                unsigned long offset)
+{
+       const unsigned long *p = addr + BITOP_WORD(offset);
+       unsigned long result = offset & ~(BITS_PER_LONG-1);
+       unsigned long tmp;
+
+       if (offset >= size)
+               return size;
+       size -= result;
+       offset %= BITS_PER_LONG;
+       if (offset) {
+               tmp = *(p++);
+               tmp |= ~0UL >> (BITS_PER_LONG - offset);
+               if (size < BITS_PER_LONG)
+                       goto found_first;
+               if (~tmp)
+                       goto found_middle;
+               size -= BITS_PER_LONG;
+               result += BITS_PER_LONG;
+       }
+       while (size & ~(BITS_PER_LONG-1)) {
+               if (~(tmp = *(p++)))
+                       goto found_middle;
+               result += BITS_PER_LONG;
+               size -= BITS_PER_LONG;
+       }
+       if (!size)
+               return result;
+       tmp = *p;
+
+found_first:
+       tmp |= ~0UL << size;
+       if (tmp == ~0UL)        /* Are any bits zero? */
+               return result + size;   /* Nope. */
+found_middle:
+       return result + ffz(tmp);
+}
+
+static inline unsigned long find_next_bit(const unsigned long *addr, unsigned 
long size,
+                           unsigned long offset)
+{
+       const unsigned long *p = addr + BITOP_WORD(offset);
+       unsigned long result = offset & ~(BITS_PER_LONG-1);
+       unsigned long tmp;
+
+       if (offset >= size)
+               return size;
+       size -= result;
+       offset %= BITS_PER_LONG;
+       if (offset) {
+               tmp = *(p++);
+               tmp &= (~0UL << offset);
+               if (size < BITS_PER_LONG)
+                       goto found_first;
+               if (tmp)
+                       goto found_middle;
+               size -= BITS_PER_LONG;
+               result += BITS_PER_LONG;
+       }
+       while (size & ~(BITS_PER_LONG-1)) {
+               if ((tmp = *(p++)))
+                       goto found_middle;
+               result += BITS_PER_LONG;
+               size -= BITS_PER_LONG;
+       }
+       if (!size)
+               return result;
+       tmp = *p;
+
+found_first:
+       tmp &= (~0UL >> (BITS_PER_LONG - size));
+       if (tmp == 0UL)         /* Are any bits set? */
+               return result + size;   /* Nope. */
+found_middle:
+       return result + __ffs(tmp);
+}
+
+static inline void set_bit(int nr, unsigned long *addr)
+{
+       addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG);
+}
+
+static inline int test_bit(unsigned int nr, const unsigned long *addr)
+{
+       return ((1UL << (nr % BITS_PER_LONG)) &
+               (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
+}
diff --git a/include/meta.h b/include/meta.h
index 67d2b11..5b296b2 100644
--- a/include/meta.h
+++ b/include/meta.h
@@ -21,33 +21,22 @@
 /*
  * Object ID rules
  *
- *  0 - 17 (18 bits): data object
- * 17 - 55 (37 bits): inode object
- * 56 - 63 ( 8 bits): PGID
- *
- * each VDI can use 2^18 data objects.
+ *  0 - 19 (20 bits): data object space
+ * 20 - 31 (12 bits): reserved data object space
+ * 32 - 55 (24 bits): vdi object space
+ * 56 - 62 (17 bits): reserved vdi object space
+ * 63 - 63 ( 1 bit ): set if vdi
  */
 
-#define DATA_SPACE_SHIFT 18
-
+#define VDI_SPACE   24
+#define VDI_SPACE_SHIFT   32
+#define VDI_BIT (UINT64_C(1) << 63)
 #define DEAFAULT_NR_COPIES 1
+#define SD_MAX_VDI_LEN 256
+#define MAX_DATA_OBJS (1ULL << 20)
+#define MAX_CHILDREN 1024
 
-static inline uint64_t oid_to_ino(uint64_t inode_oid)
-{
-       return (inode_oid >> DATA_SPACE_SHIFT) & ((UINT64_C(1) << 37) - 1);
-}
-
-static inline int is_data_obj_writeable(uint64_t inode_oid, uint64_t data_oid)
-{
-       return oid_to_ino(inode_oid) == oid_to_ino(data_oid);
-}
-
-static inline int is_data_obj(uint64_t oid)
-{
-       return oid & ((UINT64_C(1) << DATA_SPACE_SHIFT) - 1);
-}
-
-#define SHEEPDOG_SUPER_OBJ_SIZE (UINT64_C(1) << 12)
+#define SD_NR_VDIS   (1U << 24)
 
 #define FLAG_CURRENT 1
 
@@ -63,19 +52,37 @@ struct sheepdog_vdi_info {
        char tag[SD_MAX_VDI_LEN];
 };
 
-#define MAX_DATA_OBJS (1 << 18)
-#define MAX_CHILDREN 1024
-
 struct sheepdog_inode {
+       char name[SD_MAX_VDI_LEN];
        uint64_t oid;
        uint64_t ctime;
+       uint64_t snap_ctime;
        uint64_t vdi_size;
-       uint64_t block_size;
-       uint32_t copy_policy;
-       uint32_t nr_copies;
+       uint16_t copy_policy;
+       uint8_t  nr_copies;
+       uint8_t  block_size_shift;
+       uint32_t snap_id;
        uint64_t parent_oid;
        uint64_t child_oid[MAX_CHILDREN];
        uint64_t data_oid[MAX_DATA_OBJS];
 };
 
+static inline int is_data_obj_writeable(struct sheepdog_inode *inode, int idx)
+{
+       return (inode->oid >> VDI_SPACE_SHIFT) ==
+               (inode->data_oid[idx] >> VDI_SPACE_SHIFT);
+}
+
+static inline int is_data_obj(uint64_t oid)
+{
+       return !(VDI_BIT & oid);
+}
+
+#define NR_VDIS (1U << DATA_SPECE_SHIFT)
+
+static inline uint64_t bit_to_oid(unsigned long nr)
+{
+       return ((unsigned long long)nr << VDI_SPACE_SHIFT) | VDI_BIT;
+}
+
 #endif
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 9863aa3..b6afbe1 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -20,8 +20,6 @@
 #define SD_MAX_NODES 1024
 #define SD_MAX_VMS   4096
 
-#define SD_MAX_VDI_LEN 256
-
 /* -> vmon */
 
 #define SD_OP_NEW_VDI        0x11
@@ -36,6 +34,7 @@
 #define SD_OP_GET_EPOCH      0x23
 #define SD_OP_SHUTDOWN       0x24
 #define SD_OP_READ_EPOCH     0x25
+#define SD_OP_READ_VDIS      0x26
 
 #define SD_OP_DEBUG_INC_NVER 0xA0
 #define SD_OP_DEBUG_SET_NODE 0xA1
@@ -96,6 +95,7 @@
 #define SD_RES_SHUTDOWN      0x18 /* Sheepdog is shutting down */
 #define SD_RES_NO_MEM        0x19 /* Cannot allocate memory */
 #define SD_RES_INCONSISTENT_EPOCHS  0x1A /* There is inconsistency between 
epochs */
+#define SD_RES_FULL_VDI      0x1B /* we already have the maximum vdis */
 
 #define SD_VDI_RSP_FLAG_CURRENT 0x01
 
@@ -206,10 +206,10 @@ struct sd_vdi_req {
        uint32_t        id;
        uint32_t        data_length;
        uint64_t        base_oid;
-       uint64_t        tag;
        uint64_t        vdi_size;
        uint32_t        copies;
-       uint32_t        pad[1];
+       uint32_t        snapid;
+       uint32_t        pad[2];
 };
 
 struct sd_vdi_rsp {
diff --git a/include/util.h b/include/util.h
index 4c10670..b107e30 100644
--- a/include/util.h
+++ b/include/util.h
@@ -3,9 +3,10 @@
 
 #include <string.h>
 
+#include "bitops.h"
+
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 #define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
-#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
 
 #if __BYTE_ORDER == __LITTLE_ENDIAN
 #define __cpu_to_be16(x) bswap_16(x)
diff --git a/shepherd/shepherd.c b/shepherd/shepherd.c
index 5d89710..0d7cecb 100644
--- a/shepherd/shepherd.c
+++ b/shepherd/shepherd.c
@@ -100,6 +100,11 @@ static struct sheepdog_node_list_entry *node_list_entries;
 static int nr_nodes;
 static unsigned master_idx;
 
+static int is_current(struct sheepdog_inode *i)
+{
+       return !i->snap_ctime;
+}
+
 static char *size_to_str(uint64_t size, char *str, int str_size)
 {
        char *units[] = {"MB", "GB", "TB", "PB", "EB", "ZB", "YB"};
@@ -407,53 +412,46 @@ static int shutdown_sheepdog(void)
 typedef void (*vdi_parser_func_t)(uint64_t oid, char *name, uint32_t tag, 
uint32_t flags,
                                  struct sheepdog_inode *i, void *data);
 
-/*
- * TODO: handle larger buffer
- */
+
+
 int parse_vdi(vdi_parser_func_t func, void *data)
 {
-       struct sheepdog_vdi_info *ent;
-       char *buf;
-       int rest, ret;
-       struct sheepdog_inode i;
-       struct sd_so_req req;
+       int ret, fd;
+       unsigned long nr;
+       static struct sheepdog_inode i;
+       struct sd_req req;
+       static DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
+       unsigned int rlen, wlen = 0;
 
-       memset(&req, 0, sizeof(req));
+       fd = connect_to("localhost", sdport);
+       if (fd < 0)
+               return fd;
 
-       buf = zalloc(DIR_BUF_LEN);
-       if (!buf)
-               return 1;
+       memset(&req, 0, sizeof(req));
 
-       req.opcode = SD_OP_SO_READ_VDIS;
+       req.opcode = SD_OP_READ_VDIS;
+       req.data_length = sizeof(vdi_inuse);
+       req.epoch = node_list_version;
 
-       ret = exec_reqs(node_list_entries, nr_nodes, node_list_version,
-                       SD_DIR_OID, (struct sd_req *)&req, buf, 0, DIR_BUF_LEN,
-                       nr_nodes, 1);
+       rlen = sizeof(vdi_inuse);
+       ret = exec_req(fd, &req, vdi_inuse, &wlen, &rlen);
+       close(fd);
 
-       if (ret < 0) {
-               ret = 1;
-               goto out;
-       }
+       if (ret < 0)
+               return ret;
 
-       ent = (struct sheepdog_vdi_info *)buf;
-       rest = ret;
-       while (rest > 0) {
-               if (!ent->name_len)
-                       break;
+       for (nr = 0; nr < SD_NR_VDIS; nr++) {
+               if (!test_bit(nr, vdi_inuse))
+                       continue;
 
                ret = read_object(node_list_entries, nr_nodes, 
node_list_version,
-                                 ent->oid, (void *)&i, sizeof(i), 0, nr_nodes);
+                                 bit_to_oid(nr), (void *)&i, sizeof(i), 0, 
nr_nodes);
 
                if (ret == sizeof(i))
-                       func(ent->oid, ent->name, ent->id, ent->flags, &i, 
data);
+                       func(i.oid, i.name, i.snap_id, 0, &i, data);
 
-               ent++;
-               rest -= sizeof(*ent);
        }
 
-out:
-       free(buf);
-
        return 0;
 }
 
@@ -499,7 +497,7 @@ static void print_graph_tree(uint64_t oid, char *name, 
uint32_t tag,
               "time: %8s",
               name, tag, size_str, date, time);
 
-       if (info->highlight && (flags & FLAG_CURRENT))
+       if (info->highlight && is_current(i))
                printf("\", color=\"red\"];\n");
        else
                printf("\"];\n");
@@ -548,9 +546,9 @@ static void print_vdi_tree(uint64_t oid, char *name, 
uint32_t tag,
        if (info->name && strcmp(name, info->name))
                return;
 
-       if (flags & FLAG_CURRENT) {
+       if (is_current(i))
                strcpy(buf, "(You Are Here)");
-       } else {
+       else {
                ti = i->ctime >> 32;
                localtime_r(&ti, &tm);
 
@@ -559,7 +557,7 @@ static void print_vdi_tree(uint64_t oid, char *name, 
uint32_t tag,
        }
 
        add_vdi_tree(name, buf, oid, i->parent_oid,
-                info->highlight && (flags & FLAG_CURRENT));
+                    info->highlight && is_current(i));
 }
 
 static int treeview_vdi(char *vdiname, int highlight)
@@ -599,7 +597,7 @@ static void print_vdi_list(uint64_t oid, char *name, 
uint32_t tag,
        for (idx = 0; idx < MAX_DATA_OBJS; idx++) {
                if (!i->data_oid[idx])
                        continue;
-               if (is_data_obj_writeable(i->data_oid[idx], oid))
+               if (is_data_obj_writeable(i, idx))
                        my_objs++;
                else
                        cow_objs++;
@@ -611,7 +609,7 @@ static void print_vdi_list(uint64_t oid, char *name, 
uint32_t tag,
 
        if (!data || strcmp(name, data) == 0) {
                printf("%c %-8s %5d %7s %7s %7s %s  %9" PRIx64 "\n",
-                      flags & FLAG_CURRENT ? ' ' : 's', name, tag,
+                      is_current(i) ? ' ' : 's', name, tag,
                       vdi_size_str, my_objs_str, cow_objs_str, dbuf, oid);
        }
 }
@@ -630,7 +628,7 @@ static void print_vm_list(uint64_t oid, char *name, 
uint32_t tag,
        struct vm_list_info *vli = (struct vm_list_info *)data;
        char vdi_size_str[8], my_objs_str[8], cow_objs_str[8];
 
-       if (!(flags & FLAG_CURRENT))
+       if (!is_current(inode))
                return;
 
        for (i = 0; i < vli->nr_vms; i++) {
@@ -643,7 +641,7 @@ static void print_vm_list(uint64_t oid, char *name, 
uint32_t tag,
        for (j = 0; j < MAX_DATA_OBJS; j++) {
                if (!inode->data_oid[j])
                        continue;
-               if (is_data_obj_writeable(inode->data_oid[j], oid))
+               if (is_data_obj_writeable(inode, j))
                        my_objs++;
                else
                        cow_objs++;
@@ -676,7 +674,7 @@ static void cal_total_vdi_size(uint64_t oid, char *name, 
uint32_t tag,
 {
        uint64_t *size = data;
 
-       if (flags & FLAG_CURRENT)
+       if (is_current(i))
                *size += i->vdi_size;
 }
 
-- 
1.7.0

-- 
sheepdog mailing list
[email protected]
http://lists.wpkg.org/mailman/listinfo/sheepdog

Reply via email to