On Fri, 26 Mar 2010 10:17:19 +0900
FUJITA Tomonori <[email protected]> wrote:

> We use the super object to manage the list of the existing VDIs.
> 
> Unlike data objects, the super object is a directory and the directory
> is replicated on multiple nodes. I concluded that the super object
> recovery code is too tricky and complicated.
> 
> So this patchset removes the super object. We manage the list of the
> existing VDIs like p2p applications. At startup, the nodes build the
> list of VDIs.
> 
> TODO: restart support

Done. This can be applied on the top of the patchset.

=
From: FUJITA Tomonori <[email protected]>
Subject: [PATCH] add reboot support without the super object

Signed-off-by: FUJITA Tomonori <[email protected]>
---
 collie/group.c      |   64 +++++++++++++++++++++++++++++++++++++++++++++++----
 collie/net.c        |    6 ++++-
 collie/store.c      |   52 +++++++++++++++++++++++++++++++++++-----
 include/meta.h      |    5 ++++
 lib/net.c           |    5 +++-
 shepherd/shepherd.c |    2 +
 6 files changed, 120 insertions(+), 14 deletions(-)

diff --git a/collie/group.c b/collie/group.c
index 836de83..2067870 100644
--- a/collie/group.c
+++ b/collie/group.c
@@ -199,9 +199,6 @@ void cluster_queue_request(struct work *work, int idx)
 
                rsp->result = SD_RES_SUCCESS;
                break;
-       case SD_OP_READ_VDIS:
-               rsp->result = read_vdis(req->data, hdr->data_length, 
&rsp->data_length);
-               break;
        default:
                /* forward request to group */
                goto forward;
@@ -431,6 +428,58 @@ static void join(struct join_message *msg)
                msg->cluster_status = sys->status;
 }
 
+static void get_vdi_bitmap_from_all(void)
+{
+       struct sd_req hdr;
+       struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
+       int i, j, ret, nr_nodes, fd;
+       /* fixme: we need this until starting up. */
+       static DECLARE_BITMAP(tmp_vdi_inuse, SD_NR_VDIS);
+       struct sheepdog_node_list_entry entry[SD_MAX_NODES];
+       unsigned int rlen, wlen;
+       char host[128];
+
+       /*
+        * we don't need the proper order but this is the simplest
+        * way.
+        */
+       nr_nodes = build_node_list(&sys->sd_node_list, entry);
+
+       for (i = 0; i < nr_nodes; i++) {
+               if (!memcmp(&sys->this_node, &entry[i], sizeof(sys->this_node)))
+                       continue;
+
+               addr_to_str(host, sizeof(host), entry[i].addr, 0);
+
+               fd = connect_to(host, entry[i].port);
+               if (fd < 0) {
+                       vprintf(SDOG_ERR "can't get the vdi bitmap %s, %m\n", 
host);
+               }
+
+               vprintf(SDOG_ERR "get the vdi bitmap %d %s\n", i, host);
+
+               memset(&hdr, 0, sizeof(hdr));
+               hdr.opcode = SD_OP_READ_VDIS;
+               hdr.epoch = sys->epoch;
+               hdr.data_length = sizeof(tmp_vdi_inuse);
+               rlen = hdr.data_length;
+               wlen = 0;
+
+               ret = exec_req(fd, &hdr, (char *)tmp_vdi_inuse,
+                              &wlen, &rlen);
+
+               close(fd);
+
+               if (ret || rsp->result != SD_RES_SUCCESS) {
+                       vprintf(SDOG_ERR "can't get the vdi bitmap %d %d\n", 
ret,
+                               rsp->result);
+               }
+
+               for (j = 0; j < ARRAY_SIZE(sys->vdi_inuse); j++)
+                       sys->vdi_inuse[j] |= tmp_vdi_inuse[j];
+       }
+}
+
 static void update_cluster_info(struct join_message *msg)
 {
        int i;
@@ -498,9 +547,14 @@ out:
        if (sys->status == SD_STATUS_STARTUP && msg->cluster_status == 
SD_STATUS_OK)
                sys->epoch = get_latest_epoch();
 
-       if (sys->status != SD_STATUS_INCONSISTENT_EPOCHS)
-               sys->status = msg->cluster_status;
+       if (sys->status != SD_STATUS_INCONSISTENT_EPOCHS) {
+               if (msg->cluster_status == SD_STATUS_OK) {
+                       get_vdi_bitmap_from_all();
+                       set_global_nr_copies(sys->nr_sobjs);
+               }
 
+               sys->status = msg->cluster_status;
+       }
        return;
 }
 
diff --git a/collie/net.c b/collie/net.c
index 137790c..09b2452 100644
--- a/collie/net.c
+++ b/collie/net.c
@@ -54,6 +54,7 @@ static void queue_request(struct request *req)
                case SD_OP_MAKE_FS:
                case SD_OP_GET_NODE_LIST:
                case SD_OP_READ_EPOCH:
+               case SD_OP_READ_VDIS:
                        break;
                default:
                        if (sys->status == SD_STATUS_STARTUP)
@@ -88,9 +89,12 @@ static void queue_request(struct request *req)
        case SD_OP_MAKE_FS:
        case SD_OP_SHUTDOWN:
        case SD_OP_STAT_CLUSTER:
-       case SD_OP_READ_VDIS:
                req->work.fn = cluster_queue_request;
                break;
+       case SD_OP_READ_VDIS:
+               rsp->result = read_vdis(req->data, hdr->data_length, 
&rsp->data_length);
+               req->done(req);
+               return;
        default:
                eprintf("unknown operation %d\n", hdr->opcode);
                rsp->result = SD_RES_SYSTEM_ERROR;
diff --git a/collie/store.c b/collie/store.c
index 429124c..5c870b6 100644
--- a/collie/store.c
+++ b/collie/store.c
@@ -439,9 +439,6 @@ static int store_queue_request_local(struct request *req, 
char *buf, uint32_t ep
                        goto out;
                }
 
-               if (!is_data_obj(oid))
-                       break;
-
                if (hdr->flags & SD_FLAG_CMD_COW) {
                        dprintf("%" PRIu64 "\n", hdr->cow_oid);
 
@@ -567,7 +564,7 @@ void store_queue_request(struct work *work, int idx)
        ret = store_queue_request_local(req, buf, epoch);
 out:
        if (ret != SD_RES_SUCCESS) {
-               dprintf("failed, %d, %x, %" PRIx64" , %u, %u, %x\n",
+               dprintf("failed, %d, %x, %" PRIx64" , %u, %u, %d\n",
                        idx, opcode, oid, epoch, req_epoch, ret);
                rsp->result = ret;
        }
@@ -1075,6 +1072,8 @@ static int init_path(char *d, int *new)
 {
        int ret, retry = 0;
        struct stat s;
+
+       *new = 0;
 again:
        ret = stat(d, &s);
        if (ret) {
@@ -1123,12 +1122,50 @@ static int init_obj_path(char *base_path)
 
 static int init_epoch_path(char *base_path)
 {
-       int new;
+       int new, ret;
+       uint32_t epoch;
+       DIR *dir;
+       char path[1024];
+       struct dirent *dent;
+       uint64_t oid;
 
        epoch_path = zalloc(strlen(base_path) + strlen(EPOCH_PATH) + 1);
        sprintf(epoch_path, "%s" EPOCH_PATH, base_path);
 
-       return init_path(epoch_path, &new);
+       ret = init_path(epoch_path, &new);
+       if (new || ret)
+               return ret;
+
+       epoch = get_latest_epoch();
+
+       snprintf(path, sizeof(path), "%s/%08u", obj_path, epoch);
+
+       vprintf(SDOG_INFO "found the epoch dir, %s\n", path);
+
+       dir = opendir(path);
+       if (!dir) {
+               vprintf(SDOG_ERR "failed to open the epoch dir, %m\n");
+               return SD_RES_EIO;
+       }
+
+       while ((dent = readdir(dir))) {
+               if (!strcmp(dent->d_name, ".") ||
+                   !strcmp(dent->d_name, ".."))
+                       continue;
+
+               oid = strtoull(dent->d_name, NULL, 16);
+
+               if (is_data_obj(oid))
+                       continue;
+
+               vprintf(SDOG_DEBUG "found the vdi obj, %" PRIx64 " %lu\n",
+                       oid, oid_to_bit(oid));
+
+               set_bit(oid_to_bit(oid), sys->vdi_inuse);
+       }
+       closedir(dir);
+
+       return 0;
 }
 
 static int init_mnt_path(char *base_path)
@@ -1254,6 +1291,7 @@ static int global_nr_copies(uint32_t *copies, int set)
                }
        } else {
                if (ret != sizeof(*copies)) {
+                       eprintf("use 'user_xattr' option?\n");
                        return SD_RES_SYSTEM_ERROR;
                }
        }
@@ -1268,5 +1306,5 @@ int set_global_nr_copies(uint32_t copies)
 
 int get_global_nr_copies(uint32_t *copies)
 {
-       return global_nr_copies(copies, 1);
+       return global_nr_copies(copies, 0);
 }
diff --git a/include/meta.h b/include/meta.h
index 99fc38a..338f660 100644
--- a/include/meta.h
+++ b/include/meta.h
@@ -70,4 +70,9 @@ static inline uint64_t bit_to_oid(unsigned long nr)
        return ((unsigned long long)nr << VDI_SPACE_SHIFT) | VDI_BIT;
 }
 
+static inline unsigned long oid_to_bit(uint64_t oid)
+{
+       return (oid & ~VDI_BIT) >> VDI_SPACE_SHIFT;
+}
+
 #endif
diff --git a/lib/net.c b/lib/net.c
index c85ee2d..ff261e5 100644
--- a/lib/net.c
+++ b/lib/net.c
@@ -412,8 +412,11 @@ int read_object(struct sheepdog_node_list_entry *e,
                addr_to_str(name, sizeof(name), e[n].addr, 0);
 
                fd = connect_to(name, e[n].port);
-               if (fd < 0)
+               if (fd < 0) {
+                       printf("%s(%d): %s, %m\n", __func__, __LINE__,
+                              name);
                        return -1;
+               }
 
                memset(&hdr, 0, sizeof(hdr));
                hdr.epoch = node_version;
diff --git a/shepherd/shepherd.c b/shepherd/shepherd.c
index 0d7cecb..55a4fe0 100644
--- a/shepherd/shepherd.c
+++ b/shepherd/shepherd.c
@@ -449,6 +449,8 @@ int parse_vdi(vdi_parser_func_t func, void *data)
 
                if (ret == sizeof(i))
                        func(i.oid, i.name, i.snap_id, 0, &i, data);
+               else
+                       printf("error %lu %" PRIx64 ", %d\n", nr, 
bit_to_oid(nr), ret);
 
        }
 
-- 
1.7.0

-- 
sheepdog mailing list
[email protected]
http://lists.wpkg.org/mailman/listinfo/sheepdog

Reply via email to