[sheepdog] [PATCH 2/2] optimize epoch_log structure to reduce network and memory overhead

Ruoyu Mon, 07 Jul 2014 05:44:28 -0700

Current epoch_log contains a long nodes array to sync nodes and
epoch in the cluster. It is simple, but there is a potential
performance issue because each epoch log occupies nearly 500
KBytes. If the cluster members change frequently, epoch is lifted
frequently. If we don't find a way, the performance will go from
bad to worse.


Although the max node number is 6144, we only use a few of them.
Therefore, the first solution is using a zero-length array,
client (dog) and server (sheep) will negotiate an appropriate
supported node number. This way will spend much less memory and
will run much faster than before.

Signed-off-by: Ruoyu <lian...@ucweb.com>
---
 dog/alter.c              |  3 +++
 dog/cluster.c            | 40 +++++++++++++++++++++++++++++-----------
 dog/dog.c                |  1 +
 dog/vdi.c                | 37 ++++++++++++++++++++++++++++---------
 include/internal_proto.h |  2 +-
 include/sheepdog_proto.h |  3 +++
 sheep/group.c            |  8 +++++++-
 sheep/ops.c              | 47 +++++++++++++++++++++++++++++++----------------
 sheep/store.c            |  4 ++--
 9 files changed, 105 insertions(+), 40 deletions(-)

diff --git a/dog/alter.c b/dog/alter.c
index 7af7f9f..9801cc2 100644
--- a/dog/alter.c
+++ b/dog/alter.c
@@ -63,8 +63,11 @@ static int alter_cluster_copy(int argc, char **argv)
 
        log_length = sd_epoch * sizeof(struct epoch_log);
        logs = xmalloc(log_length);
+
        sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
        hdr.data_length = log_length;
+       hdr.epoch_log.support_nodes = 0;
+
        ret = dog_exec_req(&sd_nid, &hdr, logs);
        if (ret < 0)
                goto failure;
diff --git a/dog/cluster.c b/dog/cluster.c
index 69ec07c..4731767 100644
--- a/dog/cluster.c
+++ b/dog/cluster.c
@@ -141,14 +141,14 @@ static int cluster_format(int argc, char **argv)
        return EXIT_SUCCESS;
 }
 
-static void print_nodes(const struct epoch_log *logs, int epoch)
+static void print_nodes(const struct epoch_log *logs, uint16_t flags)
 {
        int i, nr_disk;
        const struct sd_node *entry;
 
-       for (i = 0; i < logs[epoch].nr_nodes; i++) {
-               entry = logs[epoch].nodes + i;
-               if (logs->flags & SD_CLUSTER_FLAG_DISKMODE) {
+       for (i = 0; i < logs->nr_nodes; i++) {
+               entry = logs->nodes + i;
+               if (flags & SD_CLUSTER_FLAG_DISKMODE) {
                        for (nr_disk = 0; nr_disk < DISK_MAX; nr_disk++) {
                                if (entry->disks[nr_disk].disk_id == 0)
                                        break;
@@ -169,21 +169,35 @@ static int cluster_info(int argc, char **argv)
        int i, ret;
        struct sd_req hdr;
        struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
-       struct epoch_log *logs;
+       struct epoch_log *logs, *log;
+       char *next_log;
        int nr_logs, log_length;
        time_t ti, ct;
        struct tm tm;
        char time_str[128];
+       uint16_t support_nodes;
 
-       log_length = sd_epoch * sizeof(struct epoch_log);
+#define DEFAULT_SUPPORT_NODES 32
+       support_nodes = DEFAULT_SUPPORT_NODES;
+       log_length = sd_epoch * (sizeof(struct epoch_log)
+                       + support_nodes * sizeof(struct sd_node));
        logs = xmalloc(log_length);
 
+retry:
        sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
        hdr.data_length = log_length;
+       hdr.epoch_log.support_nodes = support_nodes;
 
        ret = dog_exec_req(&sd_nid, &hdr, logs);
        if (ret < 0)
                goto error;
+       if (rsp->result == SD_RES_BUFFER_SMALL) {
+               support_nodes *= 2;
+               log_length = sd_epoch * (sizeof(struct epoch_log)
+                               + support_nodes * sizeof(struct sd_node));
+               logs = xrealloc(logs, log_length);
+               goto retry;
+       }
 
        /* show cluster status */
        if (!raw_output)
@@ -230,10 +244,12 @@ static int cluster_info(int argc, char **argv)
                printf("Epoch Time           Version\n");
        }
 
-       nr_logs = rsp->data_length / sizeof(struct epoch_log);
+       nr_logs = rsp->data_length / (sizeof(struct epoch_log)
+                       + support_nodes * sizeof(struct sd_node));
+       next_log = (char *)logs;
        for (i = 0; i < nr_logs; i++) {
-
-               ti = logs[i].time;
+               log = (struct epoch_log *)next_log;
+               ti = log->time;
                if (raw_output) {
                        snprintf(time_str, sizeof(time_str), "%" PRIu64, 
(uint64_t) ti);
                } else {
@@ -241,10 +257,12 @@ static int cluster_info(int argc, char **argv)
                        strftime(time_str, sizeof(time_str), "%Y-%m-%d 
%H:%M:%S", &tm);
                }
 
-               printf(raw_output ? "%s %d" : "%s %6d", time_str, 
logs[i].epoch);
+               printf(raw_output ? "%s %d" : "%s %6d", time_str, log->epoch);
                printf(" [");
-               print_nodes(logs, i);
+               print_nodes(log, logs->flags);
                printf("]\n");
+               next_log = (char *)log->nodes
+                               + support_nodes * sizeof(struct sd_node);
        }
 
        free(logs);
diff --git a/dog/dog.c b/dog/dog.c
index 46992ec..fda7906 100644
--- a/dog/dog.c
+++ b/dog/dog.c
@@ -119,6 +119,7 @@ int update_node_list(int max_nodes)
 
        sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
        hdr.data_length = log_length;
+       hdr.epoch_log.support_nodes = 0;
 
        ret = dog_exec_req(&sd_nid, &hdr, logs);
        if (ret < 0)
diff --git a/dog/vdi.c b/dog/vdi.c
index 49a2139..5fd0b7b 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -964,47 +964,64 @@ static int do_track_object(uint64_t oid, uint8_t 
nr_copies)
        struct sd_req hdr;
        struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
        const struct sd_vnode *vnode_buf[SD_MAX_COPIES];
-       struct epoch_log *logs;
+       struct epoch_log *logs, *log;
+       char *next_log;
        int nr_logs, log_length;
+       uint16_t support_nodes;
 
-       log_length = sd_epoch * sizeof(struct epoch_log);
+#define DEFAULT_SUPPORT_NODES 32
+       support_nodes = DEFAULT_SUPPORT_NODES;
+       log_length = sd_epoch * (sizeof(struct epoch_log)
+                       + support_nodes * sizeof(struct sd_node));
        logs = xmalloc(log_length);
 
+retry:
        sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
        hdr.data_length = log_length;
+       hdr.epoch_log.support_nodes = support_nodes;
 
        ret = dog_exec_req(&sd_nid, &hdr, logs);
        if (ret < 0)
                goto error;
 
+       if (rsp->result == SD_RES_BUFFER_SMALL) {
+               support_nodes *= 2;
+               log_length = sd_epoch * (sizeof(struct epoch_log)
+                               + support_nodes * sizeof(struct sd_node));
+               logs = xrealloc(logs, log_length);
+               goto retry;
+       }
        if (rsp->result != SD_RES_SUCCESS) {
                printf("%s\n", sd_strerror(rsp->result));
                goto error;
        }
 
-       nr_logs = rsp->data_length / sizeof(struct epoch_log);
+       nr_logs = rsp->data_length / (sizeof(struct epoch_log)
+                       + support_nodes * sizeof(struct sd_node));
+       next_log = (char *)logs;
        for (i = nr_logs - 1; i >= 0; i--) {
                struct rb_root vroot = RB_ROOT;
                struct rb_root nroot = RB_ROOT;
 
+               log = (struct epoch_log *)next_log;
                printf("\nobj %"PRIx64" locations at epoch %d, copies = %d\n",
-                      oid, logs[i].epoch, nr_copies);
+                      oid, log->epoch, nr_copies);
                printf("---------------------------------------------------\n");
 
                /*
                 * When # of nodes is less than nr_copies, we only print
                 * remaining nodes that holds all the remaining copies.
                 */
-               if (logs[i].nr_nodes < nr_copies) {
-                       for (j = 0; j < logs[i].nr_nodes; j++) {
-                               const struct node_id *n = &logs[i].nodes[j].nid;
+               if (log->nr_nodes < nr_copies) {
+                       for (j = 0; j < log->nr_nodes; j++) {
+                               const struct node_id *n = &log->nodes[j].nid;
 
                                printf("%s\n", addr_to_str(n->addr, n->port));
                        }
                        continue;
                }
-               for (int k = 0; k < logs[i].nr_nodes; k++)
-                       rb_insert(&nroot, &logs[i].nodes[k], rb, node_cmp);
+               for (int k = 0; k < log->nr_nodes; k++)
+                       rb_insert(&nroot, &log->nodes[k], rb, node_cmp);
                if (logs->flags & SD_CLUSTER_FLAG_DISKMODE)
                        disks_to_vnodes(&nroot, &vroot);
                else
@@ -1016,6 +1033,8 @@ static int do_track_object(uint64_t oid, uint8_t 
nr_copies)
                        printf("%s\n", addr_to_str(n->addr, n->port));
                }
                rb_destroy(&vroot, struct sd_vnode, rb);
+               next_log = (char *)log->nodes
+                               + support_nodes * sizeof(struct sd_node);
        }
 
        free(logs);
diff --git a/include/internal_proto.h b/include/internal_proto.h
index 7ec2872..ad4d822 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -211,7 +211,7 @@ struct epoch_log {
        uint8_t  __pad[3];
        uint16_t flags;
        char drv_name[STORE_LEN];
-       struct sd_node nodes[SD_MAX_NODES];
+       struct sd_node nodes[0];
 };
 
 struct vdi_op_message {
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 76fad51..1355ecb 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -180,6 +180,9 @@ struct sd_req {
                        uint8_t         addr[16];
                        uint16_t        port;
                } node_addr;
+               struct {
+                       uint16_t        support_nodes;
+               } epoch_log;
 
                uint32_t                __pad[8];
        };
diff --git a/sheep/group.c b/sheep/group.c
index adfd798..27e3574 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -352,7 +352,7 @@ error:
 int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
                          time_t *timestamp, struct vnode_info *vinfo)
 {
-       char buf[SD_MAX_NODES * sizeof(struct sd_node) + sizeof(time_t)];
+       char *buf = xzalloc(len + sizeof(time_t));
        const struct sd_node *node;
        int ret;
 
@@ -369,6 +369,10 @@ int epoch_log_read_remote(uint32_t epoch, struct sd_node 
*nodes, int len,
                hdr.obj.tgt_epoch = epoch;
                hdr.epoch = sys_epoch();
                ret = sheep_exec_req(&node->nid, &hdr, buf);
+               if (ret == SD_RES_BUFFER_SMALL) {
+                       free(buf);
+                       return -2;
+               }
                if (ret != SD_RES_SUCCESS)
                        continue;
 
@@ -377,6 +381,7 @@ int epoch_log_read_remote(uint32_t epoch, struct sd_node 
*nodes, int len,
                if (timestamp)
                        memcpy(timestamp, buf + nodes_len, sizeof(*timestamp));
 
+               free(buf);
                return nodes_len / sizeof(struct sd_node);
        }
 
@@ -384,6 +389,7 @@ int epoch_log_read_remote(uint32_t epoch, struct sd_node 
*nodes, int len,
         * If no node has targeted epoch log, return 0 here to at least
         * allow reading older epoch logs.
         */
+       free(buf);
        return 0;
 }
 
diff --git a/sheep/ops.c b/sheep/ops.c
index fb26077..12957d2 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -438,15 +438,19 @@ static int local_stat_cluster(struct request *req)
 {
        struct sd_rsp *rsp = &req->rp;
        struct epoch_log *elog;
+       char *next_elog;
        int i, max_elogs;
        uint32_t epoch;
+       uint16_t support_nodes = req->rq.epoch_log.support_nodes;
 
        if (req->vinfo == NULL) {
                sd_debug("cluster is not started up");
                goto out;
        }
 
-       max_elogs = req->rq.data_length / sizeof(*elog);
+       max_elogs = req->rq.data_length / (sizeof(*elog)
+                       + support_nodes * sizeof(struct sd_node));
+       next_elog = (char *)req->data;
        epoch = get_latest_epoch();
        for (i = 0; i < max_elogs; i++) {
                int nr_nodes;
@@ -454,7 +458,7 @@ static int local_stat_cluster(struct request *req)
                if (epoch <= 0)
                        break;
 
-               elog = (struct epoch_log *)req->data + i;
+               elog = (struct epoch_log *)next_elog;
                memset(elog, 0, sizeof(*elog));
 
                /* some filed only need to store in first elog */
@@ -469,20 +473,29 @@ static int local_stat_cluster(struct request *req)
                }
 
                elog->epoch = epoch;
-               nr_nodes = epoch_log_read_with_timestamp(epoch, elog->nodes,
-                                                        sizeof(elog->nodes),
-                                                        (time_t *)&elog->time);
-               if (nr_nodes == -1)
-                       nr_nodes = epoch_log_read_remote(epoch, elog->nodes,
-                                                        sizeof(elog->nodes),
-                                                        (time_t *)&elog->time,
-                                                        req->vinfo);
-               assert(nr_nodes >= 0);
-               assert(nr_nodes <= SD_MAX_NODES);
-               elog->nr_nodes = nr_nodes;
-
-
-               rsp->data_length += sizeof(*elog);
+               if (support_nodes > 0) {
+                       nr_nodes = epoch_log_read_with_timestamp(
+                                       epoch, elog->nodes,
+                                       support_nodes * sizeof(struct sd_node),
+                                       (time_t *)&elog->time);
+                       if (nr_nodes == -1)
+                               nr_nodes = epoch_log_read_remote(
+                                       epoch, elog->nodes,
+                                       support_nodes * sizeof(struct sd_node),
+                                       (time_t *)&elog->time,
+                                       req->vinfo);
+                       if (nr_nodes == -2)
+                               return SD_RES_BUFFER_SMALL;
+                       assert(nr_nodes >= 0);
+                       assert(nr_nodes <= SD_MAX_NODES);
+                       elog->nr_nodes = nr_nodes;
+               } else
+                       elog->nr_nodes = 0;
+
+               next_elog = (char *)elog->nodes
+                               + support_nodes * sizeof(struct sd_node);
+               rsp->data_length += sizeof(*elog)
+                               + support_nodes * sizeof(struct sd_node);
                epoch--;
        }
 out:
@@ -520,6 +533,8 @@ static int local_get_epoch(struct request *req)
                                        &timestamp);
        if (nr_nodes == -1)
                return SD_RES_NO_TAG;
+       if (nr_nodes == -2)
+               return SD_RES_BUFFER_SMALL;
 
        nodes_len = nr_nodes * sizeof(struct sd_node);
        memcpy((void *)((char *)req->data + nodes_len), &timestamp,
diff --git a/sheep/store.c b/sheep/store.c
index eee88c7..70fddb8 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -63,8 +63,8 @@ static int do_epoch_log_read(uint32_t epoch, struct sd_node 
*nodes, int len,
        }
 
        if (len < epoch_stat.st_size - sizeof(*timestamp)) {
-               sd_err("invalid epoch %"PRIu32" log", epoch);
-               goto err;
+               close(fd);
+               return -2;
        }
 
        ret = xread(fd, nodes, epoch_stat.st_size - sizeof(*timestamp));
-- 
1.8.3.2


-- 
sheepdog mailing list
sheepdog@lists.wpkg.org
http://lists.wpkg.org/mailman/listinfo/sheepdog

[sheepdog] [PATCH 2/2] optimize epoch_log structure to reduce network and memory overhead

Reply via email to