Current epoch_log contains a long nodes array to sync nodes and epoch in the cluster. It is simple, but there is a potential performance issue because each epoch log occupies nearly 500 KBytes. If the cluster members change frequently, epoch is lifted frequently. If we don't find a way, the performance will go from bad to worse.
Although the max node number is 6144, we only use a few of them. Therefore, the first solution is using a zero-length array, client (dog) and server (sheep) will negotiate an appropriate supported node number. This way will spend much less memory and will run much faster than before. Signed-off-by: Ruoyu <lian...@ucweb.com> --- dog/alter.c | 3 +++ dog/cluster.c | 40 +++++++++++++++++++++++++++++----------- dog/dog.c | 1 + dog/vdi.c | 37 ++++++++++++++++++++++++++++--------- include/internal_proto.h | 2 +- include/sheepdog_proto.h | 3 +++ sheep/group.c | 8 +++++++- sheep/ops.c | 47 +++++++++++++++++++++++++++++++---------------- sheep/store.c | 4 ++-- 9 files changed, 105 insertions(+), 40 deletions(-) diff --git a/dog/alter.c b/dog/alter.c index 7af7f9f..9801cc2 100644 --- a/dog/alter.c +++ b/dog/alter.c @@ -63,8 +63,11 @@ static int alter_cluster_copy(int argc, char **argv) log_length = sd_epoch * sizeof(struct epoch_log); logs = xmalloc(log_length); + sd_init_req(&hdr, SD_OP_STAT_CLUSTER); hdr.data_length = log_length; + hdr.epoch_log.support_nodes = 0; + ret = dog_exec_req(&sd_nid, &hdr, logs); if (ret < 0) goto failure; diff --git a/dog/cluster.c b/dog/cluster.c index 69ec07c..4731767 100644 --- a/dog/cluster.c +++ b/dog/cluster.c @@ -141,14 +141,14 @@ static int cluster_format(int argc, char **argv) return EXIT_SUCCESS; } -static void print_nodes(const struct epoch_log *logs, int epoch) +static void print_nodes(const struct epoch_log *logs, uint16_t flags) { int i, nr_disk; const struct sd_node *entry; - for (i = 0; i < logs[epoch].nr_nodes; i++) { - entry = logs[epoch].nodes + i; - if (logs->flags & SD_CLUSTER_FLAG_DISKMODE) { + for (i = 0; i < logs->nr_nodes; i++) { + entry = logs->nodes + i; + if (flags & SD_CLUSTER_FLAG_DISKMODE) { for (nr_disk = 0; nr_disk < DISK_MAX; nr_disk++) { if (entry->disks[nr_disk].disk_id == 0) break; @@ -169,21 +169,35 @@ static int cluster_info(int argc, char **argv) int i, ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; - struct epoch_log *logs; + struct epoch_log *logs, *log; + char *next_log; int nr_logs, log_length; time_t ti, ct; struct tm tm; char time_str[128]; + uint16_t support_nodes; - log_length = sd_epoch * sizeof(struct epoch_log); +#define DEFAULT_SUPPORT_NODES 32 + support_nodes = DEFAULT_SUPPORT_NODES; + log_length = sd_epoch * (sizeof(struct epoch_log) + + support_nodes * sizeof(struct sd_node)); logs = xmalloc(log_length); +retry: sd_init_req(&hdr, SD_OP_STAT_CLUSTER); hdr.data_length = log_length; + hdr.epoch_log.support_nodes = support_nodes; ret = dog_exec_req(&sd_nid, &hdr, logs); if (ret < 0) goto error; + if (rsp->result == SD_RES_BUFFER_SMALL) { + support_nodes *= 2; + log_length = sd_epoch * (sizeof(struct epoch_log) + + support_nodes * sizeof(struct sd_node)); + logs = xrealloc(logs, log_length); + goto retry; + } /* show cluster status */ if (!raw_output) @@ -230,10 +244,12 @@ static int cluster_info(int argc, char **argv) printf("Epoch Time Version\n"); } - nr_logs = rsp->data_length / sizeof(struct epoch_log); + nr_logs = rsp->data_length / (sizeof(struct epoch_log) + + support_nodes * sizeof(struct sd_node)); + next_log = (char *)logs; for (i = 0; i < nr_logs; i++) { - - ti = logs[i].time; + log = (struct epoch_log *)next_log; + ti = log->time; if (raw_output) { snprintf(time_str, sizeof(time_str), "%" PRIu64, (uint64_t) ti); } else { @@ -241,10 +257,12 @@ static int cluster_info(int argc, char **argv) strftime(time_str, sizeof(time_str), "%Y-%m-%d %H:%M:%S", &tm); } - printf(raw_output ? "%s %d" : "%s %6d", time_str, logs[i].epoch); + printf(raw_output ? "%s %d" : "%s %6d", time_str, log->epoch); printf(" ["); - print_nodes(logs, i); + print_nodes(log, logs->flags); printf("]\n"); + next_log = (char *)log->nodes + + support_nodes * sizeof(struct sd_node); } free(logs); diff --git a/dog/dog.c b/dog/dog.c index 46992ec..fda7906 100644 --- a/dog/dog.c +++ b/dog/dog.c @@ -119,6 +119,7 @@ int update_node_list(int max_nodes) sd_init_req(&hdr, SD_OP_STAT_CLUSTER); hdr.data_length = log_length; + hdr.epoch_log.support_nodes = 0; ret = dog_exec_req(&sd_nid, &hdr, logs); if (ret < 0) diff --git a/dog/vdi.c b/dog/vdi.c index 49a2139..5fd0b7b 100644 --- a/dog/vdi.c +++ b/dog/vdi.c @@ -964,47 +964,64 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies) struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; const struct sd_vnode *vnode_buf[SD_MAX_COPIES]; - struct epoch_log *logs; + struct epoch_log *logs, *log; + char *next_log; int nr_logs, log_length; + uint16_t support_nodes; - log_length = sd_epoch * sizeof(struct epoch_log); +#define DEFAULT_SUPPORT_NODES 32 + support_nodes = DEFAULT_SUPPORT_NODES; + log_length = sd_epoch * (sizeof(struct epoch_log) + + support_nodes * sizeof(struct sd_node)); logs = xmalloc(log_length); +retry: sd_init_req(&hdr, SD_OP_STAT_CLUSTER); hdr.data_length = log_length; + hdr.epoch_log.support_nodes = support_nodes; ret = dog_exec_req(&sd_nid, &hdr, logs); if (ret < 0) goto error; + if (rsp->result == SD_RES_BUFFER_SMALL) { + support_nodes *= 2; + log_length = sd_epoch * (sizeof(struct epoch_log) + + support_nodes * sizeof(struct sd_node)); + logs = xrealloc(logs, log_length); + goto retry; + } if (rsp->result != SD_RES_SUCCESS) { printf("%s\n", sd_strerror(rsp->result)); goto error; } - nr_logs = rsp->data_length / sizeof(struct epoch_log); + nr_logs = rsp->data_length / (sizeof(struct epoch_log) + + support_nodes * sizeof(struct sd_node)); + next_log = (char *)logs; for (i = nr_logs - 1; i >= 0; i--) { struct rb_root vroot = RB_ROOT; struct rb_root nroot = RB_ROOT; + log = (struct epoch_log *)next_log; printf("\nobj %"PRIx64" locations at epoch %d, copies = %d\n", - oid, logs[i].epoch, nr_copies); + oid, log->epoch, nr_copies); printf("---------------------------------------------------\n"); /* * When # of nodes is less than nr_copies, we only print * remaining nodes that holds all the remaining copies. */ - if (logs[i].nr_nodes < nr_copies) { - for (j = 0; j < logs[i].nr_nodes; j++) { - const struct node_id *n = &logs[i].nodes[j].nid; + if (log->nr_nodes < nr_copies) { + for (j = 0; j < log->nr_nodes; j++) { + const struct node_id *n = &log->nodes[j].nid; printf("%s\n", addr_to_str(n->addr, n->port)); } continue; } - for (int k = 0; k < logs[i].nr_nodes; k++) - rb_insert(&nroot, &logs[i].nodes[k], rb, node_cmp); + for (int k = 0; k < log->nr_nodes; k++) + rb_insert(&nroot, &log->nodes[k], rb, node_cmp); if (logs->flags & SD_CLUSTER_FLAG_DISKMODE) disks_to_vnodes(&nroot, &vroot); else @@ -1016,6 +1033,8 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies) printf("%s\n", addr_to_str(n->addr, n->port)); } rb_destroy(&vroot, struct sd_vnode, rb); + next_log = (char *)log->nodes + + support_nodes * sizeof(struct sd_node); } free(logs); diff --git a/include/internal_proto.h b/include/internal_proto.h index 7ec2872..ad4d822 100644 --- a/include/internal_proto.h +++ b/include/internal_proto.h @@ -211,7 +211,7 @@ struct epoch_log { uint8_t __pad[3]; uint16_t flags; char drv_name[STORE_LEN]; - struct sd_node nodes[SD_MAX_NODES]; + struct sd_node nodes[0]; }; struct vdi_op_message { diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h index 76fad51..1355ecb 100644 --- a/include/sheepdog_proto.h +++ b/include/sheepdog_proto.h @@ -180,6 +180,9 @@ struct sd_req { uint8_t addr[16]; uint16_t port; } node_addr; + struct { + uint16_t support_nodes; + } epoch_log; uint32_t __pad[8]; }; diff --git a/sheep/group.c b/sheep/group.c index adfd798..27e3574 100644 --- a/sheep/group.c +++ b/sheep/group.c @@ -352,7 +352,7 @@ error: int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len, time_t *timestamp, struct vnode_info *vinfo) { - char buf[SD_MAX_NODES * sizeof(struct sd_node) + sizeof(time_t)]; + char *buf = xzalloc(len + sizeof(time_t)); const struct sd_node *node; int ret; @@ -369,6 +369,10 @@ int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len, hdr.obj.tgt_epoch = epoch; hdr.epoch = sys_epoch(); ret = sheep_exec_req(&node->nid, &hdr, buf); + if (ret == SD_RES_BUFFER_SMALL) { + free(buf); + return -2; + } if (ret != SD_RES_SUCCESS) continue; @@ -377,6 +381,7 @@ int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len, if (timestamp) memcpy(timestamp, buf + nodes_len, sizeof(*timestamp)); + free(buf); return nodes_len / sizeof(struct sd_node); } @@ -384,6 +389,7 @@ int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len, * If no node has targeted epoch log, return 0 here to at least * allow reading older epoch logs. */ + free(buf); return 0; } diff --git a/sheep/ops.c b/sheep/ops.c index fb26077..12957d2 100644 --- a/sheep/ops.c +++ b/sheep/ops.c @@ -438,15 +438,19 @@ static int local_stat_cluster(struct request *req) { struct sd_rsp *rsp = &req->rp; struct epoch_log *elog; + char *next_elog; int i, max_elogs; uint32_t epoch; + uint16_t support_nodes = req->rq.epoch_log.support_nodes; if (req->vinfo == NULL) { sd_debug("cluster is not started up"); goto out; } - max_elogs = req->rq.data_length / sizeof(*elog); + max_elogs = req->rq.data_length / (sizeof(*elog) + + support_nodes * sizeof(struct sd_node)); + next_elog = (char *)req->data; epoch = get_latest_epoch(); for (i = 0; i < max_elogs; i++) { int nr_nodes; @@ -454,7 +458,7 @@ static int local_stat_cluster(struct request *req) if (epoch <= 0) break; - elog = (struct epoch_log *)req->data + i; + elog = (struct epoch_log *)next_elog; memset(elog, 0, sizeof(*elog)); /* some filed only need to store in first elog */ @@ -469,20 +473,29 @@ static int local_stat_cluster(struct request *req) } elog->epoch = epoch; - nr_nodes = epoch_log_read_with_timestamp(epoch, elog->nodes, - sizeof(elog->nodes), - (time_t *)&elog->time); - if (nr_nodes == -1) - nr_nodes = epoch_log_read_remote(epoch, elog->nodes, - sizeof(elog->nodes), - (time_t *)&elog->time, - req->vinfo); - assert(nr_nodes >= 0); - assert(nr_nodes <= SD_MAX_NODES); - elog->nr_nodes = nr_nodes; - - - rsp->data_length += sizeof(*elog); + if (support_nodes > 0) { + nr_nodes = epoch_log_read_with_timestamp( + epoch, elog->nodes, + support_nodes * sizeof(struct sd_node), + (time_t *)&elog->time); + if (nr_nodes == -1) + nr_nodes = epoch_log_read_remote( + epoch, elog->nodes, + support_nodes * sizeof(struct sd_node), + (time_t *)&elog->time, + req->vinfo); + if (nr_nodes == -2) + return SD_RES_BUFFER_SMALL; + assert(nr_nodes >= 0); + assert(nr_nodes <= SD_MAX_NODES); + elog->nr_nodes = nr_nodes; + } else + elog->nr_nodes = 0; + + next_elog = (char *)elog->nodes + + support_nodes * sizeof(struct sd_node); + rsp->data_length += sizeof(*elog) + + support_nodes * sizeof(struct sd_node); epoch--; } out: @@ -520,6 +533,8 @@ static int local_get_epoch(struct request *req) ×tamp); if (nr_nodes == -1) return SD_RES_NO_TAG; + if (nr_nodes == -2) + return SD_RES_BUFFER_SMALL; nodes_len = nr_nodes * sizeof(struct sd_node); memcpy((void *)((char *)req->data + nodes_len), ×tamp, diff --git a/sheep/store.c b/sheep/store.c index eee88c7..70fddb8 100644 --- a/sheep/store.c +++ b/sheep/store.c @@ -63,8 +63,8 @@ static int do_epoch_log_read(uint32_t epoch, struct sd_node *nodes, int len, } if (len < epoch_stat.st_size - sizeof(*timestamp)) { - sd_err("invalid epoch %"PRIu32" log", epoch); - goto err; + close(fd); + return -2; } ret = xread(fd, nodes, epoch_stat.st_size - sizeof(*timestamp)); -- 1.8.3.2 -- sheepdog mailing list sheepdog@lists.wpkg.org http://lists.wpkg.org/mailman/listinfo/sheepdog