On Wed, Jul 30, 2014 at 02:26:20PM +0800, Ruoyu wrote: > Current epoch_log contains a long nodes array to sync nodes and > epoch in the cluster. It is simple, but there is a potential > performance issue because each epoch log occupies nearly 500 > KBytes. If the cluster members change frequently, epoch is lifted > frequently, more and more unused data is transfered between > client and server. If we don't find a way, the performance will > go from bad to worse. > > Although the max node number is 6144, we only use a few of them. > Therefore, the first solution is using a zero-length array, > client (dog) and server (sheep) will negotiate an appropriate > supported node number. This way will spend much less memory and > will run much faster than before. > > v3: > - epoch_log_read series functions are changed to propagate error > to upper layer. > > v2: > - internal data structure is changed.
You need put change log under Signed-off-by line > Signed-off-by: Ruoyu <lian...@ucweb.com> > --- <-- Here we put change log for a single patch, otherwise put it in the introduction patch (git format-patch --cover-letter xxx). > > dog/cluster.c | 40 +++++++++++++++++++++++---------- > dog/vdi.c | 37 +++++++++++++++++++++++-------- > include/internal_proto.h | 2 +- > include/sheepdog_proto.h | 1 + > sheep/group.c | 57 > +++++++++++++++++++++++++----------------------- > sheep/ops.c | 55 +++++++++++++++++++++++++++------------------- > sheep/sheep_priv.h | 10 +++++---- > sheep/store.c | 25 ++++++++++++--------- > 8 files changed, 143 insertions(+), 84 deletions(-) > > diff --git a/dog/cluster.c b/dog/cluster.c > index 188d4f4..508b65a 100644 > --- a/dog/cluster.c > +++ b/dog/cluster.c > @@ -141,14 +141,14 @@ static int cluster_format(int argc, char **argv) > return EXIT_SUCCESS; > } > > -static void print_nodes(const struct epoch_log *logs, int epoch) > +static void print_nodes(const struct epoch_log *logs, uint16_t flags) > { > int i, nr_disk; > const struct sd_node *entry; > > - for (i = 0; i < logs[epoch].nr_nodes; i++) { > - entry = logs[epoch].nodes + i; > - if (logs->flags & SD_CLUSTER_FLAG_DISKMODE) { > + for (i = 0; i < logs->nr_nodes; i++) { > + entry = logs->nodes + i; > + if (flags & SD_CLUSTER_FLAG_DISKMODE) { > for (nr_disk = 0; nr_disk < DISK_MAX; nr_disk++) { > if (entry->disks[nr_disk].disk_id == 0) > break; > @@ -169,21 +169,35 @@ static int cluster_info(int argc, char **argv) > int i, ret; > struct sd_req hdr; > struct sd_rsp *rsp = (struct sd_rsp *)&hdr; > - struct epoch_log *logs; > + struct epoch_log *logs, *log; > + char *next_log; > int nr_logs, log_length; > time_t ti, ct; > struct tm tm; > char time_str[128]; > + uint32_t support_nodes; > > - log_length = sd_epoch * sizeof(struct epoch_log); > +#define DEFAULT_SUPPORT_NODES 32 It is used by mutiple files, so define it in .h file. By the way, why not use sd_nodes_nr directly? > + support_nodes = DEFAULT_SUPPORT_NODES; > + log_length = sd_epoch * (sizeof(struct epoch_log) > + + support_nodes * sizeof(struct sd_node)); > logs = xmalloc(log_length); > > +retry: > sd_init_req(&hdr, SD_OP_STAT_CLUSTER); > hdr.data_length = log_length; > + hdr.cluster.support_nodes = support_nodes; > > ret = dog_exec_req(&sd_nid, &hdr, logs); > if (ret < 0) > goto error; > + if (rsp->result == SD_RES_BUFFER_SMALL) { > + support_nodes *= 2; > + log_length = sd_epoch * (sizeof(struct epoch_log) > + + support_nodes * sizeof(struct sd_node)); > + logs = xrealloc(logs, log_length); > + goto retry; > + } > > /* show cluster status */ > if (!raw_output) > @@ -230,10 +244,12 @@ static int cluster_info(int argc, char **argv) > printf("Epoch Time Version\n"); > } > > - nr_logs = rsp->data_length / sizeof(struct epoch_log); > + nr_logs = rsp->data_length / (sizeof(struct epoch_log) > + + support_nodes * sizeof(struct sd_node)); > + next_log = (char *)logs; > for (i = 0; i < nr_logs; i++) { > - > - ti = logs[i].time; > + log = (struct epoch_log *)next_log; > + ti = log->time; > if (raw_output) { > snprintf(time_str, sizeof(time_str), "%" PRIu64, > (uint64_t) ti); > } else { > @@ -241,10 +257,12 @@ static int cluster_info(int argc, char **argv) > strftime(time_str, sizeof(time_str), "%Y-%m-%d > %H:%M:%S", &tm); > } > > - printf(raw_output ? "%s %d" : "%s %6d", time_str, > logs[i].epoch); > + printf(raw_output ? "%s %d" : "%s %6d", time_str, log->epoch); > printf(" ["); > - print_nodes(logs, i); > + print_nodes(log, logs->flags); > printf("]\n"); > + next_log = (char *)log->nodes > + + support_nodes * sizeof(struct sd_node); > } > > free(logs); > diff --git a/dog/vdi.c b/dog/vdi.c > index 2e3f7b3..6f0b748 100644 > --- a/dog/vdi.c > +++ b/dog/vdi.c > @@ -1096,47 +1096,64 @@ static int do_track_object(uint64_t oid, uint8_t > nr_copies) > struct sd_req hdr; > struct sd_rsp *rsp = (struct sd_rsp *)&hdr; > const struct sd_vnode *vnode_buf[SD_MAX_COPIES]; > - struct epoch_log *logs; > + struct epoch_log *logs, *log; > + char *next_log; > int nr_logs, log_length; > + uint32_t support_nodes; > > - log_length = sd_epoch * sizeof(struct epoch_log); > +#define DEFAULT_SUPPORT_NODES 32 > + support_nodes = DEFAULT_SUPPORT_NODES; > + log_length = sd_epoch * (sizeof(struct epoch_log) > + + support_nodes * sizeof(struct sd_node)); > logs = xmalloc(log_length); > > +retry: > sd_init_req(&hdr, SD_OP_STAT_CLUSTER); > hdr.data_length = log_length; > + hdr.cluster.support_nodes = support_nodes; > > ret = dog_exec_req(&sd_nid, &hdr, logs); > if (ret < 0) > goto error; > > + if (rsp->result == SD_RES_BUFFER_SMALL) { > + support_nodes *= 2; > + log_length = sd_epoch * (sizeof(struct epoch_log) > + + support_nodes * sizeof(struct sd_node)); > + logs = xrealloc(logs, log_length); > + goto retry; > + } > if (rsp->result != SD_RES_SUCCESS) { > printf("%s\n", sd_strerror(rsp->result)); > goto error; > } > > - nr_logs = rsp->data_length / sizeof(struct epoch_log); > + nr_logs = rsp->data_length / (sizeof(struct epoch_log) > + + support_nodes * sizeof(struct sd_node)); > + next_log = (char *)logs; > for (i = nr_logs - 1; i >= 0; i--) { > struct rb_root vroot = RB_ROOT; > struct rb_root nroot = RB_ROOT; > > + log = (struct epoch_log *)next_log; > printf("\nobj %"PRIx64" locations at epoch %d, copies = %d\n", > - oid, logs[i].epoch, nr_copies); > + oid, log->epoch, nr_copies); > printf("---------------------------------------------------\n"); > > /* > * When # of nodes is less than nr_copies, we only print > * remaining nodes that holds all the remaining copies. > */ > - if (logs[i].nr_nodes < nr_copies) { > - for (j = 0; j < logs[i].nr_nodes; j++) { > - const struct node_id *n = &logs[i].nodes[j].nid; > + if (log->nr_nodes < nr_copies) { > + for (j = 0; j < log->nr_nodes; j++) { > + const struct node_id *n = &log->nodes[j].nid; > > printf("%s\n", addr_to_str(n->addr, n->port)); > } > continue; > } > - for (int k = 0; k < logs[i].nr_nodes; k++) > - rb_insert(&nroot, &logs[i].nodes[k], rb, node_cmp); > + for (int k = 0; k < log->nr_nodes; k++) > + rb_insert(&nroot, &log->nodes[k], rb, node_cmp); > if (logs->flags & SD_CLUSTER_FLAG_DISKMODE) > disks_to_vnodes(&nroot, &vroot); > else > @@ -1148,6 +1165,8 @@ static int do_track_object(uint64_t oid, uint8_t > nr_copies) > printf("%s\n", addr_to_str(n->addr, n->port)); > } > rb_destroy(&vroot, struct sd_vnode, rb); > + next_log = (char *)log->nodes > + + support_nodes * sizeof(struct sd_node); > } > > free(logs); > diff --git a/include/internal_proto.h b/include/internal_proto.h > index 37afb46..d61b5a5 100644 > --- a/include/internal_proto.h > +++ b/include/internal_proto.h > @@ -221,7 +221,7 @@ struct epoch_log { > uint8_t __pad[3]; > uint16_t flags; > char drv_name[STORE_LEN]; > - struct sd_node nodes[SD_MAX_NODES]; > + struct sd_node nodes[0]; > }; > > struct vdi_op_message { > diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h > index 7cfdccb..8b26a8b 100644 > --- a/include/sheepdog_proto.h > +++ b/include/sheepdog_proto.h > @@ -165,6 +165,7 @@ struct sd_req { > uint8_t copy_policy; > uint16_t flags; > uint32_t tag; > + uint32_t support_nodes; support_nodes is not good. nodes_nr is simpler. -- sheepdog mailing list sheepdog@lists.wpkg.org http://lists.wpkg.org/mailman/listinfo/sheepdog