At Fri, 19 Dec 2014 10:07:23 +0900, Saeki Masaki wrote: > > In the current sheepdog, vnodes is recalculated at the time of > node increase or decrease. > > In the auto recovery, first get the object from the other node, > finally delete the object do not need. > During auto recovery run, available disk decreases. > In the worst case, it exhaust available disk. > > Add the following new commands and options. > 1. option to specify vnodes in sheep. (-V, --vnodes) > - The old days it has been implemented as -v(--vnodes) option. > - For now -v option is used to print version. so add in -V (large). > $ sheep -V 100 /var/lib/sheepdog > If -V is specified, vnodes strategy of sheep is 'fixed' > (default value is 'auto') > > 2. option to dog cluster format with vnode fixed. (-V, --fixedvnodes) > $ dog cluster format -V > If 'fixed' and 'auto' vnodes strategy are mixed, > cluster format command fails. > (different sheep of vnodes strategy can not be mixed in the cluster) > > 3. dog command to change the vnodes > $ dog node vnode set <vnodes> > After changing the vnodes, new epoch are created and auto recovery will > start. > > If you want to operate the vnodes fixed, > it is necessary to manage the vnodes with capacity of > the data store in each node. > So you should use this option carefully. > > For example of using fixed vnodes strategy: > > 1) start sheep with fixed vnodes strategy. > $ sheep -V 100 /var/lib/sheepdog > $ sheep -V 110 /var/lib/sheepdog > $ dog node list > Id Host:Port V-Nodes Zone > 0 172.16.4.205:7000 100 1812140204 > 1 172.16.4.206:7000 120 1828917420 > > 2) format the cluster with fixed vnodes strategy. > $ dog cluster format -V > > 3) check vnodes strategy of cluster. > $ dog cluster info -v > Cluster status: running, auto-recovery enabled > Cluster store: plain with 3 redundancy policy > Cluster vnodes strategy: fixed > Cluster vnode mode: node > Cluster created at Wed Dec 17 18:20:10 2014 > > Epoch Time Version [Host:Port:V-Nodes,,,] > 2014-12-17 18:20:10 1 [172.16.4.205:7000:100, 172.16.4.206:7000:120] > > 4) change of vnodes. > $ dog node vnodes set 140 > $ dog node list > Id Host:Port V-Nodes Zone > 0 172.16.4.205:7000 140 1812140204 > 1 172.16.4.206:7000 120 1828917420
Saeki-san, thanks a lot for this patch. The change seems good to me, but rebase is required for the latest master. Could you rebase and send v2? In addition, I have some opinions, mainly related to trivial coding styles. I'm glad if you can take them in v2. > > Signed-off-by Masaki Saeki <[email protected]> You need ':' between Signed-off-by and your name. > > --- > dog/cluster.c | 82 > ++++++++++++++++++++++++++++++++++++++-------- > dog/node.c | 67 +++++++++++++++++++++++++++++++++++++ > include/internal_proto.h | 3 ++ > include/sheep.h | 8 ++++ > include/sheepdog_proto.h | 2 + > sheep/config.c | 15 ++++++++- > sheep/group.c | 62 ++++++++++++++++++++++++++++++++--- > sheep/ops.c | 82 > ++++++++++++++++++++++++++++++++++++++++++++++ > sheep/sheep.c | 31 ++++++++++++++++- > 9 files changed, 330 insertions(+), 22 deletions(-) > > diff --git a/dog/cluster.c b/dog/cluster.c > index 20f190b..c92141e 100644 > --- a/dog/cluster.c > +++ b/dog/cluster.c > @@ -15,6 +15,7 @@ > #include <sys/time.h> > > #include "dog.h" > +#include "sheep.h" > #include "farm/farm.h" > > static struct sd_option cluster_options[] = { > @@ -27,6 +28,7 @@ static struct sd_option cluster_options[] = { > "do not serve write request if number of nodes is not sufficient"}, > {'z', "block_size_shift", true, "specify the shift num of default" > " data object size"}, > + {'V', "fixedvnodes", false, "disable automatic vnodes calculation"}, > { 0, NULL, false, NULL }, > }; > > @@ -38,6 +40,7 @@ static struct cluster_cmd_data { > bool force; > bool strict; > char name[STORE_LEN]; > + bool fixed_vnodes; > } cluster_cmd_data; > > #define DEFAULT_STORE "plain" > @@ -87,6 +90,41 @@ static int cluster_format(int argc, char **argv) > struct timeval tv; > char store_name[STORE_LEN]; > static DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS); > + struct sd_node *n; > + > + rb_for_each_entry(n, &sd_nroot, rb) { > + struct sd_req info_req; > + struct sd_rsp *info_rsp = (struct sd_rsp *)&info_req; > + struct cluster_info cinfo; > + > + sd_init_req(&info_req, SD_OP_CLUSTER_INFO); > + info_req.data_length = sizeof(cinfo); > + ret = dog_exec_req(&n->nid, &info_req, &cinfo); > + if (ret < 0) { > + sd_err("Fail to execute request"); > + return EXIT_FAILURE; > + } > + if (info_rsp->result != SD_RES_SUCCESS) { > + sd_err("%s", sd_strerror(info_rsp->result)); > + return EXIT_FAILURE; > + } > + > + if (n->nr_vnodes != 0) { > + if ((cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES) > + && cluster_cmd_data.fixed_vnodes) { > + sd_err("Can not apply the option of '-V', " > + "because there are vnode strategy of > sheep " > + "is auto in the cluster"); > + return EXIT_FAILURE; > + } else if (!(cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES) > + && !cluster_cmd_data.fixed_vnodes) { > + sd_err("Need to specify the option of '-V', " > + "because there are vnode strategy of > sheep " > + "is fixed in the cluster"); > + return EXIT_FAILURE; > + } > + } > + } > > if (cluster_cmd_data.copies > sd_nodes_nr) { > char info[1024]; > @@ -132,6 +170,11 @@ static int cluster_format(int argc, char **argv) > hdr.cluster.flags |= SD_CLUSTER_FLAG_DISKMODE; > #endif > > + if (cluster_cmd_data.fixed_vnodes) > + hdr.cluster.flags &= ~SD_CLUSTER_FLAG_AUTO_VNODES; > + else > + hdr.cluster.flags |= SD_CLUSTER_FLAG_AUTO_VNODES; > + > printf("using backend %s store\n", store_name); > ret = dog_exec_req(&sd_nid, &hdr, store_name); > if (ret < 0) > @@ -160,14 +203,15 @@ static void print_nodes(const struct epoch_log *logs, > uint16_t flags) > if (entry->disks[nr_disk].disk_id == 0) > break; > } > - printf("%s%s(%d)", > - (i == 0) ? "" : ", ", > - addr_to_str(entry->nid.addr, entry->nid.port), > - nr_disk); > + printf("%s%s:%d(%d)", > + (i == 0) ? "" : ", ", > + addr_to_str(entry->nid.addr, entry->nid.port), > + entry->nr_vnodes, nr_disk); > } else > - printf("%s%s", > - (i == 0) ? "" : ", ", > - addr_to_str(entry->nid.addr, entry->nid.port)); > + printf("%s%s:%d", > + (i == 0) ? "" : ", ", > + addr_to_str(entry->nid.addr, entry->nid.port), > + entry->nr_vnodes); > } > } > > @@ -232,6 +276,15 @@ retry: > } > printf("%s with %s redundancy policy\n", > logs->drv_name, copy); > + > + /* show vnode strategy */ > + if (!raw_output) > + printf("Cluster vnodes strategy: "); > + if (logs->flags & SD_CLUSTER_FLAG_AUTO_VNODES) > + printf("auto\n"); > + else > + printf("fixed\n"); > + > } else > printf("%s\n", sd_strerror(rsp->result)); > > @@ -239,15 +292,16 @@ retry: > if (!raw_output) > printf("Cluster vnode mode: "); > if (logs->flags & SD_CLUSTER_FLAG_DISKMODE) > - printf("disk"); > + printf("disk\n"); > else > - printf("node"); > + printf("node\n"); > } > > if (!raw_output && rsp->data_length > 0) { > ct = logs[0].ctime >> 32; > - printf("\nCluster created at %s\n", ctime(&ct)); > - printf("Epoch Time Version\n"); > + printf("Cluster created at %s\n", ctime(&ct)); > + printf("Epoch Time Version > [Host:Port:V-Nodes,,,]"); The above change will break existing tests. Could you create a patch for updating tests? You can send it as another patch. > + printf("\n"); > } > > nr_logs = rsp->data_length / (sizeof(struct epoch_log) > @@ -761,7 +815,7 @@ failure: > static struct subcommand cluster_cmd[] = { > {"info", NULL, "aprhvT", "show cluster information", > NULL, CMD_NEED_NODELIST, cluster_info, cluster_options}, > - {"format", NULL, "bctaphzT", "create a Sheepdog store", > + {"format", NULL, "bctaphzTV", "create a Sheepdog store", > NULL, CMD_NEED_NODELIST, cluster_format, cluster_options}, > {"shutdown", NULL, "aphT", "stop Sheepdog", > NULL, 0, cluster_shutdown, cluster_options}, > @@ -823,9 +877,9 @@ static int cluster_parser(int ch, const char *opt) > " Please set shift bit larger than 20"); > exit(EXIT_FAILURE); > } > - > cluster_cmd_data.block_size_shift = block_size_shift; > - > + case 'V': > + cluster_cmd_data.fixed_vnodes = true; > break; > } > > diff --git a/dog/node.c b/dog/node.c > index a4e9142..b9d441a 100644 > --- a/dog/node.c > +++ b/dog/node.c > @@ -625,6 +625,71 @@ static int node_log(int argc, char **argv) > return do_generic_subcommand(node_log_cmd, argc, argv); > } > > +static int do_vnodes_set(const struct node_id *nid, int *nr_vnodes) > +{ > + int ret = 0; > + struct sd_req hdr; > + struct sd_rsp *rsp = (struct sd_rsp *)&hdr; > + > + sd_init_req(&hdr, SD_OP_SET_VNODES); > + hdr.flags = SD_FLAG_CMD_WRITE; > + hdr.data_length = sizeof(nr_vnodes); > + > + ret = dog_exec_req(nid, &hdr, nr_vnodes); > + if (ret < 0) > + return EXIT_SYSFAIL; > + > + if (rsp->result != SD_RES_SUCCESS) > + return EXIT_FAILURE; > + > + return ret; > +} > + > +static int node_vnodes_set(int argc, char **argv) > +{ > + int ret = 0; > + char *p; > + int32_t nr_vnodes = strtol(argv[optind], &p, 10); > + > + if (argv[optind] == p || nr_vnodes < 1 || nr_vnodes > UINT16_MAX > + || *p != '\0') { > + sd_err("Invalid number of vnodes '%s': must be an integer " > + "between 1 and %u", > + argv[optind], UINT16_MAX); > + exit(EXIT_USAGE); > + } > + > + ret = do_vnodes_set(&sd_nid, &nr_vnodes); > + > + switch (ret) { > + case EXIT_FAILURE: > + case EXIT_SYSFAIL: > + sd_err("Failed to execute request"); > + ret = -1; > + break; > + case EXIT_SUCCESS: > + /* do nothing */ > + break; > + default: > + sd_err("unknown return code of do_vnodes_set(): %d", ret); > + ret = -1; > + break; > + } > + > + return ret; > +} > + > +static struct subcommand node_vnodes_cmd[] = { > + {"set", "<num of vnodes>", NULL, "set new vnodes", > + NULL, CMD_NEED_ARG, node_vnodes_set}, > + {NULL}, > +}; > + > +static int node_vnodes(int argc, char **argv) > +{ > + return do_generic_subcommand(node_vnodes_cmd, argc, argv); > +} > + > static struct subcommand node_cmd[] = { > {"kill", "<node id>", "aprhlT", "kill node", NULL, > CMD_NEED_NODELIST, node_kill, node_options}, > @@ -640,6 +705,8 @@ static struct subcommand node_cmd[] = { > 0, node_stat, node_options}, > {"log", NULL, "aphT", "show or set log level of the node", node_log_cmd, > CMD_NEED_ARG, node_log}, > + {"vnodes", "<num of vnodes>", "aph", "set new vnodes", node_vnodes_cmd, > + CMD_NEED_ARG, node_vnodes}, > {NULL,}, > }; > > diff --git a/include/internal_proto.h b/include/internal_proto.h > index 3f5d77f..f280d6d 100644 > --- a/include/internal_proto.h > +++ b/include/internal_proto.h > @@ -111,6 +111,8 @@ > #define SD_OP_VDI_STATE_SNAPSHOT_CTL 0xC7 > #define SD_OP_INODE_COHERENCE 0xC8 > #define SD_OP_READ_DEL_VDIS 0xC9 > +#define SD_OP_SET_VNODES 0xCC > +#define SD_OP_GET_VNODES 0xCD > > /* internal flags for hdr.flags, must be above 0x80 */ > #define SD_FLAG_CMD_RECOVERY 0x0080 > @@ -143,6 +145,7 @@ > > #define SD_CLUSTER_FLAG_STRICT 0x0001 /* Strict mode for write > */ > #define SD_CLUSTER_FLAG_DISKMODE 0x0002 /* Disk mode for cluster */ > +#define SD_CLUSTER_FLAG_AUTO_VNODES 0x0004 /* Cluster vnodes strategy */ > > enum sd_status { > SD_STATUS_OK = 1, > diff --git a/include/sheep.h b/include/sheep.h > index 22524c1..fe6f066 100644 > --- a/include/sheep.h > +++ b/include/sheep.h > @@ -149,6 +149,9 @@ static inline const char *sd_strerror(int err) > "IO has halted as there are not enough living nodes", > [SD_RES_READONLY] = "Object is read-only", > [SD_RES_INODE_INVALIDATED] = "Inode object is invalidated", > + [SD_RES_INVALID_VNODES_STRATEGY] = > + "Invalid cluster vnodes strategy", > + [SD_RES_GATEWAY_MODE] = "Targeted node is gateway mode", > > /* from internal_proto.h */ > [SD_RES_OLD_NODE_VER] = "Request has an old epoch", > @@ -328,4 +331,9 @@ static inline bool is_cluster_diskmode(const struct > cluster_info *cinfo) > return (cinfo->flags & SD_CLUSTER_FLAG_DISKMODE) > 0; > } > > +static inline bool is_cluster_autovnodes(const struct cluster_info *cinfo) > +{ > + return (cinfo->flags & SD_CLUSTER_FLAG_AUTO_VNODES) > 0; > +} > + > #endif > diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h > index 4f0c48c..28ededd 100644 > --- a/include/sheepdog_proto.h > +++ b/include/sheepdog_proto.h > @@ -86,6 +86,8 @@ > #define SD_RES_INCOMPLETE 0x1B /* Object (in kv) is incomplete uploading > */ > #define SD_RES_COLLECTING_CINFO 0x1C /* sheep is collecting cluster wide > status, not ready for operation */ > #define SD_RES_INODE_INVALIDATED 0x1D /* inode object in client is > invalidated, refreshing is required */ > +#define SD_RES_GATEWAY_MODE 0x1E /* Target node is gateway mode */ > +#define SD_RES_INVALID_VNODES_STRATEGY 0x1F /* Invalid vnodes strategy */ > > /* errors above 0x80 are sheepdog-internal */ > > diff --git a/sheep/config.c b/sheep/config.c > index 383a1ed..4a1e600 100644 > --- a/sheep/config.c > +++ b/sheep/config.c > @@ -62,7 +62,12 @@ static int get_cluster_config(struct cluster_info *cinfo) > { > cinfo->ctime = config.ctime; > cinfo->nr_copies = config.copies; > - cinfo->flags = config.flags; > + if (config.ctime > 0) { > + cinfo->flags = config.flags; > + } else { > + cinfo->flags = (config.flags & ~SD_CLUSTER_FLAG_AUTO_VNODES) | > + (cinfo->flags & SD_CLUSTER_FLAG_AUTO_VNODES); > + } Could you eliminate the braces? In sheepdog coding style, braces of if, for, while statements should be removed they don't have more than two statements. # seems that script/checkpatch.pl doesn't work well for this style... > cinfo->copy_policy = config.copy_policy; > memcpy(cinfo->store, config.store, sizeof(config.store)); > > @@ -121,6 +126,14 @@ int init_config_file(void) > } > > reload: > + if ((config.flags & SD_CLUSTER_FLAG_AUTO_VNODES) != > + (sys->cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES) > + && !sys->gateway_only > + && config.ctime > 0) { > + sd_err("Designation of before a restart and a vnodes option is > different."); > + return -1; > + } > + > ret = 0; > get_cluster_config(&sys->cinfo); > if ((config.flags & SD_CLUSTER_FLAG_DISKMODE) != > diff --git a/sheep/group.c b/sheep/group.c > index 095b7c5..b33e514 100644 > --- a/sheep/group.c > +++ b/sheep/group.c > @@ -145,7 +145,8 @@ struct vnode_info *alloc_vnode_info(const struct rb_root > *nroot) > vnode_info->nr_nodes++; > } > > - recalculate_vnodes(&vnode_info->nroot); > + if (is_cluster_autovnodes(&sys->cinfo)) > + recalculate_vnodes(&vnode_info->nroot); > > if (is_cluster_diskmode(&sys->cinfo)) > disks_to_vnodes(&vnode_info->nroot, &vnode_info->vroot); > @@ -1098,6 +1099,20 @@ static bool cluster_join_check(const struct > cluster_info *cinfo) > if (!cluster_ctime_check(cinfo)) > return false; > > + if (cinfo->ctime > 0 && sys->this_node.nr_vnodes != 0) { > + if (!is_cluster_autovnodes(&sys->cinfo) > + && is_cluster_autovnodes(cinfo)) { > + sd_err("failed to join for vnodes strategy unmatch. " > + " cluster:fixed, joined:auto"); > + return false; > + } else if (is_cluster_autovnodes(&sys->cinfo) > + && !is_cluster_autovnodes(cinfo)) { > + sd_err("failed to join for vnodes strategy unmatch. " > + " cluster:auto, joined:fixed"); > + return false; > + } > + } > + > /* > * Sheepdog's recovery code assumes every node have the same epoch > * history. But we don't check epoch history of joining node because: > @@ -1119,6 +1134,14 @@ main_fn void sd_accept_handler(const struct sd_node > *joined, > { > const struct cluster_info *cinfo = opaque; > struct sd_node *n; > + uint16_t flags; > + > + if (node_is_local(joined) && sys->gateway_only > + && sys->cinfo.ctime <= 0) { > + flags = cinfo->flags & SD_CLUSTER_FLAG_AUTO_VNODES; > + } else { > + flags = sys->cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES; > + } The brace problem, too. > > if (node_is_local(joined) && !cluster_join_check(cinfo)) { > sd_err("failed to join Sheepdog"); > @@ -1127,6 +1150,9 @@ main_fn void sd_accept_handler(const struct sd_node > *joined, > > cluster_info_copy(&sys->cinfo, cinfo); > > + sys->cinfo.flags &= ~SD_CLUSTER_FLAG_AUTO_VNODES; > + sys->cinfo.flags |= flags; > + > sd_debug("join %s", node_to_str(joined)); > rb_for_each_entry(n, nroot, rb) { > sd_debug("%s", node_to_str(n)); > @@ -1191,7 +1217,7 @@ main_fn void sd_leave_handler(const struct sd_node > *left, > remove_node_from_participants(&left->nid); > } > > -static void update_node_size(struct sd_node *node) > +static void update_node_info(struct sd_node *node) > { > struct vnode_info *cur_vinfo = get_vnode_info(); > struct sd_node *n = rb_search(&cur_vinfo->nroot, node, rb, node_cmp); > @@ -1199,6 +1225,11 @@ static void update_node_size(struct sd_node *node) > if (unlikely(!n)) > panic("can't find %s", node_to_str(node)); > n->space = node->space; > + > + if (!is_cluster_autovnodes(&sys->cinfo)) { > + n->nr_vnodes = node->nr_vnodes; > + } The brace problem. That's all. Thanks. Hitoshi -- sheepdog mailing list [email protected] https://lists.wpkg.org/mailman/listinfo/sheepdog
