At Fri, 19 Dec 2014 10:07:23 +0900,
Saeki Masaki wrote:
> 
> In the current sheepdog, vnodes is recalculated at the time of
> node increase or decrease.
> 
> In the auto recovery, first get the object from the other node,
> finally delete the object do not need.
> During auto recovery run, available disk decreases.
> In the worst case, it exhaust available disk.
> 
> Add the following new commands and options.
> 1. option to specify vnodes in sheep. (-V, --vnodes)
>   - The old days it has been implemented as -v(--vnodes) option.
>   - For now -v option is used to print version. so add in -V (large).
> $ sheep -V 100 /var/lib/sheepdog
>   If -V is specified, vnodes strategy of sheep is 'fixed'
>   (default value is 'auto')
> 
> 2. option to dog cluster format with vnode fixed. (-V, --fixedvnodes)
>   $ dog cluster format -V
>   If 'fixed' and 'auto' vnodes strategy are mixed,
>   cluster format command fails.
>   (different sheep of vnodes strategy can not be mixed in the cluster)
> 
> 3. dog command to change the vnodes
>   $ dog node vnode set <vnodes>
>   After changing the vnodes, new epoch are created and auto recovery will 
> start.
> 
> If you want to operate the vnodes fixed,
> it is necessary to manage the vnodes with capacity of
> the data store in each node.
> So you should use this option carefully.
> 
> For example of using fixed vnodes strategy:
> 
> 1) start sheep with fixed vnodes strategy.
> $ sheep -V 100 /var/lib/sheepdog
> $ sheep -V 110 /var/lib/sheepdog
> $ dog node list
>   Id   Host:Port         V-Nodes       Zone
>    0   172.16.4.205:7000        100 1812140204
>    1   172.16.4.206:7000        120 1828917420
> 
> 2) format the cluster with fixed vnodes strategy.
> $ dog cluster format -V
> 
> 3) check vnodes strategy of cluster.
> $ dog cluster info -v
> Cluster status: running, auto-recovery enabled
> Cluster store: plain with 3 redundancy policy
> Cluster vnodes strategy: fixed
> Cluster vnode mode: node
> Cluster created at Wed Dec 17 18:20:10 2014
> 
> Epoch Time           Version [Host:Port:V-Nodes,,,]
> 2014-12-17 18:20:10      1 [172.16.4.205:7000:100, 172.16.4.206:7000:120]
> 
> 4) change of vnodes.
> $ dog node vnodes set 140
> $ dog node list
>   Id   Host:Port         V-Nodes       Zone
>    0   172.16.4.205:7000        140 1812140204
>    1   172.16.4.206:7000        120 1828917420

Saeki-san, thanks a lot for this patch. The change seems good to me,
but rebase is required for the latest master. Could you rebase and
send v2? In addition, I have some opinions, mainly related to trivial
coding styles. I'm glad if you can take them in v2.

> 
> Signed-off-by Masaki Saeki <[email protected]>

You need ':' between Signed-off-by and your name.

> 
> ---
>  dog/cluster.c            |   82 
> ++++++++++++++++++++++++++++++++++++++--------
>  dog/node.c               |   67 +++++++++++++++++++++++++++++++++++++
>  include/internal_proto.h |    3 ++
>  include/sheep.h          |    8 ++++
>  include/sheepdog_proto.h |    2 +
>  sheep/config.c           |   15 ++++++++-
>  sheep/group.c            |   62 ++++++++++++++++++++++++++++++++---
>  sheep/ops.c              |   82 
> ++++++++++++++++++++++++++++++++++++++++++++++
>  sheep/sheep.c            |   31 ++++++++++++++++-
>  9 files changed, 330 insertions(+), 22 deletions(-)
> 
> diff --git a/dog/cluster.c b/dog/cluster.c
> index 20f190b..c92141e 100644
> --- a/dog/cluster.c
> +++ b/dog/cluster.c
> @@ -15,6 +15,7 @@
>  #include <sys/time.h>
>  
>  #include "dog.h"
> +#include "sheep.h"
>  #include "farm/farm.h"
>  
>  static struct sd_option cluster_options[] = {
> @@ -27,6 +28,7 @@ static struct sd_option cluster_options[] = {
>        "do not serve write request if number of nodes is not sufficient"},
>       {'z', "block_size_shift", true, "specify the shift num of default"
>             " data object size"},
> +     {'V', "fixedvnodes", false, "disable automatic vnodes calculation"},
>       { 0, NULL, false, NULL },
>  };
>  
> @@ -38,6 +40,7 @@ static struct cluster_cmd_data {
>       bool force;
>       bool strict;
>       char name[STORE_LEN];
> +     bool fixed_vnodes;
>  } cluster_cmd_data;
>  
>  #define DEFAULT_STORE        "plain"
> @@ -87,6 +90,41 @@ static int cluster_format(int argc, char **argv)
>       struct timeval tv;
>       char store_name[STORE_LEN];
>       static DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
> +     struct sd_node *n;
> +
> +     rb_for_each_entry(n, &sd_nroot, rb) {
> +             struct sd_req info_req;
> +             struct sd_rsp *info_rsp = (struct sd_rsp *)&info_req;
> +             struct cluster_info cinfo;
> +
> +             sd_init_req(&info_req, SD_OP_CLUSTER_INFO);
> +             info_req.data_length = sizeof(cinfo);
> +             ret = dog_exec_req(&n->nid, &info_req, &cinfo);
> +             if (ret < 0) {
> +                     sd_err("Fail to execute request");
> +                     return EXIT_FAILURE;
> +             }
> +             if (info_rsp->result != SD_RES_SUCCESS) {
> +                     sd_err("%s", sd_strerror(info_rsp->result));
> +                     return EXIT_FAILURE;
> +             }
> +
> +             if (n->nr_vnodes != 0) {
> +                     if ((cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES)
> +                             && cluster_cmd_data.fixed_vnodes) {
> +                             sd_err("Can not apply the option of '-V', "
> +                                     "because there are vnode strategy of 
> sheep "
> +                                     "is auto in the cluster");
> +                             return EXIT_FAILURE;
> +                     } else if (!(cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES)
> +                             && !cluster_cmd_data.fixed_vnodes) {
> +                             sd_err("Need to specify the option of '-V', "
> +                                     "because there are vnode strategy of 
> sheep "
> +                                     "is fixed in the cluster");
> +                             return EXIT_FAILURE;
> +                     }
> +             }
> +     }
>  
>       if (cluster_cmd_data.copies > sd_nodes_nr) {
>               char info[1024];
> @@ -132,6 +170,11 @@ static int cluster_format(int argc, char **argv)
>       hdr.cluster.flags |= SD_CLUSTER_FLAG_DISKMODE;
>  #endif
>  
> +     if (cluster_cmd_data.fixed_vnodes)
> +             hdr.cluster.flags &= ~SD_CLUSTER_FLAG_AUTO_VNODES;
> +     else
> +             hdr.cluster.flags |= SD_CLUSTER_FLAG_AUTO_VNODES;
> +
>       printf("using backend %s store\n", store_name);
>       ret = dog_exec_req(&sd_nid, &hdr, store_name);
>       if (ret < 0)
> @@ -160,14 +203,15 @@ static void print_nodes(const struct epoch_log *logs, 
> uint16_t flags)
>                               if (entry->disks[nr_disk].disk_id == 0)
>                                       break;
>                       }
> -                     printf("%s%s(%d)",
> -                            (i == 0) ? "" : ", ",
> -                            addr_to_str(entry->nid.addr, entry->nid.port),
> -                            nr_disk);
> +                     printf("%s%s:%d(%d)",
> +                             (i == 0) ? "" : ", ",
> +                             addr_to_str(entry->nid.addr, entry->nid.port),
> +                                     entry->nr_vnodes, nr_disk);
>               } else
> -                     printf("%s%s",
> -                            (i == 0) ? "" : ", ",
> -                            addr_to_str(entry->nid.addr, entry->nid.port));
> +                     printf("%s%s:%d",
> +                             (i == 0) ? "" : ", ",
> +                             addr_to_str(entry->nid.addr, entry->nid.port),
> +                                     entry->nr_vnodes);
>       }
>  }
>  
> @@ -232,6 +276,15 @@ retry:
>                       }
>                       printf("%s with %s redundancy policy\n",
>                              logs->drv_name, copy);
> +
> +                     /* show vnode strategy */
> +                     if (!raw_output)
> +                             printf("Cluster vnodes strategy: ");
> +                     if (logs->flags & SD_CLUSTER_FLAG_AUTO_VNODES)
> +                             printf("auto\n");
> +                     else
> +                             printf("fixed\n");
> +
>               } else
>                       printf("%s\n", sd_strerror(rsp->result));
>  
> @@ -239,15 +292,16 @@ retry:
>               if (!raw_output)
>                       printf("Cluster vnode mode: ");
>               if (logs->flags & SD_CLUSTER_FLAG_DISKMODE)
> -                     printf("disk");
> +                     printf("disk\n");
>               else
> -                     printf("node");
> +                     printf("node\n");
>       }
>  
>       if (!raw_output && rsp->data_length > 0) {
>               ct = logs[0].ctime >> 32;
> -             printf("\nCluster created at %s\n", ctime(&ct));
> -             printf("Epoch Time           Version\n");
> +             printf("Cluster created at %s\n", ctime(&ct));
> +             printf("Epoch Time           Version
>               [Host:Port:V-Nodes,,,]");

The above change will break existing tests. Could you create a patch
for updating tests? You can send it as another patch.

> +             printf("\n");
>       }
>  
>       nr_logs = rsp->data_length / (sizeof(struct epoch_log)
> @@ -761,7 +815,7 @@ failure:
>  static struct subcommand cluster_cmd[] = {
>       {"info", NULL, "aprhvT", "show cluster information",
>        NULL, CMD_NEED_NODELIST, cluster_info, cluster_options},
> -     {"format", NULL, "bctaphzT", "create a Sheepdog store",
> +     {"format", NULL, "bctaphzTV", "create a Sheepdog store",
>        NULL, CMD_NEED_NODELIST, cluster_format, cluster_options},
>       {"shutdown", NULL, "aphT", "stop Sheepdog",
>        NULL, 0, cluster_shutdown, cluster_options},
> @@ -823,9 +877,9 @@ static int cluster_parser(int ch, const char *opt)
>                       " Please set shift bit larger than 20");
>                       exit(EXIT_FAILURE);
>               }
> -
>               cluster_cmd_data.block_size_shift = block_size_shift;
> -
> +     case 'V':
> +             cluster_cmd_data.fixed_vnodes = true;
>               break;
>       }
>  
> diff --git a/dog/node.c b/dog/node.c
> index a4e9142..b9d441a 100644
> --- a/dog/node.c
> +++ b/dog/node.c
> @@ -625,6 +625,71 @@ static int node_log(int argc, char **argv)
>       return do_generic_subcommand(node_log_cmd, argc, argv);
>  }
>  
> +static int do_vnodes_set(const struct node_id *nid, int *nr_vnodes)
> +{
> +     int ret = 0;
> +     struct sd_req hdr;
> +     struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
> +
> +     sd_init_req(&hdr, SD_OP_SET_VNODES);
> +     hdr.flags = SD_FLAG_CMD_WRITE;
> +     hdr.data_length = sizeof(nr_vnodes);
> +
> +     ret = dog_exec_req(nid, &hdr, nr_vnodes);
> +     if (ret < 0)
> +             return EXIT_SYSFAIL;
> +
> +     if (rsp->result != SD_RES_SUCCESS)
> +             return EXIT_FAILURE;
> +
> +     return ret;
> +}
> +
> +static int node_vnodes_set(int argc, char **argv)
> +{
> +     int ret = 0;
> +     char *p;
> +     int32_t nr_vnodes = strtol(argv[optind], &p, 10);
> +
> +     if (argv[optind] == p || nr_vnodes < 1 || nr_vnodes > UINT16_MAX
> +             || *p != '\0') {
> +             sd_err("Invalid number of vnodes '%s': must be an integer "
> +                     "between 1 and %u",
> +                     argv[optind], UINT16_MAX);
> +             exit(EXIT_USAGE);
> +     }
> +
> +     ret = do_vnodes_set(&sd_nid, &nr_vnodes);
> +
> +     switch (ret) {
> +     case EXIT_FAILURE:
> +     case EXIT_SYSFAIL:
> +             sd_err("Failed to execute request");
> +             ret = -1;
> +             break;
> +     case EXIT_SUCCESS:
> +             /* do nothing */
> +             break;
> +     default:
> +             sd_err("unknown return code of do_vnodes_set(): %d", ret);
> +             ret = -1;
> +             break;
> +     }
> +
> +     return ret;
> +}
> +
> +static struct subcommand node_vnodes_cmd[] = {
> +     {"set", "<num of vnodes>", NULL, "set new vnodes",
> +      NULL, CMD_NEED_ARG, node_vnodes_set},
> +     {NULL},
> +};
> +
> +static int node_vnodes(int argc, char **argv)
> +{
> +     return do_generic_subcommand(node_vnodes_cmd, argc, argv);
> +}
> +
>  static struct subcommand node_cmd[] = {
>       {"kill", "<node id>", "aprhlT", "kill node", NULL,
>        CMD_NEED_NODELIST, node_kill, node_options},
> @@ -640,6 +705,8 @@ static struct subcommand node_cmd[] = {
>        0, node_stat, node_options},
>       {"log", NULL, "aphT", "show or set log level of the node", node_log_cmd,
>        CMD_NEED_ARG, node_log},
> +     {"vnodes", "<num of vnodes>", "aph", "set new vnodes", node_vnodes_cmd,
> +      CMD_NEED_ARG, node_vnodes},
>       {NULL,},
>  };
>  
> diff --git a/include/internal_proto.h b/include/internal_proto.h
> index 3f5d77f..f280d6d 100644
> --- a/include/internal_proto.h
> +++ b/include/internal_proto.h
> @@ -111,6 +111,8 @@
>  #define SD_OP_VDI_STATE_SNAPSHOT_CTL  0xC7
>  #define SD_OP_INODE_COHERENCE 0xC8
>  #define SD_OP_READ_DEL_VDIS  0xC9
> +#define SD_OP_SET_VNODES 0xCC
> +#define SD_OP_GET_VNODES 0xCD
>  
>  /* internal flags for hdr.flags, must be above 0x80 */
>  #define SD_FLAG_CMD_RECOVERY 0x0080
> @@ -143,6 +145,7 @@
>  
>  #define SD_CLUSTER_FLAG_STRICT               0x0001 /* Strict mode for write 
> */
>  #define SD_CLUSTER_FLAG_DISKMODE     0x0002 /* Disk mode for cluster */
> +#define SD_CLUSTER_FLAG_AUTO_VNODES  0x0004 /* Cluster vnodes strategy */
>  
>  enum sd_status {
>       SD_STATUS_OK = 1,
> diff --git a/include/sheep.h b/include/sheep.h
> index 22524c1..fe6f066 100644
> --- a/include/sheep.h
> +++ b/include/sheep.h
> @@ -149,6 +149,9 @@ static inline const char *sd_strerror(int err)
>                       "IO has halted as there are not enough living nodes",
>               [SD_RES_READONLY] = "Object is read-only",
>               [SD_RES_INODE_INVALIDATED] = "Inode object is invalidated",
> +             [SD_RES_INVALID_VNODES_STRATEGY] =
> +                     "Invalid cluster vnodes strategy",
> +             [SD_RES_GATEWAY_MODE] = "Targeted node is gateway mode",
>  
>               /* from internal_proto.h */
>               [SD_RES_OLD_NODE_VER] = "Request has an old epoch",
> @@ -328,4 +331,9 @@ static inline bool is_cluster_diskmode(const struct 
> cluster_info *cinfo)
>       return (cinfo->flags & SD_CLUSTER_FLAG_DISKMODE) > 0;
>  }
>  
> +static inline bool is_cluster_autovnodes(const struct cluster_info *cinfo)
> +{
> +     return (cinfo->flags & SD_CLUSTER_FLAG_AUTO_VNODES) > 0;
> +}
> +
>  #endif
> diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
> index 4f0c48c..28ededd 100644
> --- a/include/sheepdog_proto.h
> +++ b/include/sheepdog_proto.h
> @@ -86,6 +86,8 @@
>  #define SD_RES_INCOMPLETE    0x1B /* Object (in kv) is incomplete uploading 
> */
>  #define SD_RES_COLLECTING_CINFO 0x1C /* sheep is collecting cluster wide 
> status, not ready for operation */
>  #define SD_RES_INODE_INVALIDATED 0x1D /* inode object in client is 
> invalidated, refreshing is required */
> +#define SD_RES_GATEWAY_MODE  0x1E /* Target node is gateway mode */
> +#define SD_RES_INVALID_VNODES_STRATEGY 0x1F /* Invalid vnodes strategy */
>  
>  /* errors above 0x80 are sheepdog-internal */
>  
> diff --git a/sheep/config.c b/sheep/config.c
> index 383a1ed..4a1e600 100644
> --- a/sheep/config.c
> +++ b/sheep/config.c
> @@ -62,7 +62,12 @@ static int get_cluster_config(struct cluster_info *cinfo)
>  {
>       cinfo->ctime = config.ctime;
>       cinfo->nr_copies = config.copies;
> -     cinfo->flags = config.flags;
> +     if (config.ctime > 0) {
> +             cinfo->flags = config.flags;
> +     } else {
> +             cinfo->flags = (config.flags & ~SD_CLUSTER_FLAG_AUTO_VNODES) |
> +                     (cinfo->flags & SD_CLUSTER_FLAG_AUTO_VNODES);
> +     }

Could you eliminate the braces? In sheepdog coding style, braces of
if, for, while statements should be removed they don't have more than
two statements.

# seems that script/checkpatch.pl doesn't work well for this style...

>       cinfo->copy_policy = config.copy_policy;
>       memcpy(cinfo->store, config.store, sizeof(config.store));
>  
> @@ -121,6 +126,14 @@ int init_config_file(void)
>       }
>  
>  reload:
> +     if ((config.flags & SD_CLUSTER_FLAG_AUTO_VNODES) !=
> +                     (sys->cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES)
> +             && !sys->gateway_only
> +             && config.ctime > 0) {
> +             sd_err("Designation of before a restart and a vnodes option is 
> different.");
> +             return -1;
> +     }
> +
>       ret = 0;
>       get_cluster_config(&sys->cinfo);
>       if ((config.flags & SD_CLUSTER_FLAG_DISKMODE) !=
> diff --git a/sheep/group.c b/sheep/group.c
> index 095b7c5..b33e514 100644
> --- a/sheep/group.c
> +++ b/sheep/group.c
> @@ -145,7 +145,8 @@ struct vnode_info *alloc_vnode_info(const struct rb_root 
> *nroot)
>               vnode_info->nr_nodes++;
>       }
>  
> -     recalculate_vnodes(&vnode_info->nroot);
> +     if (is_cluster_autovnodes(&sys->cinfo))
> +             recalculate_vnodes(&vnode_info->nroot);
>  
>       if (is_cluster_diskmode(&sys->cinfo))
>               disks_to_vnodes(&vnode_info->nroot, &vnode_info->vroot);
> @@ -1098,6 +1099,20 @@ static bool cluster_join_check(const struct 
> cluster_info *cinfo)
>       if (!cluster_ctime_check(cinfo))
>               return false;
>  
> +     if (cinfo->ctime > 0 && sys->this_node.nr_vnodes != 0) {
> +             if (!is_cluster_autovnodes(&sys->cinfo)
> +                     && is_cluster_autovnodes(cinfo)) {
> +                     sd_err("failed to join for vnodes strategy unmatch. "
> +                             " cluster:fixed, joined:auto");
> +                     return false;
> +             } else if (is_cluster_autovnodes(&sys->cinfo)
> +                     && !is_cluster_autovnodes(cinfo)) {
> +                     sd_err("failed to join for vnodes strategy unmatch. "
> +                             " cluster:auto, joined:fixed");
> +                     return false;
> +             }
> +     }
> +
>       /*
>        * Sheepdog's recovery code assumes every node have the same epoch
>        * history. But we don't check epoch history of joining node because:
> @@ -1119,6 +1134,14 @@ main_fn void sd_accept_handler(const struct sd_node 
> *joined,
>  {
>       const struct cluster_info *cinfo = opaque;
>       struct sd_node *n;
> +     uint16_t flags;
> +
> +     if (node_is_local(joined) && sys->gateway_only
> +             && sys->cinfo.ctime <= 0) {
> +             flags = cinfo->flags & SD_CLUSTER_FLAG_AUTO_VNODES;
> +     } else {
> +             flags = sys->cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES;
> +     }

The brace problem, too.

>  
>       if (node_is_local(joined) && !cluster_join_check(cinfo)) {
>               sd_err("failed to join Sheepdog");
> @@ -1127,6 +1150,9 @@ main_fn void sd_accept_handler(const struct sd_node 
> *joined,
>  
>       cluster_info_copy(&sys->cinfo, cinfo);
>  
> +     sys->cinfo.flags &= ~SD_CLUSTER_FLAG_AUTO_VNODES;
> +     sys->cinfo.flags |= flags;
> +
>       sd_debug("join %s", node_to_str(joined));
>       rb_for_each_entry(n, nroot, rb) {
>               sd_debug("%s", node_to_str(n));
> @@ -1191,7 +1217,7 @@ main_fn void sd_leave_handler(const struct sd_node 
> *left,
>       remove_node_from_participants(&left->nid);
>  }
>  
> -static void update_node_size(struct sd_node *node)
> +static void update_node_info(struct sd_node *node)
>  {
>       struct vnode_info *cur_vinfo = get_vnode_info();
>       struct sd_node *n = rb_search(&cur_vinfo->nroot, node, rb, node_cmp);
> @@ -1199,6 +1225,11 @@ static void update_node_size(struct sd_node *node)
>       if (unlikely(!n))
>               panic("can't find %s", node_to_str(node));
>       n->space = node->space;
> +
> +     if (!is_cluster_autovnodes(&sys->cinfo)) {
> +             n->nr_vnodes = node->nr_vnodes;
> +     }

The brace problem.

That's all. Thanks.
Hitoshi
-- 
sheepdog mailing list
[email protected]
https://lists.wpkg.org/mailman/listinfo/sheepdog

Reply via email to