At Fri, 19 Dec 2014 10:07:23 +0900,
Saeki Masaki wrote:
In the current sheepdog, vnodes is recalculated at the time of
node increase or decrease.
In the auto recovery, first get the object from the other node,
finally delete the object do not need.
During auto recovery run, available disk decreases.
In the worst case, it exhaust available disk.
Add the following new commands and options.
1. option to specify vnodes in sheep. (-V, --vnodes)
- The old days it has been implemented as -v(--vnodes) option.
- For now -v option is used to print version. so add in -V (large).
$ sheep -V 100 /var/lib/sheepdog
If -V is specified, vnodes strategy of sheep is 'fixed'
(default value is 'auto')
2. option to dog cluster format with vnode fixed. (-V, --fixedvnodes)
$ dog cluster format -V
If 'fixed' and 'auto' vnodes strategy are mixed,
cluster format command fails.
(different sheep of vnodes strategy can not be mixed in the cluster)
3. dog command to change the vnodes
$ dog node vnode set <vnodes>
After changing the vnodes, new epoch are created and auto recovery will
start.
If you want to operate the vnodes fixed,
it is necessary to manage the vnodes with capacity of
the data store in each node.
So you should use this option carefully.
For example of using fixed vnodes strategy:
1) start sheep with fixed vnodes strategy.
$ sheep -V 100 /var/lib/sheepdog
$ sheep -V 110 /var/lib/sheepdog
$ dog node list
Id Host:Port V-Nodes Zone
0 172.16.4.205:7000 100 1812140204
1 172.16.4.206:7000 120 1828917420
2) format the cluster with fixed vnodes strategy.
$ dog cluster format -V
3) check vnodes strategy of cluster.
$ dog cluster info -v
Cluster status: running, auto-recovery enabled
Cluster store: plain with 3 redundancy policy
Cluster vnodes strategy: fixed
Cluster vnode mode: node
Cluster created at Wed Dec 17 18:20:10 2014
Epoch Time Version [Host:Port:V-Nodes,,,]
2014-12-17 18:20:10 1 [172.16.4.205:7000:100, 172.16.4.206:7000:120]
4) change of vnodes.
$ dog node vnodes set 140
$ dog node list
Id Host:Port V-Nodes Zone
0 172.16.4.205:7000 140 1812140204
1 172.16.4.206:7000 120 1828917420
Saeki-san, thanks a lot for this patch. The change seems good to me,
but rebase is required for the latest master. Could you rebase and
send v2? In addition, I have some opinions, mainly related to trivial
coding styles. I'm glad if you can take them in v2.
Signed-off-by Masaki Saeki <[email protected]>
You need ':' between Signed-off-by and your name.
---
dog/cluster.c | 82 ++++++++++++++++++++++++++++++++++++++--------
dog/node.c | 67 +++++++++++++++++++++++++++++++++++++
include/internal_proto.h | 3 ++
include/sheep.h | 8 ++++
include/sheepdog_proto.h | 2 +
sheep/config.c | 15 ++++++++-
sheep/group.c | 62 ++++++++++++++++++++++++++++++++---
sheep/ops.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++
sheep/sheep.c | 31 ++++++++++++++++-
9 files changed, 330 insertions(+), 22 deletions(-)
diff --git a/dog/cluster.c b/dog/cluster.c
index 20f190b..c92141e 100644
--- a/dog/cluster.c
+++ b/dog/cluster.c
@@ -15,6 +15,7 @@
#include <sys/time.h>
#include "dog.h"
+#include "sheep.h"
#include "farm/farm.h"
static struct sd_option cluster_options[] = {
@@ -27,6 +28,7 @@ static struct sd_option cluster_options[] = {
"do not serve write request if number of nodes is not sufficient"},
{'z', "block_size_shift", true, "specify the shift num of default"
" data object size"},
+ {'V', "fixedvnodes", false, "disable automatic vnodes calculation"},
{ 0, NULL, false, NULL },
};
@@ -38,6 +40,7 @@ static struct cluster_cmd_data {
bool force;
bool strict;
char name[STORE_LEN];
+ bool fixed_vnodes;
} cluster_cmd_data;
#define DEFAULT_STORE "plain"
@@ -87,6 +90,41 @@ static int cluster_format(int argc, char **argv)
struct timeval tv;
char store_name[STORE_LEN];
static DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
+ struct sd_node *n;
+
+ rb_for_each_entry(n, &sd_nroot, rb) {
+ struct sd_req info_req;
+ struct sd_rsp *info_rsp = (struct sd_rsp *)&info_req;
+ struct cluster_info cinfo;
+
+ sd_init_req(&info_req, SD_OP_CLUSTER_INFO);
+ info_req.data_length = sizeof(cinfo);
+ ret = dog_exec_req(&n->nid, &info_req, &cinfo);
+ if (ret < 0) {
+ sd_err("Fail to execute request");
+ return EXIT_FAILURE;
+ }
+ if (info_rsp->result != SD_RES_SUCCESS) {
+ sd_err("%s", sd_strerror(info_rsp->result));
+ return EXIT_FAILURE;
+ }
+
+ if (n->nr_vnodes != 0) {
+ if ((cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES)
+ && cluster_cmd_data.fixed_vnodes) {
+ sd_err("Can not apply the option of '-V', "
+ "because there are vnode strategy of sheep
"
+ "is auto in the cluster");
+ return EXIT_FAILURE;
+ } else if (!(cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES)
+ && !cluster_cmd_data.fixed_vnodes) {
+ sd_err("Need to specify the option of '-V', "
+ "because there are vnode strategy of sheep
"
+ "is fixed in the cluster");
+ return EXIT_FAILURE;
+ }
+ }
+ }
if (cluster_cmd_data.copies > sd_nodes_nr) {
char info[1024];
@@ -132,6 +170,11 @@ static int cluster_format(int argc, char **argv)
hdr.cluster.flags |= SD_CLUSTER_FLAG_DISKMODE;
#endif
+ if (cluster_cmd_data.fixed_vnodes)
+ hdr.cluster.flags &= ~SD_CLUSTER_FLAG_AUTO_VNODES;
+ else
+ hdr.cluster.flags |= SD_CLUSTER_FLAG_AUTO_VNODES;
+
printf("using backend %s store\n", store_name);
ret = dog_exec_req(&sd_nid, &hdr, store_name);
if (ret < 0)
@@ -160,14 +203,15 @@ static void print_nodes(const struct epoch_log *logs,
uint16_t flags)
if (entry->disks[nr_disk].disk_id == 0)
break;
}
- printf("%s%s(%d)",
- (i == 0) ? "" : ", ",
- addr_to_str(entry->nid.addr, entry->nid.port),
- nr_disk);
+ printf("%s%s:%d(%d)",
+ (i == 0) ? "" : ", ",
+ addr_to_str(entry->nid.addr, entry->nid.port),
+ entry->nr_vnodes, nr_disk);
} else
- printf("%s%s",
- (i == 0) ? "" : ", ",
- addr_to_str(entry->nid.addr, entry->nid.port));
+ printf("%s%s:%d",
+ (i == 0) ? "" : ", ",
+ addr_to_str(entry->nid.addr, entry->nid.port),
+ entry->nr_vnodes);
}
}
@@ -232,6 +276,15 @@ retry:
}
printf("%s with %s redundancy policy\n",
logs->drv_name, copy);
+
+ /* show vnode strategy */
+ if (!raw_output)
+ printf("Cluster vnodes strategy: ");
+ if (logs->flags & SD_CLUSTER_FLAG_AUTO_VNODES)
+ printf("auto\n");
+ else
+ printf("fixed\n");
+
} else
printf("%s\n", sd_strerror(rsp->result));
@@ -239,15 +292,16 @@ retry:
if (!raw_output)
printf("Cluster vnode mode: ");
if (logs->flags & SD_CLUSTER_FLAG_DISKMODE)
- printf("disk");
+ printf("disk\n");
else
- printf("node");
+ printf("node\n");
}
if (!raw_output && rsp->data_length > 0) {
ct = logs[0].ctime >> 32;
- printf("\nCluster created at %s\n", ctime(&ct));
- printf("Epoch Time Version\n");
+ printf("Cluster created at %s\n", ctime(&ct));
+ printf("Epoch Time Version
[Host:Port:V-Nodes,,,]");
The above change will break existing tests. Could you create a patch
for updating tests? You can send it as another patch.
+ printf("\n");
}
nr_logs = rsp->data_length / (sizeof(struct epoch_log)
@@ -761,7 +815,7 @@ failure:
static struct subcommand cluster_cmd[] = {
{"info", NULL, "aprhvT", "show cluster information",
NULL, CMD_NEED_NODELIST, cluster_info, cluster_options},
- {"format", NULL, "bctaphzT", "create a Sheepdog store",
+ {"format", NULL, "bctaphzTV", "create a Sheepdog store",
NULL, CMD_NEED_NODELIST, cluster_format, cluster_options},
{"shutdown", NULL, "aphT", "stop Sheepdog",
NULL, 0, cluster_shutdown, cluster_options},
@@ -823,9 +877,9 @@ static int cluster_parser(int ch, const char *opt)
" Please set shift bit larger than 20");
exit(EXIT_FAILURE);
}
-
cluster_cmd_data.block_size_shift = block_size_shift;
-
+ case 'V':
+ cluster_cmd_data.fixed_vnodes = true;
break;
}
diff --git a/dog/node.c b/dog/node.c
index a4e9142..b9d441a 100644
--- a/dog/node.c
+++ b/dog/node.c
@@ -625,6 +625,71 @@ static int node_log(int argc, char **argv)
return do_generic_subcommand(node_log_cmd, argc, argv);
}
+static int do_vnodes_set(const struct node_id *nid, int *nr_vnodes)
+{
+ int ret = 0;
+ struct sd_req hdr;
+ struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
+
+ sd_init_req(&hdr, SD_OP_SET_VNODES);
+ hdr.flags = SD_FLAG_CMD_WRITE;
+ hdr.data_length = sizeof(nr_vnodes);
+
+ ret = dog_exec_req(nid, &hdr, nr_vnodes);
+ if (ret < 0)
+ return EXIT_SYSFAIL;
+
+ if (rsp->result != SD_RES_SUCCESS)
+ return EXIT_FAILURE;
+
+ return ret;
+}
+
+static int node_vnodes_set(int argc, char **argv)
+{
+ int ret = 0;
+ char *p;
+ int32_t nr_vnodes = strtol(argv[optind], &p, 10);
+
+ if (argv[optind] == p || nr_vnodes < 1 || nr_vnodes > UINT16_MAX
+ || *p != '\0') {
+ sd_err("Invalid number of vnodes '%s': must be an integer "
+ "between 1 and %u",
+ argv[optind], UINT16_MAX);
+ exit(EXIT_USAGE);
+ }
+
+ ret = do_vnodes_set(&sd_nid, &nr_vnodes);
+
+ switch (ret) {
+ case EXIT_FAILURE:
+ case EXIT_SYSFAIL:
+ sd_err("Failed to execute request");
+ ret = -1;
+ break;
+ case EXIT_SUCCESS:
+ /* do nothing */
+ break;
+ default:
+ sd_err("unknown return code of do_vnodes_set(): %d", ret);
+ ret = -1;
+ break;
+ }
+
+ return ret;
+}
+
+static struct subcommand node_vnodes_cmd[] = {
+ {"set", "<num of vnodes>", NULL, "set new vnodes",
+ NULL, CMD_NEED_ARG, node_vnodes_set},
+ {NULL},
+};
+
+static int node_vnodes(int argc, char **argv)
+{
+ return do_generic_subcommand(node_vnodes_cmd, argc, argv);
+}
+
static struct subcommand node_cmd[] = {
{"kill", "<node id>", "aprhlT", "kill node", NULL,
CMD_NEED_NODELIST, node_kill, node_options},
@@ -640,6 +705,8 @@ static struct subcommand node_cmd[] = {
0, node_stat, node_options},
{"log", NULL, "aphT", "show or set log level of the node", node_log_cmd,
CMD_NEED_ARG, node_log},
+ {"vnodes", "<num of vnodes>", "aph", "set new vnodes", node_vnodes_cmd,
+ CMD_NEED_ARG, node_vnodes},
{NULL,},
};
diff --git a/include/internal_proto.h b/include/internal_proto.h
index 3f5d77f..f280d6d 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -111,6 +111,8 @@
#define SD_OP_VDI_STATE_SNAPSHOT_CTL 0xC7
#define SD_OP_INODE_COHERENCE 0xC8
#define SD_OP_READ_DEL_VDIS 0xC9
+#define SD_OP_SET_VNODES 0xCC
+#define SD_OP_GET_VNODES 0xCD
/* internal flags for hdr.flags, must be above 0x80 */
#define SD_FLAG_CMD_RECOVERY 0x0080
@@ -143,6 +145,7 @@
#define SD_CLUSTER_FLAG_STRICT 0x0001 /* Strict mode for write
*/
#define SD_CLUSTER_FLAG_DISKMODE 0x0002 /* Disk mode for cluster */
+#define SD_CLUSTER_FLAG_AUTO_VNODES 0x0004 /* Cluster vnodes strategy */
enum sd_status {
SD_STATUS_OK = 1,
diff --git a/include/sheep.h b/include/sheep.h
index 22524c1..fe6f066 100644
--- a/include/sheep.h
+++ b/include/sheep.h
@@ -149,6 +149,9 @@ static inline const char *sd_strerror(int err)
"IO has halted as there are not enough living nodes",
[SD_RES_READONLY] = "Object is read-only",
[SD_RES_INODE_INVALIDATED] = "Inode object is invalidated",
+ [SD_RES_INVALID_VNODES_STRATEGY] =
+ "Invalid cluster vnodes strategy",
+ [SD_RES_GATEWAY_MODE] = "Targeted node is gateway mode",
/* from internal_proto.h */
[SD_RES_OLD_NODE_VER] = "Request has an old epoch",
@@ -328,4 +331,9 @@ static inline bool is_cluster_diskmode(const struct
cluster_info *cinfo)
return (cinfo->flags & SD_CLUSTER_FLAG_DISKMODE) > 0;
}
+static inline bool is_cluster_autovnodes(const struct cluster_info *cinfo)
+{
+ return (cinfo->flags & SD_CLUSTER_FLAG_AUTO_VNODES) > 0;
+}
+
#endif
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 4f0c48c..28ededd 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -86,6 +86,8 @@
#define SD_RES_INCOMPLETE 0x1B /* Object (in kv) is incomplete uploading */
#define SD_RES_COLLECTING_CINFO 0x1C /* sheep is collecting cluster wide
status, not ready for operation */
#define SD_RES_INODE_INVALIDATED 0x1D /* inode object in client is
invalidated, refreshing is required */
+#define SD_RES_GATEWAY_MODE 0x1E /* Target node is gateway mode */
+#define SD_RES_INVALID_VNODES_STRATEGY 0x1F /* Invalid vnodes strategy */
/* errors above 0x80 are sheepdog-internal */
diff --git a/sheep/config.c b/sheep/config.c
index 383a1ed..4a1e600 100644
--- a/sheep/config.c
+++ b/sheep/config.c
@@ -62,7 +62,12 @@ static int get_cluster_config(struct cluster_info *cinfo)
{
cinfo->ctime = config.ctime;
cinfo->nr_copies = config.copies;
- cinfo->flags = config.flags;
+ if (config.ctime > 0) {
+ cinfo->flags = config.flags;
+ } else {
+ cinfo->flags = (config.flags & ~SD_CLUSTER_FLAG_AUTO_VNODES) |
+ (cinfo->flags & SD_CLUSTER_FLAG_AUTO_VNODES);
+ }
Could you eliminate the braces? In sheepdog coding style, braces of
if, for, while statements should be removed they don't have more than
two statements.
# seems that script/checkpatch.pl doesn't work well for this style...
cinfo->copy_policy = config.copy_policy;
memcpy(cinfo->store, config.store, sizeof(config.store));
@@ -121,6 +126,14 @@ int init_config_file(void)
}
reload:
+ if ((config.flags & SD_CLUSTER_FLAG_AUTO_VNODES) !=
+ (sys->cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES)
+ && !sys->gateway_only
+ && config.ctime > 0) {
+ sd_err("Designation of before a restart and a vnodes option is
different.");
+ return -1;
+ }
+
ret = 0;
get_cluster_config(&sys->cinfo);
if ((config.flags & SD_CLUSTER_FLAG_DISKMODE) !=
diff --git a/sheep/group.c b/sheep/group.c
index 095b7c5..b33e514 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -145,7 +145,8 @@ struct vnode_info *alloc_vnode_info(const struct rb_root
*nroot)
vnode_info->nr_nodes++;
}
- recalculate_vnodes(&vnode_info->nroot);
+ if (is_cluster_autovnodes(&sys->cinfo))
+ recalculate_vnodes(&vnode_info->nroot);
if (is_cluster_diskmode(&sys->cinfo))
disks_to_vnodes(&vnode_info->nroot, &vnode_info->vroot);
@@ -1098,6 +1099,20 @@ static bool cluster_join_check(const struct cluster_info
*cinfo)
if (!cluster_ctime_check(cinfo))
return false;
+ if (cinfo->ctime > 0 && sys->this_node.nr_vnodes != 0) {
+ if (!is_cluster_autovnodes(&sys->cinfo)
+ && is_cluster_autovnodes(cinfo)) {
+ sd_err("failed to join for vnodes strategy unmatch. "
+ " cluster:fixed, joined:auto");
+ return false;
+ } else if (is_cluster_autovnodes(&sys->cinfo)
+ && !is_cluster_autovnodes(cinfo)) {
+ sd_err("failed to join for vnodes strategy unmatch. "
+ " cluster:auto, joined:fixed");
+ return false;
+ }
+ }
+
/*
* Sheepdog's recovery code assumes every node have the same epoch
* history. But we don't check epoch history of joining node because:
@@ -1119,6 +1134,14 @@ main_fn void sd_accept_handler(const struct sd_node
*joined,
{
const struct cluster_info *cinfo = opaque;
struct sd_node *n;
+ uint16_t flags;
+
+ if (node_is_local(joined) && sys->gateway_only
+ && sys->cinfo.ctime <= 0) {
+ flags = cinfo->flags & SD_CLUSTER_FLAG_AUTO_VNODES;
+ } else {
+ flags = sys->cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES;
+ }
The brace problem, too.
if (node_is_local(joined) && !cluster_join_check(cinfo)) {
sd_err("failed to join Sheepdog");
@@ -1127,6 +1150,9 @@ main_fn void sd_accept_handler(const struct sd_node
*joined,
cluster_info_copy(&sys->cinfo, cinfo);
+ sys->cinfo.flags &= ~SD_CLUSTER_FLAG_AUTO_VNODES;
+ sys->cinfo.flags |= flags;
+
sd_debug("join %s", node_to_str(joined));
rb_for_each_entry(n, nroot, rb) {
sd_debug("%s", node_to_str(n));
@@ -1191,7 +1217,7 @@ main_fn void sd_leave_handler(const struct sd_node *left,
remove_node_from_participants(&left->nid);
}
-static void update_node_size(struct sd_node *node)
+static void update_node_info(struct sd_node *node)
{
struct vnode_info *cur_vinfo = get_vnode_info();
struct sd_node *n = rb_search(&cur_vinfo->nroot, node, rb, node_cmp);
@@ -1199,6 +1225,11 @@ static void update_node_size(struct sd_node *node)
if (unlikely(!n))
panic("can't find %s", node_to_str(node));
n->space = node->space;
+
+ if (!is_cluster_autovnodes(&sys->cinfo)) {
+ n->nr_vnodes = node->nr_vnodes;
+ }
The brace problem.
That's all. Thanks.
Hitoshi