This patch handles local disk crash and file system unmount while Sheepdog is running.
Signed-off-by: MORITA Kazutaka <[email protected]> --- sheep/group.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++-- sheep/sdnet.c | 26 ++++++++++++++++++++++++ sheep/sheep_priv.h | 2 + sheep/store.c | 11 ++++++++- 4 files changed, 89 insertions(+), 5 deletions(-) diff --git a/sheep/group.c b/sheep/group.c index 6deb4a2..cb693ff 100644 --- a/sheep/group.c +++ b/sheep/group.c @@ -65,6 +65,10 @@ struct join_message { } nodes[SD_MAX_NODES]; }; +struct leave_message { + struct message_header header; +}; + struct vdi_op_message { struct message_header header; struct sd_vdi_req req; @@ -841,6 +845,9 @@ static void __sd_deliver_done(struct cpg_event *cevent) struct message_header *m; char name[128]; int do_recovery; + struct node *node; + struct sheepdog_node_list_entry e[SD_MAX_NODES]; + int nr; m = w->msg; @@ -849,13 +856,33 @@ static void __sd_deliver_done(struct cpg_event *cevent) case SD_MSG_JOIN: update_cluster_info((struct join_message *)m); break; + case SD_MSG_LEAVE: + node = find_node(&sys->sd_node_list, m->nodeid, m->pid); + if (node) { + sys->nr_vnodes = 0; + + list_del(&node->list); + free(node); + if (sys->status == SD_STATUS_OK) { + nr = get_ordered_sd_node_list(e); + dprintf("update epoch, %d, %d\n", sys->epoch + 1, nr); + epoch_log_write(sys->epoch + 1, (char *)e, + nr * sizeof(struct sheepdog_node_list_entry)); + + sys->epoch++; + + update_epoch_store(sys->epoch); + } + } + break; default: eprintf("unknown message %d\n", m->op); break; } } - do_recovery = (m->state == DM_FIN && m->op == SD_MSG_JOIN); + do_recovery = (m->state == DM_FIN && + (m->op == SD_MSG_JOIN || m->op == SD_MSG_LEAVE)); dprintf("op: %d, state: %u, size: %d, from: %s\n", m->op, m->state, m->msg_length, @@ -1364,6 +1391,11 @@ do_retry: list_del(&cevent->cpg_event_list); if (is_io_request(req->rq.opcode)) { + int copies = sys->nr_sobjs; + + if (copies > req->nr_nodes) + copies = req->nr_nodes; + if (__is_access_to_recoverying_objects(req)) { if (req->rq.flags & SD_FLAG_CMD_DIRECT) { req->rp.result = SD_RES_NEW_NODE_VER; @@ -1383,9 +1415,9 @@ do_retry: sys->nr_outstanding_io++; if (is_access_local(req->entry, req->nr_vnodes, - ((struct sd_obj_req *)&req->rq)->oid, sys->nr_sobjs) || + ((struct sd_obj_req *)&req->rq)->oid, copies) || is_access_local(req->entry, req->nr_vnodes, - ((struct sd_obj_req *)&req->rq)->cow_oid, sys->nr_sobjs)) { + ((struct sd_obj_req *)&req->rq)->cow_oid, copies)) { int ret = check_epoch(req); if (ret != SD_RES_SUCCESS) { req->rp.result = ret; @@ -1628,3 +1660,20 @@ join_retry: register_event(fd, group_handler, NULL); return 0; } + +/* after this function is called, this node only works as a gateway */ +int leave_cluster(void) +{ + struct leave_message msg; + + memset(&msg, 0, sizeof(msg)); + msg.header.proto_ver = SD_SHEEP_PROTO_VER; + msg.header.op = SD_MSG_LEAVE; + msg.header.state = DM_FIN; + msg.header.msg_length = sizeof(msg); + msg.header.from = sys->this_node; + msg.header.nodeid = sys->this_nodeid; + msg.header.pid = sys->this_pid; + + return send_message(sys->handle, (struct message_header *)&msg); +} diff --git a/sheep/sdnet.c b/sheep/sdnet.c index 0251f71..089e7f6 100644 --- a/sheep/sdnet.c +++ b/sheep/sdnet.c @@ -59,6 +59,9 @@ int is_access_local(struct sheepdog_vnode_list_entry *e, int nr_nodes, if (oid == 0) return 0; + if (copies > nr_nodes) + copies = nr_nodes; + for (i = 0; i < copies; i++) { n = obj_to_sheep(e, nr_nodes, oid, i); @@ -82,6 +85,8 @@ static void setup_access_to_local_objects(struct request *req) copies = hdr->copies; if (!copies) copies = sys->nr_sobjs; + if (copies > req->nr_nodes) + copies = req->nr_nodes; if (is_access_local(req->entry, req->nr_vnodes, hdr->oid, copies)) req->local_oid = hdr->oid; @@ -92,6 +97,10 @@ static void __done(struct work *work, int idx) struct request *req = container_of(work, struct request, work); struct sd_req *hdr = (struct sd_req *)&req->rq; int again = 0; + int copies = sys->nr_sobjs; + + if (copies > req->nr_nodes) + copies = req->nr_nodes; switch (hdr->opcode) { case SD_OP_NEW_VDI: @@ -151,6 +160,23 @@ static void __done(struct work *work, int idx) bmap->vdi_id = vdi_id; list_add(&bmap->list, &sys->consistent_obj_list); set_bit(data_oid_to_idx(obj_hdr->oid), bmap->dobjs); + } else if (is_access_local(req->entry, req->nr_vnodes, + ((struct sd_obj_req *)&req->rq)->oid, copies) && + req->rp.result == SD_RES_EIO) { + eprintf("leave from cluster\n"); + leave_cluster(); + + if (req->rq.flags & SD_FLAG_CMD_DIRECT) + /* hack to retry */ + req->rp.result = SD_RES_NETWORK_ERROR; + else { + req->rq.epoch = sys->epoch; + setup_ordered_sd_vnode_list(req); + setup_access_to_local_objects(req); + + list_add_tail(&cevent->cpg_event_list, &sys->cpg_event_siblings); + again = 1; + } } done: resume_pending_requests(); diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h index 82840de..e0be2cc 100644 --- a/sheep/sheep_priv.h +++ b/sheep/sheep_priv.h @@ -28,6 +28,7 @@ #define SD_MSG_JOIN 0x01 #define SD_MSG_VDI_OP 0x02 #define SD_MSG_MASTER_CHANGED 0x03 +#define SD_MSG_LEAVE 0x04 #define SD_STATUS_OK 0x00 #define SD_STATUS_WAIT_FOR_FORMAT 0x01 @@ -172,6 +173,7 @@ int is_access_local(struct sheepdog_vnode_list_entry *e, int nr_nodes, void resume_pending_requests(void); int create_cluster(int port); +int leave_cluster(void); void start_cpg_event_work(void); void store_queue_request(struct work *work, int idx); diff --git a/sheep/store.c b/sheep/store.c index 5f0de2a..6cb60de 100644 --- a/sheep/store.c +++ b/sheep/store.c @@ -489,9 +489,16 @@ static int ob_open(uint32_t epoch, uint64_t oid, int aflags, int *ret) fd = open(path, flags, def_fmode); if (fd < 0) { eprintf("failed to open %s, %s\n", path, strerror(errno)); - if (errno == ENOENT) + if (errno == ENOENT) { + struct stat s; + *ret = SD_RES_NO_OBJ; - else + if (stat(obj_path, &s) < 0) { + /* store directory is corrupted */ + eprintf("corrupted\n"); + *ret = SD_RES_EIO; + } + } else *ret = SD_RES_UNKNOWN; } else *ret = 0; -- 1.5.6.5 -- sheepdog mailing list [email protected] http://lists.wpkg.org/mailman/listinfo/sheepdog
