This patch handles local disk crash and file system unmount while
Sheepdog is running.

Signed-off-by: MORITA Kazutaka <[email protected]>
---
 sheep/group.c      |   55 +++++++++++++++++++++++++++++++++++++++++++++++++--
 sheep/sdnet.c      |   26 ++++++++++++++++++++++++
 sheep/sheep_priv.h |    2 +
 sheep/store.c      |   11 ++++++++-
 4 files changed, 89 insertions(+), 5 deletions(-)

diff --git a/sheep/group.c b/sheep/group.c
index 6deb4a2..cb693ff 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -65,6 +65,10 @@ struct join_message {
        } nodes[SD_MAX_NODES];
 };
 
+struct leave_message {
+       struct message_header header;
+};
+
 struct vdi_op_message {
        struct message_header header;
        struct sd_vdi_req req;
@@ -841,6 +845,9 @@ static void __sd_deliver_done(struct cpg_event *cevent)
        struct message_header *m;
        char name[128];
        int do_recovery;
+       struct node *node;
+       struct sheepdog_node_list_entry e[SD_MAX_NODES];
+       int nr;
 
        m = w->msg;
 
@@ -849,13 +856,33 @@ static void __sd_deliver_done(struct cpg_event *cevent)
                case SD_MSG_JOIN:
                        update_cluster_info((struct join_message *)m);
                        break;
+               case SD_MSG_LEAVE:
+                       node = find_node(&sys->sd_node_list, m->nodeid, m->pid);
+                       if (node) {
+                               sys->nr_vnodes = 0;
+
+                               list_del(&node->list);
+                               free(node);
+                               if (sys->status == SD_STATUS_OK) {
+                                       nr = get_ordered_sd_node_list(e);
+                                       dprintf("update epoch, %d, %d\n", 
sys->epoch + 1, nr);
+                                       epoch_log_write(sys->epoch + 1, (char 
*)e,
+                                                       nr * sizeof(struct 
sheepdog_node_list_entry));
+
+                                       sys->epoch++;
+
+                                       update_epoch_store(sys->epoch);
+                               }
+                       }
+                       break;
                default:
                        eprintf("unknown message %d\n", m->op);
                        break;
                }
        }
 
-       do_recovery = (m->state == DM_FIN && m->op == SD_MSG_JOIN);
+       do_recovery = (m->state == DM_FIN &&
+                      (m->op == SD_MSG_JOIN || m->op == SD_MSG_LEAVE));
 
        dprintf("op: %d, state: %u, size: %d, from: %s\n",
                m->op, m->state, m->msg_length,
@@ -1364,6 +1391,11 @@ do_retry:
                list_del(&cevent->cpg_event_list);
 
                if (is_io_request(req->rq.opcode)) {
+                       int copies = sys->nr_sobjs;
+
+                       if (copies > req->nr_nodes)
+                               copies = req->nr_nodes;
+
                        if (__is_access_to_recoverying_objects(req)) {
                                if (req->rq.flags & SD_FLAG_CMD_DIRECT) {
                                        req->rp.result = SD_RES_NEW_NODE_VER;
@@ -1383,9 +1415,9 @@ do_retry:
                        sys->nr_outstanding_io++;
 
                        if (is_access_local(req->entry, req->nr_vnodes,
-                                           ((struct sd_obj_req 
*)&req->rq)->oid, sys->nr_sobjs) ||
+                                           ((struct sd_obj_req 
*)&req->rq)->oid, copies) ||
                            is_access_local(req->entry, req->nr_vnodes,
-                                           ((struct sd_obj_req 
*)&req->rq)->cow_oid, sys->nr_sobjs)) {
+                                           ((struct sd_obj_req 
*)&req->rq)->cow_oid, copies)) {
                                int ret = check_epoch(req);
                                if (ret != SD_RES_SUCCESS) {
                                        req->rp.result = ret;
@@ -1628,3 +1660,20 @@ join_retry:
        register_event(fd, group_handler, NULL);
        return 0;
 }
+
+/* after this function is called, this node only works as a gateway */
+int leave_cluster(void)
+{
+       struct leave_message msg;
+
+       memset(&msg, 0, sizeof(msg));
+       msg.header.proto_ver = SD_SHEEP_PROTO_VER;
+       msg.header.op = SD_MSG_LEAVE;
+       msg.header.state = DM_FIN;
+       msg.header.msg_length = sizeof(msg);
+       msg.header.from = sys->this_node;
+       msg.header.nodeid = sys->this_nodeid;
+       msg.header.pid = sys->this_pid;
+
+       return send_message(sys->handle, (struct message_header *)&msg);
+}
diff --git a/sheep/sdnet.c b/sheep/sdnet.c
index 0251f71..089e7f6 100644
--- a/sheep/sdnet.c
+++ b/sheep/sdnet.c
@@ -59,6 +59,9 @@ int is_access_local(struct sheepdog_vnode_list_entry *e, int 
nr_nodes,
        if (oid == 0)
                return 0;
 
+       if (copies > nr_nodes)
+               copies = nr_nodes;
+
        for (i = 0; i < copies; i++) {
                n = obj_to_sheep(e, nr_nodes, oid, i);
 
@@ -82,6 +85,8 @@ static void setup_access_to_local_objects(struct request *req)
        copies = hdr->copies;
        if (!copies)
                copies = sys->nr_sobjs;
+       if (copies > req->nr_nodes)
+               copies = req->nr_nodes;
 
        if (is_access_local(req->entry, req->nr_vnodes, hdr->oid, copies))
                req->local_oid = hdr->oid;
@@ -92,6 +97,10 @@ static void __done(struct work *work, int idx)
        struct request *req = container_of(work, struct request, work);
        struct sd_req *hdr = (struct sd_req *)&req->rq;
        int again = 0;
+       int copies = sys->nr_sobjs;
+
+       if (copies > req->nr_nodes)
+               copies = req->nr_nodes;
 
        switch (hdr->opcode) {
        case SD_OP_NEW_VDI:
@@ -151,6 +160,23 @@ static void __done(struct work *work, int idx)
                        bmap->vdi_id = vdi_id;
                        list_add(&bmap->list, &sys->consistent_obj_list);
                        set_bit(data_oid_to_idx(obj_hdr->oid), bmap->dobjs);
+               } else if (is_access_local(req->entry, req->nr_vnodes,
+                                          ((struct sd_obj_req 
*)&req->rq)->oid, copies) &&
+                          req->rp.result == SD_RES_EIO) {
+                       eprintf("leave from cluster\n");
+                       leave_cluster();
+
+                       if (req->rq.flags & SD_FLAG_CMD_DIRECT)
+                               /* hack to retry */
+                               req->rp.result = SD_RES_NETWORK_ERROR;
+                       else {
+                               req->rq.epoch = sys->epoch;
+                               setup_ordered_sd_vnode_list(req);
+                               setup_access_to_local_objects(req);
+
+                               list_add_tail(&cevent->cpg_event_list, 
&sys->cpg_event_siblings);
+                               again = 1;
+                       }
                }
 done:
                resume_pending_requests();
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 82840de..e0be2cc 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -28,6 +28,7 @@
 #define SD_MSG_JOIN             0x01
 #define SD_MSG_VDI_OP           0x02
 #define SD_MSG_MASTER_CHANGED   0x03
+#define SD_MSG_LEAVE            0x04
 
 #define SD_STATUS_OK                0x00
 #define SD_STATUS_WAIT_FOR_FORMAT   0x01
@@ -172,6 +173,7 @@ int is_access_local(struct sheepdog_vnode_list_entry *e, 
int nr_nodes,
 void resume_pending_requests(void);
 
 int create_cluster(int port);
+int leave_cluster(void);
 
 void start_cpg_event_work(void);
 void store_queue_request(struct work *work, int idx);
diff --git a/sheep/store.c b/sheep/store.c
index 5f0de2a..6cb60de 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -489,9 +489,16 @@ static int ob_open(uint32_t epoch, uint64_t oid, int 
aflags, int *ret)
        fd = open(path, flags, def_fmode);
        if (fd < 0) {
                eprintf("failed to open %s, %s\n", path, strerror(errno));
-               if (errno == ENOENT)
+               if (errno == ENOENT) {
+                       struct stat s;
+
                        *ret = SD_RES_NO_OBJ;
-               else
+                       if (stat(obj_path, &s) < 0) {
+                               /* store directory is corrupted */
+                               eprintf("corrupted\n");
+                               *ret = SD_RES_EIO;
+                       }
+               } else
                        *ret = SD_RES_UNKNOWN;
        } else
                *ret = 0;
-- 
1.5.6.5

-- 
sheepdog mailing list
[email protected]
http://lists.wpkg.org/mailman/listinfo/sheepdog

Reply via email to