From: Yunkai Zhang <[email protected]>

V2:
- use '-R, --repair' instread of '-F, --force_repair'
- not connect to target sheep directly
- define a member name instead of using __pad.
- update this commit log
---------------------------------------------- >8

Reading all of vdi's objects from cluster when checking them will lead to a lot
of waste of network bandwith, let's calculate the checksum of objects in backend
and only send the checksum result to the collie client.

And I think repairing object automaticly is dangerous, as we don't known which
replica is correct. In order to let user have a chance to check them if
necessary, I add a new option: '-R, --repair'. By default, this command
just do check, not repair(as the command name implies).

After add '-R' flag, the help looks like:
$ collie vdi check
Usage: collie vdi check [-R] [-s snapshot] [-a address] [-p port] [-h] <vdiname>
Options:
  -R, --repair            force repair object's copies (dangerous)
  -s, --snapshot          specify a snapshot id or tag name
  -a, --address           specify the daemon address (default: localhost)
  -p, --port              specify the daemon port
  -h, --help              display this help and exit

Let's show some examples when execute this command:
* Success:
$ collie vdi check test.img
CHECKING VDI:test.img ...
PASSED

* Failure (by default not repair):
$ collie vdi check test.img
CHECKING VDI:test.img ...
Failed oid: 9c5e6800000001
>> copy[0], sha1: c78ca69c4be7401b6d1f11a37a4cec4226e736cd, from: 127.0.0.1:7000
>> copy[1], sha1: 46dbc769de60a508faf134c6d51926741c0e38fa, from: 127.0.0.1:7001
>> copy[2], sha1: c78ca69c4be7401b6d1f11a37a4cec4226e736cd, from: 127.0.0.1:7004
FAILED

With the output showed above, user can check all copies of this object and
decide which one is correct (I plan to add a new option: '--oid' to 'collie vdi 
read'
in another patch, so that user can specify which copy of object to be exported:
  $ collie vdi read test.img --oid [email protected]:7001 > foo.img
By testing foo.img, we can known which copy is correct).

User can do force repair by specify -R or --repair flag:
* Force repair:
$ collie vdi check -R test.img
CHECKING VDI:test.img ...
Failed oid: 9c5e6800000001
>> copy[0], sha1: c78ca69c4be7401b6d1f11a37a4cec4226e736cd, from: 127.0.0.1:7000
>> copy[1], sha1: 46dbc769de60a508faf134c6d51926741c0e38fa, from: 127.0.0.1:7001
>> copy[2], sha1: c78ca69c4be7401b6d1f11a37a4cec4226e736cd, from: 127.0.0.1:7004
>> repairing ...
>> copy this object from 127.0.0.1:7000 => 127.0.0.1:7001
>> copy this object from 127.0.0.1:7000 => 127.0.0.1:7004
>> repair finished
REPAIRED

Signed-off-by: Yunkai Zhang <[email protected]>
---
 collie/common.c          |   6 +-
 collie/vdi.c             | 185 ++++++++++++++++++++++++++-------------------
 include/internal_proto.h |  22 +++++-
 include/sheep.h          |  16 ++++
 sheep/farm/farm.h        |   1 -
 sheep/farm/sha1_file.c   |  15 ----
 sheep/gateway.c          |  76 +++++++++++++++++++
 sheep/ops.c              | 191 ++++++++++++++++++++++++++++++++++++++++-------
 sheep/sheep_priv.h       |   4 +
 9 files changed, 390 insertions(+), 126 deletions(-)

diff --git a/collie/common.c b/collie/common.c
index f885c8c..83a2c3d 100644
--- a/collie/common.c
+++ b/collie/common.c
@@ -207,8 +207,7 @@ int send_light_req_get_response(struct sd_req *hdr, const 
char *host, int port)
        ret = exec_req(fd, hdr, NULL, &wlen, &rlen);
        close(fd);
        if (ret) {
-               fprintf(stderr, "failed to connect to  %s:%d\n",
-                       host, port);
+               dprintf("failed to connect to  %s:%d\n", host, port);
                return -1;
        }
 
@@ -229,8 +228,7 @@ int send_light_req(struct sd_req *hdr, const char *host, 
int port)
                return -1;
 
        if (ret != SD_RES_SUCCESS) {
-               fprintf(stderr, "Response's result: %s\n",
-                       sd_strerror(ret));
+               dprintf("Response's result: %s\n", sd_strerror(ret));
                return -1;
        }
 
diff --git a/collie/vdi.c b/collie/vdi.c
index d27b5af..b479efd 100644
--- a/collie/vdi.c
+++ b/collie/vdi.c
@@ -23,6 +23,7 @@ static struct sd_option vdi_options[] = {
        {'x', "exclusive", 0, "write in an exclusive mode"},
        {'d', "delete", 0, "delete a key"},
        {'C', "cache", 0, "enable object cache"},
+       {'R', "repair", 0, "force repair object's copies (dangerous)"},
 
        { 0, NULL, 0, NULL },
 };
@@ -35,6 +36,7 @@ struct vdi_cmd_data {
        int delete;
        int prealloc;
        int cache;
+       int repair;
 } vdi_cmd_data = { ~0, };
 
 struct get_vdi_info {
@@ -1320,126 +1322,143 @@ out:
        return ret;
 }
 
-static void *read_object_from(struct sd_vnode *vnode, uint64_t oid)
+static void get_obj_checksum_from(struct sd_vnode *vnode, uint64_t oid,
+                                 unsigned char *sha1)
 {
        struct sd_req hdr;
-       struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
+       struct sd_checksum_rsp *rsp = (struct sd_checksum_rsp *)&hdr;
+       unsigned rlen, wlen;
        int fd, ret;
-       unsigned wlen = 0, rlen = SD_DATA_OBJ_SIZE;
-       char name[128];
-       void *buf;
-
-       buf = malloc(SD_DATA_OBJ_SIZE);
-       if (!buf) {
-               fprintf(stderr, "Failed to allocate memory\n");
-               exit(EXIT_SYSFAIL);
-       }
 
-       addr_to_str(name, sizeof(name), vnode->nid.addr, 0);
-       fd = connect_to(name, vnode->nid.port);
+       fd = connect_to(sdhost, sdport);
        if (fd < 0) {
-               fprintf(stderr, "failed to connect to %s:%"PRIu32"\n",
-                       name, vnode->nid.port);
-               exit(EXIT_FAILURE);
+               fprintf(stderr, "Failed to connect\n");
+               return;
        }
 
-       sd_init_req(&hdr, SD_OP_READ_PEER);
-       hdr.epoch = sd_epoch;
-       hdr.flags = 0;
-       hdr.data_length = rlen;
+       sd_init_req(&hdr, SD_OP_CALC_CHKSUM);
 
+       rlen = 0;
+       wlen = sizeof(vnode->nid);
+
+       hdr.epoch = sd_epoch;
        hdr.obj.oid = oid;
+       hdr.data_length = wlen;
+       hdr.flags = SD_FLAG_CMD_WRITE;
 
-       ret = exec_req(fd, &hdr, buf, &wlen, &rlen);
+       ret = exec_req(fd, &hdr, &vnode->nid, &wlen, &rlen);
        close(fd);
-
        if (ret) {
-               fprintf(stderr, "Failed to execute request\n");
-               exit(EXIT_FAILURE);
+               fprintf(stderr, "Failed to repair oid:%"PRIx64"\n", oid);
+               return;
        }
-
        if (rsp->result != SD_RES_SUCCESS) {
-               fprintf(stderr, "Failed to read, %s\n",
-                       sd_strerror(rsp->result));
-               exit(EXIT_FAILURE);
+               fprintf(stderr, "Failed to repair oid:%"PRIx64", %s\n",
+                       oid, sd_strerror(rsp->result));
+               return;
        }
-       return buf;
+
+       memcpy(sha1, rsp->sha1, SHA1_LEN);
+       return;
 }
 
-static void write_object_to(struct sd_vnode *vnode, uint64_t oid, void *buf)
+static int do_repair(uint64_t oid, struct node_id *src, struct node_id *dest)
 {
        struct sd_req hdr;
        struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
+       unsigned rlen, wlen;
+       struct node_id nids[2];
        int fd, ret;
-       unsigned wlen = SD_DATA_OBJ_SIZE, rlen = 0;
-       char name[128];
 
-       addr_to_str(name, sizeof(name), vnode->nid.addr, 0);
-       fd = connect_to(name, vnode->nid.port);
+       fd = connect_to(sdhost, sdport);
        if (fd < 0) {
-               fprintf(stderr, "failed to connect to %s:%"PRIu32"\n",
-                       name, vnode->nid.port);
-               exit(EXIT_FAILURE);
+               fprintf(stderr, "Failed to connect\n");
+               return SD_RES_EIO;
        }
 
-       sd_init_req(&hdr, SD_OP_WRITE_PEER);
+       sd_init_req(&hdr, SD_OP_REPAIR_OBJ);
+
+       rlen = 0;
+       wlen = sizeof(nids);
+
        hdr.epoch = sd_epoch;
-       hdr.flags = SD_FLAG_CMD_WRITE;
+       hdr.obj.oid = oid;
        hdr.data_length = wlen;
+       hdr.flags = SD_FLAG_CMD_WRITE;
 
-       hdr.obj.oid = oid;
+       nids[0] = *src;
+       nids[1] = *dest;
 
-       ret = exec_req(fd, &hdr, buf, &wlen, &rlen);
+       ret = exec_req(fd, &hdr, nids, &wlen, &rlen);
        close(fd);
-
        if (ret) {
-               fprintf(stderr, "Failed to execute request\n");
-               exit(EXIT_FAILURE);
+               fprintf(stderr, "Failed to repair oid:%"PRIx64"\n", oid);
+               return SD_RES_EIO;
        }
-
        if (rsp->result != SD_RES_SUCCESS) {
-               fprintf(stderr, "Failed to read, %s\n",
-                       sd_strerror(rsp->result));
-               exit(EXIT_FAILURE);
+               fprintf(stderr, "Failed to repair oid:%"PRIx64", %s\n",
+                       oid, sd_strerror(rsp->result));
+               return rsp->result;
        }
+
+       return SD_RES_SUCCESS;
 }
 
-/*
- * Fix consistency of the replica of oid.
- *
- * XXX: The fix is rather dumb, just read the first copy and write it
- * to other replica.
- */
-static void do_check_repair(uint64_t oid, int nr_copies)
+static int do_check_repair(uint64_t oid, int nr_copies)
 {
        struct sd_vnode *tgt_vnodes[nr_copies];
-       void *buf, *buf_cmp;
-       int i;
+       unsigned char sha1[SD_MAX_COPIES][SHA1_LEN];
+       char host[128];
+       int i, j;
 
        oid_to_vnodes(sd_vnodes, sd_vnodes_nr, oid, nr_copies, tgt_vnodes);
-       buf = read_object_from(tgt_vnodes[0], oid);
-       for (i = 1; i < nr_copies; i++) {
-               buf_cmp = read_object_from(tgt_vnodes[i], oid);
-               if (memcmp(buf, buf_cmp, SD_DATA_OBJ_SIZE)) {
-                       free(buf_cmp);
-                       goto fix_consistency;
+       for (i = 0; i < nr_copies; i++)
+               get_obj_checksum_from(tgt_vnodes[i], oid, sha1[i]);
+
+       for (i = 0; i < nr_copies; i++) {
+               for (j = (i + 1); j < nr_copies; j++) {
+                       if (memcmp(sha1[i], sha1[j], SHA1_LEN))
+                               goto diff;
                }
-               free(buf_cmp);
        }
-       free(buf);
-       return;
+       return 0;
 
-fix_consistency:
-       for (i = 1; i < nr_copies; i++)
-               write_object_to(tgt_vnodes[i], oid, buf);
-       fprintf(stdout, "fix %"PRIx64" success\n", oid);
-       free(buf);
+diff:
+       fprintf(stderr, "Failed oid: %"PRIx64"\n", oid);
+       for (i = 0; i < nr_copies; i++) {
+               addr_to_str(host, sizeof(host), tgt_vnodes[i]->nid.addr, 0);
+               fprintf(stderr, ">> copy[%d], sha1: %s, from: %s:%d\n",
+                       i, sha1_to_hex(sha1[i]), host, tgt_vnodes[i]->nid.port);
+       }
+
+       if (!vdi_cmd_data.repair)
+               return -1;
+
+       /*
+        * Force repair the consistency of oid's replica
+        *
+        * FIXME: this fix is rather dumb, it just read the
+        * first copy and write it to other replica,
+        */
+       fprintf(stderr, ">> repairing ...\n");
+       addr_to_str(host, sizeof(host), tgt_vnodes[0]->nid.addr,
+                   tgt_vnodes[0]->nid.port);
+       for (i = 1; i < nr_copies; i++) {
+               char dest[128];
+               addr_to_str(dest, sizeof(dest), tgt_vnodes[i]->nid.addr,
+                           tgt_vnodes[i]->nid.port);
+               fprintf(stderr, ">> copy this object from %s => %s\n",
+                       host, dest);
+               do_repair(oid, &tgt_vnodes[0]->nid, &tgt_vnodes[i]->nid);
+       }
+       fprintf(stderr, ">> repair finished\n");
+       return -1;
 }
 
 static int check_repair_vdi(uint32_t vid)
 {
        struct sheepdog_inode *inode;
-       int ret;
+       int ret, failed = 0;
        uint64_t total, done = 0, oid;
        uint32_t idx = 0, dvid;
 
@@ -1448,22 +1467,29 @@ static int check_repair_vdi(uint32_t vid)
                fprintf(stderr, "Failed to allocate memory\n");
                return EXIT_SYSFAIL;
        }
+
        ret = sd_read_object(vid_to_vdi_oid(vid), inode, SD_INODE_SIZE, 0);
        if (ret != SD_RES_SUCCESS) {
                fprintf(stderr, "Failed to read an inode\n");
                return EXIT_FAILURE;
        }
 
+       if (do_check_repair(vid_to_vdi_oid(vid), inode->nr_copies))
+               failed = 1;
+
        total = inode->vdi_size;
        while(done < total) {
                dvid = inode->data_vdi_id[idx];
                if (dvid) {
                        oid = vid_to_data_oid(dvid, idx);
-                       do_check_repair(oid, inode->nr_copies);
+                       if (do_check_repair(oid, inode->nr_copies))
+                               failed = 1;
                }
                done += SD_DATA_OBJ_SIZE;
                idx++;
        }
+       if (failed)
+               return EXIT_FAILURE;
 
        return EXIT_SUCCESS;
 }
@@ -1482,11 +1508,15 @@ static int vdi_check(int argc, char **argv)
                goto out;
        }
 
+       printf("CHECKING VDI:%s ...\n", vdiname);
        ret = check_repair_vdi(vid);
-       if (ret != EXIT_SUCCESS)
+       if (ret != EXIT_SUCCESS) {
+               printf("%s\n",
+                      vdi_cmd_data.repair ? "REPAIRED" : "FAILED");
                goto out;
+       }
 
-       fprintf(stdout, "finish check&repair %s\n", vdiname);
+       printf("PASSED\n");
        return EXIT_SUCCESS;
 out:
        return ret;
@@ -1519,7 +1549,7 @@ out:
 }
 
 static struct subcommand vdi_cmd[] = {
-       {"check", "<vdiname>", "saph", "check and repair image's consistency",
+       {"check", "<vdiname>", "Rsaph", "check and repair image's consistency",
         NULL, SUBCMD_FLAG_NEED_NODELIST|SUBCMD_FLAG_NEED_THIRD_ARG,
         vdi_check, vdi_options},
        {"create", "<vdiname> <size>", "Paph", "create an image",
@@ -1599,6 +1629,9 @@ static int vdi_parser(int ch, char *opt)
        case 'C':
                vdi_cmd_data.cache = 1;
                break;
+       case 'R':
+               vdi_cmd_data.repair = 1;
+               break;
        }
 
        return 0;
diff --git a/include/internal_proto.h b/include/internal_proto.h
index 83d98f1..22a050a 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -61,8 +61,12 @@
 #define SD_OP_REMOVE_PEER    0xA6
 #define SD_OP_SET_CACHE_SIZE 0xA7
 #define SD_OP_ENABLE_RECOVER 0xA8
-#define SD_OP_DISABLE_RECOVER 0xA9
-#define SD_OP_INFO_RECOVER 0xAA
+#define SD_OP_DISABLE_RECOVER  0xA9
+#define SD_OP_INFO_RECOVER     0xAA
+#define SD_OP_CALC_CHKSUM      0xAB
+#define SD_OP_CALC_CHKSUM_PEER 0xAC
+#define SD_OP_REPAIR_OBJ       0xAD
+#define SD_OP_REPAIR_OBJ_PEER  0xAE
 
 /* internal flags for hdr.flags, must be above 0x80 */
 #define SD_FLAG_CMD_RECOVERY 0x0080
@@ -167,6 +171,20 @@ struct sd_node_rsp {
        uint64_t        store_free;
 };
 
+struct sd_checksum_rsp {
+       uint8_t         proto_ver;
+       uint8_t         opcode;
+       uint16_t        flags;
+       uint32_t        epoch;
+       uint32_t        id;
+       uint32_t        data_length;
+       uint32_t        result;
+       union {
+               uint8_t         sha1[SHA1_LEN];
+               uint32_t        __pad[7];
+       };
+};
+
 struct node_id {
        uint8_t addr[16];
        uint16_t port;
diff --git a/include/sheep.h b/include/sheep.h
index 719d18f..54b6eb3 100644
--- a/include/sheep.h
+++ b/include/sheep.h
@@ -279,4 +279,20 @@ static inline int nodes_to_vnodes(struct sd_node *nodes, 
int nr,
        return nr_vnodes;
 }
 
+static inline char *sha1_to_hex(const unsigned char *sha1)
+{
+       static char buffer[50];
+       static const char hex[] = "0123456789abcdef";
+       char *buf = buffer;
+       int i;
+
+       for (i = 0; i < SHA1_LEN; i++) {
+               unsigned int val = *sha1++;
+               *buf++ = hex[val >> 4];
+               *buf++ = hex[val & 0xf];
+       }
+       buffer[2 * SHA1_LEN] = 0;
+       return buffer;
+}
+
 #endif
diff --git a/sheep/farm/farm.h b/sheep/farm/farm.h
index 27e65cd..d0b635a 100644
--- a/sheep/farm/farm.h
+++ b/sheep/farm/farm.h
@@ -53,7 +53,6 @@ extern char farm_obj_dir[PATH_MAX];
 extern char *sha1_to_path(const unsigned char *sha1);
 extern int sha1_file_write(unsigned char *buf, unsigned len, unsigned char 
*outsha1);
 extern void *sha1_file_read(const unsigned char *sha1, struct sha1_file_hdr 
*hdr);
-extern char *sha1_to_hex(const unsigned char *sha1);
 extern int get_sha1_hex(const char *hex, unsigned char *sha1);
 extern int sha1_file_try_delete(const unsigned char *sha1);
 
diff --git a/sheep/farm/sha1_file.c b/sheep/farm/sha1_file.c
index b0abc16..3cd4f40 100644
--- a/sheep/farm/sha1_file.c
+++ b/sheep/farm/sha1_file.c
@@ -257,18 +257,3 @@ int get_sha1_hex(const char *hex, unsigned char *sha1)
        }
        return 0;
 }
-
-char *sha1_to_hex(const unsigned char *sha1)
-{
-       static char buffer[50];
-       static const char hex[] = "0123456789abcdef";
-       char *buf = buffer;
-       int i;
-
-       for (i = 0; i < SHA1_LEN; i++) {
-               unsigned int val = *sha1++;
-               *buf++ = hex[val >> 4];
-               *buf++ = hex[val & 0xf];
-       }
-       return buffer;
-}
diff --git a/sheep/gateway.c b/sheep/gateway.c
index bdbd08c..82ef145 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -316,3 +316,79 @@ int gateway_remove_obj(struct request *req)
 {
        return gateway_forward_request(req);
 }
+
+int gateway_calc_obj_chksum(struct request *req)
+{
+       struct sd_req fwdhdr, *hdr = &req->rq;
+       struct sd_checksum_rsp *rsp;
+       struct node_id *nid = (struct node_id *)req->data;
+       char host[128];
+       unsigned int rlen, wlen;
+       int fd, ret;
+
+       addr_to_str(host, sizeof(host), nid->addr, 0);
+
+       fd = connect_to(host, nid->port);
+       if (fd < 0) {
+               dprintf("Failed to connect\n");
+               return SD_RES_EIO;
+       }
+
+       rlen = 0;
+       wlen = 0;
+
+       memcpy(&fwdhdr, hdr, sizeof(fwdhdr));
+       fwdhdr.opcode = gateway_to_peer_opcode(hdr->opcode);
+       fwdhdr.flags = 0;
+       fwdhdr.data_length = 0;
+
+       rsp = (struct sd_checksum_rsp *)&fwdhdr;
+       ret = exec_req(fd, &fwdhdr, NULL, &wlen, &rlen);
+       close(fd);
+       if (ret)
+               return SD_RES_EIO;
+
+       if (rsp->result == SD_RES_SUCCESS) {
+               memcpy((struct sd_checksum_rsp *)&req->rp, rsp, sizeof(*rsp));
+               return SD_RES_SUCCESS;
+       }
+
+       return rsp->result;
+}
+
+int gateway_repair_obj(struct request *req)
+{
+       struct sd_req fwdhdr, *hdr = &req->rq;
+       struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
+       unsigned rlen, wlen;
+       struct node_id *src, *dest;
+       char host[128];
+       int fd, ret;
+
+       src = (struct node_id *)req->data;
+       dest = src + 1;
+
+       addr_to_str(host, sizeof(host), dest->addr, 0);
+
+       fd = connect_to(host, dest->port);
+       if (fd < 0) {
+               dprintf("Failed to connect\n");
+               return SD_RES_EIO;
+       }
+
+       rlen = 0;
+       wlen = sizeof(*src);
+
+       memcpy(&fwdhdr, hdr, sizeof(fwdhdr));
+       fwdhdr.opcode = gateway_to_peer_opcode(hdr->opcode);
+       fwdhdr.flags = 0;
+       fwdhdr.data_length = wlen;
+       fwdhdr.flags = SD_FLAG_CMD_WRITE;
+
+       ret = exec_req(fd, &fwdhdr, src, &wlen, &rlen);
+       close(fd);
+       if (ret)
+               return SD_RES_EIO;
+
+       return rsp->result;
+}
diff --git a/sheep/ops.c b/sheep/ops.c
index 8ca8748..d0aac31 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -26,6 +26,7 @@
 #include "strbuf.h"
 #include "trace/trace.h"
 #include "util.h"
+#include "sha1.h"
 
 enum sd_op_type {
        SD_OP_TYPE_CLUSTER = 1, /* cluster operations */
@@ -625,6 +626,51 @@ static int local_kill_node(const struct sd_req *req, 
struct sd_rsp *rsp,
        return SD_RES_SUCCESS;
 }
 
+static inline int read_object_from(struct node_id *src, uint32_t epoch,
+                                  uint64_t oid, char *buf)
+{
+       struct sd_req hdr;
+       struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
+       int fd, ret;
+       unsigned wlen, rlen;
+       char host[128];
+
+       addr_to_str(host, sizeof(host), src->addr, 0);
+
+       fd = connect_to(host, src->port);
+       if (fd < 0) {
+               dprintf("failed to connect to %s:%"PRIu32"\n",
+                       host, src->port);
+               return SD_RES_NETWORK_ERROR;
+       }
+
+       rlen = SD_DATA_OBJ_SIZE;
+       if (is_vdi_obj(oid))
+               rlen = SD_INODE_SIZE;
+       wlen = 0;
+
+       sd_init_req(&hdr, SD_OP_READ_PEER);
+       hdr.epoch = epoch;
+       hdr.data_length = rlen;
+
+       hdr.obj.oid = oid;
+       hdr.obj.offset = 0;
+
+       ret = exec_req(fd, &hdr, buf, &wlen, &rlen);
+       close(fd);
+       if (ret) {
+               dprintf("Failed to execute request\n");
+               return SD_RES_NETWORK_ERROR;
+       }
+
+       if (rsp->result != SD_RES_SUCCESS) {
+               dprintf("Failed to read, %s\n", sd_strerror(rsp->result));
+               return SD_RES_NETWORK_ERROR;
+       }
+
+       return SD_RES_SUCCESS;
+}
+
 static int read_copy_from_replica(struct vnode_info *vnodes, uint32_t epoch,
                                  uint64_t oid, char *buf)
 {
@@ -666,9 +712,6 @@ static int read_copy_from_replica(struct vnode_info 
*vnodes, uint32_t epoch,
        rounded_rand = random() % nr_copies;
 
        for (i = 0; i < nr_copies; i++) {
-               unsigned wlen, rlen;
-               int fd;
-
                j = (i + rounded_rand) % nr_copies;
 
                /* bypass the local copy */
@@ -676,32 +719,9 @@ static int read_copy_from_replica(struct vnode_info 
*vnodes, uint32_t epoch,
                        continue;
 
                v = obj_vnodes[j];
-               addr_to_str(name, sizeof(name), v->nid.addr, 0);
-
-               fd = connect_to(name, v->nid.port);
-               if (fd < 0)
-                       continue;
-
-               rlen = SD_DATA_OBJ_SIZE;
-               wlen = 0;
 
-               sd_init_req(&hdr, SD_OP_READ_PEER);
-               hdr.epoch = epoch;
-               hdr.data_length = rlen;
-
-               hdr.obj.oid = oid;
-               hdr.obj.offset = 0;
-
-               ret = exec_req(fd, &hdr, buf, &wlen, &rlen);
-
-               close(fd);
-
-               if (ret) {
-                       dprintf("%x, %x\n", ret, rsp->result);
-                       continue;
-               }
-
-               if (rsp->result == SD_RES_SUCCESS)
+               ret = read_object_from(&v->nid, epoch, oid, buf);
+               if (ret == SD_RES_SUCCESS)
                        break;
        }
 
@@ -748,6 +768,50 @@ out:
        return ret;
 }
 
+int peer_calc_obj_chksum(struct request *req)
+{
+       struct sd_req *hdr = &req->rq;
+       struct sd_checksum_rsp *rsp = (struct sd_checksum_rsp *)&req->rp;
+       uint32_t epoch = hdr->epoch;
+       unsigned char sha1[SHA1_LEN];
+       struct siocb iocb;
+       struct sha1_ctx ctx;
+       void *buf;
+       int ret;
+
+       if (sys->gateway_only)
+               return SD_RES_NO_OBJ;
+
+       hdr->data_length = SD_DATA_OBJ_SIZE;
+       if (is_vdi_obj(hdr->obj.oid))
+               hdr->data_length = SD_INODE_SIZE;
+
+       buf = valloc(hdr->data_length);
+       hdr->obj.offset = 0;
+
+       memset(&iocb, 0, sizeof(iocb));
+       iocb.epoch = epoch;
+       iocb.flags = hdr->flags;
+       iocb.buf = buf;
+       iocb.length = hdr->data_length;
+       iocb.offset = hdr->obj.offset;
+       ret = sd_store->read(hdr->obj.oid, &iocb);
+       if (ret != SD_RES_SUCCESS)
+               goto out;
+
+       sha1_init(&ctx);
+       sha1_update(&ctx, buf, hdr->data_length);
+       sha1_final(&ctx, sha1);
+       memcpy(&rsp->sha1, sha1, SHA1_LEN);
+out:
+       req->data = NULL;
+       req->data_length = 0;
+       hdr->data_length = 0;
+       rsp->data_length = 0;
+       free(buf);
+       return ret;
+}
+
 static int do_write_obj(struct siocb *iocb, struct sd_req *hdr, uint32_t epoch,
                void *data, int create)
 {
@@ -777,6 +841,50 @@ static int do_write_obj(struct siocb *iocb, struct sd_req 
*hdr, uint32_t epoch,
        return ret;
 }
 
+int peer_repair_obj(struct request *req)
+{
+       struct sd_req whdr, *hdr = &req->rq;
+       uint32_t epoch = hdr->epoch;
+       uint64_t oid = hdr->obj.oid;
+       struct node_id *src;
+       struct siocb iocb;
+       char host[128];
+       int ret, len;
+       void *buf;
+
+       if (sys->gateway_only)
+               return SD_RES_NO_OBJ;
+
+       src = (struct node_id *)req->data;
+       addr_to_str(host, sizeof(host), src->addr, src->port);
+
+       dprintf("oid:%"PRIx64", from:%s\n", oid, host);
+
+       len = SD_DATA_OBJ_SIZE;
+       if (is_vdi_obj(hdr->obj.oid))
+               len = SD_INODE_SIZE;
+
+       buf = valloc(len);
+       if (!buf) {
+               eprintf("can not allocate memory\n");
+               return SD_RES_NO_SPACE;
+       }
+
+       ret = read_object_from(src, epoch, oid, buf);
+       if (ret != SD_RES_SUCCESS)
+               return ret;
+
+       memset(&whdr, 0, sizeof(whdr));
+       memset(&iocb, 0, sizeof(iocb));
+
+       whdr.data_length = len;
+       whdr.obj.offset = 0;
+       whdr.obj.oid = oid;
+       whdr.epoch = epoch;
+       iocb.epoch = epoch;
+       return do_write_obj(&iocb, &whdr, epoch, buf, 1);
+}
+
 int peer_write_obj(struct request *req)
 {
        struct sd_req *hdr = &req->rq;
@@ -1084,11 +1192,36 @@ static struct sd_op_template sd_ops[] = {
                .type = SD_OP_TYPE_CLUSTER,
                .process_main = cluster_disable_recover,
        },
+
        [SD_OP_INFO_RECOVER] = {
                .name = "INFO_RECOVER",
                .type = SD_OP_TYPE_LOCAL,
                .process_main = local_info_recover,
        },
+
+       [SD_OP_CALC_CHKSUM] = {
+               .name = "CALC_CHKSUM",
+               .type = SD_OP_TYPE_GATEWAY,
+               .process_work = gateway_calc_obj_chksum,
+       },
+
+       [SD_OP_CALC_CHKSUM_PEER] = {
+               .name = "CALC_CHKSUM_PEER",
+               .type = SD_OP_TYPE_PEER,
+               .process_work = peer_calc_obj_chksum,
+       },
+
+       [SD_OP_REPAIR_OBJ] = {
+               .name = "REPAIR_OBJ",
+               .type = SD_OP_TYPE_GATEWAY,
+               .process_work = gateway_repair_obj,
+       },
+
+       [SD_OP_REPAIR_OBJ_PEER] = {
+               .name = "REPAIR_OBJ_PEER",
+               .type = SD_OP_TYPE_PEER,
+               .process_work = peer_repair_obj,
+       },
 };
 
 struct sd_op_template *get_sd_op(uint8_t opcode)
@@ -1174,6 +1307,8 @@ static int map_table[] = {
        [SD_OP_READ_OBJ] = SD_OP_READ_PEER,
        [SD_OP_WRITE_OBJ] = SD_OP_WRITE_PEER,
        [SD_OP_REMOVE_OBJ] = SD_OP_REMOVE_PEER,
+       [SD_OP_CALC_CHKSUM] = SD_OP_CALC_CHKSUM_PEER,
+       [SD_OP_REPAIR_OBJ] = SD_OP_REPAIR_OBJ_PEER,
 };
 
 int gateway_to_peer_opcode(int opcode)
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 857cf87..48e076c 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -319,12 +319,16 @@ int gateway_read_obj(struct request *req);
 int gateway_write_obj(struct request *req);
 int gateway_create_and_write_obj(struct request *req);
 int gateway_remove_obj(struct request *req);
+int gateway_calc_obj_chksum(struct request *req);
+int gateway_repair_obj(struct request *req);
 
 /* backend store */
 int peer_read_obj(struct request *req);
 int peer_write_obj(struct request *req);
 int peer_create_and_write_obj(struct request *req);
 int peer_remove_obj(struct request *req);
+int peer_calc_obj_chksum(struct request *req);
+int peer_repair_obj(struct request *req);
 
 /* object_cache */
 
-- 
1.7.11.2

-- 
sheepdog mailing list
[email protected]
http://lists.wpkg.org/mailman/listinfo/sheepdog

Reply via email to