Re: [sheepdog] [PATCH] sheep/recovery: reduce network overhead of preparing object list

Ruoyu Mon, 16 Jun 2014 03:46:17 -0700


On 2014年06月16日 18:26, Liu Yuan wrote:

On Mon, Jun 16, 2014 at 04:58:19PM +0800, Ruoyu wrote:

In the stage of preparing object list, every node tries to fetch
entire object lists from all nodes of the cluster, and then,
screens out objects that don't belong to it's own. This behavior
transfers quite a lot of unnecessary object id over the network.


This patch resolve the issue by:
1. fetch objects that belong to the sender node only.
2. don't send request of getting objects to itself.
3. no need to screen out objects if step 1 is achieved, but merging
    object lists fetched from different nodes is essential.

Suppose three nodes a, b, c with 3 copies and one object N. Then

a tries to get object list from b, c and get N from b and N from c because N
belongs to this node. Then we'll have duplicate N if you don't screen out the
objects.

I mean, no need to screen out objects those are not belong to this node.
screen out duplicated objects, that is merging, is needed.

Signed-off-by: Ruoyu <lian...@ucweb.com>
---
  include/sheepdog_proto.h  |  4 ++++
  sheep/object_list_cache.c | 38 +++++++++++++++++++++++++++++++++-----
  sheep/recovery.c          | 41 ++++++++++++++++++-----------------------
  sheep/sheep_priv.h        |  6 ++++++
  4 files changed, 61 insertions(+), 28 deletions(-)

diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 8b5834b..76fad51 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -176,6 +176,10 @@ struct sd_req {
                        uint32_t        generation;
                        uint32_t        count;
                } ref;
+               struct {
+                       uint8_t         addr[16];
+                       uint16_t        port;
+               } node_addr;

uint32_t __pad[8];

        };
diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c
index eefa40a..9a7073d 100644
--- a/sheep/object_list_cache.c
+++ b/sheep/object_list_cache.c
@@ -97,8 +97,17 @@ int objlist_cache_insert(uint64_t oid)

int get_obj_list(const struct sd_req *hdr, struct sd_rsp *rsp, void *data)

  {
-       int nr = 0;
+       int i = 0, j, copies;
        struct objlist_cache_entry *entry;
+       struct node_id peer_nid;
+       struct request *req = container_of(hdr, struct request, rq);
+       struct vnode_info *peer_vinfo = req->vinfo;
+       const struct sd_vnode *vnodes[SD_MAX_COPIES];
+       int last = 0, end = 4096;
+       uint64_t *oids = xmalloc(end * sizeof(uint64_t));
+
+       memcpy(peer_nid.addr, hdr->node_addr.addr, sizeof(peer_nid.addr));
+       peer_nid.port = hdr->node_addr.port;

/* first try getting the cached buffer with only a read lock held */

        sd_read_lock(&obj_list_cache.lock);
@@ -116,19 +125,38 @@ int get_obj_list(const struct sd_req *hdr, struct sd_rsp 
*rsp, void *data)
                                obj_list_cache.cache_size * sizeof(uint64_t));

rb_for_each_entry(entry, &obj_list_cache.root, node) {

-               obj_list_cache.buf[nr++] = entry->oid;
+               obj_list_cache.buf[i++] = entry->oid;
        }

out:

-       if (hdr->data_length < obj_list_cache.cache_size * sizeof(uint64_t)) {
+       /* Screen out objects that don't belong to that node */
+       for (i = 0; i < obj_list_cache.cache_size; i++) {
+               copies = get_obj_copy_number(obj_list_cache.buf[i],
+                               peer_vinfo->nr_zones);
+               oid_to_vnodes(obj_list_cache.buf[i],
+                               &peer_vinfo->vroot, copies, vnodes);
+               for (j = 0; j < copies; j++) {
+                       if (!vnode_is_peer(vnodes[j], &peer_nid))
+                               continue;
+                       oids[last++] = obj_list_cache.buf[i];
+                       if (last >= end) {
+                               end *= 2;
+                               oids = xrealloc(oids, end * sizeof(uint64_t));
+                       }
+               }
+       }
+
+       if (hdr->data_length < last * sizeof(uint64_t)) {
                sd_rw_unlock(&obj_list_cache.lock);
                sd_err("GET_OBJ_LIST buffer too small");
+               free(oids);
                return SD_RES_BUFFER_SMALL;
        }

- rsp->data_length = obj_list_cache.cache_size * sizeof(uint64_t);

-       memcpy(data, obj_list_cache.buf, rsp->data_length);
+       rsp->data_length = last * sizeof(uint64_t);
+       memcpy(data, oids, rsp->data_length);
        sd_rw_unlock(&obj_list_cache.lock);
+       free(oids);
        return SD_RES_SUCCESS;
  }

diff --git a/sheep/recovery.c b/sheep/recovery.c

index 4648966..dfd27bb 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -978,6 +978,9 @@ retry:
        sd_init_req(&hdr, SD_OP_GET_OBJ_LIST);
        hdr.data_length = buf_size;
        hdr.epoch = epoch;
+       memcpy(hdr.node_addr.addr, sys->this_node.nid.addr,
+                       sizeof(hdr.node_addr.addr));
+       hdr.node_addr.port = sys->this_node.nid.port;
        ret = sheep_exec_req(&e->nid, &hdr, buf);

switch (ret) {

@@ -997,40 +1000,28 @@ retry:
        }

*nr_oids = rsp->data_length / sizeof(uint64_t);

-       sd_debug("%zu", *nr_oids);
+       sd_debug("%s: %zu objects to be fetched",
+                       addr_to_str(e->nid.addr, e->nid.port), *nr_oids);
        return buf;
  }

-/* Screen out objects that don't belong to this node */

-static void screen_object_list(struct recovery_list_work *rlw,
+/* Merge object lists fetched from other nodes */
+static void merge_object_list(struct recovery_list_work *rlw,
                               uint64_t *oids, size_t nr_oids)
  {
-       struct recovery_work *rw = &rlw->base;
-       const struct sd_vnode *vnodes[SD_MAX_COPIES];
        uint64_t old_count = rlw->count;
-       uint64_t nr_objs;
-       uint64_t i, j;
+       uint64_t i;

for (i = 0; i < nr_oids; i++) {

                if (xbsearch(&oids[i], rlw->oids, old_count, obj_cmp))
                        /* the object is already scheduled to be recovered */
                        continue;

- nr_objs = get_obj_copy_number(oids[i], rw->cur_vinfo->nr_zones);

-
-               oid_to_vnodes(oids[i], &rw->cur_vinfo->vroot, nr_objs, vnodes);
-               for (j = 0; j < nr_objs; j++) {
-                       if (!vnode_is_local(vnodes[j]))
-                               continue;
-
-                       rlw->oids[rlw->count++] = oids[i];
-                       /* enlarge the list buffer if full */
-                       if (rlw->count == list_buffer_size / sizeof(uint64_t)) {
-                               list_buffer_size *= 2;
-                               rlw->oids = xrealloc(rlw->oids,
-                                                    list_buffer_size);
-                       }
-                       break;
+               rlw->oids[rlw->count++] = oids[i];
+               /* enlarge the list buffer if full */
+               if (rlw->count == list_buffer_size / sizeof(uint64_t)) {
+                       list_buffer_size *= 2;
+                       rlw->oids = xrealloc(rlw->oids, list_buffer_size);
                }
        }

@@ -1064,6 +1055,10 @@ again:

                size_t nr_oids;
                struct sd_node *node = nodes + i;

+ /* No need to fetch from local node */

+               if (node_is_local(node))
+                       continue;

We have to fetch from local node too because we need to get the complete object
list.

Suppose we have 2 nodes and 2 objects with 1 copy.

a[O1], b[O2]. If a wouldn't try to get object list from itself, O1 will be lost.
and b will lose O2 for the same reason.

I am wonder why a[01] will be lost since it is still in node a.


Thanks
Yuan



--
sheepdog mailing list
sheepdog@lists.wpkg.org
http://lists.wpkg.org/mailman/listinfo/sheepdog

Re: [sheepdog] [PATCH] sheep/recovery: reduce network overhead of preparing object list

Reply via email to