On Tue, Jun 17, 2014 at 06:08:40PM +0800, Ruoyu wrote:
> In the stage of preparing object list, every node tries to fetch
> entire object lists from all nodes of the cluster, and then,
> screens out objects that don't belong to it's own. This behavior
> transfers quite a lot of unnecessary object id over the network.
> 
> This patch resolve the issue by:
> 1. receiver node prepare objects that belong to the sender node.
> 2. sender node don't screen out objects any longer because it is
>    already done by the receiver, but merging object lists fetched
>    from different nodes is essential.
> 
> Signed-off-by: Ruoyu <lian...@ucweb.com>
> ---
>  include/sheepdog_proto.h  |  4 ++++
>  sheep/object_list_cache.c | 38 +++++++++++++++++++++++++++++++++-----
>  sheep/recovery.c          | 37 ++++++++++++++-----------------------
>  sheep/sheep_priv.h        |  6 ++++++
>  4 files changed, 57 insertions(+), 28 deletions(-)
> 
> diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
> index 8b5834b..76fad51 100644
> --- a/include/sheepdog_proto.h
> +++ b/include/sheepdog_proto.h
> @@ -176,6 +176,10 @@ struct sd_req {
>                       uint32_t        generation;
>                       uint32_t        count;
>               } ref;
> +             struct {
> +                     uint8_t         addr[16];
> +                     uint16_t        port;
> +             } node_addr;
>  
>               uint32_t                __pad[8];
>       };
> diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c
> index eefa40a..9a7073d 100644
> --- a/sheep/object_list_cache.c
> +++ b/sheep/object_list_cache.c
> @@ -97,8 +97,17 @@ int objlist_cache_insert(uint64_t oid)
>  
>  int get_obj_list(const struct sd_req *hdr, struct sd_rsp *rsp, void *data)
>  {
> -     int nr = 0;
> +     int i = 0, j, copies;
>       struct objlist_cache_entry *entry;
> +     struct node_id peer_nid;
> +     struct request *req = container_of(hdr, struct request, rq);
> +     struct vnode_info *peer_vinfo = req->vinfo;
> +     const struct sd_vnode *vnodes[SD_MAX_COPIES];
> +     int last = 0, end = 4096;
> +     uint64_t *oids = xmalloc(end * sizeof(uint64_t));
> +
> +     memcpy(peer_nid.addr, hdr->node_addr.addr, sizeof(peer_nid.addr));
> +     peer_nid.port = hdr->node_addr.port;
>  
>       /* first try getting the cached buffer with only a read lock held */
>       sd_read_lock(&obj_list_cache.lock);
> @@ -116,19 +125,38 @@ int get_obj_list(const struct sd_req *hdr, struct 
> sd_rsp *rsp, void *data)
>                               obj_list_cache.cache_size * sizeof(uint64_t));
>  
>       rb_for_each_entry(entry, &obj_list_cache.root, node) {
> -             obj_list_cache.buf[nr++] = entry->oid;
> +             obj_list_cache.buf[i++] = entry->oid;
>       }
>  
>  out:
> -     if (hdr->data_length < obj_list_cache.cache_size * sizeof(uint64_t)) {
> +     /* Screen out objects that don't belong to that node */
> +     for (i = 0; i < obj_list_cache.cache_size; i++) {
> +             copies = get_obj_copy_number(obj_list_cache.buf[i],
> +                             peer_vinfo->nr_zones);
> +             oid_to_vnodes(obj_list_cache.buf[i],
> +                             &peer_vinfo->vroot, copies, vnodes);
> +             for (j = 0; j < copies; j++) {
> +                     if (!vnode_is_peer(vnodes[j], &peer_nid))
> +                             continue;
> +                     oids[last++] = obj_list_cache.buf[i];
> +                     if (last >= end) {
> +                             end *= 2;
> +                             oids = xrealloc(oids, end * sizeof(uint64_t));
> +                     }
> +             }
> +     }
> +
> +     if (hdr->data_length < last * sizeof(uint64_t)) {
>               sd_rw_unlock(&obj_list_cache.lock);
>               sd_err("GET_OBJ_LIST buffer too small");
> +             free(oids);
>               return SD_RES_BUFFER_SMALL;
>       }
>  
> -     rsp->data_length = obj_list_cache.cache_size * sizeof(uint64_t);
> -     memcpy(data, obj_list_cache.buf, rsp->data_length);
> +     rsp->data_length = last * sizeof(uint64_t);
> +     memcpy(data, oids, rsp->data_length);
>       sd_rw_unlock(&obj_list_cache.lock);
> +     free(oids);
>       return SD_RES_SUCCESS;
>  }
>  
> diff --git a/sheep/recovery.c b/sheep/recovery.c
> index 4648966..ea67b5f 100644
> --- a/sheep/recovery.c
> +++ b/sheep/recovery.c
> @@ -978,6 +978,9 @@ retry:
>       sd_init_req(&hdr, SD_OP_GET_OBJ_LIST);
>       hdr.data_length = buf_size;
>       hdr.epoch = epoch;
> +     memcpy(hdr.node_addr.addr, sys->this_node.nid.addr,
> +                     sizeof(hdr.node_addr.addr));
> +     hdr.node_addr.port = sys->this_node.nid.port;
>       ret = sheep_exec_req(&e->nid, &hdr, buf);
>  
>       switch (ret) {
> @@ -997,40 +1000,28 @@ retry:
>       }
>  
>       *nr_oids = rsp->data_length / sizeof(uint64_t);
> -     sd_debug("%zu", *nr_oids);
> +     sd_debug("%s: %zu objects to be fetched",
> +                     addr_to_str(e->nid.addr, e->nid.port), *nr_oids);
>       return buf;
>  }
>  
> -/* Screen out objects that don't belong to this node */
> -static void screen_object_list(struct recovery_list_work *rlw,
> +/* Merge object lists fetched from other nodes */
> +static void merge_object_list(struct recovery_list_work *rlw,
>                              uint64_t *oids, size_t nr_oids)
>  {
> -     struct recovery_work *rw = &rlw->base;
> -     const struct sd_vnode *vnodes[SD_MAX_COPIES];
>       uint64_t old_count = rlw->count;
> -     uint64_t nr_objs;
> -     uint64_t i, j;
> +     uint64_t i;
>  
>       for (i = 0; i < nr_oids; i++) {
>               if (xbsearch(&oids[i], rlw->oids, old_count, obj_cmp))
>                       /* the object is already scheduled to be recovered */
>                       continue;
>  
> -             nr_objs = get_obj_copy_number(oids[i], rw->cur_vinfo->nr_zones);
> -
> -             oid_to_vnodes(oids[i], &rw->cur_vinfo->vroot, nr_objs, vnodes);
> -             for (j = 0; j < nr_objs; j++) {
> -                     if (!vnode_is_local(vnodes[j]))
> -                             continue;
> -
> -                     rlw->oids[rlw->count++] = oids[i];
> -                     /* enlarge the list buffer if full */
> -                     if (rlw->count == list_buffer_size / sizeof(uint64_t)) {
> -                             list_buffer_size *= 2;
> -                             rlw->oids = xrealloc(rlw->oids,
> -                                                  list_buffer_size);
> -                     }
> -                     break;
> +             rlw->oids[rlw->count++] = oids[i];
> +             /* enlarge the list buffer if full */
> +             if (rlw->count == list_buffer_size / sizeof(uint64_t)) {
> +                     list_buffer_size *= 2;
> +                     rlw->oids = xrealloc(rlw->oids, list_buffer_size);
>               }
>       }
>  
> @@ -1072,7 +1063,7 @@ again:
>               oids = fetch_object_list(node, rw->epoch, &nr_oids);
>               if (!oids)
>                       continue;
> -             screen_object_list(rlw, oids, nr_oids);
> +             merge_object_list(rlw, oids, nr_oids);
>               free(oids);
>       }
>  
> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
> index 7b33f11..02f75ad 100644
> --- a/sheep/sheep_priv.h
> +++ b/sheep/sheep_priv.h
> @@ -461,6 +461,12 @@ int gateway_to_peer_opcode(int opcode);
>  
>  extern uint32_t last_gathered_epoch;
>  
> +static inline bool vnode_is_peer(const struct sd_vnode *v,
> +                     const struct node_id *peer_nid)
> +{
> +     return node_id_cmp(&v->node->nid, peer_nid) == 0;
> +}
> +
>  static inline bool vnode_is_local(const struct sd_vnode *v)
>  {
>       return node_id_cmp(&v->node->nid, &sys->this_node.nid) == 0;
> -- 
> 1.8.3.2
> 
> 
> -- 
> sheepdog mailing list
> sheepdog@lists.wpkg.org
> http://lists.wpkg.org/mailman/listinfo/sheepdog

Applied thanks

Yuan
-- 
sheepdog mailing list
sheepdog@lists.wpkg.org
http://lists.wpkg.org/mailman/listinfo/sheepdog

Reply via email to