Re: [sheepdog] [PATCH] sheep: recovery without using priority list

Hitoshi Mitake Thu, 25 Dec 2014 02:02:23 -0800

At Thu, 25 Dec 2014 17:17:00 +0900,
FUKUDA Yasuhito wrote:
> 
> On Thu, 18 Dec 2014 14:00:07 +0900
> Hitoshi Mitake <[email protected]> wrote:
> 
> > At Wed, 17 Dec 2014 17:34:04 +0900,
> > FUKUDA Yasuhito wrote:
> > > 
> > > Current sheepdog, I/O request is processed by way of the Priority Lists 
> > > during auto-recovery.
> > > But it might be wasted performance.
> > > 
> > > This patch omits unnecessary procedure.
> > 
> > Could you provide brief data which can show performance improvement?
> 
> Mitake-san,
> 
> I performed the effect measurement by vdi write command during recovery.
> The results are as follows.


Thanks for your sharing. Applied, thanks.
Hitoshi

> 
> [results (response time of vdi write command)]
>  /* sheepdog-master */
> 1st: 2m38.765s
> 2nd: 2m25.439s
> 3rd: 2m32.993s
> 4th: 2m30.355s
> 5th: 2m27.301s
> 
>  average: 2m30.971s
> 
>  /*this-patch */
> 1st: 2m15.284s
> 2nd: 2m13.842s
> 3rd: 2m15.636s
> 4th: 2m14.967s
> 5th: 2m15.100s
> 
>  average: 2m14.966s
> 
> [Environment]
> sheepdog node: 3
> zookeeper node: 3 (same machine with sheepdog node)
> copies: 3
> 
> [operations]
>  /* all sheepdog start */
> # sheep /var/lib/sheepdog
> 
>  * create VDI for test */
> # dog cluster format -c 3
> # dog vdi create test1G 1G -P
> # dog vdi create test10G 10G -P
> 
>  * random data insert in test10G VDI
> # cat CentOS-6.5-x86_64-bin-DVD1.iso | dog vdi write test10G 0 11000000000
> 
>  /* delete object in one node */
> # pkill -9 sheep
> # rm -rf /var/lib/sheepdog/obj/*
> 
>  /* cache clear */
> # echo 1 > /proc/sys/vm/drop_caches
> 
>  /* sheepdog start in one node */
> # sheep /var/lib/sheepdog
> 
>    ----->auto-recovery start in backend
> 
>  /* vdi write start */
> # time cat CentOS-6.5-x86_64-bin-DVD1.iso | dog vdi write test1G 0 1073741824
> 
> 
> Regards
> 
> > 
> > Thanks,
> > Hitoshi
> > 
> > > 
> > > Signed-off-by: Yasuhito Fukuda <[email protected]>
> > > ---
> > >  sheep/recovery.c |  113 
> > > ++++++++++++++---------------------------------------
> > >  1 files changed, 30 insertions(+), 83 deletions(-)
> > > 
> > > diff --git a/sheep/recovery.c b/sheep/recovery.c
> > > index 85dad21..319fde3 100644
> > > --- a/sheep/recovery.c
> > > +++ b/sheep/recovery.c
> > > @@ -66,9 +66,6 @@ struct recovery_info {
> > >  
> > >   uint64_t count;
> > >   uint64_t *oids;
> > > - uint64_t *prio_oids;
> > > - uint64_t nr_prio_oids;
> > > - uint64_t nr_scheduled_prio_oids;
> > >  
> > >   struct vnode_info *old_vinfo;
> > >   struct vnode_info *cur_vinfo;
> > > @@ -84,6 +81,7 @@ static struct recovery_info *next_rinfo;
> > >  static main_thread(struct recovery_info *) current_rinfo;
> > >  
> > >  static void queue_recovery_work(struct recovery_info *rinfo);
> > > +static void free_recovery_obj_work(struct recovery_obj_work *row);
> > >  
> > >  /* Dynamically grown list buffer default as 4M (2T storage) */
> > >  #define DEFAULT_LIST_BUFFER_SIZE (UINT64_C(1) << 22)
> > > @@ -606,22 +604,38 @@ bool node_in_recovery(void)
> > >   return main_thread_get(current_rinfo) != NULL;
> > >  }
> > >  
> > > -static inline void prepare_schedule_oid(uint64_t oid)
> > > +static void direct_recover_object_main(struct work *work)
> > > +{
> > > + struct recovery_work *rw = container_of(work, struct recovery_work,
> > > +                                         work);
> > > + struct recovery_obj_work *row = container_of(rw,
> > > +                                         struct recovery_obj_work,
> > > +                                         base);
> > > +
> > > + wakeup_requests_on_oid(row->oid);
> > > + free_recovery_obj_work(row);
> > > +}
> > > +
> > > +static inline void direct_queue_recovery_work(uint64_t oid)
> > >  {
> > >   struct recovery_info *rinfo = main_thread_get(current_rinfo);
> > >  
> > > - if (xlfind(&oid, rinfo->prio_oids, rinfo->nr_prio_oids, oid_cmp)) {
> > > -         sd_debug("%" PRIx64 " has been already in prio_oids", oid);
> > > -         return;
> > > - }
> > > + struct recovery_work *rw;
> > > + struct recovery_obj_work *row;
> > > + row = xzalloc(sizeof(*row));
> > > + row->oid = oid;
> > >  
> > > - rinfo->nr_prio_oids++;
> > > - rinfo->prio_oids = xrealloc(rinfo->prio_oids,
> > > -                             rinfo->nr_prio_oids * sizeof(uint64_t));
> > > - rinfo->prio_oids[rinfo->nr_prio_oids - 1] = oid;
> > > - sd_debug("%"PRIx64" nr_prio_oids %"PRIu64, oid, rinfo->nr_prio_oids);
> > > + rw = &row->base;
> > > + rw->work.fn = recover_object_work;
> > > + rw->work.done = direct_recover_object_main;
> > >  
> > > - resume_suspended_recovery();
> > > + rw->epoch = rinfo->epoch;
> > > + rw->tgt_epoch = rinfo->tgt_epoch;
> > > + rw->rinfo = rinfo;
> > > + rw->cur_vinfo = grab_vnode_info(rinfo->cur_vinfo);
> > > + rw->old_vinfo = grab_vnode_info(rinfo->old_vinfo);
> > > +
> > > + queue_work(sys->recovery_wqueue, &rw->work);
> > >  }
> > >  
> > >  main_fn bool oid_in_recovery(uint64_t oid)
> > > @@ -688,7 +702,7 @@ main_fn bool oid_in_recovery(uint64_t oid)
> > >           return false;
> > >   }
> > >  
> > > - prepare_schedule_oid(oid);
> > > + direct_queue_recovery_work(oid);
> > >   return true;
> > >  }
> > >  
> > > @@ -719,7 +733,6 @@ static void free_recovery_info(struct recovery_info 
> > > *rinfo)
> > >   put_vnode_info(rinfo->cur_vinfo);
> > >   put_vnode_info(rinfo->old_vinfo);
> > >   free(rinfo->oids);
> > > - free(rinfo->prio_oids);
> > >   for (int i = 0; i < rinfo->max_epoch; i++)
> > >           put_vnode_info(rinfo->vinfo_array[i]);
> > >   free(rinfo->vinfo_array);
> > > @@ -803,78 +816,12 @@ static inline void finish_recovery(struct 
> > > recovery_info *rinfo)
> > >   sd_debug("recovery complete: new epoch %"PRIu32, recovered_epoch);
> > >  }
> > >  
> > > -static inline bool oid_in_prio_oids(struct recovery_info *rinfo, 
> > > uint64_t oid)
> > > -{
> > > - for (uint64_t i = 0; i < rinfo->nr_prio_oids; i++)
> > > -         if (rinfo->prio_oids[i] == oid)
> > > -                 return true;
> > > - return false;
> > > -}
> > > -
> > > -/*
> > > - * Schedule prio_oids to be recovered first in FIFO order
> > > - *
> > > - * rw->next is index of the original next object to be recovered and 
> > > also the
> > > - * number of objects already recovered and being recovered.
> > > - * we just move rw->prio_oids in between:
> > > - *   new_oids = [0..rw->next - 1] + [rw->prio_oids] + [rw->next]
> > > - */
> > > -static inline void finish_schedule_oids(struct recovery_info *rinfo)
> > > -{
> > > - uint64_t i, nr_recovered = rinfo->next, new_idx;
> > > - uint64_t *new_oids;
> > > -
> > > - /* If I am the last oid, done */
> > > - if (nr_recovered == rinfo->count - 1)
> > > -         goto done;
> > > -
> > > - new_oids = xmalloc(list_buffer_size);
> > > - memcpy(new_oids, rinfo->oids, nr_recovered * sizeof(uint64_t));
> > > - memcpy(new_oids + nr_recovered, rinfo->prio_oids,
> > > -        rinfo->nr_prio_oids * sizeof(uint64_t));
> > > - new_idx = nr_recovered + rinfo->nr_prio_oids;
> > > -
> > > - for (i = rinfo->next; i < rinfo->count; i++) {
> > > -         if (oid_in_prio_oids(rinfo, rinfo->oids[i]))
> > > -                 continue;
> > > -         new_oids[new_idx++] = rinfo->oids[i];
> > > - }
> > > - /* rw->count should eq new_idx, otherwise something is wrong */
> > > - sd_debug("%snr_recovered %" PRIu64 ", nr_prio_oids %" PRIu64 ", count %"
> > > -          PRIu64 " = new %" PRIu64,
> > > -          rinfo->count == new_idx ? "" : "WARN: ", nr_recovered,
> > > -          rinfo->nr_prio_oids, rinfo->count, new_idx);
> > > -
> > > - free(rinfo->oids);
> > > - rinfo->oids = new_oids;
> > > -done:
> > > - free(rinfo->prio_oids);
> > > - rinfo->prio_oids = NULL;
> > > - rinfo->nr_scheduled_prio_oids += rinfo->nr_prio_oids;
> > > - rinfo->nr_prio_oids = 0;
> > > -}
> > > -
> > > -/*
> > > - * When automatic object recovery is disabled, the behavior of the
> > > - * recovery process is like 'lazy recovery'.  This function returns
> > > - * true if the recovery queue contains objects being accessed by
> > > - * clients.  Sheep recovers such objects for availability even when
> > > - * automatic object recovery is not enabled.
> > > - */
> > > -static bool has_scheduled_objects(struct recovery_info *rinfo)
> > > -{
> > > - return rinfo->done < rinfo->nr_scheduled_prio_oids;
> > > -}
> > > -
> > >  static void recover_next_object(struct recovery_info *rinfo)
> > >  {
> > >   if (run_next_rw())
> > >           return;
> > >  
> > > - if (rinfo->nr_prio_oids)
> > > -         finish_schedule_oids(rinfo);
> > > -
> > > - if (sys->cinfo.disable_recovery && !has_scheduled_objects(rinfo)) {
> > > + if (sys->cinfo.disable_recovery) {
> > >           sd_debug("suspended");
> > >           rinfo->suspended = true;
> > >           /* suspend until resume_suspended_recovery() is called */
> > > -- 
> > > 1.7.1
> > > 
> > > 
> > > 
> > > -- 
> > > NTTソフトウェア株式会社
> > > クラウド事業部 第一事業ユニット(C一BU)
> > > 福田康人(FUKUDA Yasuhito)
> > > E-mail:[email protected]
> > > 〒220-0012 横浜市西区みなとみらい4-4-5
> > > 横浜アイマークプレイス13階
> > > TEL:045-212-7393/FAX:045-662-7856
> > > 
> > > 
> > > -- 
> > > sheepdog mailing list
> > > [email protected]
> > > http://lists.wpkg.org/mailman/listinfo/sheepdog
> 
> 
> -- 
> NTTソフトウェア株式会社
> クラウド事業部 第一事業ユニット(C一BU)
> 福田康人(FUKUDA Yasuhito)
> E-mail:[email protected]
> 〒220-0012 横浜市西区みなとみらい4-4-5
> 横浜アイマークプレイス13階
> TEL:045-212-7393/FAX:045-662-7856
> 
> 
> -- 
> sheepdog mailing list
> [email protected]
> https://lists.wpkg.org/mailman/listinfo/sheepdog
-- 
sheepdog mailing list
[email protected]
https://lists.wpkg.org/mailman/listinfo/sheepdog

Re: [sheepdog] [PATCH] sheep: recovery without using priority list

Reply via email to