At Mon, 20 Aug 2012 23:34:10 +0800,
Yunkai Zhang wrote:
> 
> In fact, I have thought this method, but we should face nearly the same 
> problem:
> 
> After sheep joined back, it should known which objects is dirty, and
> should do the clear work(because there are old version object stay in
> it's working directory). This method seems not save the steps, but
> will do extra recovery works.

Can you give me a concrete example?

I created a really naive patch to disable object recovery with my
idea:

==
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 5164aa7..8bf032f 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -35,6 +35,7 @@ struct recovery_work {
        uint64_t *oids;
        uint64_t *prio_oids;
        int nr_prio_oids;
+       int nr_scheduled_oids;
 
        struct vnode_info *old_vinfo;
        struct vnode_info *cur_vinfo;
@@ -269,9 +270,6 @@ static inline void prepare_schedule_oid(uint64_t oid)
                                oid);
                        return;
                }
-       /* The oid is currently being recovered */
-       if (rw->oids[rw->done] == oid)
-               return;
        rw->nr_prio_oids++;
        rw->prio_oids = xrealloc(rw->prio_oids,
                                 rw->nr_prio_oids * sizeof(uint64_t));
@@ -399,9 +397,31 @@ static inline void finish_schedule_oids(struct 
recovery_work *rw)
 done:
        free(rw->prio_oids);
        rw->prio_oids = NULL;
+       rw->nr_scheduled_oids += rw->nr_prio_oids;
        rw->nr_prio_oids = 0;
 }
 
+static struct timer recovery_timer;
+
+static void recover_next_object(void *arg)
+{
+       struct recovery_work *rw = arg;
+
+       if (rw->nr_prio_oids)
+               finish_schedule_oids(rw);
+
+       if (rw->done < rw->nr_scheduled_oids) {
+               /* Try recover next object */
+               queue_work(sys->recovery_wqueue, &rw->work);
+               return;
+       }
+
+       /* There is no objects to be recovered.  Try again later */
+       recovery_timer.callback = recover_next_object;
+       recovery_timer.data = rw;
+       add_timer(&recovery_timer, 1); /* FIXME */
+}
+
 static void recover_object_main(struct work *work)
 {
        struct recovery_work *rw = container_of(work, struct recovery_work,
@@ -425,11 +445,7 @@ static void recover_object_main(struct work *work)
        resume_wait_obj_requests(rw->oids[rw->done++]);
 
        if (rw->done < rw->count) {
-               if (rw->nr_prio_oids)
-                       finish_schedule_oids(rw);
-
-               /* Try recover next object */
-               queue_work(sys->recovery_wqueue, &rw->work);
+               recover_next_object(rw);
                return;
        }
 
@@ -458,7 +474,7 @@ static void finish_object_list(struct work *work)
        resume_wait_recovery_requests();
        rw->work.fn = recover_object_work;
        rw->work.done = recover_object_main;
-       queue_work(sys->recovery_wqueue, &rw->work);
+       recover_next_object(rw);
        return;
 }
 
==

I ran the following test, and object recovery was disabled correctly
for both join and leave case.

==
#!/bin/bash

for i in 0 1 2 3; do
    ./sheep/sheep /store/$i -z $i -p 700$i -c local
done

sleep 1
./collie/collie cluster format

./collie/collie vdi create test 4G

echo " * objects will be created on node[0-2] *"
md5sum /store/[0,1,2,3]/obj/807c2b2500000000

pkill -f "./sheep/sheep /store/1"
sleep 3

echo " * recovery doesn't start until the object is touched *"
md5sum /store/[0,2,3]/obj/807c2b2500000000

./collie/collie vdi snapshot test  # invoke recovery of the vdi object
echo " * the object is recovered *"
md5sum /store/[0,2,3]/obj/807c2b2500000000

./sheep/sheep /store/1 -z 1 -p 7001 -c local
sleep 3

echo " * recovery doesn't start until the object is touched *"
md5sum /store/[0,1,2,3]/obj/807c2b2500000000

./collie/collie vdi list -p 7001  # invoke recovery of the vdi object
echo " * the object is recovered *"
md5sum /store/[0,1,2,3]/obj/807c2b2500000000
==

[Output]

using backend farm store
 * objects will be created on node[0-2] *
701e77eab6002c9a48f7ba72c8d9bfe9  /store/0/obj/807c2b2500000000
701e77eab6002c9a48f7ba72c8d9bfe9  /store/1/obj/807c2b2500000000
701e77eab6002c9a48f7ba72c8d9bfe9  /store/2/obj/807c2b2500000000
 * recovery doesn't start until the object is touched *
701e77eab6002c9a48f7ba72c8d9bfe9  /store/0/obj/807c2b2500000000
701e77eab6002c9a48f7ba72c8d9bfe9  /store/2/obj/807c2b2500000000
 * the object is recovered *
3c3bf0d865363fd0d1f1d5c7aa044dcd  /store/0/obj/807c2b2500000000
3c3bf0d865363fd0d1f1d5c7aa044dcd  /store/2/obj/807c2b2500000000
3c3bf0d865363fd0d1f1d5c7aa044dcd  /store/3/obj/807c2b2500000000
 * recovery doesn't start until the object is touched *
3c3bf0d865363fd0d1f1d5c7aa044dcd  /store/0/obj/807c2b2500000000
3c3bf0d865363fd0d1f1d5c7aa044dcd  /store/2/obj/807c2b2500000000
  Name        Id    Size    Used  Shared    Creation time   VDI id  Tag
s test         1  4.0 GB  0.0 MB  0.0 MB 2012-08-21 02:49   7c2b25  
  test         2  4.0 GB  0.0 MB  0.0 MB 2012-08-21 02:49   7c2b26  
 * the object is recovered *
3c3bf0d865363fd0d1f1d5c7aa044dcd  /store/0/obj/807c2b2500000000
3c3bf0d865363fd0d1f1d5c7aa044dcd  /store/1/obj/807c2b2500000000
3c3bf0d865363fd0d1f1d5c7aa044dcd  /store/2/obj/807c2b2500000000


I couldn't read an old object at all.

Thanks,

Kazutaka
-- 
sheepdog mailing list
[email protected]
http://lists.wpkg.org/mailman/listinfo/sheepdog

Reply via email to