On Mon, Sep 01, 2014 at 11:37:20AM +0200, Benoît Canet wrote: > The Monday 01 Sep 2014 à 15:43:14 (+0800), Liu Yuan wrote : > > For some configuration, quorum allow VMs to continue while some child > > devices > > are broken and when the child devices are repaired and return back, we need > > to > > sync dirty bits during downtime to keep data consistency. > > > > The recovery logic is based on the driver state bitmap and will sync the > > dirty > > bits with a timeslice window in a coroutine in this prtimive implementation. > > > > Simple graph about 2 children with threshold=1 and read-pattern=fifo: > > > > + denote device sync iteration > > - IO on a single device > > = IO on two devices > > > > sync complete, release dirty bitmap > > ^ > > | > > ====-----------------++++----++++----++========== > > | | > > | v > > | device repaired and begin to sync > > v > > device broken, create a dirty bitmap > > > > This sync logic can take care of nested broken problem, that devices are > > broken while in sync. We just start a sync process after the devices are > > repaired again and switch the devices from broken to sound only when the > > sync > > completes. > > > > For read-pattern=quorum mode, it enjoys the recovery logic without any > > problem. > > > > Cc: Eric Blake <ebl...@redhat.com> > > Cc: Benoit Canet <ben...@irqsave.net> > > Cc: Kevin Wolf <kw...@redhat.com> > > Cc: Stefan Hajnoczi <stefa...@redhat.com> > > Signed-off-by: Liu Yuan <namei.u...@gmail.com> > > --- > > block/quorum.c | 189 > > ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- > > trace-events | 5 ++ > > 2 files changed, 191 insertions(+), 3 deletions(-) > > > > diff --git a/block/quorum.c b/block/quorum.c > > index 7b07e35..ffd7c2d 100644 > > --- a/block/quorum.c > > +++ b/block/quorum.c > > @@ -23,6 +23,7 @@ > > #include "qapi/qmp/qlist.h" > > #include "qapi/qmp/qstring.h" > > #include "qapi-event.h" > > +#include "trace.h" > > > > #define HASH_LENGTH 32 > > > > @@ -31,6 +32,10 @@ > > #define QUORUM_OPT_REWRITE "rewrite-corrupted" > > #define QUORUM_OPT_READ_PATTERN "read-pattern" > > > > +#define SLICE_TIME 100000000ULL /* 100 ms */ > > +#define CHUNK_SIZE (1 << 20) /* 1M */ > > +#define SECTORS_PER_CHUNK (CHUNK_SIZE >> BDRV_SECTOR_BITS) > > + > > /* This union holds a vote hash value */ > > typedef union QuorumVoteValue { > > char h[HASH_LENGTH]; /* SHA-256 hash */ > > @@ -64,6 +69,7 @@ typedef struct QuorumVotes { > > > > /* the following structure holds the state of one quorum instance */ > > typedef struct BDRVQuorumState { > > + BlockDriverState *mybs;/* Quorum block driver base state */ > > BlockDriverState **bs; /* children BlockDriverStates */ > > int num_children; /* children count */ > > int threshold; /* if less than threshold children reads gave > > the > > @@ -82,6 +88,10 @@ typedef struct BDRVQuorumState { > > */ > > > > QuorumReadPattern read_pattern; > > + BdrvDirtyBitmap *dirty_bitmap; > > + uint8_t *sync_buf; > > + HBitmapIter hbi; > > + int64_t sector_num; > > } BDRVQuorumState; > > > > typedef struct QuorumAIOCB QuorumAIOCB; > > @@ -290,12 +300,11 @@ static void quorum_copy_qiov(QEMUIOVector *dest, > > QEMUIOVector *source) > > } > > } > > > > -static int next_fifo_child(QuorumAIOCB *acb) > > +static int get_good_child(BDRVQuorumState *s, int iter) > > { > > - BDRVQuorumState *s = acb->common.bs->opaque; > > int i; > > > > - for (i = acb->child_iter; i < s->num_children; i++) { > > + for (i = iter; i < s->num_children; i++) { > > if (!s->bs[i]->broken) { > > break; > > } > > @@ -306,6 +315,13 @@ static int next_fifo_child(QuorumAIOCB *acb) > > return i; > > } > > > > +static int next_fifo_child(QuorumAIOCB *acb) > > +{ > > + BDRVQuorumState *s = acb->common.bs->opaque; > > + > > + return get_good_child(s, acb->child_iter); > > +} > > + > > static void quorum_aio_cb(void *opaque, int ret) > > { > > QuorumChildRequest *sacb = opaque; > > @@ -951,6 +967,171 @@ static int parse_read_pattern(const char *opt) > > return -EINVAL; > > } > > > > +static void sync_prepare(BDRVQuorumState *qs, int64_t *num) > > +{ > > + int64_t nb, total = bdrv_nb_sectors(qs->mybs); > > + > > + qs->sector_num = hbitmap_iter_next(&qs->hbi); > > + /* Wrap around if previous bits get dirty while syncing */ > > + if (qs->sector_num < 0) { > > + bdrv_dirty_iter_init(qs->mybs, qs->dirty_bitmap, &qs->hbi); > > + qs->sector_num = hbitmap_iter_next(&qs->hbi); > > + assert(qs->sector_num >= 0); > > + } > > + > > + for (nb = 1; nb < SECTORS_PER_CHUNK && qs->sector_num + nb < total; > > + nb++) { > > + if (!bdrv_get_dirty(qs->mybs, qs->dirty_bitmap, qs->sector_num + > > nb)) { > > + break; > > + } > > + } > > + *num = nb; > > +} > > + > > +static void sync_finish(BDRVQuorumState *qs, int64_t num) > > +{ > > + int64_t i; > > + > > + for (i = 0; i < num; i++) { > > + /* We need to advance the iterator manually */ > > + hbitmap_iter_next(&qs->hbi); > > + } > > + bdrv_reset_dirty(qs->mybs, qs->sector_num, num); > > +} > > + > > +static int quorum_sync_iteration(BDRVQuorumState *qs, BlockDriverState > > *target) > > +{ > > + BlockDriverState *source; > > + QEMUIOVector qiov; > > + int ret, good; > > + int64_t nb_sectors; > > + struct iovec iov; > > + const char *sname, *tname = bdrv_get_filename(target); > > + > > + good = get_good_child(qs, 0); > > + if (good < 0) { > > + error_report("No good device available."); > > + return -1; > > + } > > + source = qs->bs[good]; > > + sname = bdrv_get_filename(source); > > + sync_prepare(qs, &nb_sectors); > > + iov.iov_base = qs->sync_buf; > > + iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE; > > + qemu_iovec_init_external(&qiov, &iov, 1); > > + > > + trace_quorum_sync_iteration(sname, tname, qs->sector_num, nb_sectors); > > + ret = bdrv_co_readv(source, qs->sector_num, nb_sectors, &qiov); > > + if (ret < 0) { > > + error_report("Read source %s failed.", sname); > > I didn't read this patch throughfully but in quorum if you need to name a > child BDS > you must use bs->node_name. > > bs->node_name was introduced to be able to merge quorum and uniquely identify > a given > node of the BDS graph.
Ah I see, thanks for reminding. Will do in next version. Stefan and Kevin, a minor question I need to make sure if it is conventional to add a helper to access any member of bs outside block.c? In this case, I need to add bdrv_get_node_name(struct *bs)? Thanks Yuan