The Monday 01 Sep 2014 à 15:43:14 (+0800), Liu Yuan wrote : > For some configuration, quorum allow VMs to continue while some child devices > are broken and when the child devices are repaired and return back, we need to > sync dirty bits during downtime to keep data consistency. > > The recovery logic is based on the driver state bitmap and will sync the dirty > bits with a timeslice window in a coroutine in this prtimive implementation. > > Simple graph about 2 children with threshold=1 and read-pattern=fifo: > > + denote device sync iteration > - IO on a single device > = IO on two devices > > sync complete, release dirty bitmap > ^ > | > ====-----------------++++----++++----++========== > | | > | v > | device repaired and begin to sync > v > device broken, create a dirty bitmap > > This sync logic can take care of nested broken problem, that devices are > broken while in sync. We just start a sync process after the devices are > repaired again and switch the devices from broken to sound only when the > sync > completes. > > For read-pattern=quorum mode, it enjoys the recovery logic without any > problem. > > Cc: Eric Blake <ebl...@redhat.com> > Cc: Benoit Canet <ben...@irqsave.net> > Cc: Kevin Wolf <kw...@redhat.com> > Cc: Stefan Hajnoczi <stefa...@redhat.com> > Signed-off-by: Liu Yuan <namei.u...@gmail.com> > --- > block/quorum.c | 189 > ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- > trace-events | 5 ++ > 2 files changed, 191 insertions(+), 3 deletions(-) > > diff --git a/block/quorum.c b/block/quorum.c > index 7b07e35..ffd7c2d 100644 > --- a/block/quorum.c > +++ b/block/quorum.c > @@ -23,6 +23,7 @@ > #include "qapi/qmp/qlist.h" > #include "qapi/qmp/qstring.h" > #include "qapi-event.h" > +#include "trace.h" > > #define HASH_LENGTH 32 > > @@ -31,6 +32,10 @@ > #define QUORUM_OPT_REWRITE "rewrite-corrupted" > #define QUORUM_OPT_READ_PATTERN "read-pattern" > > +#define SLICE_TIME 100000000ULL /* 100 ms */ > +#define CHUNK_SIZE (1 << 20) /* 1M */ > +#define SECTORS_PER_CHUNK (CHUNK_SIZE >> BDRV_SECTOR_BITS) > + > /* This union holds a vote hash value */ > typedef union QuorumVoteValue { > char h[HASH_LENGTH]; /* SHA-256 hash */ > @@ -64,6 +69,7 @@ typedef struct QuorumVotes { > > /* the following structure holds the state of one quorum instance */ > typedef struct BDRVQuorumState { > + BlockDriverState *mybs;/* Quorum block driver base state */ > BlockDriverState **bs; /* children BlockDriverStates */ > int num_children; /* children count */ > int threshold; /* if less than threshold children reads gave the > @@ -82,6 +88,10 @@ typedef struct BDRVQuorumState { > */ > > QuorumReadPattern read_pattern; > + BdrvDirtyBitmap *dirty_bitmap; > + uint8_t *sync_buf; > + HBitmapIter hbi; > + int64_t sector_num; > } BDRVQuorumState; > > typedef struct QuorumAIOCB QuorumAIOCB; > @@ -290,12 +300,11 @@ static void quorum_copy_qiov(QEMUIOVector *dest, > QEMUIOVector *source) > } > } > > -static int next_fifo_child(QuorumAIOCB *acb) > +static int get_good_child(BDRVQuorumState *s, int iter) > { > - BDRVQuorumState *s = acb->common.bs->opaque; > int i; > > - for (i = acb->child_iter; i < s->num_children; i++) { > + for (i = iter; i < s->num_children; i++) { > if (!s->bs[i]->broken) { > break; > } > @@ -306,6 +315,13 @@ static int next_fifo_child(QuorumAIOCB *acb) > return i; > } > > +static int next_fifo_child(QuorumAIOCB *acb) > +{ > + BDRVQuorumState *s = acb->common.bs->opaque; > + > + return get_good_child(s, acb->child_iter); > +} > + > static void quorum_aio_cb(void *opaque, int ret) > { > QuorumChildRequest *sacb = opaque; > @@ -951,6 +967,171 @@ static int parse_read_pattern(const char *opt) > return -EINVAL; > } > > +static void sync_prepare(BDRVQuorumState *qs, int64_t *num) > +{ > + int64_t nb, total = bdrv_nb_sectors(qs->mybs); > + > + qs->sector_num = hbitmap_iter_next(&qs->hbi); > + /* Wrap around if previous bits get dirty while syncing */ > + if (qs->sector_num < 0) { > + bdrv_dirty_iter_init(qs->mybs, qs->dirty_bitmap, &qs->hbi); > + qs->sector_num = hbitmap_iter_next(&qs->hbi); > + assert(qs->sector_num >= 0); > + } > + > + for (nb = 1; nb < SECTORS_PER_CHUNK && qs->sector_num + nb < total; > + nb++) { > + if (!bdrv_get_dirty(qs->mybs, qs->dirty_bitmap, qs->sector_num + > nb)) { > + break; > + } > + } > + *num = nb; > +} > + > +static void sync_finish(BDRVQuorumState *qs, int64_t num) > +{ > + int64_t i; > + > + for (i = 0; i < num; i++) { > + /* We need to advance the iterator manually */ > + hbitmap_iter_next(&qs->hbi); > + } > + bdrv_reset_dirty(qs->mybs, qs->sector_num, num); > +} > + > +static int quorum_sync_iteration(BDRVQuorumState *qs, BlockDriverState > *target) > +{ > + BlockDriverState *source; > + QEMUIOVector qiov; > + int ret, good; > + int64_t nb_sectors; > + struct iovec iov; > + const char *sname, *tname = bdrv_get_filename(target); > + > + good = get_good_child(qs, 0); > + if (good < 0) { > + error_report("No good device available."); > + return -1; > + } > + source = qs->bs[good]; > + sname = bdrv_get_filename(source); > + sync_prepare(qs, &nb_sectors); > + iov.iov_base = qs->sync_buf; > + iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE; > + qemu_iovec_init_external(&qiov, &iov, 1); > + > + trace_quorum_sync_iteration(sname, tname, qs->sector_num, nb_sectors); > + ret = bdrv_co_readv(source, qs->sector_num, nb_sectors, &qiov); > + if (ret < 0) { > + error_report("Read source %s failed.", sname);
I didn't read this patch throughfully but in quorum if you need to name a child BDS you must use bs->node_name. bs->node_name was introduced to be able to merge quorum and uniquely identify a given node of the BDS graph. Best regards Benoît > + return ret; > + } > + ret = bdrv_co_writev(target, qs->sector_num, nb_sectors, &qiov); > + if (ret < 0) { > + error_report("Write target %s failed.", tname); > + return ret; > + } > + sync_finish(qs, nb_sectors); > + > + return 0; > +} > + > +static int quorum_sync_device(BDRVQuorumState *qs, BlockDriverState *target) > +{ > + uint64_t last_pause_ns; > + > + bdrv_dirty_iter_init(qs->mybs, qs->dirty_bitmap, &qs->hbi); > + last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); > + for (;;) { > + int64_t cnt; > + > + cnt = bdrv_get_dirty_count(qs->mybs, qs->dirty_bitmap); > + if (cnt == 0) { > + break; > + } > + error_report("count %ld", cnt); > + if (quorum_sync_iteration(qs, target) < 0) { > + return -1; > + } > + cnt = bdrv_get_dirty_count(qs->mybs, qs->dirty_bitmap); > + if (cnt == 0) { > + break; > + } > + > + if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - last_pause_ns >= > + SLICE_TIME) { > + co_aio_sleep_ns(bdrv_get_aio_context(target), > QEMU_CLOCK_REALTIME, > + SLICE_TIME); > + last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); > + } > + } > + > + return 0; > +} > + > +static BlockDriverState *file_to_bs(BDRVQuorumState *qs, BlockDriverState > *file) > +{ > + int i; > + > + for (i = 0; i < qs->num_children; i++) { > + BlockDriverState *f = bdrv_get_file(qs->bs[i]); > + > + if (f == file) { > + return qs->bs[i]; > + } > + } > + > + error_report("Can't find driver state for %s", bdrv_get_filename(file)); > + abort(); > +} > + > +static void quorum_driver_reconnect(BlockDriverState *file) > +{ > + BDRVQuorumState *qs = file->drv_opaque; > + BlockDriverState *bs = file_to_bs(qs, file); > + const char *name = bdrv_get_filename(bs); > + > + trace_quorum_driver_reconnect(name); > + assert(bs->broken == true); > + if (quorum_sync_device(qs, bs) < 0) { > + error_report("Failed to sync device %s", name); > + return; > + } > + > + bdrv_release_dirty_bitmap(qs->mybs, qs->dirty_bitmap); > + qemu_vfree(qs->sync_buf); > + bs->broken = false; > +} > + > +static void quorum_driver_disconnect(BlockDriverState *file) > +{ > + BDRVQuorumState *qs = file->drv_opaque; > + BlockDriverState *bs = file_to_bs(qs, file); > + const char *name = bdrv_get_filename(bs); > + > + trace_quorum_driver_disconnect(name); > + /* > + * If we are disconnected while being syncing, we expect to reconnect to > the > + * target again and resume the data sync from the last synced point. > + */ > + if (bs->broken) { > + return; > + } > + > + bs->broken = true; > + qs->dirty_bitmap = bdrv_create_dirty_bitmap(qs->mybs, BDRV_SECTOR_SIZE, > + NULL); > + if (!qs->dirty_bitmap) { > + abort(); > + } > + qs->sync_buf = qemu_blockalign(bs, CHUNK_SIZE); > +} > + > +static const BlockDrvOps quorum_block_drv_ops = { > + .driver_reconnect = quorum_driver_reconnect, > + .driver_disconnect = quorum_driver_disconnect, > +}; > + > static int quorum_open(BlockDriverState *bs, QDict *options, int flags, > Error **errp) > { > @@ -975,6 +1156,7 @@ static int quorum_open(BlockDriverState *bs, QDict > *options, int flags, > goto exit; > } > > + s->mybs = bs; > /* count how many different children are present */ > s->num_children = qlist_size(list); > if (s->num_children < 2) { > @@ -1061,6 +1243,7 @@ static int quorum_open(BlockDriverState *bs, QDict > *options, int flags, > goto close_exit; > } > opened[i] = true; > + bdrv_set_drv_ops(bdrv_get_file(s->bs[i]), &quorum_block_drv_ops, s); > } > > g_free(opened); > diff --git a/trace-events b/trace-events > index 81bc915..8da0a13 100644 > --- a/trace-events > +++ b/trace-events > @@ -572,6 +572,11 @@ qed_aio_write_prefill(void *s, void *acb, uint64_t > start, size_t len, uint64_t o > qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, > uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64 > qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) > "s %p acb %p ret %d offset %"PRIu64" len %zu" > > +# block/quorum.c > +quorum_sync_iteration(const char *source, const char *target, int64_t > sector, int num) "%s -> %s, sector %"PRId64" nb_sectors %d" > +quorum_driver_reconnect(const char *target) "%s" > +quorum_driver_disconnect(const char *target) "%s" > + > # hw/display/g364fb.c > g364fb_read(uint64_t addr, uint32_t val) "read addr=0x%"PRIx64": 0x%x" > g364fb_write(uint64_t addr, uint32_t new) "write addr=0x%"PRIx64": 0x%x" > -- > 1.9.1 >