Il 15/10/2012 18:57, Kevin Wolf ha scritto: > Am 26.09.2012 17:56, schrieb Paolo Bonzini: >> This patch adds the implementation of a new job that mirrors a disk to >> a new image while letting the guest continue using the old image. >> The target is treated as a "black box" and data is copied from the >> source to the target in the background. This can be used for several >> purposes, including storage migration, continuous replication, and >> observation of the guest I/O in an external program. It is also a >> first step in replacing the inefficient block migration code that is >> part of QEMU. >> >> The job is possibly never-ending, but it is logically structured into >> two phases: 1) copy all data as fast as possible until the target >> first gets in sync with the source; 2) keep target in sync and >> ensure that reopening to the target gets a correct (full) copy >> of the source data. >> >> The second phase is indicated by the progress in "info block-jobs" >> reporting the current offset to be equal to the length of the file. >> When the job is cancelled in the second phase, QEMU will run the >> job until the source is clean and quiescent, then it will report >> successful completion of the job. >> >> In other words, the BLOCK_JOB_CANCELLED event means that the target >> may _not_ be consistent with a past state of the source; the >> BLOCK_JOB_COMPLETED event means that the target is consistent with >> a past state of the source. (Note that it could already happen >> that management lost the race against QEMU and got a completion >> event instead of cancellation). >> >> It is not yet possible to complete the job and switch over to the target >> disk. The next patches will fix this and add many refinements to the >> basic idea introduced here. These include improved error management, >> some tunable knobs and performance optimizations. >> >> Signed-off-by: Paolo Bonzini <pbonz...@redhat.com> >> --- >> v1->v2: Always "goto immediate_exit" and similar code cleanups. >> Error checking for bdrv_flush. Call bdrv_set_enable_write_cache >> on the target to make it always writeback. >> >> block/Makefile.objs | 2 +- >> block/mirror.c | 234 >> ++++++++++++++++++++++++++++++++++++++++++++++++++++ >> block_int.h | 20 +++++ >> qapi-schema.json | 17 ++++ >> trace-events | 7 ++ >> 5 file modificati, 279 inserzioni(+). 1 rimozione(-) >> create mode 100644 block/mirror.c >> >> diff --git a/block/Makefile.objs b/block/Makefile.objs >> index c45affc..f1a394a 100644 >> --- a/block/Makefile.objs >> +++ b/block/Makefile.objs >> @@ -9,4 +9,4 @@ block-obj-$(CONFIG_LIBISCSI) += iscsi.o >> block-obj-$(CONFIG_CURL) += curl.o >> block-obj-$(CONFIG_RBD) += rbd.o >> >> -common-obj-y += stream.o >> +common-obj-y += stream.o mirror.o >> diff --git a/block/mirror.c b/block/mirror.c >> new file mode 100644 >> index 0000000..09ea020 >> --- /dev/null >> +++ b/block/mirror.c >> @@ -0,0 +1,234 @@ >> +/* >> + * Image mirroring >> + * >> + * Copyright Red Hat, Inc. 2012 >> + * >> + * Authors: >> + * Paolo Bonzini <pbonz...@redhat.com> >> + * >> + * This work is licensed under the terms of the GNU LGPL, version 2 or >> later. >> + * See the COPYING.LIB file in the top-level directory. >> + * >> + */ >> + >> +#include "trace.h" >> +#include "blockjob.h" >> +#include "block_int.h" >> +#include "qemu/ratelimit.h" >> + >> +enum { >> + /* >> + * Size of data buffer for populating the image file. This should be >> large >> + * enough to process multiple clusters in a single call, so that >> populating >> + * contiguous regions of the image is efficient. >> + */ >> + BLOCK_SIZE = 512 * BDRV_SECTORS_PER_DIRTY_CHUNK, /* in bytes */ >> +}; >> + >> +#define SLICE_TIME 100000000ULL /* ns */ >> + >> +typedef struct MirrorBlockJob { >> + BlockJob common; >> + RateLimit limit; >> + BlockDriverState *target; >> + MirrorSyncMode mode; >> + int64_t sector_num; >> + uint8_t *buf; >> +} MirrorBlockJob; >> + >> +static int coroutine_fn mirror_iteration(MirrorBlockJob *s) >> +{ >> + BlockDriverState *source = s->common.bs; >> + BlockDriverState *target = s->target; >> + QEMUIOVector qiov; >> + int ret, nb_sectors; >> + int64_t end; >> + struct iovec iov; >> + >> + end = s->common.len >> BDRV_SECTOR_BITS; >> + s->sector_num = bdrv_get_next_dirty(source, s->sector_num); >> + nb_sectors = MIN(BDRV_SECTORS_PER_DIRTY_CHUNK, end - s->sector_num); >> + bdrv_reset_dirty(source, s->sector_num, nb_sectors); >> + >> + /* Copy the dirty cluster. */ >> + iov.iov_base = s->buf; >> + iov.iov_len = nb_sectors * 512; >> + qemu_iovec_init_external(&qiov, &iov, 1); >> + >> + trace_mirror_one_iteration(s, s->sector_num, nb_sectors); >> + ret = bdrv_co_readv(source, s->sector_num, nb_sectors, &qiov); >> + if (ret < 0) { >> + return ret; >> + } >> + return bdrv_co_writev(target, s->sector_num, nb_sectors, &qiov); >> +} >> + >> +static void coroutine_fn mirror_run(void *opaque) >> +{ >> + MirrorBlockJob *s = opaque; >> + BlockDriverState *bs = s->common.bs; >> + int64_t sector_num, end; >> + int ret = 0; >> + int n; >> + bool synced = false; >> + >> + if (block_job_is_cancelled(&s->common)) { >> + goto immediate_exit; >> + } >> + >> + s->common.len = bdrv_getlength(bs); >> + if (s->common.len < 0) { >> + block_job_completed(&s->common, s->common.len); >> + return; >> + } >> + >> + end = s->common.len >> BDRV_SECTOR_BITS; >> + s->buf = qemu_blockalign(bs, BLOCK_SIZE); >> + >> + if (s->mode != MIRROR_SYNC_MODE_NONE) { >> + /* First part, loop on the sectors and initialize the dirty bitmap. >> */ >> + BlockDriverState *base; >> + base = s->mode == MIRROR_SYNC_MODE_FULL ? NULL : bs->backing_hd; >> + for (sector_num = 0; sector_num < end; ) { >> + int64_t next = (sector_num | (BDRV_SECTORS_PER_DIRTY_CHUNK - >> 1)) + 1; >> + ret = bdrv_co_is_allocated_above(bs, base, >> + sector_num, next - sector_num, >> &n); >> + >> + if (ret < 0) { >> + goto immediate_exit; >> + } >> + >> + assert(n > 0); >> + if (ret == 1) { >> + bdrv_set_dirty(bs, sector_num, n); >> + sector_num = next; >> + } else { >> + sector_num += n; >> + } >> + } >> + } >> + >> + s->sector_num = -1; >> + for (;;) { >> + uint64_t delay_ns; >> + int64_t cnt; >> + bool should_complete; >> + >> + cnt = bdrv_get_dirty_count(bs); >> + if (cnt != 0) { >> + ret = mirror_iteration(s); >> + if (ret < 0) { >> + goto immediate_exit; >> + } >> + cnt = bdrv_get_dirty_count(bs); >> + } >> + >> + should_complete = false; >> + if (cnt == 0) { >> + trace_mirror_before_flush(s); >> + if (bdrv_flush(s->target) < 0) { >> + goto immediate_exit; >> + } > > Are you sure that we should signal successful completion when > bdrv_flush() fails?
Hmm, of course not. >> + >> + /* We're out of the streaming phase. From now on, if the job >> + * is cancelled we will actually complete all pending I/O and >> + * report completion. This way, block-job-cancel will leave >> + * the target in a consistent state. >> + */ > > Don't we have block_job_complete() for that now? Then I think the job > can be cancelled immediately, even in an inconsistent state. The idea was that block-job-cancel will still leave the target in a consistent state if executed during the second phase. Otherwise it is impossible to take a consistent snapshot and keep running on the first image. >> + synced = true; >> + s->common.offset = end * BDRV_SECTOR_SIZE; >> + should_complete = block_job_is_cancelled(&s->common); >> + cnt = bdrv_get_dirty_count(bs); >> + } >> + >> + if (cnt == 0 && should_complete) { >> + /* The dirty bitmap is not updated while operations are pending. >> + * If we're about to exit, wait for pending operations before >> + * calling bdrv_get_dirty_count(bs), or we may exit while the >> + * source has dirty data to copy! >> + * >> + * Note that I/O can be submitted by the guest while >> + * mirror_populate runs. >> + */ >> + trace_mirror_before_drain(s, cnt); >> + bdrv_drain_all(); >> + cnt = bdrv_get_dirty_count(bs); >> + } >> + >> + ret = 0; >> + trace_mirror_before_sleep(s, cnt, synced); >> + if (!synced) { >> + /* Publish progress */ >> + s->common.offset = end * BDRV_SECTOR_SIZE - cnt * BLOCK_SIZE; >> + >> + if (s->common.speed) { >> + delay_ns = ratelimit_calculate_delay(&s->limit, >> BDRV_SECTORS_PER_DIRTY_CHUNK); >> + } else { >> + delay_ns = 0; >> + } >> + >> + /* Note that even when no rate limit is applied we need to yield >> + * with no pending I/O here so that qemu_aio_flush() returns. >> + */ >> + block_job_sleep_ns(&s->common, rt_clock, delay_ns); >> + if (block_job_is_cancelled(&s->common)) { >> + break; >> + } >> + } else if (!should_complete) { >> + delay_ns = (cnt == 0 ? SLICE_TIME : 0); >> + block_job_sleep_ns(&s->common, rt_clock, delay_ns); > > Why don't we check block_job_is_cancelled() here? I can't see how > cancellation works in the second phase, except when cnt becomes 0. Indeed cancellation requires consistency (and hence cnt == 0) in the second phase. > But this isn't guaranteed, is it? Not guaranteed, but in practice it works and you can always throttle writes on the source to guarantee that it does. >> + } else if (cnt == 0) { >> + /* The two disks are in sync. Exit and report successful >> + * completion. >> + */ >> + assert(QLIST_EMPTY(&bs->tracked_requests)); >> + s->common.cancelled = false; >> + break; >> + } >> + } >> + >> +immediate_exit: >> + g_free(s->buf); >> + bdrv_set_dirty_tracking(bs, false); >> + bdrv_close(s->target); >> + bdrv_delete(s->target); >> + block_job_completed(&s->common, ret); >> +} > > Kevin >