On 28.09.2015 05:29, Jeff Cody wrote: > During mirror, if the target device does not have support zero > initialization, a mirror may result in a corrupt image. > > For instance, on mirror to a host device with format = raw, whatever > random data is on the target device will still be there for unallocated > sectors. > > This is because during the mirror, we set the dirty bitmap to copy only > sectors allocated above 'base'. In the case of target devices where we > cannot assume unallocated sectors will be read as zeroes, we need to > explicitely zero out this data. > > In order to avoid zeroing out all sectors of the target device prior to > mirroring, we do zeroing as part of the block job. A second dirty > bitmap cache is created, to track sectors that are unallocated above > 'base'. These sectors are then checked for status of BDRV_BLOCK_ZERO > on the target - if they are not, then zeroes are explicitly written. > > This only occurs under two conditions: > > 1. 'mode' != "existing" > 2. bdrv_has_zero_init(target) == NULL > > We perform the mirroring through mirror_iteration() as before, except > in two passes. If the above two conditions are met, the first pass > is using the bitmap tracking unallocated sectors, to write the needed > zeroes. Then, the second pass is performed, to mirror the actual data > as before. > > If the above two conditions are not met, then the first pass is skipped, > and only the second pass (the one with the actual data) is performed. > > Signed-off-by: Jeff Cody <[email protected]> > --- > block/mirror.c | 109 > ++++++++++++++++++++++++++++++++++------------ > blockdev.c | 2 +- > include/block/block_int.h | 3 +- > qapi/block-core.json | 6 ++- > 4 files changed, 87 insertions(+), 33 deletions(-) > > diff --git a/block/mirror.c b/block/mirror.c > index 405e5c4..b599176 100644 > --- a/block/mirror.c > +++ b/block/mirror.c > @@ -50,7 +50,9 @@ typedef struct MirrorBlockJob { > int64_t bdev_length; > unsigned long *cow_bitmap; > BdrvDirtyBitmap *dirty_bitmap; > - HBitmapIter hbi; > + HBitmapIter zero_hbi; > + HBitmapIter allocated_hbi; > + HBitmapIter *hbi; > uint8_t *buf; > QSIMPLEQ_HEAD(, MirrorBuffer) buf_free; > int buf_free_count; > @@ -60,6 +62,8 @@ typedef struct MirrorBlockJob { > int sectors_in_flight; > int ret; > bool unmap; > + bool zero_unallocated; > + bool zero_cycle; > bool waiting_for_io; > } MirrorBlockJob; > > @@ -166,10 +170,10 @@ static uint64_t coroutine_fn > mirror_iteration(MirrorBlockJob *s) > int pnum; > int64_t ret; > > - s->sector_num = hbitmap_iter_next(&s->hbi); > + s->sector_num = hbitmap_iter_next(s->hbi); > if (s->sector_num < 0) { > - bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); > - s->sector_num = hbitmap_iter_next(&s->hbi); > + bdrv_dirty_iter_init(s->dirty_bitmap, s->hbi); > + s->sector_num = hbitmap_iter_next(s->hbi); > trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap)); > assert(s->sector_num >= 0); > } > @@ -287,7 +291,7 @@ static uint64_t coroutine_fn > mirror_iteration(MirrorBlockJob *s) > */ > if (next_sector > hbitmap_next_sector > && bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) { > - hbitmap_next_sector = hbitmap_iter_next(&s->hbi); > + hbitmap_next_sector = hbitmap_iter_next(s->hbi); > } > > next_sector += sectors_per_chunk; > @@ -300,25 +304,34 @@ static uint64_t coroutine_fn > mirror_iteration(MirrorBlockJob *s) > s->sectors_in_flight += nb_sectors; > trace_mirror_one_iteration(s, sector_num, nb_sectors); > > - ret = bdrv_get_block_status_above(source, NULL, sector_num, > - nb_sectors, &pnum); > - if (ret < 0 || pnum < nb_sectors || > - (ret & BDRV_BLOCK_DATA && !(ret & BDRV_BLOCK_ZERO))) { > - bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors, > - mirror_read_complete, op); > - } else if (ret & BDRV_BLOCK_ZERO) { > - bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors, > - s->unmap ? BDRV_REQ_MAY_UNMAP : 0, > - mirror_write_complete, op); > + if (s->zero_cycle) { > + ret = bdrv_get_block_status(s->target, sector_num, nb_sectors, > &pnum); > + if (!(ret & BDRV_BLOCK_ZERO)) { > + bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors, > + s->unmap ? BDRV_REQ_MAY_UNMAP : 0, > + mirror_write_complete, op); > + } > } else { > - assert(!(ret & BDRV_BLOCK_DATA)); > - bdrv_aio_discard(s->target, sector_num, op->nb_sectors, > - mirror_write_complete, op); > + ret = bdrv_get_block_status_above(source, NULL, sector_num, > + nb_sectors, &pnum); > + if (ret < 0 || pnum < nb_sectors || > + (ret & BDRV_BLOCK_DATA && !(ret & BDRV_BLOCK_ZERO))) { > + bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors, > + mirror_read_complete, op); > + } else if (ret & BDRV_BLOCK_ZERO) { > + bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors, > + s->unmap ? BDRV_REQ_MAY_UNMAP : 0, > + mirror_write_complete, op); > + } else { > + assert(!(ret & BDRV_BLOCK_DATA)); > + bdrv_aio_discard(s->target, sector_num, op->nb_sectors, > + mirror_write_complete, op); > + } > } > return delay_ns; > } > > -static int mirror_do_iteration(MirrorBlockJob *s, uint64_t last_pause_ns) > +static int mirror_do_iteration(MirrorBlockJob *s, uint64_t *last_pause_ns) > { > int ret; > > @@ -347,7 +360,7 @@ static int mirror_do_iteration(MirrorBlockJob *s, > uint64_t last_pause_ns) > * We do so every SLICE_TIME nanoseconds, or when there is an error, > * or when the source is clean, whichever comes first. > */ > - if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - last_pause_ns < > SLICE_TIME > + if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - *last_pause_ns < > SLICE_TIME > && s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) { > if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 || > (cnt == 0 && s->in_flight > 0)) { > @@ -371,6 +384,14 @@ static int mirror_do_iteration(MirrorBlockJob *s, > uint64_t last_pause_ns) > goto immediate_exit; > } > } else { > + > + if (s->zero_cycle) { > + /* this is not the end of the streaming cycle, > + * if we are just filling in zeroes for unallocated > + * sectors prior to streaming the real data */ > + goto immediate_exit; > + } > + > /* We're out of the streaming phase. From now on, if the job > * is cancelled we will actually complete all pending I/O and > * report completion. This way, block-job-cancel will leave > @@ -419,7 +440,7 @@ static int mirror_do_iteration(MirrorBlockJob *s, > uint64_t last_pause_ns) > s->common.cancelled = false; > break; > } > - last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); > + *last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); > } > > immediate_exit: > @@ -511,6 +532,15 @@ static void coroutine_fn mirror_run(void *opaque) > checking for a NULL string */ > int ret = 0; > int n; > + BdrvDirtyBitmap *zero_dirty_bitmap; > + BdrvDirtyBitmap *allocated_dirty_bitmap = s->dirty_bitmap; > + > + zero_dirty_bitmap = bdrv_create_dirty_bitmap(s->target, > + s->granularity, NULL, true, > + NULL); > + if (zero_dirty_bitmap == NULL) { > + goto immediate_exit; > + }
I think I'd like the error to be reported to the user; but in any case,
you have to set ret to a negative value.
>
> if (block_job_is_cancelled(&s->common)) {
> goto immediate_exit;
> @@ -588,14 +618,33 @@ static void coroutine_fn mirror_run(void *opaque)
> assert(n > 0);
> if (ret == 1) {
> bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
> + } else if (s->zero_unallocated) {
> + bdrv_set_dirty_bitmap(zero_dirty_bitmap, sector_num, n);
> }
> sector_num += n;
> }
> }
>
> - bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
> + bdrv_dirty_iter_init(s->dirty_bitmap, &s->allocated_hbi);
>
> - ret = mirror_do_iteration(s, last_pause_ns);
> + if (s->zero_unallocated) {
> + bdrv_dirty_iter_init(zero_dirty_bitmap, &s->zero_hbi);
> + s->dirty_bitmap = zero_dirty_bitmap;
> + s->hbi = &s->zero_hbi;
> +
> + s->zero_cycle = true;
> + ret = mirror_do_iteration(s, &last_pause_ns);
> + if (ret < 0) {
> + goto immediate_exit;
> + }
> +
> + mirror_drain(s);
> + s->zero_cycle = false;
> + }
> +
> + s->dirty_bitmap = allocated_dirty_bitmap;
> + s->hbi = &s->allocated_hbi;
> + ret = mirror_do_iteration(s, &last_pause_ns);
>
> immediate_exit:
> if (s->in_flight > 0) {
> @@ -611,7 +660,8 @@ immediate_exit:
> qemu_vfree(s->buf);
> g_free(s->cow_bitmap);
> g_free(s->in_flight_bitmap);
> - bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
> + bdrv_release_dirty_bitmap(bs, allocated_dirty_bitmap);
> + bdrv_release_dirty_bitmap(NULL, zero_dirty_bitmap);
> bdrv_iostatus_disable(s->target);
>
> data = g_malloc(sizeof(*data));
> @@ -702,7 +752,7 @@ static void mirror_start_job(BlockDriverState *bs,
> BlockDriverState *target,
> int64_t buf_size,
> BlockdevOnError on_source_error,
> BlockdevOnError on_target_error,
> - bool unmap,
> + bool unmap, bool existing,
> BlockCompletionFunc *cb,
> void *opaque, Error **errp,
> const BlockJobDriver *driver,
> @@ -737,6 +787,7 @@ static void mirror_start_job(BlockDriverState *bs,
> BlockDriverState *target,
> return;
> }
>
> + s->zero_unallocated = !existing && !bdrv_has_zero_init(target);
I think this should be set only if we're doing a full mirror operation.
For instance, I could do a none, top or incremental mirror to a new
qcow2 file, which would give it a backing file, obviously. You're lucky
in that qcow2 claims to always have zero initialization, when this is in
fact not true (someone's ought to fix that...): With a backing file, an
overlay file just cannot have zero initialization, it's impossible
(well, unless the backing file is completely zero).
So if qcow2 were to answer correctly, i.e. "No, with a backing file I do
not have zero init", then this would overwrite all sectors which are
supposed to be unallocated because they are present in the backing file.
> s->replaces = g_strdup(replaces);
> s->on_source_error = on_source_error;
> s->on_target_error = on_target_error;
> @@ -767,7 +818,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState
> *target,
> int64_t speed, uint32_t granularity, int64_t buf_size,
> MirrorSyncMode mode, BlockdevOnError on_source_error,
> BlockdevOnError on_target_error,
> - bool unmap,
> + bool unmap, bool existing,
> BlockCompletionFunc *cb,
> void *opaque, Error **errp)
> {
> @@ -782,8 +833,8 @@ void mirror_start(BlockDriverState *bs, BlockDriverState
> *target,
> base = mode == MIRROR_SYNC_MODE_TOP ? bs->backing_hd : NULL;
> mirror_start_job(bs, target, replaces,
> speed, granularity, buf_size,
> - on_source_error, on_target_error, unmap, cb, opaque,
> errp,
> - &mirror_job_driver, is_none_mode, base);
> + on_source_error, on_target_error, unmap, existing, cb,
> + opaque, errp, &mirror_job_driver, is_none_mode, base);
> }
>
> void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
> @@ -830,7 +881,7 @@ void commit_active_start(BlockDriverState *bs,
> BlockDriverState *base,
>
> bdrv_ref(base);
> mirror_start_job(bs, base, NULL, speed, 0, 0,
> - on_error, on_error, false, cb, opaque, &local_err,
> + on_error, on_error, false, false, cb, opaque,
> &local_err,
This should probably be true; the commit target is already existing,
after all. Also, without it being true, iotest 097 fails.
> &commit_active_job_driver, false, base);
> if (local_err) {
> error_propagate(errp, local_err);
> diff --git a/blockdev.c b/blockdev.c
> index cb9f78d..c06ac60 100644
> --- a/blockdev.c
> +++ b/blockdev.c
> @@ -2816,7 +2816,7 @@ void qmp_drive_mirror(const char *device, const char
> *target,
> has_replaces ? replaces : NULL,
> speed, granularity, buf_size, sync,
> on_source_error, on_target_error,
> - unmap,
> + unmap, mode == NEW_IMAGE_MODE_EXISTING,
> block_job_cb, bs, &local_err);
> if (local_err != NULL) {
> bdrv_unref(target_bs);
> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index 14ad4c3..21a8988 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -614,6 +614,7 @@ void commit_active_start(BlockDriverState *bs,
> BlockDriverState *base,
> * @on_source_error: The action to take upon error reading from the source.
> * @on_target_error: The action to take upon error writing to the target.
> * @unmap: Whether to unmap target where source sectors only contain zeroes.
> + * @existing: Whether target image is an existing image prior to the QMP cmd.
> * @cb: Completion function for the job.
> * @opaque: Opaque pointer value passed to @cb.
> * @errp: Error object.
> @@ -628,7 +629,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState
> *target,
> int64_t speed, uint32_t granularity, int64_t buf_size,
> MirrorSyncMode mode, BlockdevOnError on_source_error,
> BlockdevOnError on_target_error,
> - bool unmap,
> + bool unmap, bool existing,
> BlockCompletionFunc *cb,
> void *opaque, Error **errp);
>
> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index bb2189e..033afb4 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -952,8 +952,10 @@
> # broken Quorum files. (Since 2.1)
> #
> # @mode: #optional whether and how QEMU should create a new image, default is
> -# 'absolute-paths'.
> -#
This empty line should stay.
> +# 'absolute-paths'. If mode != 'existing', and the target does not
> +# have zero init (sparseness), then the target image will have
> sectors
> +# zeroed out that correspond to sectors in an unallocated state in
> the
> +# source image.
As I said above, this should only happen if @sync == 'full'.
Max
> # @speed: #optional the maximum speed, in bytes per second
> #
> # @sync: what parts of the disk image should be copied to the destination
>
signature.asc
Description: OpenPGP digital signature
