On 28.09.2015 05:29, Jeff Cody wrote: > During mirror, if the target device does not have support zero > initialization, a mirror may result in a corrupt image. > > For instance, on mirror to a host device with format = raw, whatever > random data is on the target device will still be there for unallocated > sectors. > > This is because during the mirror, we set the dirty bitmap to copy only > sectors allocated above 'base'. In the case of target devices where we > cannot assume unallocated sectors will be read as zeroes, we need to > explicitely zero out this data. > > In order to avoid zeroing out all sectors of the target device prior to > mirroring, we do zeroing as part of the block job. A second dirty > bitmap cache is created, to track sectors that are unallocated above > 'base'. These sectors are then checked for status of BDRV_BLOCK_ZERO > on the target - if they are not, then zeroes are explicitly written. > > This only occurs under two conditions: > > 1. 'mode' != "existing" > 2. bdrv_has_zero_init(target) == NULL > > We perform the mirroring through mirror_iteration() as before, except > in two passes. If the above two conditions are met, the first pass > is using the bitmap tracking unallocated sectors, to write the needed > zeroes. Then, the second pass is performed, to mirror the actual data > as before. > > If the above two conditions are not met, then the first pass is skipped, > and only the second pass (the one with the actual data) is performed. > > Signed-off-by: Jeff Cody <jc...@redhat.com> > --- > block/mirror.c | 109 > ++++++++++++++++++++++++++++++++++------------ > blockdev.c | 2 +- > include/block/block_int.h | 3 +- > qapi/block-core.json | 6 ++- > 4 files changed, 87 insertions(+), 33 deletions(-) > > diff --git a/block/mirror.c b/block/mirror.c > index 405e5c4..b599176 100644 > --- a/block/mirror.c > +++ b/block/mirror.c > @@ -50,7 +50,9 @@ typedef struct MirrorBlockJob { > int64_t bdev_length; > unsigned long *cow_bitmap; > BdrvDirtyBitmap *dirty_bitmap; > - HBitmapIter hbi; > + HBitmapIter zero_hbi; > + HBitmapIter allocated_hbi; > + HBitmapIter *hbi; > uint8_t *buf; > QSIMPLEQ_HEAD(, MirrorBuffer) buf_free; > int buf_free_count; > @@ -60,6 +62,8 @@ typedef struct MirrorBlockJob { > int sectors_in_flight; > int ret; > bool unmap; > + bool zero_unallocated; > + bool zero_cycle; > bool waiting_for_io; > } MirrorBlockJob; > > @@ -166,10 +170,10 @@ static uint64_t coroutine_fn > mirror_iteration(MirrorBlockJob *s) > int pnum; > int64_t ret; > > - s->sector_num = hbitmap_iter_next(&s->hbi); > + s->sector_num = hbitmap_iter_next(s->hbi); > if (s->sector_num < 0) { > - bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); > - s->sector_num = hbitmap_iter_next(&s->hbi); > + bdrv_dirty_iter_init(s->dirty_bitmap, s->hbi); > + s->sector_num = hbitmap_iter_next(s->hbi); > trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap)); > assert(s->sector_num >= 0); > } > @@ -287,7 +291,7 @@ static uint64_t coroutine_fn > mirror_iteration(MirrorBlockJob *s) > */ > if (next_sector > hbitmap_next_sector > && bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) { > - hbitmap_next_sector = hbitmap_iter_next(&s->hbi); > + hbitmap_next_sector = hbitmap_iter_next(s->hbi); > } > > next_sector += sectors_per_chunk; > @@ -300,25 +304,34 @@ static uint64_t coroutine_fn > mirror_iteration(MirrorBlockJob *s) > s->sectors_in_flight += nb_sectors; > trace_mirror_one_iteration(s, sector_num, nb_sectors); > > - ret = bdrv_get_block_status_above(source, NULL, sector_num, > - nb_sectors, &pnum); > - if (ret < 0 || pnum < nb_sectors || > - (ret & BDRV_BLOCK_DATA && !(ret & BDRV_BLOCK_ZERO))) { > - bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors, > - mirror_read_complete, op); > - } else if (ret & BDRV_BLOCK_ZERO) { > - bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors, > - s->unmap ? BDRV_REQ_MAY_UNMAP : 0, > - mirror_write_complete, op); > + if (s->zero_cycle) { > + ret = bdrv_get_block_status(s->target, sector_num, nb_sectors, > &pnum); > + if (!(ret & BDRV_BLOCK_ZERO)) { > + bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors, > + s->unmap ? BDRV_REQ_MAY_UNMAP : 0, > + mirror_write_complete, op); > + } > } else { > - assert(!(ret & BDRV_BLOCK_DATA)); > - bdrv_aio_discard(s->target, sector_num, op->nb_sectors, > - mirror_write_complete, op); > + ret = bdrv_get_block_status_above(source, NULL, sector_num, > + nb_sectors, &pnum); > + if (ret < 0 || pnum < nb_sectors || > + (ret & BDRV_BLOCK_DATA && !(ret & BDRV_BLOCK_ZERO))) { > + bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors, > + mirror_read_complete, op); > + } else if (ret & BDRV_BLOCK_ZERO) { > + bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors, > + s->unmap ? BDRV_REQ_MAY_UNMAP : 0, > + mirror_write_complete, op); > + } else { > + assert(!(ret & BDRV_BLOCK_DATA)); > + bdrv_aio_discard(s->target, sector_num, op->nb_sectors, > + mirror_write_complete, op); > + } > } > return delay_ns; > } > > -static int mirror_do_iteration(MirrorBlockJob *s, uint64_t last_pause_ns) > +static int mirror_do_iteration(MirrorBlockJob *s, uint64_t *last_pause_ns) > { > int ret; > > @@ -347,7 +360,7 @@ static int mirror_do_iteration(MirrorBlockJob *s, > uint64_t last_pause_ns) > * We do so every SLICE_TIME nanoseconds, or when there is an error, > * or when the source is clean, whichever comes first. > */ > - if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - last_pause_ns < > SLICE_TIME > + if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - *last_pause_ns < > SLICE_TIME > && s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) { > if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 || > (cnt == 0 && s->in_flight > 0)) { > @@ -371,6 +384,14 @@ static int mirror_do_iteration(MirrorBlockJob *s, > uint64_t last_pause_ns) > goto immediate_exit; > } > } else { > + > + if (s->zero_cycle) { > + /* this is not the end of the streaming cycle, > + * if we are just filling in zeroes for unallocated > + * sectors prior to streaming the real data */ > + goto immediate_exit; > + } > + > /* We're out of the streaming phase. From now on, if the job > * is cancelled we will actually complete all pending I/O and > * report completion. This way, block-job-cancel will leave > @@ -419,7 +440,7 @@ static int mirror_do_iteration(MirrorBlockJob *s, > uint64_t last_pause_ns) > s->common.cancelled = false; > break; > } > - last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); > + *last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); > } > > immediate_exit: > @@ -511,6 +532,15 @@ static void coroutine_fn mirror_run(void *opaque) > checking for a NULL string */ > int ret = 0; > int n; > + BdrvDirtyBitmap *zero_dirty_bitmap; > + BdrvDirtyBitmap *allocated_dirty_bitmap = s->dirty_bitmap; > + > + zero_dirty_bitmap = bdrv_create_dirty_bitmap(s->target, > + s->granularity, NULL, true, > + NULL); > + if (zero_dirty_bitmap == NULL) { > + goto immediate_exit; > + }
I think I'd like the error to be reported to the user; but in any case, you have to set ret to a negative value. > > if (block_job_is_cancelled(&s->common)) { > goto immediate_exit; > @@ -588,14 +618,33 @@ static void coroutine_fn mirror_run(void *opaque) > assert(n > 0); > if (ret == 1) { > bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n); > + } else if (s->zero_unallocated) { > + bdrv_set_dirty_bitmap(zero_dirty_bitmap, sector_num, n); > } > sector_num += n; > } > } > > - bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); > + bdrv_dirty_iter_init(s->dirty_bitmap, &s->allocated_hbi); > > - ret = mirror_do_iteration(s, last_pause_ns); > + if (s->zero_unallocated) { > + bdrv_dirty_iter_init(zero_dirty_bitmap, &s->zero_hbi); > + s->dirty_bitmap = zero_dirty_bitmap; > + s->hbi = &s->zero_hbi; > + > + s->zero_cycle = true; > + ret = mirror_do_iteration(s, &last_pause_ns); > + if (ret < 0) { > + goto immediate_exit; > + } > + > + mirror_drain(s); > + s->zero_cycle = false; > + } > + > + s->dirty_bitmap = allocated_dirty_bitmap; > + s->hbi = &s->allocated_hbi; > + ret = mirror_do_iteration(s, &last_pause_ns); > > immediate_exit: > if (s->in_flight > 0) { > @@ -611,7 +660,8 @@ immediate_exit: > qemu_vfree(s->buf); > g_free(s->cow_bitmap); > g_free(s->in_flight_bitmap); > - bdrv_release_dirty_bitmap(bs, s->dirty_bitmap); > + bdrv_release_dirty_bitmap(bs, allocated_dirty_bitmap); > + bdrv_release_dirty_bitmap(NULL, zero_dirty_bitmap); > bdrv_iostatus_disable(s->target); > > data = g_malloc(sizeof(*data)); > @@ -702,7 +752,7 @@ static void mirror_start_job(BlockDriverState *bs, > BlockDriverState *target, > int64_t buf_size, > BlockdevOnError on_source_error, > BlockdevOnError on_target_error, > - bool unmap, > + bool unmap, bool existing, > BlockCompletionFunc *cb, > void *opaque, Error **errp, > const BlockJobDriver *driver, > @@ -737,6 +787,7 @@ static void mirror_start_job(BlockDriverState *bs, > BlockDriverState *target, > return; > } > > + s->zero_unallocated = !existing && !bdrv_has_zero_init(target); I think this should be set only if we're doing a full mirror operation. For instance, I could do a none, top or incremental mirror to a new qcow2 file, which would give it a backing file, obviously. You're lucky in that qcow2 claims to always have zero initialization, when this is in fact not true (someone's ought to fix that...): With a backing file, an overlay file just cannot have zero initialization, it's impossible (well, unless the backing file is completely zero). So if qcow2 were to answer correctly, i.e. "No, with a backing file I do not have zero init", then this would overwrite all sectors which are supposed to be unallocated because they are present in the backing file. > s->replaces = g_strdup(replaces); > s->on_source_error = on_source_error; > s->on_target_error = on_target_error; > @@ -767,7 +818,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState > *target, > int64_t speed, uint32_t granularity, int64_t buf_size, > MirrorSyncMode mode, BlockdevOnError on_source_error, > BlockdevOnError on_target_error, > - bool unmap, > + bool unmap, bool existing, > BlockCompletionFunc *cb, > void *opaque, Error **errp) > { > @@ -782,8 +833,8 @@ void mirror_start(BlockDriverState *bs, BlockDriverState > *target, > base = mode == MIRROR_SYNC_MODE_TOP ? bs->backing_hd : NULL; > mirror_start_job(bs, target, replaces, > speed, granularity, buf_size, > - on_source_error, on_target_error, unmap, cb, opaque, > errp, > - &mirror_job_driver, is_none_mode, base); > + on_source_error, on_target_error, unmap, existing, cb, > + opaque, errp, &mirror_job_driver, is_none_mode, base); > } > > void commit_active_start(BlockDriverState *bs, BlockDriverState *base, > @@ -830,7 +881,7 @@ void commit_active_start(BlockDriverState *bs, > BlockDriverState *base, > > bdrv_ref(base); > mirror_start_job(bs, base, NULL, speed, 0, 0, > - on_error, on_error, false, cb, opaque, &local_err, > + on_error, on_error, false, false, cb, opaque, > &local_err, This should probably be true; the commit target is already existing, after all. Also, without it being true, iotest 097 fails. > &commit_active_job_driver, false, base); > if (local_err) { > error_propagate(errp, local_err); > diff --git a/blockdev.c b/blockdev.c > index cb9f78d..c06ac60 100644 > --- a/blockdev.c > +++ b/blockdev.c > @@ -2816,7 +2816,7 @@ void qmp_drive_mirror(const char *device, const char > *target, > has_replaces ? replaces : NULL, > speed, granularity, buf_size, sync, > on_source_error, on_target_error, > - unmap, > + unmap, mode == NEW_IMAGE_MODE_EXISTING, > block_job_cb, bs, &local_err); > if (local_err != NULL) { > bdrv_unref(target_bs); > diff --git a/include/block/block_int.h b/include/block/block_int.h > index 14ad4c3..21a8988 100644 > --- a/include/block/block_int.h > +++ b/include/block/block_int.h > @@ -614,6 +614,7 @@ void commit_active_start(BlockDriverState *bs, > BlockDriverState *base, > * @on_source_error: The action to take upon error reading from the source. > * @on_target_error: The action to take upon error writing to the target. > * @unmap: Whether to unmap target where source sectors only contain zeroes. > + * @existing: Whether target image is an existing image prior to the QMP cmd. > * @cb: Completion function for the job. > * @opaque: Opaque pointer value passed to @cb. > * @errp: Error object. > @@ -628,7 +629,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState > *target, > int64_t speed, uint32_t granularity, int64_t buf_size, > MirrorSyncMode mode, BlockdevOnError on_source_error, > BlockdevOnError on_target_error, > - bool unmap, > + bool unmap, bool existing, > BlockCompletionFunc *cb, > void *opaque, Error **errp); > > diff --git a/qapi/block-core.json b/qapi/block-core.json > index bb2189e..033afb4 100644 > --- a/qapi/block-core.json > +++ b/qapi/block-core.json > @@ -952,8 +952,10 @@ > # broken Quorum files. (Since 2.1) > # > # @mode: #optional whether and how QEMU should create a new image, default is > -# 'absolute-paths'. > -# This empty line should stay. > +# 'absolute-paths'. If mode != 'existing', and the target does not > +# have zero init (sparseness), then the target image will have > sectors > +# zeroed out that correspond to sectors in an unallocated state in > the > +# source image. As I said above, this should only happen if @sync == 'full'. Max > # @speed: #optional the maximum speed, in bytes per second > # > # @sync: what parts of the disk image should be copied to the destination >
signature.asc
Description: OpenPGP digital signature