[PATCH v13 10/10] block: apply COR-filter to block-stream jobs
This patch completes the series with the COR-filter applied to block-stream operations. Adding the filter makes it possible for copied regions to be discarded in backing files during the block-stream job, what will reduce the disk overuse. The COR-filter insertion incurs changes in the test case 245:test_block_stream_4 that reopens the backing chain during a block-stream job. There are changes in the test #030 as well. The test case 030:test_stream_parallel was deleted due to multiple conflicts between the concurrent job operations over the same backing chain. All the nodes involved into one job are being frozen, including the filter node. Operations over the mentioned nodes, including the filter one, are being blocked for other jobs. So, the filter node gets involved into two concurrent jobs with the adjacent data node. That is not allowed. It is what the test cases with overlapping jobs are about. The concept of the parallel jobs with common nodes is considered vital no more. Signed-off-by: Andrey Shinkevich --- block/stream.c | 97 ++ tests/qemu-iotests/030 | 51 +++- tests/qemu-iotests/030.out | 4 +- tests/qemu-iotests/141.out | 2 +- tests/qemu-iotests/245 | 22 +++ 5 files changed, 86 insertions(+), 90 deletions(-) diff --git a/block/stream.c b/block/stream.c index 061268b..2f80fae 100644 --- a/block/stream.c +++ b/block/stream.c @@ -18,8 +18,10 @@ #include "qapi/error.h" #include "qapi/qmp/qerror.h" #include "qemu/error-report.h" +#include "qapi/qmp/qdict.h" #include "qemu/ratelimit.h" #include "sysemu/block-backend.h" +#include "block/copy-on-read.h" enum { /* @@ -34,6 +36,8 @@ typedef struct StreamBlockJob { BlockJob common; BlockDriverState *base_overlay; /* COW overlay (stream from this) */ BlockDriverState *above_base; /* Node directly above the base */ +BlockDriverState *cor_filter_bs; +BlockDriverState *target_bs; BlockdevOnError on_error; char *backing_file_str; bool bs_read_only; @@ -45,8 +49,7 @@ static int coroutine_fn stream_populate(BlockBackend *blk, { assert(bytes < SIZE_MAX); -return blk_co_preadv(blk, offset, bytes, NULL, - BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH); +return blk_co_preadv(blk, offset, bytes, NULL, BDRV_REQ_PREFETCH); } static void stream_abort(Job *job) @@ -54,24 +57,21 @@ static void stream_abort(Job *job) StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); if (s->chain_frozen) { -BlockJob *bjob = >common; -bdrv_unfreeze_backing_chain(blk_bs(bjob->blk), s->above_base); +bdrv_unfreeze_backing_chain(s->cor_filter_bs, s->above_base); } } static int stream_prepare(Job *job) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); -BlockJob *bjob = >common; -BlockDriverState *bs = blk_bs(bjob->blk); -BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs); +BlockDriverState *unfiltered_bs = bdrv_skip_filters(s->target_bs); BlockDriverState *base = bdrv_filter_or_cow_bs(s->above_base); BlockDriverState *base_unfiltered; BlockDriverState *backing_bs; Error *local_err = NULL; int ret = 0; -bdrv_unfreeze_backing_chain(bs, s->above_base); +bdrv_unfreeze_backing_chain(s->cor_filter_bs, s->above_base); s->chain_frozen = false; if (bdrv_cow_child(unfiltered_bs)) { @@ -79,7 +79,7 @@ static int stream_prepare(Job *job) if (base) { base_id = s->backing_file_str; if (base_id) { -backing_bs = bdrv_find_backing_image(bs, base_id); +backing_bs = bdrv_find_backing_image(unfiltered_bs, base_id); if (backing_bs && backing_bs->drv) { base_fmt = backing_bs->drv->format_name; } else { @@ -111,15 +111,16 @@ static void stream_clean(Job *job) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); BlockJob *bjob = >common; -BlockDriverState *bs = blk_bs(bjob->blk); /* Reopen the image back in read-only mode if necessary */ if (s->bs_read_only) { /* Give up write permissions before making it read-only */ blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, _abort); -bdrv_reopen_set_read_only(bs, true, NULL); +bdrv_reopen_set_read_only(s->target_bs, true, NULL); } +bdrv_cor_filter_drop(s->cor_filter_bs); + g_free(s->backing_file_str); } @@ -127,9 +128,7 @@ static int coroutine_fn stream_run(Job *job, Error **errp) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); BlockBackend *blk = s->common.blk; -BlockDriverState *bs = blk_bs(blk); -BlockDriverState *unfilt
[PATCH v13 03/10] copy-on-read: add filter drop function
Provide API for the COR-filter removal. Also, drop the filter child permissions for an inactive state when the filter node is being removed. To insert the filter, the block generic layer function bdrv_insert_node() can be used. The new function bdrv_cor_filter_drop() may be considered as an intermediate solution before the QEMU permission update system has overhauled. Then we are able to implement the API function bdrv_remove_node() on the block generic layer. Signed-off-by: Andrey Shinkevich Reviewed-by: Vladimir Sementsov-Ogievskiy --- block/copy-on-read.c | 56 block/copy-on-read.h | 32 ++ 2 files changed, 88 insertions(+) create mode 100644 block/copy-on-read.h diff --git a/block/copy-on-read.c b/block/copy-on-read.c index cb03e0f..618c4c4 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -23,11 +23,20 @@ #include "qemu/osdep.h" #include "block/block_int.h" #include "qemu/module.h" +#include "qapi/error.h" +#include "block/copy-on-read.h" + + +typedef struct BDRVStateCOR { +bool active; +} BDRVStateCOR; static int cor_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { +BDRVStateCOR *state = bs->opaque; + bs->file = bdrv_open_child(NULL, options, "file", bs, _of_bds, BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, false, errp); @@ -42,6 +51,13 @@ static int cor_open(BlockDriverState *bs, QDict *options, int flags, ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & bs->file->bs->supported_zero_flags); +state->active = true; + +/* + * We don't need to call bdrv_child_refresh_perms() now as the permissions + * will be updated later when the filter node gets its parent. + */ + return 0; } @@ -57,6 +73,17 @@ static void cor_child_perm(BlockDriverState *bs, BdrvChild *c, uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared) { +BDRVStateCOR *s = bs->opaque; + +if (!s->active) { +/* + * While the filter is being removed + */ +*nperm = 0; +*nshared = BLK_PERM_ALL; +return; +} + *nperm = perm & PERM_PASSTHROUGH; *nshared = (shared & PERM_PASSTHROUGH) | PERM_UNCHANGED; @@ -135,6 +162,7 @@ static void cor_lock_medium(BlockDriverState *bs, bool locked) static BlockDriver bdrv_copy_on_read = { .format_name= "copy-on-read", +.instance_size = sizeof(BDRVStateCOR), .bdrv_open = cor_open, .bdrv_child_perm= cor_child_perm, @@ -154,6 +182,34 @@ static BlockDriver bdrv_copy_on_read = { .is_filter = true, }; + +void bdrv_cor_filter_drop(BlockDriverState *cor_filter_bs) +{ +BdrvChild *child; +BlockDriverState *bs; +BDRVStateCOR *s = cor_filter_bs->opaque; + +child = bdrv_filter_child(cor_filter_bs); +if (!child) { +return; +} +bs = child->bs; + +/* Retain the BDS until we complete the graph change. */ +bdrv_ref(bs); +/* Hold a guest back from writing while permissions are being reset. */ +bdrv_drained_begin(bs); +/* Drop permissions before the graph change. */ +s->active = false; +bdrv_child_refresh_perms(cor_filter_bs, child, _abort); +bdrv_replace_node(cor_filter_bs, bs, _abort); + +bdrv_drained_end(bs); +bdrv_unref(bs); +bdrv_unref(cor_filter_bs); +} + + static void bdrv_copy_on_read_init(void) { bdrv_register(_copy_on_read); diff --git a/block/copy-on-read.h b/block/copy-on-read.h new file mode 100644 index 000..7bf405d --- /dev/null +++ b/block/copy-on-read.h @@ -0,0 +1,32 @@ +/* + * Copy-on-read filter block driver + * + * The filter driver performs Copy-On-Read (COR) operations + * + * Copyright (c) 2018-2020 Virtuozzo International GmbH. + * + * Author: + * Andrey Shinkevich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef BLOCK_COPY_ON_READ +#define BLOCK_COPY_ON_READ + +#include "block/block_int.h" + +void bdrv_cor_filter_drop(BlockDriverState *cor_filter_bs); + +#endif /* BLOCK_COPY_ON_READ */ -- 1.8.3.1
[PATCH v13 01/10] copy-on-read: support preadv/pwritev_part functions
Add support for the recently introduced functions bdrv_co_preadv_part() and bdrv_co_pwritev_part() to the COR-filter driver. Signed-off-by: Andrey Shinkevich Reviewed-by: Vladimir Sementsov-Ogievskiy --- block/copy-on-read.c | 28 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index 2816e61..cb03e0f 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -74,21 +74,25 @@ static int64_t cor_getlength(BlockDriverState *bs) } -static int coroutine_fn cor_co_preadv(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, + size_t qiov_offset, + int flags) { -return bdrv_co_preadv(bs->file, offset, bytes, qiov, - flags | BDRV_REQ_COPY_ON_READ); +return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, + flags | BDRV_REQ_COPY_ON_READ); } -static int coroutine_fn cor_co_pwritev(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +static int coroutine_fn cor_co_pwritev_part(BlockDriverState *bs, +uint64_t offset, +uint64_t bytes, +QEMUIOVector *qiov, +size_t qiov_offset, int flags) { - -return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); +return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset, +flags); } @@ -137,8 +141,8 @@ static BlockDriver bdrv_copy_on_read = { .bdrv_getlength = cor_getlength, -.bdrv_co_preadv = cor_co_preadv, -.bdrv_co_pwritev= cor_co_pwritev, +.bdrv_co_preadv_part= cor_co_preadv_part, +.bdrv_co_pwritev_part = cor_co_pwritev_part, .bdrv_co_pwrite_zeroes = cor_co_pwrite_zeroes, .bdrv_co_pdiscard = cor_co_pdiscard, .bdrv_co_pwritev_compressed = cor_co_pwritev_compressed, -- 1.8.3.1
[PATCH v13 02/10] block: add API function to insert a node
Provide API for insertion a node to backing chain. Suggested-by: Max Reitz Signed-off-by: Andrey Shinkevich Reviewed-by: Vladimir Sementsov-Ogievskiy --- block.c | 25 + include/block/block.h | 2 ++ 2 files changed, 27 insertions(+) diff --git a/block.c b/block.c index f1cedac..b71c39f 100644 --- a/block.c +++ b/block.c @@ -4698,6 +4698,31 @@ static void bdrv_delete(BlockDriverState *bs) g_free(bs); } +BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *node_options, + int flags, Error **errp) +{ +BlockDriverState *new_node_bs; +Error *local_err = NULL; + +new_node_bs = bdrv_open(NULL, NULL, node_options, flags, errp); +if (new_node_bs == NULL) { +error_prepend(errp, "Could not create node: "); +return NULL; +} + +bdrv_drained_begin(bs); +bdrv_replace_node(bs, new_node_bs, _err); +bdrv_drained_end(bs); + +if (local_err) { +bdrv_unref(new_node_bs); +error_propagate(errp, local_err); +return NULL; +} + +return new_node_bs; +} + /* * Run consistency checks on an image * diff --git a/include/block/block.h b/include/block/block.h index c9d7c58..81a3894 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -350,6 +350,8 @@ void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top, Error **errp); void bdrv_replace_node(BlockDriverState *from, BlockDriverState *to, Error **errp); +BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *node_options, + int flags, Error **errp); int bdrv_parse_aio(const char *mode, int *flags); int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough); -- 1.8.3.1
[PATCH v13 06/10] iotests: add #310 to test bottom node in COR driver
The test case #310 is similar to #216 by Max Reitz. The difference is that the test #310 involves a bottom node to the COR filter driver. Signed-off-by: Andrey Shinkevich Reviewed-by: Vladimir Sementsov-Ogievskiy --- tests/qemu-iotests/310 | 114 + tests/qemu-iotests/310.out | 15 ++ tests/qemu-iotests/group | 1 + 3 files changed, 130 insertions(+) create mode 100755 tests/qemu-iotests/310 create mode 100644 tests/qemu-iotests/310.out diff --git a/tests/qemu-iotests/310 b/tests/qemu-iotests/310 new file mode 100755 index 000..c8b34cd --- /dev/null +++ b/tests/qemu-iotests/310 @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +# +# Copy-on-read tests using a COR filter with a bottom node +# +# Copyright (C) 2018 Red Hat, Inc. +# Copyright (c) 2020 Virtuozzo International GmbH +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +import iotests +from iotests import log, qemu_img, qemu_io_silent + +# Need backing file support +iotests.script_initialize(supported_fmts=['qcow2', 'qcow', 'qed', 'vmdk'], + supported_platforms=['linux']) + +log('') +log('=== Copy-on-read across nodes ===') +log('') + +# This test is similar to the 216 one by Max Reitz +# The difference is that this test case involves a bottom node to the +# COR filter driver. + +with iotests.FilePath('base.img') as base_img_path, \ + iotests.FilePath('mid.img') as mid_img_path, \ + iotests.FilePath('top.img') as top_img_path, \ + iotests.VM() as vm: + +log('--- Setting up images ---') +log('') + +assert qemu_img('create', '-f', iotests.imgfmt, base_img_path, '64M') == 0 +assert qemu_io_silent(base_img_path, '-c', 'write -P 1 0M 1M') == 0 +assert qemu_io_silent(base_img_path, '-c', 'write -P 1 3M 1M') == 0 +assert qemu_img('create', '-f', iotests.imgfmt, '-b', base_img_path, +'-F', iotests.imgfmt, mid_img_path) == 0 +assert qemu_io_silent(mid_img_path, '-c', 'write -P 3 2M 1M') == 0 +assert qemu_io_silent(mid_img_path, '-c', 'write -P 3 4M 1M') == 0 +assert qemu_img('create', '-f', iotests.imgfmt, '-b', mid_img_path, +'-F', iotests.imgfmt, top_img_path) == 0 +assert qemu_io_silent(top_img_path, '-c', 'write -P 2 1M 1M') == 0 + +# 0 1 2 3 4 +# top2 +# mid 3 3 +# base 1 1 + +log('Done') + +log('') +log('--- Doing COR ---') +log('') + +vm.launch() + +log(vm.qmp('blockdev-add', + node_name='node0', + driver='copy-on-read', + bottom='node2', + file={ + 'driver': iotests.imgfmt, + 'file': { + 'driver': 'file', + 'filename': top_img_path + }, + 'backing': { + 'node-name': 'node2', + 'driver': iotests.imgfmt, + 'file': { + 'driver': 'file', + 'filename': mid_img_path + }, + 'backing': { + 'driver': iotests.imgfmt, + 'file': { + 'driver': 'file', + 'filename': base_img_path + } + }, + } + })) + +# Trigger COR +log(vm.qmp('human-monitor-command', + command_line='qemu-io node0 "read 0 5M"')) + +vm.shutdown() + +log('') +log('--- Checking COR result ---') +log('') + +assert qemu_io_silent(base_img_path, '-c', 'discard 0 4M') == 0 +assert qemu_io_silent(mid_img_path, '-c', 'discard 0M 5M') == 0 +assert qemu_io_silent(top_img_path, '-c', 'read -P 0 0 1M') == 0 +assert qemu_io_silent(top_img_path, '-c', 'read -P 2 1M 1M') == 0 +assert qemu_io_silent(top_img_path, '-c', 'read -P 3 2M 1M') == 0 +assert qemu_io_silent(top_img_path, '-c', 'read -P 0 3M 1M') == 0 +assert qemu_io_silent(top_img_path, '-c', 'read -P 3 4M 1M') == 0 + +log('Done') diff --git a/tests/qemu-iotests/310.out b/tests/qemu-iotests/310.out new file mode 100644 index 000..a70aa5c --- /dev/null +++ b/tests/qemu-iotests/310.out @@ -0,0 +1,15 @@ + +=== Copy-on-r
[PATCH v13 09/10] stream: skip filters when writing backing file name to QCOW2 header
Avoid writing a filter JSON file name and a filter format name to QCOW2 image when the backing file is being changed after the block stream job. It can occur due to a concurrent commit job on the same backing chain. A user is still able to assign the 'backing-file' parameter for a block-stream job keeping in mind the possible issue mentioned above. If the user does not specify the 'backing-file' parameter, QEMU will assign it automatically. Signed-off-by: Andrey Shinkevich --- block/stream.c | 21 +++-- blockdev.c | 8 +--- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/block/stream.c b/block/stream.c index 6e281c7..061268b 100644 --- a/block/stream.c +++ b/block/stream.c @@ -17,6 +17,7 @@ #include "block/blockjob_int.h" #include "qapi/error.h" #include "qapi/qmp/qerror.h" +#include "qemu/error-report.h" #include "qemu/ratelimit.h" #include "sysemu/block-backend.h" @@ -65,6 +66,8 @@ static int stream_prepare(Job *job) BlockDriverState *bs = blk_bs(bjob->blk); BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs); BlockDriverState *base = bdrv_filter_or_cow_bs(s->above_base); +BlockDriverState *base_unfiltered; +BlockDriverState *backing_bs; Error *local_err = NULL; int ret = 0; @@ -75,8 +78,22 @@ static int stream_prepare(Job *job) const char *base_id = NULL, *base_fmt = NULL; if (base) { base_id = s->backing_file_str; -if (base->drv) { -base_fmt = base->drv->format_name; +if (base_id) { +backing_bs = bdrv_find_backing_image(bs, base_id); +if (backing_bs && backing_bs->drv) { +base_fmt = backing_bs->drv->format_name; +} else { +error_report("Format not found for backing file %s", + s->backing_file_str); +} +} else { +base_unfiltered = bdrv_skip_filters(base); +if (base_unfiltered) { +base_id = base_unfiltered->filename; +if (base_unfiltered->drv) { +base_fmt = base_unfiltered->drv->format_name; +} +} } } bdrv_set_backing_hd(unfiltered_bs, base, _err); diff --git a/blockdev.c b/blockdev.c index c917625..70900f4 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2508,7 +2508,6 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, BlockDriverState *base_bs = NULL; AioContext *aio_context; Error *local_err = NULL; -const char *base_name = NULL; int job_flags = JOB_DEFAULT; if (!has_on_error) { @@ -2536,7 +2535,6 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, goto out; } assert(bdrv_get_aio_context(base_bs) == aio_context); -base_name = base; } if (has_base_node) { @@ -2551,7 +2549,6 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, } assert(bdrv_get_aio_context(base_bs) == aio_context); bdrv_refresh_filename(base_bs); -base_name = base_bs->filename; } /* Check for op blockers in the whole chain between bs and base */ @@ -2571,9 +2568,6 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, goto out; } -/* backing_file string overrides base bs filename */ -base_name = has_backing_file ? backing_file : base_name; - if (has_auto_finalize && !auto_finalize) { job_flags |= JOB_MANUAL_FINALIZE; } @@ -2581,7 +2575,7 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, job_flags |= JOB_MANUAL_DISMISS; } -stream_start(has_job_id ? job_id : NULL, bs, base_bs, base_name, +stream_start(has_job_id ? job_id : NULL, bs, base_bs, backing_file, job_flags, has_speed ? speed : 0, on_error, filter_node_name, _err); if (local_err) { -- 1.8.3.1
[PATCH v13 07/10] block: include supported_read_flags into BDS structure
Add the new member supported_read_flags to the BlockDriverState structure. It will control the flags set for copy-on-read operations. Make the block generic layer evaluate supported read flags before they go to a block driver. Suggested-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Andrey Shinkevich Reviewed-by: Vladimir Sementsov-Ogievskiy --- block/io.c| 12 ++-- include/block/block_int.h | 4 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/block/io.c b/block/io.c index ec5e152..e28b11c 100644 --- a/block/io.c +++ b/block/io.c @@ -1405,6 +1405,9 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, if (flags & BDRV_REQ_COPY_ON_READ) { int64_t pnum; +/* The flag BDRV_REQ_COPY_ON_READ has reached its addressee */ +flags &= ~BDRV_REQ_COPY_ON_READ; + ret = bdrv_is_allocated(bs, offset, bytes, ); if (ret < 0) { goto out; @@ -1426,9 +1429,13 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, goto out; } +if (flags & ~bs->supported_read_flags) { +abort(); +} + max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); if (bytes <= max_bytes && bytes <= max_transfer) { -ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, 0); +ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, flags); goto out; } @@ -1441,7 +1448,8 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, num, qiov, - qiov_offset + bytes - bytes_remaining, 0); + qiov_offset + bytes - bytes_remaining, + flags); max_bytes -= num; } else { num = bytes_remaining; diff --git a/include/block/block_int.h b/include/block/block_int.h index c05fa1e..247e166 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -873,6 +873,10 @@ struct BlockDriverState { /* I/O Limits */ BlockLimits bl; +/* + * Flags honored during pread + */ +unsigned int supported_read_flags; /* Flags honored during pwrite (so far: BDRV_REQ_FUA, * BDRV_REQ_WRITE_UNCHANGED). * If a driver does not support BDRV_REQ_WRITE_UNCHANGED, those -- 1.8.3.1
[PATCH v13 00/10] Apply COR-filter to the block-stream permanently
The previous version 12 was discussed in the email thread: Message-Id: <1603390423-980205-1-git-send-email-andrey.shinkev...@virtuozzo.com> v13: 02: The bdrv_remove_node() was dropped. 05: Three patches with fixes were merged into one. 06: Minor changes based on Vladimir's suggestions. 08: Three patches with fixes were merged into one. 09: The search for format_name of backing file was added. 10: The flag BLK_PERM_GRAPH_MOD was removed. Andrey Shinkevich (10): copy-on-read: support preadv/pwritev_part functions block: add API function to insert a node copy-on-read: add filter drop function qapi: add filter-node-name to block-stream qapi: create BlockdevOptionsCor structure for COR driver iotests: add #310 to test bottom node in COR driver block: include supported_read_flags into BDS structure copy-on-read: skip non-guest reads if no copy needed stream: skip filters when writing backing file name to QCOW2 header block: apply COR-filter to block-stream jobs block.c| 25 +++ block/copy-on-read.c | 143 + block/copy-on-read.h | 32 + block/io.c | 12 +++- block/monitor/block-hmp-cmds.c | 4 +- block/stream.c | 120 +++--- blockdev.c | 12 ++-- include/block/block.h | 10 ++- include/block/block_int.h | 11 +++- qapi/block-core.json | 27 +++- tests/qemu-iotests/030 | 51 ++- tests/qemu-iotests/030.out | 4 +- tests/qemu-iotests/141.out | 2 +- tests/qemu-iotests/245 | 22 +-- tests/qemu-iotests/310 | 114 tests/qemu-iotests/310.out | 15 + tests/qemu-iotests/group | 1 + 17 files changed, 484 insertions(+), 121 deletions(-) create mode 100644 block/copy-on-read.h create mode 100755 tests/qemu-iotests/310 create mode 100644 tests/qemu-iotests/310.out -- 1.8.3.1
[PATCH v13 05/10] qapi: create BlockdevOptionsCor structure for COR driver
Create the BlockdevOptionsCor structure for COR driver specific options splitting it off form the BlockdevOptionsGenericFormat. The only option 'bottom' node in the structure denotes an image file that limits the COR operations in the backing chain. We are going to use the COR-filter for a block-stream job and will pass a bottom node name to the COR driver. The bottom node is the first non-filter overlay of the base. It was introduced because the base node itself may change due to possible concurrent jobs. Suggested-by: Max Reitz Suggested-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Andrey Shinkevich Reviewed-by: Vladimir Sementsov-Ogievskiy --- block/copy-on-read.c | 57 ++-- qapi/block-core.json | 21 ++- 2 files changed, 75 insertions(+), 3 deletions(-) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index 618c4c4..2cddc96 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -24,18 +24,23 @@ #include "block/block_int.h" #include "qemu/module.h" #include "qapi/error.h" +#include "qapi/qmp/qdict.h" #include "block/copy-on-read.h" typedef struct BDRVStateCOR { bool active; +BlockDriverState *bottom_bs; } BDRVStateCOR; static int cor_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { +BlockDriverState *bottom_bs = NULL; BDRVStateCOR *state = bs->opaque; +/* Find a bottom node name, if any */ +const char *bottom_node = qdict_get_try_str(options, "bottom"); bs->file = bdrv_open_child(NULL, options, "file", bs, _of_bds, BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, @@ -51,7 +56,17 @@ static int cor_open(BlockDriverState *bs, QDict *options, int flags, ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & bs->file->bs->supported_zero_flags); +if (bottom_node) { +bottom_bs = bdrv_lookup_bs(NULL, bottom_node, errp); +if (!bottom_bs) { +error_setg(errp, "Bottom node '%s' not found", bottom_node); +qdict_del(options, "bottom"); +return -EINVAL; +} +qdict_del(options, "bottom"); +} state->active = true; +state->bottom_bs = bottom_bs; /* * We don't need to call bdrv_child_refresh_perms() now as the permissions @@ -107,8 +122,46 @@ static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, size_t qiov_offset, int flags) { -return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, - flags | BDRV_REQ_COPY_ON_READ); +int64_t n; +int local_flags; +int ret; +BDRVStateCOR *state = bs->opaque; + +if (!state->bottom_bs) { +return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, + flags | BDRV_REQ_COPY_ON_READ); +} + +while (bytes) { +local_flags = flags; + +/* In case of failure, try to copy-on-read anyway */ +ret = bdrv_is_allocated(bs->file->bs, offset, bytes, ); +if (ret <= 0) { +ret = bdrv_is_allocated_above(bdrv_backing_chain_next(bs->file->bs), + state->bottom_bs, true, offset, + n, ); +if (ret == 1 || ret < 0) { +local_flags |= BDRV_REQ_COPY_ON_READ; +} +/* Finish earlier if the end of a backing file has been reached */ +if (n == 0) { +break; +} +} + +ret = bdrv_co_preadv_part(bs->file, offset, n, qiov, qiov_offset, + local_flags); +if (ret < 0) { +return ret; +} + +offset += n; +qiov_offset += n; +bytes -= n; +} + +return 0; } diff --git a/qapi/block-core.json b/qapi/block-core.json index 8ef3df6..04055ef 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -3942,6 +3942,25 @@ 'data': { 'throttle-group': 'str', 'file' : 'BlockdevRef' } } + +## +# @BlockdevOptionsCor: +# +# Driver specific block device options for the copy-on-read driver. +# +# @bottom: the name of a non-filter node (allocation-bearing layer) that limits +# the COR operations in the backing chain (inclusive). +# For the block-stream job, it will be the first non-filter overlay of +# the base node. We do not involve the base node into the COR +# operations because the base may change due to a concurrent +# block-commit job on the same backing chain. +# +# Since: 5.2 +## +{ 'struct': 'BlockdevOptionsCor', + 'base': 'Blo
[PATCH v13 08/10] copy-on-read: skip non-guest reads if no copy needed
If the flag BDRV_REQ_PREFETCH was set, skip idling read/write operations in COR-driver. It can be taken into account for the COR-algorithms optimization. That check is being made during the block stream job by the moment. Add the BDRV_REQ_PREFETCH flag to the supported_read_flags of the COR-filter. block: Modify the comment for the flag BDRV_REQ_PREFETCH as we are going to use it alone and pass it to the COR-filter driver for further processing. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 14 ++ include/block/block.h | 8 +--- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index 2cddc96..123d197 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -49,6 +49,8 @@ static int cor_open(BlockDriverState *bs, QDict *options, int flags, return -EINVAL; } +bs->supported_read_flags = BDRV_REQ_PREFETCH; + bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED | (BDRV_REQ_FUA & bs->file->bs->supported_write_flags); @@ -150,10 +152,14 @@ static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, } } -ret = bdrv_co_preadv_part(bs->file, offset, n, qiov, qiov_offset, - local_flags); -if (ret < 0) { -return ret; +/* Skip if neither read nor write are needed */ +if ((local_flags & (BDRV_REQ_PREFETCH | BDRV_REQ_COPY_ON_READ)) != +BDRV_REQ_PREFETCH) { +ret = bdrv_co_preadv_part(bs->file, offset, n, qiov, qiov_offset, + local_flags); +if (ret < 0) { +return ret; +} } offset += n; diff --git a/include/block/block.h b/include/block/block.h index 81a3894..3499554 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -81,9 +81,11 @@ typedef enum { BDRV_REQ_NO_FALLBACK= 0x100, /* - * BDRV_REQ_PREFETCH may be used only together with BDRV_REQ_COPY_ON_READ - * on read request and means that caller doesn't really need data to be - * written to qiov parameter which may be NULL. + * BDRV_REQ_PREFETCH makes sense only in the context of copy-on-read + * (i.e., together with the BDRV_REQ_COPY_ON_READ flag or when a COR + * filter is involved), in which case it signals that the COR operation + * need not read the data into memory (qiov) but only ensure they are + * copied to the top layer (i.e., that COR operation is done). */ BDRV_REQ_PREFETCH = 0x200, /* Mask of valid flags */ -- 1.8.3.1
[PATCH v13 04/10] qapi: add filter-node-name to block-stream
Provide the possibility to pass the 'filter-node-name' parameter to the block-stream job as it is done for the commit block job. Signed-off-by: Andrey Shinkevich Reviewed-by: Vladimir Sementsov-Ogievskiy --- block/monitor/block-hmp-cmds.c | 4 ++-- block/stream.c | 4 +++- blockdev.c | 4 +++- include/block/block_int.h | 7 ++- qapi/block-core.json | 6 ++ 5 files changed, 20 insertions(+), 5 deletions(-) diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c index d15a2be..e8a58f3 100644 --- a/block/monitor/block-hmp-cmds.c +++ b/block/monitor/block-hmp-cmds.c @@ -508,8 +508,8 @@ void hmp_block_stream(Monitor *mon, const QDict *qdict) qmp_block_stream(true, device, device, base != NULL, base, false, NULL, false, NULL, qdict_haskey(qdict, "speed"), speed, true, - BLOCKDEV_ON_ERROR_REPORT, false, false, false, false, - ); + BLOCKDEV_ON_ERROR_REPORT, false, NULL, false, false, false, + false, ); hmp_handle_error(mon, error); } diff --git a/block/stream.c b/block/stream.c index 236384f..6e281c7 100644 --- a/block/stream.c +++ b/block/stream.c @@ -221,7 +221,9 @@ static const BlockJobDriver stream_job_driver = { void stream_start(const char *job_id, BlockDriverState *bs, BlockDriverState *base, const char *backing_file_str, int creation_flags, int64_t speed, - BlockdevOnError on_error, Error **errp) + BlockdevOnError on_error, + const char *filter_node_name, + Error **errp) { StreamBlockJob *s; BlockDriverState *iter; diff --git a/blockdev.c b/blockdev.c index fe6fb5d..c917625 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2499,6 +2499,7 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, bool has_backing_file, const char *backing_file, bool has_speed, int64_t speed, bool has_on_error, BlockdevOnError on_error, + bool has_filter_node_name, const char *filter_node_name, bool has_auto_finalize, bool auto_finalize, bool has_auto_dismiss, bool auto_dismiss, Error **errp) @@ -2581,7 +2582,8 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, } stream_start(has_job_id ? job_id : NULL, bs, base_bs, base_name, - job_flags, has_speed ? speed : 0, on_error, _err); + job_flags, has_speed ? speed : 0, on_error, + filter_node_name, _err); if (local_err) { error_propagate(errp, local_err); goto out; diff --git a/include/block/block_int.h b/include/block/block_int.h index 95d9333..c05fa1e 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -1134,6 +1134,9 @@ int is_windows_drive(const char *filename); * See @BlockJobCreateFlags * @speed: The maximum speed, in bytes per second, or 0 for unlimited. * @on_error: The action to take upon error. + * @filter_node_name: The node name that should be assigned to the filter + * driver that the commit job inserts into the graph above @bs. NULL means + * that a node name should be autogenerated. * @errp: Error object. * * Start a streaming operation on @bs. Clusters that are unallocated @@ -1146,7 +1149,9 @@ int is_windows_drive(const char *filename); void stream_start(const char *job_id, BlockDriverState *bs, BlockDriverState *base, const char *backing_file_str, int creation_flags, int64_t speed, - BlockdevOnError on_error, Error **errp); + BlockdevOnError on_error, + const char *filter_node_name, + Error **errp); /** * commit_start: diff --git a/qapi/block-core.json b/qapi/block-core.json index 04ad80b..8ef3df6 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -2543,6 +2543,11 @@ #'stop' and 'enospc' can only be used if the block device #supports io-status (see BlockInfo). Since 1.3. # +# @filter-node-name: the node name that should be assigned to the +#filter driver that the stream job inserts into the graph +#above @device. If this option is not given, a node name is +#autogenerated. (Since: 5.2) +# # @auto-finalize: When false, this job will wait in a PENDING state after it has # finished its work, waiting for @block-job-finalize before # making any block graph changes. @@ -2573,6 +2578,7 @@ 'data': { '*job-id': 'str', 'device': 'str', '*base': 'str', '*base-node': 'str', '*backing-file': 'str', '*speed': 'int', '*on-error': 'Block
Re: [PATCH v12 14/14] block: apply COR-filter to block-stream jobs
On 27.10.2020 21:24, Andrey Shinkevich wrote: On 27.10.2020 20:57, Vladimir Sementsov-Ogievskiy wrote: 27.10.2020 20:48, Andrey Shinkevich wrote: On 27.10.2020 19:13, Vladimir Sementsov-Ogievskiy wrote: 22.10.2020 21:13, Andrey Shinkevich wrote: This patch completes the series with the COR-filter insertion for block-stream operations. Adding the filter makes it possible for copied regions to be discarded in backing files during the block-stream job, what will reduce the disk overuse. The COR-filter insertion incurs changes in the iotests case 245:test_block_stream_4 that reopens the backing chain during a block-stream job. There are changes in the iotests #030 as well. The iotests case 030:test_stream_parallel was deleted due to multiple conflicts between the concurrent job operations over the same backing chain. The base backing node for one job is the top node for another job. It may change due to the filter node inserted into the backing chain while both jobs are running. Another issue is that the parts of the backing chain are being frozen by the running job and may not be changed by the concurrent job when needed. The concept of the parallel jobs with common nodes is considered vital no more. Signed-off-by: Andrey Shinkevich --- block/stream.c | 98 ++ tests/qemu-iotests/030 | 51 +++- tests/qemu-iotests/030.out | 4 +- tests/qemu-iotests/141.out | 2 +- tests/qemu-iotests/245 | 22 +++ 5 files changed, 87 insertions(+), 90 deletions(-) diff --git a/block/stream.c b/block/stream.c [...] + s = block_job_create(job_id, _job_driver, NULL, cor_filter_bs, + BLK_PERM_CONSISTENT_READ, + basic_flags | BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD, I think that BLK_PERM_GRAPH_MOD is something outdated. We have chain-feeze, what BLK_PERM_GRAPH_MOD adds to it? I don't know, and doubt that somebody knows. That is true for the commit/mirror jobs also. If we agree to remove the flag BLK_PERM_GRAPH_MOD from all these jobs, it will be made in a separate series, won't it? Hmm. At least, let's not implement new logic based on BLK_PERM_GRAPH_MOD. In original code it's only block_job_create's perm, not in shared_perm, not somewhere else.. So, if we keep it, let's keep it as is: only in perm in block_job_create, not implementing additional perm/shared_perm logic. With @perm=0 in the block_job_add_bdrv(>common, "active node"...), it won't. speed, creation_flags, NULL, NULL, errp); if (!s) { goto fail; } + /* + * Prevent concurrent jobs trying to modify the graph structure here, we + * already have our own plans. Also don't allow resize as the image size is + * queried only at the job start and then cached. + */ + if (block_job_add_bdrv(>common, "active node", bs, + basic_flags | BLK_PERM_GRAPH_MOD, why not 0, like for other nodes? We don't use this BdrvChild at all, why to requre permissions? Yes, '0' s right. + basic_flags | BLK_PERM_WRITE, _abort)) { + goto fail; + } + /* Block all intermediate nodes between bs and base, because [...] diff --git a/tests/qemu-iotests/030 b/tests/qemu-iotests/030 index dcb4b5d..0064590 100755 --- a/tests/qemu-iotests/030 +++ b/tests/qemu-iotests/030 @@ -227,61 +227,20 @@ class TestParallelOps(iotests.QMPTestCase): for img in self.imgs: os.remove(img) - # Test that it's possible to run several block-stream operations - # in parallel in the same snapshot chain - @unittest.skipIf(os.environ.get('QEMU_CHECK_BLOCK_AUTO'), 'disabled in CI') - def test_stream_parallel(self): Didn't we agree to add "bottom" paramter to qmp? Than this test-case can be rewritten using node-names and new "bottom" stream argument. The QMP new "bottom" option is passed to the COR-driver. It is done withing the stream-job code. So, it works. I guess it will not help for the whole test. Particularly, there is an issue with freezing the child link to COR-filter of the cuncurrent job, then it fails to finish first. We should not have such frozen link, as our bottom node should be above COR-filter of concurrent job. The bdrv_freeze_backing_chain(bs, above_base, errp) does that job. Max insisted on keeping it. Andrey I have kept the test_stream_parallel() deleted in the coming v13 because it was agreed to make the above_base node frozen. With this, the test case can not pass. It is also true because the operations over the COR-filter node are blocked for the parallel jobs. Andrey
[PATCH v3 3/5] monitor: let QMP monitor track JSON message content
We are going to allow the QMP monitor reading data from input channel more than one byte at once to increase the performance. With the OOB compatibility disabled, the monitor queues one QMP command at most. It was done for the backward compatibility as stated in the comment before pushing a command into the queue. To keep that concept functional, the monitor should track the end of a single QMP command. It allows the dispatcher handling the command and send a response to client in time. Signed-off-by: Andrey Shinkevich --- include/qapi/qmp/json-parser.h | 5 +++-- monitor/qmp.c | 18 -- qga/main.c | 2 +- qobject/json-lexer.c | 30 +- qobject/json-parser-int.h | 8 +--- qobject/json-streamer.c| 15 --- qobject/qjson.c| 2 +- tests/qtest/libqtest.c | 2 +- 8 files changed, 56 insertions(+), 26 deletions(-) diff --git a/include/qapi/qmp/json-parser.h b/include/qapi/qmp/json-parser.h index 7345a9b..039addb 100644 --- a/include/qapi/qmp/json-parser.h +++ b/include/qapi/qmp/json-parser.h @@ -36,8 +36,9 @@ void json_message_parser_init(JSONMessageParser *parser, Error *err), void *opaque, va_list *ap); -void json_message_parser_feed(JSONMessageParser *parser, - const char *buffer, size_t size); +size_t json_message_parser_feed(JSONMessageParser *parser, + const char *buffer, size_t size, + bool track_qmp); void json_message_parser_flush(JSONMessageParser *parser); diff --git a/monitor/qmp.c b/monitor/qmp.c index a86ed35..0b39c62 100644 --- a/monitor/qmp.c +++ b/monitor/qmp.c @@ -367,8 +367,22 @@ static void handle_qmp_command(void *opaque, QObject *req, Error *err) static void monitor_qmp_read(void *opaque, const uint8_t *buf, int size) { MonitorQMP *mon = opaque; - -json_message_parser_feed(>parser, (const char *) buf, size); +char *cursor = (char *) buf; +size_t len; + +while (size > 0) { +len = json_message_parser_feed(>parser, (const char *) cursor, + size, true); +cursor += len; +size -= len; + +if (size > 0) { +/* Let the dispatcher process the QMP command */ +while (qatomic_mb_read(>common.suspend_cnt)) { +g_usleep(20); +} +} +} } static QDict *qmp_greeting(MonitorQMP *mon) diff --git a/qga/main.c b/qga/main.c index dea6a3a..16de642 100644 --- a/qga/main.c +++ b/qga/main.c @@ -605,7 +605,7 @@ static gboolean channel_event_cb(GIOCondition condition, gpointer data) case G_IO_STATUS_NORMAL: buf[count] = 0; g_debug("read data, count: %d, data: %s", (int)count, buf); -json_message_parser_feed(>parser, (char *)buf, (int)count); +json_message_parser_feed(>parser, (char *)buf, (int)count, false); break; case G_IO_STATUS_EOF: g_debug("received EOF"); diff --git a/qobject/json-lexer.c b/qobject/json-lexer.c index 632320d..1fefbae 100644 --- a/qobject/json-lexer.c +++ b/qobject/json-lexer.c @@ -280,10 +280,11 @@ void json_lexer_init(JSONLexer *lexer, bool enable_interpolation) lexer->x = lexer->y = 0; } -static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush) +static JSONTokenType json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush) { int new_state; bool char_consumed = false; +JSONTokenType ret; lexer->x++; if (ch == '\n') { @@ -310,16 +311,16 @@ static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush) case JSON_FLOAT: case JSON_KEYWORD: case JSON_STRING: -json_message_process_token(lexer, lexer->token, new_state, - lexer->x, lexer->y); +ret = json_message_process_token(lexer, lexer->token, new_state, + lexer->x, lexer->y); /* fall through */ case IN_START: g_string_truncate(lexer->token, 0); new_state = lexer->start_state; break; case JSON_ERROR: -json_message_process_token(lexer, lexer->token, JSON_ERROR, - lexer->x, lexer->y); +ret = json_message_process_token(lexer, lexer->token, JSON_ERROR, + lexer->x, lexer->y); new_state = IN_RECOVERY; /* fall through */ case IN_RECOVERY: @@ -335,20 +336,31 @@ static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush) * this is a security consideration. */ if (lexer->token
[PATCH v3 5/5] monitor: increase amount of data for monitor to read
QMP and HMP monitors read one byte at a time from the socket or stdin, which is very inefficient. With 100+ VMs on the host, this results in multiple extra system calls and CPU overuse. This patch increases the amount of read data up to 4096 bytes that fits the buffer size on the channel level. Suggested-by: Denis V. Lunev Signed-off-by: Andrey Shinkevich --- monitor/monitor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monitor/monitor.c b/monitor/monitor.c index 84222cd..43d2d3b 100644 --- a/monitor/monitor.c +++ b/monitor/monitor.c @@ -566,7 +566,7 @@ int monitor_can_read(void *opaque) { Monitor *mon = opaque; -return !qatomic_mb_read(>suspend_cnt); +return !qatomic_mb_read(>suspend_cnt) ? CHR_READ_BUF_LEN : 0; } void monitor_list_append(Monitor *mon) -- 1.8.3.1
[PATCH v3 2/5] monitor: drain requests queue with 'channel closed' event
When CHR_EVENT_CLOSED comes, the QMP requests queue may still contain unprocessed commands. It can happen with QMP capability OOB enabled. Let the dispatcher complete handling requests rest in the monitor queue. Signed-off-by: Andrey Shinkevich --- monitor/qmp.c | 46 +- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/monitor/qmp.c b/monitor/qmp.c index 7169366..a86ed35 100644 --- a/monitor/qmp.c +++ b/monitor/qmp.c @@ -75,36 +75,32 @@ static void monitor_qmp_cleanup_req_queue_locked(MonitorQMP *mon) } } -static void monitor_qmp_cleanup_queue_and_resume(MonitorQMP *mon) +/* + * Let unprocessed QMP commands be handled. + */ +static void monitor_qmp_drain_queue(MonitorQMP *mon) { -qemu_mutex_lock(>qmp_queue_lock); +bool q_is_empty = false; -/* - * Same condition as in monitor_qmp_dispatcher_co(), but before - * removing an element from the queue (hence no `- 1`). - * Also, the queue should not be empty either, otherwise the - * monitor hasn't been suspended yet (or was already resumed). - */ -bool need_resume = (!qmp_oob_enabled(mon) || -mon->qmp_requests->length == QMP_REQ_QUEUE_LEN_MAX) -&& !g_queue_is_empty(mon->qmp_requests); +while (!q_is_empty) { +qemu_mutex_lock(>qmp_queue_lock); +q_is_empty = g_queue_is_empty(mon->qmp_requests); +qemu_mutex_unlock(>qmp_queue_lock); -monitor_qmp_cleanup_req_queue_locked(mon); +if (!q_is_empty) { +if (!qatomic_xchg(_dispatcher_co_busy, true)) { +/* Kick the dispatcher coroutine */ +aio_co_wake(qmp_dispatcher_co); +} else { +/* Let the dispatcher do its job for a while */ +g_usleep(40); +} +} +} -if (need_resume) { -/* - * handle_qmp_command() suspended the monitor because the - * request queue filled up, to be resumed when the queue has - * space again. We just emptied it; resume the monitor. - * - * Without this, the monitor would remain suspended forever - * when we get here while the monitor is suspended. An - * unfortunately timed CHR_EVENT_CLOSED can do the trick. - */ +if (qatomic_mb_read(>common.suspend_cnt)) { monitor_resume(>common); } - -qemu_mutex_unlock(>qmp_queue_lock); } void qmp_send_response(MonitorQMP *mon, const QDict *rsp) @@ -418,7 +414,7 @@ static void monitor_qmp_event(void *opaque, QEMUChrEvent event) * stdio, it's possible that stdout is still open when stdin * is closed. */ -monitor_qmp_cleanup_queue_and_resume(mon); +monitor_qmp_drain_queue(mon); json_message_parser_destroy(>parser); json_message_parser_init(>parser, handle_qmp_command, mon, NULL); -- 1.8.3.1
Re: [PATCH v2 2/2] monitor: increase amount of data for monitor to read
On 24.11.2020 14:03, Vladimir Sementsov-Ogievskiy wrote: 23.11.2020 18:44, Andrey Shinkevich wrote: QMP and HMP monitors read one byte at a time from the socket or stdin, which is very inefficient. With 100+ VMs on the host, this results in multiple extra system calls and CPU overuse. This patch increases the amount of read data up to 4096 bytes that fits the buffer size on the channel level. A JSON little parser is introduced to throttle QMP commands read from the buffer so that incoming requests do not overflow the monitor input queue. Suggested-by: Denis V. Lunev Signed-off-by: Andrey Shinkevich Can't we just increase qmp queue instead? It seems a lot simpler: With the OOB compatibility disabled, the monitor queues one QMP command at most. It was made for the backward compatibility as stated in the comment before pushing a command into the queue. To keep that concept functional, the monitor should track the end of a single QMP command. It allows the dispatcher handling the command and send a response to client in time. With the patch below, the monitor queue will be filled with QMP commands as many as they will be found in the input buffer. The first command execution {"execute":"qmp_capabilities"} takes more time and queue will be filled at full. Then the dispatcher starts execution of other commands in the monitor queue. The process becomes synchronious. In this case, we need neither thread nor the queue. Andrey diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h index 348bfad3d5..7e721eee3f 100644 --- a/include/monitor/monitor.h +++ b/include/monitor/monitor.h @@ -8,7 +8,7 @@ typedef struct MonitorHMP MonitorHMP; typedef struct MonitorOptions MonitorOptions; -#define QMP_REQ_QUEUE_LEN_MAX 8 +#define QMP_REQ_QUEUE_LEN_MAX 4096 extern QemuOptsList qemu_mon_opts; diff --git a/monitor/monitor.c b/monitor/monitor.c index 84222cd130..1588f00306 100644 --- a/monitor/monitor.c +++ b/monitor/monitor.c @@ -566,7 +566,7 @@ int monitor_can_read(void *opaque) { Monitor *mon = opaque; - return !qatomic_mb_read(>suspend_cnt); + return !qatomic_mb_read(>suspend_cnt) ? 4096 : 0; } - with this patch tests pass and performance is even better.
[PATCH v3 0/5] Increase amount of data for monitor to read
The subject was discussed here: https://lists.gnu.org/archive/html/qemu-devel/2017-05/msg00206.html https://patchew.org/QEMU/20190610105906.28524-1-dplotni...@virtuozzo.com/# Message-ID: <31dd78ba-bd64-2ed6-3c8f-eed4e904d...@virtuozzo.com> and v2: Message-Id: <1606146274-246154-1-git-send-email-andrey.shinkev...@virtuozzo.com> This series is a solution for the issue with overflow of the monitor queue with QMP requests if we keep the maximum queue length unchanged (=8). v3: 01: New 02: New 03: The additional little JSON parser removed and the resources of the existing JSON parser were used to track the end of a QMP command. 04: The amount of read input data increases only. Andrey Shinkevich (4): monitor: change function obsolete name in comments monitor: drain requests queue with 'channel closed' event monitor: let QMP monitor track JSON message content monitor: increase amount of data for monitor to read Vladimir Sementsov-Ogievskiy (1): iotests: 129 don't check backup "busy" include/qapi/qmp/json-parser.h | 5 ++-- monitor/monitor.c | 2 +- monitor/qmp.c | 66 -- qga/main.c | 2 +- qobject/json-lexer.c | 30 +-- qobject/json-parser-int.h | 8 +++-- qobject/json-streamer.c| 15 +- qobject/qjson.c| 2 +- tests/qemu-iotests/129 | 1 - tests/qtest/libqtest.c | 2 +- 10 files changed, 79 insertions(+), 54 deletions(-) -- 1.8.3.1
[PATCH v3 4/5] iotests: 129 don't check backup "busy"
From: Vladimir Sementsov-Ogievskiy Busy is racy, job has it's "pause-points" when it's not busy. Drop this check. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Max Reitz --- tests/qemu-iotests/129 | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/qemu-iotests/129 b/tests/qemu-iotests/129 index 0e13244..3c22f64 100755 --- a/tests/qemu-iotests/129 +++ b/tests/qemu-iotests/129 @@ -67,7 +67,6 @@ class TestStopWithBlockJob(iotests.QMPTestCase): result = self.vm.qmp("stop") self.assert_qmp(result, 'return', {}) result = self.vm.qmp("query-block-jobs") -self.assert_qmp(result, 'return[0]/busy', True) self.assert_qmp(result, 'return[0]/ready', False) def test_drive_mirror(self): -- 1.8.3.1
[PATCH v3 1/5] monitor: change function obsolete name in comments
The function name monitor_qmp_bh_dispatcher() has been changed to monitor_qmp_dispatcher_co() since the commit 9ce44e2c. Let's amend the comments. Signed-off-by: Andrey Shinkevich --- monitor/qmp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/monitor/qmp.c b/monitor/qmp.c index b42f8c6..7169366 100644 --- a/monitor/qmp.c +++ b/monitor/qmp.c @@ -80,7 +80,7 @@ static void monitor_qmp_cleanup_queue_and_resume(MonitorQMP *mon) qemu_mutex_lock(>qmp_queue_lock); /* - * Same condition as in monitor_qmp_bh_dispatcher(), but before + * Same condition as in monitor_qmp_dispatcher_co(), but before * removing an element from the queue (hence no `- 1`). * Also, the queue should not be empty either, otherwise the * monitor hasn't been suspended yet (or was already resumed). @@ -343,7 +343,7 @@ static void handle_qmp_command(void *opaque, QObject *req, Error *err) /* * Suspend the monitor when we can't queue more requests after - * this one. Dequeuing in monitor_qmp_bh_dispatcher() or + * this one. Dequeuing in monitor_qmp_dispatcher_co() or * monitor_qmp_cleanup_queue_and_resume() will resume it. * Note that when OOB is disabled, we queue at most one command, * for backward compatibility. -- 1.8.3.1
Re: [PATCH v2 1/2] iotests: add another bash sleep command to 247
On 23.11.2020 18:44, Andrey Shinkevich wrote: This patch paves the way for the one that follows. The following patch makes the QMP monitor to read up to 4K from stdin at once. That results in running the bash 'sleep' command before the _qemu_proc_exec() starts in subshell. Another 'sleep' command with an unobtrusive 'query-status' plays as a workaround. Signed-off-by: Andrey Shinkevich --- tests/qemu-iotests/247 | 2 ++ tests/qemu-iotests/247.out | 1 + 2 files changed, 3 insertions(+) [...] With the patch 2/2 of the current version 2, the test case #247 passes without this patch 1/2. So, it may be excluded from the series. Thanks to Vladimir for the idea to check. Andrey
Re: [PATCH v2 1/2] iotests: add another bash sleep command to 247
On 24.11.2020 13:04, Vladimir Sementsov-Ogievskiy wrote: 23.11.2020 18:44, Andrey Shinkevich wrote: This patch paves the way for the one that follows. The following patch makes the QMP monitor to read up to 4K from stdin at once. That results in running the bash 'sleep' command before the _qemu_proc_exec() starts But how? Before _qemu_proc_exec() starts, qemu monitor is not runnning, and its new behavior can't influence.. I am not a bash expert to explain 'how' but this workaround works. It's just a test. Maybe other colleagues can say. If bash subshell work in unpredictable way, may be better is refactor test to send commands one by one with help of _send_qemu_cmd. Then sleep will be natively executed between sending commands. Or maybe write a similar test case in Python if Kevin agrees. in subshell. Another 'sleep' command with an unobtrusive 'query-status' plays as a workaround. Signed-off-by: Andrey Shinkevich --- tests/qemu-iotests/247 | 2 ++ tests/qemu-iotests/247.out | 1 + 2 files changed, 3 insertions(+) diff --git a/tests/qemu-iotests/247 b/tests/qemu-iotests/247 index 87e37b3..7d316ec 100755 --- a/tests/qemu-iotests/247 +++ b/tests/qemu-iotests/247 @@ -59,6 +59,8 @@ TEST_IMG="$TEST_IMG.4" _make_test_img $size {"execute":"block-commit", "arguments":{"device":"format-4", "top-node": "format-2", "base-node":"format-0", "job-id":"job0"}} EOF +sleep 1 +echo '{"execute":"query-status"}' if [ "${VALGRIND_QEMU}" == "y" ]; then sleep 10 else diff --git a/tests/qemu-iotests/247.out b/tests/qemu-iotests/247.out index e909e83..13d9547 100644 --- a/tests/qemu-iotests/247.out +++ b/tests/qemu-iotests/247.out @@ -17,6 +17,7 @@ QMP_VERSION {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "job0", "len": 134217728, "offset": 134217728, "speed": 0, "type": "commit"}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "job0"}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "job0"}} +{"return": {"status": "running", "singlestep": false, "running": true}} {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} *** done
Re: [PATCH v2 0/2] Increase amount of data for monitor to read
On 23.11.2020 18:44, Andrey Shinkevich wrote: The subject was discussed here: https://lists.gnu.org/archive/html/qemu-devel/2017-05/msg00206.html This series is a solution for the issue with QMP monitor buffered input. A JSON little parser is introduced to separate QMP commands read from the input buffer so that incoming requests do not overwhelm the monitor queue. A side effect raised in the test #247 was managed in the first patch. It may be considered as a workaround. Any sane fix suggested will be appreciated. Note: This series goes after the Vladimir's one: '[PATCH v3 00/25] backup performance: block_status + async"' To make the test #129 passed, the following patch should be applied first: '[PATCH v3 01/25] iotests: 129 don't check backup "busy"'. v2: 02: The static JSONthrottle object was made a member of the Chardev structure. The fd_chr_read functions were merged. The monitor thread synchronization was added to protect the input queue from overflow. Andrey Shinkevich (2): iotests: add another bash sleep command to 247 monitor: increase amount of data for monitor to read chardev/char-fd.c | 35 +-- chardev/char-socket.c | 42 +++--- chardev/char.c | 41 + include/chardev/char.h | 15 +++ monitor/monitor.c | 2 +- tests/qemu-iotests/247 | 2 ++ tests/qemu-iotests/247.out | 1 + 7 files changed, 132 insertions(+), 6 deletions(-) ...and with the extended number of QMP commands time (echo "{ 'execute': 'qmp_capabilities' }"; for i in {1..1}; do echo "{ 'execute': 'query-block-jobs' } {"execute":"query-status"} { 'execute': 'query-block-jobs' } {"execute":"query-status"} { 'execute': 'query-block-jobs' } {"execute":"query-status"} { 'execute': 'query-block-jobs' } {"execute":"query-status"}"; done; echo "{ 'execute': 'quit' }" ) | ./build/qemu-system-x86_64 -qmp stdio > /dev/null on master: real0m10.112s user0m10.168s sys 0m4.793s after the patch applied: real0m4.140s user0m4.079s sys 0m0.785s Andrey
Re: [PATCH v2 0/2] Increase amount of data for monitor to read
On 23.11.2020 18:44, Andrey Shinkevich wrote: The subject was discussed here: https://lists.gnu.org/archive/html/qemu-devel/2017-05/msg00206.html This series is a solution for the issue with QMP monitor buffered input. A JSON little parser is introduced to separate QMP commands read from the input buffer so that incoming requests do not overwhelm the monitor queue. A side effect raised in the test #247 was managed in the first patch. It may be considered as a workaround. Any sane fix suggested will be appreciated. Note: This series goes after the Vladimir's one: '[PATCH v3 00/25] backup performance: block_status + async"' To make the test #129 passed, the following patch should be applied first: '[PATCH v3 01/25] iotests: 129 don't check backup "busy"'. v2: 02: The static JSONthrottle object was made a member of the Chardev structure. The fd_chr_read functions were merged. The monitor thread synchronization was added to protect the input queue from overflow. Andrey Shinkevich (2): iotests: add another bash sleep command to 247 monitor: increase amount of data for monitor to read chardev/char-fd.c | 35 +-- chardev/char-socket.c | 42 +++--- chardev/char.c | 41 + include/chardev/char.h | 15 +++ monitor/monitor.c | 2 +- tests/qemu-iotests/247 | 2 ++ tests/qemu-iotests/247.out | 1 + 7 files changed, 132 insertions(+), 6 deletions(-) The Vladimir's modified test case $ time (echo "{ 'execute': 'qmp_capabilities' }"; for i in {1..1}; do echo "{ 'execute': 'query-block-jobs' } {"execute":"query-status"} { 'execute': 'query-block-jobs' } {"execute":"query-status"}"; done; echo "{ 'execute': 'quit' }" ) | ./build/qemu-system-x86_64 -qmp stdio > /dev/null shows the following performance on master: real0m5.188s user0m5.310s sys 0m2.539s after the patch applied: real0m2.227s user0m2.483s sys 0m0.480s Andrey
[PATCH v2 1/2] iotests: add another bash sleep command to 247
This patch paves the way for the one that follows. The following patch makes the QMP monitor to read up to 4K from stdin at once. That results in running the bash 'sleep' command before the _qemu_proc_exec() starts in subshell. Another 'sleep' command with an unobtrusive 'query-status' plays as a workaround. Signed-off-by: Andrey Shinkevich --- tests/qemu-iotests/247 | 2 ++ tests/qemu-iotests/247.out | 1 + 2 files changed, 3 insertions(+) diff --git a/tests/qemu-iotests/247 b/tests/qemu-iotests/247 index 87e37b3..7d316ec 100755 --- a/tests/qemu-iotests/247 +++ b/tests/qemu-iotests/247 @@ -59,6 +59,8 @@ TEST_IMG="$TEST_IMG.4" _make_test_img $size {"execute":"block-commit", "arguments":{"device":"format-4", "top-node": "format-2", "base-node":"format-0", "job-id":"job0"}} EOF +sleep 1 +echo '{"execute":"query-status"}' if [ "${VALGRIND_QEMU}" == "y" ]; then sleep 10 else diff --git a/tests/qemu-iotests/247.out b/tests/qemu-iotests/247.out index e909e83..13d9547 100644 --- a/tests/qemu-iotests/247.out +++ b/tests/qemu-iotests/247.out @@ -17,6 +17,7 @@ QMP_VERSION {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "job0", "len": 134217728, "offset": 134217728, "speed": 0, "type": "commit"}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "job0"}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "job0"}} +{"return": {"status": "running", "singlestep": false, "running": true}} {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} *** done -- 1.8.3.1
[PATCH v2 2/2] monitor: increase amount of data for monitor to read
QMP and HMP monitors read one byte at a time from the socket or stdin, which is very inefficient. With 100+ VMs on the host, this results in multiple extra system calls and CPU overuse. This patch increases the amount of read data up to 4096 bytes that fits the buffer size on the channel level. A JSON little parser is introduced to throttle QMP commands read from the buffer so that incoming requests do not overflow the monitor input queue. Suggested-by: Denis V. Lunev Signed-off-by: Andrey Shinkevich --- chardev/char-fd.c | 35 +-- chardev/char-socket.c | 42 +++--- chardev/char.c | 41 + include/chardev/char.h | 15 +++ monitor/monitor.c | 2 +- 5 files changed, 129 insertions(+), 6 deletions(-) diff --git a/chardev/char-fd.c b/chardev/char-fd.c index 1cd62f2..15bc8f4 100644 --- a/chardev/char-fd.c +++ b/chardev/char-fd.c @@ -33,6 +33,8 @@ #include "chardev/char-fd.h" #include "chardev/char-io.h" +#include "monitor/monitor-internal.h" + /* Called with chr_write_lock held. */ static int fd_chr_write(Chardev *chr, const uint8_t *buf, int len) { @@ -45,8 +47,12 @@ static gboolean fd_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque) { Chardev *chr = CHARDEV(opaque); FDChardev *s = FD_CHARDEV(opaque); +CharBackend *be = chr->be; +Monitor *mon = (Monitor *)be->opaque; int len; uint8_t buf[CHR_READ_BUF_LEN]; +uint8_t *cursor; +int load, size, pos; ssize_t ret; len = sizeof(buf); @@ -62,10 +68,35 @@ static gboolean fd_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque) if (ret == 0) { remove_fd_in_watch(chr); qemu_chr_be_event(chr, CHR_EVENT_CLOSED); +chr->json_thl = (const JSONthrottle){0}; return FALSE; } -if (ret > 0) { -qemu_chr_be_write(chr, buf, ret); +if (ret < 0) { +return TRUE; +} +load = ret; +cursor = buf; + +while (load > 0) { +size = load; +if (monitor_is_qmp(mon)) { +/* Find the end position of a JSON command in the input buffer */ +pos = qemu_chr_end_position((const char *) cursor, size, +>json_thl); +if (pos >= 0) { +size = pos + 1; +} +} + +qemu_chr_be_write(chr, cursor, size); +cursor += size; +load -= size; + +if (load > 0) { +while (qatomic_mb_read(>suspend_cnt)) { +g_usleep(40); +} +} } return TRUE; diff --git a/chardev/char-socket.c b/chardev/char-socket.c index 213a4c8..30ad1d4 100644 --- a/chardev/char-socket.c +++ b/chardev/char-socket.c @@ -38,6 +38,8 @@ #include "chardev/char-io.h" #include "qom/object.h" +#include "monitor/monitor-internal.h" + /***/ /* TCP Net console */ @@ -522,7 +524,11 @@ static gboolean tcp_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque) { Chardev *chr = CHARDEV(opaque); SocketChardev *s = SOCKET_CHARDEV(opaque); +CharBackend *be = chr->be; +Monitor *mon = (Monitor *)be->opaque; uint8_t buf[CHR_READ_BUF_LEN]; +uint8_t *cursor; +int load, pos; int len, size; if ((s->state != TCP_CHARDEV_STATE_CONNECTED) || @@ -537,12 +543,42 @@ static gboolean tcp_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque) if (size == 0 || (size == -1 && errno != EAGAIN)) { /* connection closed */ tcp_chr_disconnect(chr); -} else if (size > 0) { +chr->json_thl = (const JSONthrottle){0}; +return TRUE; +} +if (size < 0) { +return TRUE; +} +load = size; +cursor = buf; + +while (load > 0) { +size = load; +if (monitor_is_qmp(mon)) { +/* Find the end position of a JSON command in the input buffer */ +pos = qemu_chr_end_position((const char *) cursor, size, +>json_thl); +if (pos >= 0) { +size = pos + 1; +} +} +len = size; + if (s->do_telnetopt) { -tcp_chr_process_IAC_bytes(chr, s, buf, ); +tcp_chr_process_IAC_bytes(chr, s, cursor, ); } if (size > 0) { -qemu_chr_be_write(chr, buf, size); +qemu_chr_be_write(chr, cursor, size); +cursor += size; +load -= size; +} else { +cursor += len; +load -= len; +} +if (load > 0) { +while (qatomic_mb_read(>suspend_cnt)) { +g_usleep(40); +} } } diff --git
[PATCH v2 0/2] Increase amount of data for monitor to read
The subject was discussed here: https://lists.gnu.org/archive/html/qemu-devel/2017-05/msg00206.html This series is a solution for the issue with QMP monitor buffered input. A JSON little parser is introduced to separate QMP commands read from the input buffer so that incoming requests do not overwhelm the monitor queue. A side effect raised in the test #247 was managed in the first patch. It may be considered as a workaround. Any sane fix suggested will be appreciated. Note: This series goes after the Vladimir's one: '[PATCH v3 00/25] backup performance: block_status + async"' To make the test #129 passed, the following patch should be applied first: '[PATCH v3 01/25] iotests: 129 don't check backup "busy"'. v2: 02: The static JSONthrottle object was made a member of the Chardev structure. The fd_chr_read functions were merged. The monitor thread synchronization was added to protect the input queue from overflow. Andrey Shinkevich (2): iotests: add another bash sleep command to 247 monitor: increase amount of data for monitor to read chardev/char-fd.c | 35 +-- chardev/char-socket.c | 42 +++--- chardev/char.c | 41 + include/chardev/char.h | 15 +++ monitor/monitor.c | 2 +- tests/qemu-iotests/247 | 2 ++ tests/qemu-iotests/247.out | 1 + 7 files changed, 132 insertions(+), 6 deletions(-) -- 1.8.3.1
Re: [PATCH 2/2] monitor: increase amount of data for monitor to read
On 09.11.2020 12:55, Vladimir Sementsov-Ogievskiy wrote: 06.11.2020 15:42, Andrey Shinkevich wrote: QMP and HMP monitors read one byte at a time from the socket or stdin, which is very inefficient. With 100+ VMs on the host, this results in multiple extra system calls and CPU overuse. This patch increases the amount of read data up to 4096 bytes that fits the buffer size on the channel level. Suggested-by: Denis V. Lunev Signed-off-by: Andrey Shinkevich --- chardev/char-fd.c | 64 +- chardev/char-socket.c | 54 +++--- chardev/char.c | 40 + include/chardev/char.h | 15 +++ monitor/monitor.c | 2 +- tests/qemu-iotests/247.out | 2 +- 6 files changed, 159 insertions(+), 18 deletions(-) [...] + ret = qio_channel_read( + chan, (gchar *)thl.buf, len, NULL); + if (ret == 0) { + remove_fd_in_watch(chr); + qemu_chr_be_event(chr, CHR_EVENT_CLOSED); + thl = (const JSONthrottle){0}; + return FALSE; + } + if (ret < 0) { + return TRUE; + } large code chunk is shared with fd_chr_read_hmp(). Would be not bad to avoid duplication.. There were two reasons to split the function: 1. Not to make the code complicated. 2. Avoid unused buffer of 4k on the stack: fd_chr_read_hmp() { uint8_t buf[CHR_READ_BUF_LEN];.. + thl.load = ret; + thl.cursor = 0; + } + + size = thl.load; + start = thl.buf + thl.cursor; you may use uint8_t* pointer type for thl.curser and get rid of size and start variables. For the 'start', yes. And I will want the 'size' anyway. [...] +int qemu_chr_end_position(const char *buf, int size, JSONthrottle *thl) +{ + int i; + + for (i = 0; i < size; i++) { + switch (buf[i]) { + case ' ': + case '\n': + case '\r': + continue; + case '{': + thl->brace_count++; + break; + case '}': + thl->brace_count--; + break; + case '[': + thl->bracket_count++; + break; + case ']': + thl->bracket_count--; I don't think you need to care about square brackets, as QMP queries and answers are always json objects, i.e. in pair of '{' and '}'. I've kept the brackets because it is another condition to put a command into the requests queue (see json_message_process_token()). Andrey
Re: [PATCH 0/2] Increase amount of data for monitor to read
On 09.11.2020 13:04, Vladimir Sementsov-Ogievskiy wrote: 09.11.2020 11:50, Vladimir Sementsov-Ogievskiy wrote: 06.11.2020 15:42, Andrey Shinkevich wrote: The subject was discussed here: https://lists.gnu.org/archive/html/qemu-devel/2017-05/msg00206.html This series is a solution for the issue with QMP monitor buffered input. A little parser is introduced to throttle JSON commands read from the buffer so that QMP requests do not overwhelm the monitor input queue. A side effect raised in the test #247 was managed in the first patch. It may be considered as a workaround. Any sane fix suggested will be appreciated. Note: This series goes after the Vladimir's one: '[PATCH v3 00/25] backup performance: block_status + async"' To make the test #129 passed, the following patch should be applied first: '[PATCH v3 01/25] iotests: 129 don't check backup "busy"'. [...] Positive thing: the patches do increase performance: for me, the following command: (echo "{ 'execute': 'qmp_capabilities' }"; for i in {1..1}; do echo "{ 'execute': 'query-block-jobs' }"; done; echo "{ 'execute': 'quit' }" ) | time ./qemu-system-x86_64 -qmp stdio > /dev/null shows 2.4s on master and 0.6s after patches Thank you for testing it. I'd like to include the result to the patch description with "Tested-by: ..." Andrey
Re: [PATCH 0/2] Increase amount of data for monitor to read
Please exclude this address when reply: jc...@redhat.com Andrey
[PATCH 2/2] monitor: increase amount of data for monitor to read
QMP and HMP monitors read one byte at a time from the socket or stdin, which is very inefficient. With 100+ VMs on the host, this results in multiple extra system calls and CPU overuse. This patch increases the amount of read data up to 4096 bytes that fits the buffer size on the channel level. Suggested-by: Denis V. Lunev Signed-off-by: Andrey Shinkevich --- chardev/char-fd.c | 64 +- chardev/char-socket.c | 54 +++--- chardev/char.c | 40 + include/chardev/char.h | 15 +++ monitor/monitor.c | 2 +- tests/qemu-iotests/247.out | 2 +- 6 files changed, 159 insertions(+), 18 deletions(-) diff --git a/chardev/char-fd.c b/chardev/char-fd.c index 1cd62f2..6194fe6 100644 --- a/chardev/char-fd.c +++ b/chardev/char-fd.c @@ -33,6 +33,8 @@ #include "chardev/char-fd.h" #include "chardev/char-io.h" +#include "monitor/monitor-internal.h" + /* Called with chr_write_lock held. */ static int fd_chr_write(Chardev *chr, const uint8_t *buf, int len) { @@ -41,7 +43,7 @@ static int fd_chr_write(Chardev *chr, const uint8_t *buf, int len) return io_channel_send(s->ioc_out, buf, len); } -static gboolean fd_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque) +static gboolean fd_chr_read_hmp(QIOChannel *chan, void *opaque) { Chardev *chr = CHARDEV(opaque); FDChardev *s = FD_CHARDEV(opaque); @@ -71,6 +73,66 @@ static gboolean fd_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque) return TRUE; } +static gboolean fd_chr_read_qmp(QIOChannel *chan, void *opaque) +{ +static JSONthrottle thl = {0}; +uint8_t *start; +Chardev *chr = CHARDEV(opaque); +FDChardev *s = FD_CHARDEV(opaque); +int len, size, pos; +ssize_t ret; + +if (!thl.load) { +len = sizeof(thl.buf); +if (len > s->max_size) { +len = s->max_size; +} +if (len == 0) { +return TRUE; +} + +ret = qio_channel_read( +chan, (gchar *)thl.buf, len, NULL); +if (ret == 0) { +remove_fd_in_watch(chr); +qemu_chr_be_event(chr, CHR_EVENT_CLOSED); +thl = (const JSONthrottle){0}; +return FALSE; +} +if (ret < 0) { +return TRUE; +} +thl.load = ret; +thl.cursor = 0; +} + +size = thl.load; +start = thl.buf + thl.cursor; +pos = qemu_chr_end_position((const char *) start, size, ); +if (pos >= 0) { +size = pos + 1; +} + +qemu_chr_be_write(chr, start, size); +thl.cursor += size; +thl.load -= size; + +return TRUE; +} + +static gboolean fd_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque) +{ +Chardev *chr = CHARDEV(opaque); +CharBackend *be = chr->be; +Monitor *mon = (Monitor *)be->opaque; + +if (monitor_is_qmp(mon)) { +return fd_chr_read_qmp(chan, opaque); +} + +return fd_chr_read_hmp(chan, opaque); +} + static int fd_chr_read_poll(void *opaque) { Chardev *chr = CHARDEV(opaque); diff --git a/chardev/char-socket.c b/chardev/char-socket.c index 213a4c8..8335e8c 100644 --- a/chardev/char-socket.c +++ b/chardev/char-socket.c @@ -520,30 +520,54 @@ static void tcp_chr_disconnect(Chardev *chr) static gboolean tcp_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque) { +static JSONthrottle thl = {0}; +uint8_t *start; Chardev *chr = CHARDEV(opaque); SocketChardev *s = SOCKET_CHARDEV(opaque); -uint8_t buf[CHR_READ_BUF_LEN]; -int len, size; +int len, size, pos; if ((s->state != TCP_CHARDEV_STATE_CONNECTED) || s->max_size <= 0) { return TRUE; } -len = sizeof(buf); -if (len > s->max_size) { -len = s->max_size; -} -size = tcp_chr_recv(chr, (void *)buf, len); -if (size == 0 || (size == -1 && errno != EAGAIN)) { -/* connection closed */ -tcp_chr_disconnect(chr); -} else if (size > 0) { -if (s->do_telnetopt) { -tcp_chr_process_IAC_bytes(chr, s, buf, ); + +if (!thl.load) { +len = sizeof(thl.buf); +if (len > s->max_size) { +len = s->max_size; +} +size = tcp_chr_recv(chr, (void *)thl.buf, len); +if (size == 0 || (size == -1 && errno != EAGAIN)) { +/* connection closed */ +tcp_chr_disconnect(chr); +thl = (const JSONthrottle){0}; +return TRUE; } -if (size > 0) { -qemu_chr_be_write(chr, buf, size); +if (size < 0) { +return TRUE; } +thl.load = size; +thl.cursor = 0; +} + +size = thl.load; +start = thl.buf + thl.cursor; +pos = qemu_chr_end_position((const char *) sta
[PATCH 1/2] iotests: add another bash sleep command to 247
This patch paves the way for the one that follows. The following patch makes the QMP monitor to read up to 4K from stdin at once. That results in running the bash 'sleep' command before the _qemu_proc_exec() starts in subshell. Another 'sleep' command with an unobtrusive 'query-status' plays as a workaround. Signed-off-by: Andrey Shinkevich --- tests/qemu-iotests/247 | 2 ++ tests/qemu-iotests/247.out | 1 + 2 files changed, 3 insertions(+) diff --git a/tests/qemu-iotests/247 b/tests/qemu-iotests/247 index 87e37b3..7d316ec 100755 --- a/tests/qemu-iotests/247 +++ b/tests/qemu-iotests/247 @@ -59,6 +59,8 @@ TEST_IMG="$TEST_IMG.4" _make_test_img $size {"execute":"block-commit", "arguments":{"device":"format-4", "top-node": "format-2", "base-node":"format-0", "job-id":"job0"}} EOF +sleep 1 +echo '{"execute":"query-status"}' if [ "${VALGRIND_QEMU}" == "y" ]; then sleep 10 else diff --git a/tests/qemu-iotests/247.out b/tests/qemu-iotests/247.out index e909e83..13d9547 100644 --- a/tests/qemu-iotests/247.out +++ b/tests/qemu-iotests/247.out @@ -17,6 +17,7 @@ QMP_VERSION {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "job0", "len": 134217728, "offset": 134217728, "speed": 0, "type": "commit"}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "job0"}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "null", "id": "job0"}} +{"return": {"status": "running", "singlestep": false, "running": true}} {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} *** done -- 1.8.3.1
[PATCH 0/2] Increase amount of data for monitor to read
The subject was discussed here: https://lists.gnu.org/archive/html/qemu-devel/2017-05/msg00206.html This series is a solution for the issue with QMP monitor buffered input. A little parser is introduced to throttle JSON commands read from the buffer so that QMP requests do not overwhelm the monitor input queue. A side effect raised in the test #247 was managed in the first patch. It may be considered as a workaround. Any sane fix suggested will be appreciated. Note: This series goes after the Vladimir's one: '[PATCH v3 00/25] backup performance: block_status + async"' To make the test #129 passed, the following patch should be applied first: '[PATCH v3 01/25] iotests: 129 don't check backup "busy"'. Andrey Shinkevich (2): iotests: add another bash sleep command to 247 monitor: increase amount of data for monitor to read chardev/char-fd.c | 64 +- chardev/char-socket.c | 54 +++--- chardev/char.c | 40 + include/chardev/char.h | 15 +++ monitor/monitor.c | 2 +- tests/qemu-iotests/247 | 2 ++ tests/qemu-iotests/247.out | 1 + 7 files changed, 161 insertions(+), 17 deletions(-) -- 1.8.3.1
Re: [PATCH v12 14/14] block: apply COR-filter to block-stream jobs
On 27.10.2020 20:57, Vladimir Sementsov-Ogievskiy wrote: 27.10.2020 20:48, Andrey Shinkevich wrote: On 27.10.2020 19:13, Vladimir Sementsov-Ogievskiy wrote: 22.10.2020 21:13, Andrey Shinkevich wrote: This patch completes the series with the COR-filter insertion for block-stream operations. Adding the filter makes it possible for copied regions to be discarded in backing files during the block-stream job, what will reduce the disk overuse. The COR-filter insertion incurs changes in the iotests case 245:test_block_stream_4 that reopens the backing chain during a block-stream job. There are changes in the iotests #030 as well. The iotests case 030:test_stream_parallel was deleted due to multiple conflicts between the concurrent job operations over the same backing chain. The base backing node for one job is the top node for another job. It may change due to the filter node inserted into the backing chain while both jobs are running. Another issue is that the parts of the backing chain are being frozen by the running job and may not be changed by the concurrent job when needed. The concept of the parallel jobs with common nodes is considered vital no more. Signed-off-by: Andrey Shinkevich --- block/stream.c | 98 ++ tests/qemu-iotests/030 | 51 +++- tests/qemu-iotests/030.out | 4 +- tests/qemu-iotests/141.out | 2 +- tests/qemu-iotests/245 | 22 +++ 5 files changed, 87 insertions(+), 90 deletions(-) diff --git a/block/stream.c b/block/stream.c [...] + s = block_job_create(job_id, _job_driver, NULL, cor_filter_bs, + BLK_PERM_CONSISTENT_READ, + basic_flags | BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD, I think that BLK_PERM_GRAPH_MOD is something outdated. We have chain-feeze, what BLK_PERM_GRAPH_MOD adds to it? I don't know, and doubt that somebody knows. That is true for the commit/mirror jobs also. If we agree to remove the flag BLK_PERM_GRAPH_MOD from all these jobs, it will be made in a separate series, won't it? Hmm. At least, let's not implement new logic based on BLK_PERM_GRAPH_MOD. In original code it's only block_job_create's perm, not in shared_perm, not somewhere else.. So, if we keep it, let's keep it as is: only in perm in block_job_create, not implementing additional perm/shared_perm logic. With @perm=0 in the block_job_add_bdrv(>common, "active node"...), it won't. speed, creation_flags, NULL, NULL, errp); if (!s) { goto fail; } + /* + * Prevent concurrent jobs trying to modify the graph structure here, we + * already have our own plans. Also don't allow resize as the image size is + * queried only at the job start and then cached. + */ + if (block_job_add_bdrv(>common, "active node", bs, + basic_flags | BLK_PERM_GRAPH_MOD, why not 0, like for other nodes? We don't use this BdrvChild at all, why to requre permissions? Yes, '0' s right. + basic_flags | BLK_PERM_WRITE, _abort)) { + goto fail; + } + /* Block all intermediate nodes between bs and base, because [...] diff --git a/tests/qemu-iotests/030 b/tests/qemu-iotests/030 index dcb4b5d..0064590 100755 --- a/tests/qemu-iotests/030 +++ b/tests/qemu-iotests/030 @@ -227,61 +227,20 @@ class TestParallelOps(iotests.QMPTestCase): for img in self.imgs: os.remove(img) - # Test that it's possible to run several block-stream operations - # in parallel in the same snapshot chain - @unittest.skipIf(os.environ.get('QEMU_CHECK_BLOCK_AUTO'), 'disabled in CI') - def test_stream_parallel(self): Didn't we agree to add "bottom" paramter to qmp? Than this test-case can be rewritten using node-names and new "bottom" stream argument. I guess it will not help for the whole test. Particularly, there is an issue with freezing the child link to COR-filter of the cuncurrent job, then it fails to finish first. We should not have such frozen link, as our bottom node should be above COR-filter of concurrent job. The bdrv_freeze_backing_chain(bs, above_base, errp) does that job. Max insisted on keeping it. Andrey
Re: [PATCH v12 14/14] block: apply COR-filter to block-stream jobs
On 27.10.2020 19:13, Vladimir Sementsov-Ogievskiy wrote: 22.10.2020 21:13, Andrey Shinkevich wrote: This patch completes the series with the COR-filter insertion for block-stream operations. Adding the filter makes it possible for copied regions to be discarded in backing files during the block-stream job, what will reduce the disk overuse. The COR-filter insertion incurs changes in the iotests case 245:test_block_stream_4 that reopens the backing chain during a block-stream job. There are changes in the iotests #030 as well. The iotests case 030:test_stream_parallel was deleted due to multiple conflicts between the concurrent job operations over the same backing chain. The base backing node for one job is the top node for another job. It may change due to the filter node inserted into the backing chain while both jobs are running. Another issue is that the parts of the backing chain are being frozen by the running job and may not be changed by the concurrent job when needed. The concept of the parallel jobs with common nodes is considered vital no more. Signed-off-by: Andrey Shinkevich --- block/stream.c | 98 ++ tests/qemu-iotests/030 | 51 +++- tests/qemu-iotests/030.out | 4 +- tests/qemu-iotests/141.out | 2 +- tests/qemu-iotests/245 | 22 +++ 5 files changed, 87 insertions(+), 90 deletions(-) diff --git a/block/stream.c b/block/stream.c [...] + s = block_job_create(job_id, _job_driver, NULL, cor_filter_bs, + BLK_PERM_CONSISTENT_READ, + basic_flags | BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD, I think that BLK_PERM_GRAPH_MOD is something outdated. We have chain-feeze, what BLK_PERM_GRAPH_MOD adds to it? I don't know, and doubt that somebody knows. That is true for the commit/mirror jobs also. If we agree to remove the flag BLK_PERM_GRAPH_MOD from all these jobs, it will be made in a separate series, won't it? speed, creation_flags, NULL, NULL, errp); if (!s) { goto fail; } + /* + * Prevent concurrent jobs trying to modify the graph structure here, we + * already have our own plans. Also don't allow resize as the image size is + * queried only at the job start and then cached. + */ + if (block_job_add_bdrv(>common, "active node", bs, + basic_flags | BLK_PERM_GRAPH_MOD, why not 0, like for other nodes? We don't use this BdrvChild at all, why to requre permissions? Yes, '0' s right. + basic_flags | BLK_PERM_WRITE, _abort)) { + goto fail; + } + /* Block all intermediate nodes between bs and base, because [...] diff --git a/tests/qemu-iotests/030 b/tests/qemu-iotests/030 index dcb4b5d..0064590 100755 --- a/tests/qemu-iotests/030 +++ b/tests/qemu-iotests/030 @@ -227,61 +227,20 @@ class TestParallelOps(iotests.QMPTestCase): for img in self.imgs: os.remove(img) - # Test that it's possible to run several block-stream operations - # in parallel in the same snapshot chain - @unittest.skipIf(os.environ.get('QEMU_CHECK_BLOCK_AUTO'), 'disabled in CI') - def test_stream_parallel(self): Didn't we agree to add "bottom" paramter to qmp? Than this test-case can be rewritten using node-names and new "bottom" stream argument. I guess it will not help for the whole test. Particularly, there is an issue with freezing the child link to COR-filter of the cuncurrent job, then it fails to finish first. Andrey
Re: [PATCH v12 13/14] stream: skip filters when writing backing file name to QCOW2 header
On 27.10.2020 19:21, Vladimir Sementsov-Ogievskiy wrote: 27.10.2020 19:01, Andrey Shinkevich wrote: On 27.10.2020 18:09, Vladimir Sementsov-Ogievskiy wrote: 22.10.2020 21:13, Andrey Shinkevich wrote: Avoid writing a filter JSON file name and a filter format name to QCOW2 image when the backing file is changed after the block stream job. A user is still able to assign the 'backing-file' parameter for a block-stream job keeping in mind the possible issue mentioned above. If the user does not specify the 'backing-file' parameter, QEMU will assign it automatically. Signed-off-by: Andrey Shinkevich --- block/stream.c | 15 +-- blockdev.c | 9 ++--- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/block/stream.c b/block/stream.c index e0540ee..1ba74ab 100644 --- a/block/stream.c +++ b/block/stream.c @@ -65,6 +65,7 @@ static int stream_prepare(Job *job) BlockDriverState *bs = blk_bs(bjob->blk); BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs); BlockDriverState *base = bdrv_filter_or_cow_bs(s->above_base); + BlockDriverState *base_unfiltered = NULL; Error *local_err = NULL; int ret = 0; @@ -75,8 +76,18 @@ static int stream_prepare(Job *job) const char *base_id = NULL, *base_fmt = NULL; if (base) { base_id = s->backing_file_str; - if (base->drv) { - base_fmt = base->drv->format_name; + if (base_id) { + if (base->drv) { + base_fmt = base->drv->format_name; hmm. this doesn't make real sense: so, we assume that user specified backing_file_str, which may not relate to base, but we use base->drv->format_name? But it may be name of the filter driver, which would be wrong.. Any ideas? 1. we can use base_fmt=NULL, to provoke probing on next open of the qcow2 file.. I would choose this item #1 but have to check the probing code logic... Particularly, I do not remember now if the probing is able to recognize a protocol. The logic for the format_name in the QEMU existent code (I has kept it here in the patch) is a slippery way for an imprudent user. That's why I staked on the backing_file_str deprication in the previous version. 2. we can do probing now 3. we can at least check, if backing_file_str == Not bad for the sanity check but we will search a node by the file name again - not good (( Not search, but only check one very likely option. Yes, just strcmp(). And why a user may not merely specify a desired backing file as the base? Additionally to 1. or 3. (or combined), or even keeping things as is (i.e. wrong, but it is preexisting), we can: - add backing-format argument to qapi as pair for backing-file - deprecate using backing-file without backing-format. Then, after deprecation period we'll have correct code. This may be done in separate. base_unfiltered->filename, in this case we can use base_unfiltered->drv->format_name + } + } else { + base_unfiltered = bdrv_skip_filters(base); + if (base_unfiltered) { + base_id = base_unfiltered->filename; + if (base_unfiltered->drv) { + base_fmt = base_unfiltered->drv->format_name; + } + } } } bdrv_set_backing_hd(unfiltered_bs, base, _err); diff --git a/blockdev.c b/blockdev.c index c917625..0e9c783 100644 --- a/blockdev.c +++ b/blockdev.c [...] - stream_start(has_job_id ? job_id : NULL, bs, base_bs, base_name, + stream_start(has_job_id ? job_id : NULL, bs, base_bs, + has_backing_file ? backing_file : NULL, backing_file should be NULL if has_backing_file is false, so you can use just backing_file instead of ternary operator. Yes, if reliable. I has kept the conformation with the ternary operator at the first parameter above. Andrey job_flags, has_speed ? speed : 0, on_error, filter_node_name, _err); if (local_err) {
Re: [PATCH v12 13/14] stream: skip filters when writing backing file name to QCOW2 header
On 27.10.2020 18:09, Vladimir Sementsov-Ogievskiy wrote: 22.10.2020 21:13, Andrey Shinkevich wrote: Avoid writing a filter JSON file name and a filter format name to QCOW2 image when the backing file is changed after the block stream job. A user is still able to assign the 'backing-file' parameter for a block-stream job keeping in mind the possible issue mentioned above. If the user does not specify the 'backing-file' parameter, QEMU will assign it automatically. Signed-off-by: Andrey Shinkevich --- block/stream.c | 15 +-- blockdev.c | 9 ++--- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/block/stream.c b/block/stream.c index e0540ee..1ba74ab 100644 --- a/block/stream.c +++ b/block/stream.c @@ -65,6 +65,7 @@ static int stream_prepare(Job *job) BlockDriverState *bs = blk_bs(bjob->blk); BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs); BlockDriverState *base = bdrv_filter_or_cow_bs(s->above_base); + BlockDriverState *base_unfiltered = NULL; Error *local_err = NULL; int ret = 0; @@ -75,8 +76,18 @@ static int stream_prepare(Job *job) const char *base_id = NULL, *base_fmt = NULL; if (base) { base_id = s->backing_file_str; - if (base->drv) { - base_fmt = base->drv->format_name; + if (base_id) { + if (base->drv) { + base_fmt = base->drv->format_name; hmm. this doesn't make real sense: so, we assume that user specified backing_file_str, which may not relate to base, but we use base->drv->format_name? But it may be name of the filter driver, which would be wrong.. Any ideas? 1. we can use base_fmt=NULL, to provoke probing on next open of the qcow2 file.. I would choose this item #1 but have to check the probing code logic... Particularly, I do not remember now if the probing is able to recognize a protocol. The logic for the format_name in the QEMU existent code (I has kept it here in the patch) is a slippery way for an imprudent user. That's why I staked on the backing_file_str deprication in the previous version. 2. we can do probing now 3. we can at least check, if backing_file_str == Not bad for the sanity check but we will search a node by the file name again - not good (( base_unfiltered->filename, in this case we can use base_unfiltered->drv->format_name + } + } else { + base_unfiltered = bdrv_skip_filters(base); + if (base_unfiltered) { + base_id = base_unfiltered->filename; + if (base_unfiltered->drv) { + base_fmt = base_unfiltered->drv->format_name; + } + } } } bdrv_set_backing_hd(unfiltered_bs, base, _err); diff --git a/blockdev.c b/blockdev.c index c917625..0e9c783 100644 --- a/blockdev.c +++ b/blockdev.c [...] - stream_start(has_job_id ? job_id : NULL, bs, base_bs, base_name, + stream_start(has_job_id ? job_id : NULL, bs, base_bs, + has_backing_file ? backing_file : NULL, backing_file should be NULL if has_backing_file is false, so you can use just backing_file instead of ternary operator. Yes, if reliable. I has kept the conformation with the ternary operator at the first parameter above. Andrey job_flags, has_speed ? speed : 0, on_error, filter_node_name, _err); if (local_err) {
Re: [PATCH v12 06/14] copy-on-read: pass bottom node name to COR driver
On 23.10.2020 17:45, Vladimir Sementsov-Ogievskiy wrote: 22.10.2020 21:13, Andrey Shinkevich wrote: We are going to use the COR-filter for a block-stream job. To limit COR operations by the base node in the backing chain during stream job, pass the bottom node name, that is the first non-filter overlay of the base, to the copy-on-read driver as the base node itself may change due to possible concurrent jobs. The rest of the functionality will be implemented in the patch that follows. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 16 1 file changed, 16 insertions(+) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index 618c4c4..3d8e4db 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -24,18 +24,24 @@ #include "block/block_int.h" #include "qemu/module.h" #include "qapi/error.h" +#include "qapi/qmp/qerror.h" +#include "qapi/qmp/qdict.h" #include "block/copy-on-read.h" typedef struct BDRVStateCOR { bool active; + BlockDriverState *bottom_bs; } BDRVStateCOR; static int cor_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { + BlockDriverState *bottom_bs = NULL; BDRVStateCOR *state = bs->opaque; + /* Find a bottom node name, if any */ + const char *bottom_node = qdict_get_try_str(options, "bottom"); bs->file = bdrv_open_child(NULL, options, "file", bs, _of_bds, BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, @@ -51,7 +57,17 @@ static int cor_open(BlockDriverState *bs, QDict *options, int flags, ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & bs->file->bs->supported_zero_flags); + if (bottom_node) { + bottom_bs = bdrv_lookup_bs(NULL, bottom_node, errp); + if (!bottom_bs) { + error_setg(errp, QERR_BASE_NOT_FOUND, bottom_node); QERR_BASE_NOT_FOUND is unrelated here. Also, I see a comment in qerror.h that such macros should not be used in new code. And don't forget to drop qerror.h include line. I have been surprized because I don't have it in my branch and instead I do: error_setg(errp, "Bottom node '%s' not found", bottom_node); + qdict_del(options, "bottom"); this may be moved above "bottom_bs = ..", to not call it after "if" in separate. Please, see the "Re: [PATCH v11 04/13] copy-on-read: pass overlay base node name to COR driver". + return -EINVAL; + } + qdict_del(options, "bottom"); + } state->active = true; + state->bottom_bs = bottom_bs; /* * We don't need to call bdrv_child_refresh_perms() now as the permissions
[PATCH v12 12/14] copy-on-read: skip non-guest reads if no copy needed
If the flag BDRV_REQ_PREFETCH was set, skip idling read/write operations in COR-driver. It can be taken into account for the COR-algorithms optimization. That check is being made during the block stream job by the moment. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 12 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index a2b180a..081e661 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -153,10 +153,14 @@ static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, } } -ret = bdrv_co_preadv_part(bs->file, offset, n, qiov, qiov_offset, - local_flags); -if (ret < 0) { -return ret; +/* Skip if neither read nor write are needed */ +if ((local_flags & (BDRV_REQ_PREFETCH | BDRV_REQ_COPY_ON_READ)) != +BDRV_REQ_PREFETCH) { +ret = bdrv_co_preadv_part(bs->file, offset, n, qiov, qiov_offset, + local_flags); +if (ret < 0) { +return ret; +} } offset += n; -- 1.8.3.1
[PATCH v12 06/14] copy-on-read: pass bottom node name to COR driver
We are going to use the COR-filter for a block-stream job. To limit COR operations by the base node in the backing chain during stream job, pass the bottom node name, that is the first non-filter overlay of the base, to the copy-on-read driver as the base node itself may change due to possible concurrent jobs. The rest of the functionality will be implemented in the patch that follows. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 16 1 file changed, 16 insertions(+) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index 618c4c4..3d8e4db 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -24,18 +24,24 @@ #include "block/block_int.h" #include "qemu/module.h" #include "qapi/error.h" +#include "qapi/qmp/qerror.h" +#include "qapi/qmp/qdict.h" #include "block/copy-on-read.h" typedef struct BDRVStateCOR { bool active; +BlockDriverState *bottom_bs; } BDRVStateCOR; static int cor_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { +BlockDriverState *bottom_bs = NULL; BDRVStateCOR *state = bs->opaque; +/* Find a bottom node name, if any */ +const char *bottom_node = qdict_get_try_str(options, "bottom"); bs->file = bdrv_open_child(NULL, options, "file", bs, _of_bds, BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, @@ -51,7 +57,17 @@ static int cor_open(BlockDriverState *bs, QDict *options, int flags, ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & bs->file->bs->supported_zero_flags); +if (bottom_node) { +bottom_bs = bdrv_lookup_bs(NULL, bottom_node, errp); +if (!bottom_bs) { +error_setg(errp, QERR_BASE_NOT_FOUND, bottom_node); +qdict_del(options, "bottom"); +return -EINVAL; +} +qdict_del(options, "bottom"); +} state->active = true; +state->bottom_bs = bottom_bs; /* * We don't need to call bdrv_child_refresh_perms() now as the permissions -- 1.8.3.1
[PATCH v12 09/14] block: modify the comment for BDRV_REQ_PREFETCH flag
Modify the comment for the flag BDRV_REQ_PREFETCH as we are going to use it alone and pass it to the COR-filter driver for further processing. Signed-off-by: Andrey Shinkevich --- include/block/block.h | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/block/block.h b/include/block/block.h index ae7612f..1b6742f 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -81,9 +81,11 @@ typedef enum { BDRV_REQ_NO_FALLBACK= 0x100, /* - * BDRV_REQ_PREFETCH may be used only together with BDRV_REQ_COPY_ON_READ - * on read request and means that caller doesn't really need data to be - * written to qiov parameter which may be NULL. + * BDRV_REQ_PREFETCH makes sense only in the context of copy-on-read + * (i.e., together with the BDRV_REQ_COPY_ON_READ flag or when a COR + * filter is involved), in which case it signals that the COR operation + * need not read the data into memory (qiov) but only ensure they are + * copied to the top layer (i.e., that COR operation is done). */ BDRV_REQ_PREFETCH = 0x200, /* Mask of valid flags */ -- 1.8.3.1
[PATCH v12 13/14] stream: skip filters when writing backing file name to QCOW2 header
Avoid writing a filter JSON file name and a filter format name to QCOW2 image when the backing file is changed after the block stream job. A user is still able to assign the 'backing-file' parameter for a block-stream job keeping in mind the possible issue mentioned above. If the user does not specify the 'backing-file' parameter, QEMU will assign it automatically. Signed-off-by: Andrey Shinkevich --- block/stream.c | 15 +-- blockdev.c | 9 ++--- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/block/stream.c b/block/stream.c index e0540ee..1ba74ab 100644 --- a/block/stream.c +++ b/block/stream.c @@ -65,6 +65,7 @@ static int stream_prepare(Job *job) BlockDriverState *bs = blk_bs(bjob->blk); BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs); BlockDriverState *base = bdrv_filter_or_cow_bs(s->above_base); +BlockDriverState *base_unfiltered = NULL; Error *local_err = NULL; int ret = 0; @@ -75,8 +76,18 @@ static int stream_prepare(Job *job) const char *base_id = NULL, *base_fmt = NULL; if (base) { base_id = s->backing_file_str; -if (base->drv) { -base_fmt = base->drv->format_name; +if (base_id) { +if (base->drv) { +base_fmt = base->drv->format_name; +} +} else { +base_unfiltered = bdrv_skip_filters(base); +if (base_unfiltered) { +base_id = base_unfiltered->filename; +if (base_unfiltered->drv) { +base_fmt = base_unfiltered->drv->format_name; +} +} } } bdrv_set_backing_hd(unfiltered_bs, base, _err); diff --git a/blockdev.c b/blockdev.c index c917625..0e9c783 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2508,7 +2508,6 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, BlockDriverState *base_bs = NULL; AioContext *aio_context; Error *local_err = NULL; -const char *base_name = NULL; int job_flags = JOB_DEFAULT; if (!has_on_error) { @@ -2536,7 +2535,6 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, goto out; } assert(bdrv_get_aio_context(base_bs) == aio_context); -base_name = base; } if (has_base_node) { @@ -2551,7 +2549,6 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, } assert(bdrv_get_aio_context(base_bs) == aio_context); bdrv_refresh_filename(base_bs); -base_name = base_bs->filename; } /* Check for op blockers in the whole chain between bs and base */ @@ -2571,9 +2568,6 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, goto out; } -/* backing_file string overrides base bs filename */ -base_name = has_backing_file ? backing_file : base_name; - if (has_auto_finalize && !auto_finalize) { job_flags |= JOB_MANUAL_FINALIZE; } @@ -2581,7 +2575,8 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, job_flags |= JOB_MANUAL_DISMISS; } -stream_start(has_job_id ? job_id : NULL, bs, base_bs, base_name, +stream_start(has_job_id ? job_id : NULL, bs, base_bs, + has_backing_file ? backing_file : NULL, job_flags, has_speed ? speed : 0, on_error, filter_node_name, _err); if (local_err) { -- 1.8.3.1
[PATCH v12 00/14] Apply COR-filter to the block-stream permanently
The node insert/remove functions were added at the block generic layer. COR-filter options structure was added to the QAPI. The test case #310 was added to check the 'bottom' node limit for COR. The 'supported_read_flags' member was added to the BDS structure (with the flags check at the block generic layer for drivers). v12: 02: New. 03: Only the temporary drop filter function left. 05: New (suggested by Max) 06: 'base' -> 'bottom' option. 07: Fixes based on the review of the v11. 08: New. 09: The comment ext was modified. 10: The read flags check at the block generic layer. 11: COR flag was added. 12: The condition was fixed. 13: The 'backing-file' parameter returned. No deprecation. 14: The COR-filter 'add' function replaced with the 'insert node' generic function. Fixes based on the review of the v11. Andrey Shinkevich (14): copy-on-read: support preadv/pwritev_part functions block: add insert/remove node functions copy-on-read: add filter drop function qapi: add filter-node-name to block-stream qapi: create BlockdevOptionsCor structure for COR driver copy-on-read: pass bottom node name to COR driver copy-on-read: limit COR operations to bottom node iotests: add #310 to test bottom node in COR driver block: modify the comment for BDRV_REQ_PREFETCH flag block: include supported_read_flags into BDS structure copy-on-read: add support for read flags to COR-filter copy-on-read: skip non-guest reads if no copy needed stream: skip filters when writing backing file name to QCOW2 header block: apply COR-filter to block-stream jobs block.c| 49 ++ block/copy-on-read.c | 144 + block/copy-on-read.h | 32 + block/io.c | 12 +++- block/monitor/block-hmp-cmds.c | 4 +- block/stream.c | 117 ++--- blockdev.c | 13 ++-- include/block/block.h | 11 +++- include/block/block_int.h | 11 +++- qapi/block-core.json | 27 +++- tests/qemu-iotests/030 | 51 ++- tests/qemu-iotests/030.out | 4 +- tests/qemu-iotests/141.out | 2 +- tests/qemu-iotests/245 | 22 +-- tests/qemu-iotests/310 | 109 +++ tests/qemu-iotests/310.out | 15 + tests/qemu-iotests/group | 3 +- 17 files changed, 503 insertions(+), 123 deletions(-) create mode 100644 block/copy-on-read.h create mode 100755 tests/qemu-iotests/310 create mode 100644 tests/qemu-iotests/310.out -- 1.8.3.1
[PATCH v12 11/14] copy-on-read: add support for read flags to COR-filter
Add the BDRV_REQ_COPY_ON_READ and BDRV_REQ_PREFETCH flags to the supported_read_flags of the COR-filter. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index 8178a91..a2b180a 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -50,6 +50,8 @@ static int cor_open(BlockDriverState *bs, QDict *options, int flags, return -EINVAL; } +bs->supported_read_flags = BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH; + bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED | (BDRV_REQ_FUA & bs->file->bs->supported_write_flags); -- 1.8.3.1
[PATCH v12 14/14] block: apply COR-filter to block-stream jobs
This patch completes the series with the COR-filter insertion for block-stream operations. Adding the filter makes it possible for copied regions to be discarded in backing files during the block-stream job, what will reduce the disk overuse. The COR-filter insertion incurs changes in the iotests case 245:test_block_stream_4 that reopens the backing chain during a block-stream job. There are changes in the iotests #030 as well. The iotests case 030:test_stream_parallel was deleted due to multiple conflicts between the concurrent job operations over the same backing chain. The base backing node for one job is the top node for another job. It may change due to the filter node inserted into the backing chain while both jobs are running. Another issue is that the parts of the backing chain are being frozen by the running job and may not be changed by the concurrent job when needed. The concept of the parallel jobs with common nodes is considered vital no more. Signed-off-by: Andrey Shinkevich --- block/stream.c | 98 ++ tests/qemu-iotests/030 | 51 +++- tests/qemu-iotests/030.out | 4 +- tests/qemu-iotests/141.out | 2 +- tests/qemu-iotests/245 | 22 +++ 5 files changed, 87 insertions(+), 90 deletions(-) diff --git a/block/stream.c b/block/stream.c index 1ba74ab..f6ed315 100644 --- a/block/stream.c +++ b/block/stream.c @@ -17,8 +17,10 @@ #include "block/blockjob_int.h" #include "qapi/error.h" #include "qapi/qmp/qerror.h" +#include "qapi/qmp/qdict.h" #include "qemu/ratelimit.h" #include "sysemu/block-backend.h" +#include "block/copy-on-read.h" enum { /* @@ -33,6 +35,8 @@ typedef struct StreamBlockJob { BlockJob common; BlockDriverState *base_overlay; /* COW overlay (stream from this) */ BlockDriverState *above_base; /* Node directly above the base */ +BlockDriverState *cor_filter_bs; +BlockDriverState *target_bs; BlockdevOnError on_error; char *backing_file_str; bool bs_read_only; @@ -44,8 +48,7 @@ static int coroutine_fn stream_populate(BlockBackend *blk, { assert(bytes < SIZE_MAX); -return blk_co_preadv(blk, offset, bytes, NULL, - BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH); +return blk_co_preadv(blk, offset, bytes, NULL, BDRV_REQ_PREFETCH); } static void stream_abort(Job *job) @@ -53,23 +56,20 @@ static void stream_abort(Job *job) StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); if (s->chain_frozen) { -BlockJob *bjob = >common; -bdrv_unfreeze_backing_chain(blk_bs(bjob->blk), s->above_base); +bdrv_unfreeze_backing_chain(s->cor_filter_bs, s->above_base); } } static int stream_prepare(Job *job) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); -BlockJob *bjob = >common; -BlockDriverState *bs = blk_bs(bjob->blk); -BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs); +BlockDriverState *unfiltered_bs = bdrv_skip_filters(s->target_bs); BlockDriverState *base = bdrv_filter_or_cow_bs(s->above_base); BlockDriverState *base_unfiltered = NULL; Error *local_err = NULL; int ret = 0; -bdrv_unfreeze_backing_chain(bs, s->above_base); +bdrv_unfreeze_backing_chain(s->cor_filter_bs, s->above_base); s->chain_frozen = false; if (bdrv_cow_child(unfiltered_bs)) { @@ -105,15 +105,16 @@ static void stream_clean(Job *job) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); BlockJob *bjob = >common; -BlockDriverState *bs = blk_bs(bjob->blk); /* Reopen the image back in read-only mode if necessary */ if (s->bs_read_only) { /* Give up write permissions before making it read-only */ blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, _abort); -bdrv_reopen_set_read_only(bs, true, NULL); +bdrv_reopen_set_read_only(s->target_bs, true, NULL); } +bdrv_cor_filter_drop(s->cor_filter_bs); + g_free(s->backing_file_str); } @@ -121,9 +122,7 @@ static int coroutine_fn stream_run(Job *job, Error **errp) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); BlockBackend *blk = s->common.blk; -BlockDriverState *bs = blk_bs(blk); -BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs); -bool enable_cor = !bdrv_cow_child(s->base_overlay); +BlockDriverState *unfiltered_bs = bdrv_skip_filters(s->target_bs); int64_t len; int64_t offset = 0; uint64_t delay_ns = 0; @@ -135,21 +134,12 @@ static int coroutine_fn stream_run(Job *job, Error **errp) return 0; } -len = bdrv_getlength(bs); +len = bdrv_getlength(s->target_bs); if (len < 0) { return len; } job_progress_set_remaining(>common.
[PATCH v12 01/14] copy-on-read: support preadv/pwritev_part functions
Add support for the recently introduced functions bdrv_co_preadv_part() and bdrv_co_pwritev_part() to the COR-filter driver. Signed-off-by: Andrey Shinkevich Reviewed-by: Vladimir Sementsov-Ogievskiy --- block/copy-on-read.c | 28 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index 2816e61..cb03e0f 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -74,21 +74,25 @@ static int64_t cor_getlength(BlockDriverState *bs) } -static int coroutine_fn cor_co_preadv(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, + size_t qiov_offset, + int flags) { -return bdrv_co_preadv(bs->file, offset, bytes, qiov, - flags | BDRV_REQ_COPY_ON_READ); +return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, + flags | BDRV_REQ_COPY_ON_READ); } -static int coroutine_fn cor_co_pwritev(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +static int coroutine_fn cor_co_pwritev_part(BlockDriverState *bs, +uint64_t offset, +uint64_t bytes, +QEMUIOVector *qiov, +size_t qiov_offset, int flags) { - -return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); +return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset, +flags); } @@ -137,8 +141,8 @@ static BlockDriver bdrv_copy_on_read = { .bdrv_getlength = cor_getlength, -.bdrv_co_preadv = cor_co_preadv, -.bdrv_co_pwritev= cor_co_pwritev, +.bdrv_co_preadv_part= cor_co_preadv_part, +.bdrv_co_pwritev_part = cor_co_pwritev_part, .bdrv_co_pwrite_zeroes = cor_co_pwrite_zeroes, .bdrv_co_pdiscard = cor_co_pdiscard, .bdrv_co_pwritev_compressed = cor_co_pwritev_compressed, -- 1.8.3.1
[PATCH v12 05/14] qapi: create BlockdevOptionsCor structure for COR driver
Create the BlockdevOptionsCor structure for COR driver specific options splitting it off form the BlockdevOptionsGenericFormat. The only option 'bottom' node in the structure denotes an image file that limits the COR operations in the backing chain. Suggested-by: Max Reitz Signed-off-by: Andrey Shinkevich --- qapi/block-core.json | 21 - 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/qapi/block-core.json b/qapi/block-core.json index 0a64306..bf465f6 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -3938,6 +3938,25 @@ 'data': { 'throttle-group': 'str', 'file' : 'BlockdevRef' } } + +## +# @BlockdevOptionsCor: +# +# Driver specific block device options for the copy-on-read driver. +# +# @bottom: the name of a non-filter node (allocation-bearing layer) that limits +# the COR operations in the backing chain (inclusive). +# For the block-stream job, it will be the first non-filter overlay of +# the base node. We do not involve the base node into the COR +# operations because the base may change due to a concurrent +# block-commit job on the same backing chain. +# +# Since: 5.2 +## +{ 'struct': 'BlockdevOptionsCor', + 'base': 'BlockdevOptionsGenericFormat', + 'data': { '*bottom': 'str' } } + ## # @BlockdevOptions: # @@ -3990,7 +4009,7 @@ 'bochs': 'BlockdevOptionsGenericFormat', 'cloop': 'BlockdevOptionsGenericFormat', 'compress': 'BlockdevOptionsGenericFormat', - 'copy-on-read':'BlockdevOptionsGenericFormat', + 'copy-on-read':'BlockdevOptionsCor', 'dmg':'BlockdevOptionsGenericFormat', 'file': 'BlockdevOptionsFile', 'ftp':'BlockdevOptionsCurlFtp', -- 1.8.3.1
[PATCH v12 10/14] block: include supported_read_flags into BDS structure
Add the new member supported_read_flags to the BlockDriverState structure. It will control the flags set for copy-on-read operations. Make the block generic layer evaluate supported read flags before they go to a block driver. Suggested-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Andrey Shinkevich --- block/io.c| 12 ++-- include/block/block_int.h | 4 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/block/io.c b/block/io.c index 54f0968..78ddf13 100644 --- a/block/io.c +++ b/block/io.c @@ -1392,6 +1392,9 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, if (flags & BDRV_REQ_COPY_ON_READ) { int64_t pnum; +/* The flag BDRV_REQ_COPY_ON_READ has reached its addressee */ +flags &= ~BDRV_REQ_COPY_ON_READ; + ret = bdrv_is_allocated(bs, offset, bytes, ); if (ret < 0) { goto out; @@ -1413,9 +1416,13 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, goto out; } +if (flags & ~bs->supported_read_flags) { +abort(); +} + max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); if (bytes <= max_bytes && bytes <= max_transfer) { -ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, 0); +ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, flags); goto out; } @@ -1428,7 +1435,8 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, num, qiov, - qiov_offset + bytes - bytes_remaining, 0); + qiov_offset + bytes - bytes_remaining, + flags); max_bytes -= num; } else { num = bytes_remaining; diff --git a/include/block/block_int.h b/include/block/block_int.h index f782737..474174c 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -873,6 +873,10 @@ struct BlockDriverState { /* I/O Limits */ BlockLimits bl; +/* + * Flags honored during pread + */ +unsigned int supported_read_flags; /* Flags honored during pwrite (so far: BDRV_REQ_FUA, * BDRV_REQ_WRITE_UNCHANGED). * If a driver does not support BDRV_REQ_WRITE_UNCHANGED, those -- 1.8.3.1
[PATCH v12 02/14] block: add insert/remove node functions
Provide API for a node insertion to and removal from a backing chain. Suggested-by: Max Reitz Signed-off-by: Andrey Shinkevich --- block.c | 49 + include/block/block.h | 3 +++ 2 files changed, 52 insertions(+) diff --git a/block.c b/block.c index 430edf7..502b483 100644 --- a/block.c +++ b/block.c @@ -4670,6 +4670,55 @@ static void bdrv_delete(BlockDriverState *bs) g_free(bs); } +BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *node_options, + int flags, Error **errp) +{ +BlockDriverState *new_node_bs; +Error *local_err = NULL; + +new_node_bs = bdrv_open(NULL, NULL, node_options, flags, errp); +if (new_node_bs == NULL) { +error_prepend(errp, "Could not create node: "); +return NULL; +} + +bdrv_drained_begin(bs); +bdrv_replace_node(bs, new_node_bs, _err); +bdrv_drained_end(bs); + +if (local_err) { +bdrv_unref(new_node_bs); +error_propagate(errp, local_err); +return NULL; +} + +return new_node_bs; +} + +void bdrv_remove_node(BlockDriverState *bs) +{ +BdrvChild *child; +BlockDriverState *inferior_bs; + +child = bdrv_filter_or_cow_child(bs); +if (!child) { +return; +} +inferior_bs = child->bs; + +/* Retain the BDS until we complete the graph change. */ +bdrv_ref(inferior_bs); +/* Hold a guest back from writing while permissions are being reset. */ +bdrv_drained_begin(inferior_bs); +/* Refresh permissions before the graph change. */ +bdrv_child_refresh_perms(bs, child, _abort); +bdrv_replace_node(bs, inferior_bs, _abort); + +bdrv_drained_end(inferior_bs); +bdrv_unref(inferior_bs); +bdrv_unref(bs); +} + /* * Run consistency checks on an image * diff --git a/include/block/block.h b/include/block/block.h index d16c401..ae7612f 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -350,6 +350,9 @@ void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top, Error **errp); void bdrv_replace_node(BlockDriverState *from, BlockDriverState *to, Error **errp); +BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *node_options, + int flags, Error **errp); +void bdrv_remove_node(BlockDriverState *bs); int bdrv_parse_aio(const char *mode, int *flags); int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough); -- 1.8.3.1
[PATCH v12 08/14] iotests: add #310 to test bottom node in COR driver
The test case #310 is similar to #216 by Max Reitz. The difference is that the test #310 involves a bottom node to the COR filter driver. Signed-off-by: Andrey Shinkevich --- tests/qemu-iotests/310 | 109 + tests/qemu-iotests/310.out | 15 +++ tests/qemu-iotests/group | 3 +- 3 files changed, 126 insertions(+), 1 deletion(-) create mode 100755 tests/qemu-iotests/310 create mode 100644 tests/qemu-iotests/310.out diff --git a/tests/qemu-iotests/310 b/tests/qemu-iotests/310 new file mode 100755 index 000..5ad7ad2 --- /dev/null +++ b/tests/qemu-iotests/310 @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +# +# Copy-on-read tests using a COR filter with a bottom node +# +# Copyright (c) 2020 Virtuozzo International GmbH +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +import iotests +from iotests import log, qemu_img, qemu_io_silent + +# Need backing file support +iotests.script_initialize(supported_fmts=['qcow2', 'qcow', 'qed', 'vmdk'], + supported_platforms=['linux']) + +log('') +log('=== Copy-on-read across nodes ===') +log('') + +# This test is similar to the 216 one by Max Reitz +# The difference is that this test case involves a bottom node to the +# COR filter driver. + +with iotests.FilePath('base.img') as base_img_path, \ + iotests.FilePath('mid.img') as mid_img_path, \ + iotests.FilePath('top.img') as top_img_path, \ + iotests.VM() as vm: + +log('--- Setting up images ---') +log('') + +assert qemu_img('create', '-f', iotests.imgfmt, base_img_path, '64M') == 0 +assert qemu_io_silent(base_img_path, '-c', 'write -P 1 0M 1M') == 0 +assert qemu_io_silent(base_img_path, '-c', 'write -P 1 3M 1M') == 0 +assert qemu_img('create', '-f', iotests.imgfmt, '-b', base_img_path, +'-F', iotests.imgfmt, mid_img_path) == 0 +assert qemu_io_silent(mid_img_path, '-c', 'write -P 3 2M 1M') == 0 +assert qemu_io_silent(mid_img_path, '-c', 'write -P 3 4M 1M') == 0 +assert qemu_img('create', '-f', iotests.imgfmt, '-b', mid_img_path, +'-F', iotests.imgfmt, top_img_path) == 0 +assert qemu_io_silent(top_img_path, '-c', 'write -P 2 1M 1M') == 0 + +log('Done') + +log('') +log('--- Doing COR ---') +log('') + +vm.launch() + +log(vm.qmp('blockdev-add', +node_name='node0', +driver='copy-on-read', +bottom='node2', +file={ +'driver': iotests.imgfmt, +'file': { +'driver': 'file', +'filename': top_img_path +}, +'backing': { +'node-name': 'node2', +'driver': iotests.imgfmt, +'file': { +'driver': 'file', +'filename': mid_img_path +}, +'backing': { +#'node-name': 'node2', +'driver': iotests.imgfmt, +'file': { +'driver': 'file', +'filename': base_img_path +} +}, +} +})) + +# Trigger COR +log(vm.qmp('human-monitor-command', + command_line='qemu-io node0 "read 0 5M"')) + +vm.shutdown() + +log('') +log('--- Checking COR result ---') +log('') + +assert qemu_io_silent(base_img_path, '-c', 'discard 0 4M') == 0 +assert qemu_io_silent(mid_img_path, '-c', 'discard 0M 5M') == 0 +assert qemu_io_silent(top_img_path, '-c', 'read -P 1 0M 1M') != 0 +assert qemu_io_silent(top_img_path, '-c', 'read -P 2 1M 1M') == 0 +assert qemu_io_silent(top_img_path, '-c', 'read -P 3 2M 1M') == 0 +assert qemu_io_silent(top_img_path, '-c', 'read -P 1 3M 1M') != 0 +assert qemu_io_silent(top_img_path, '-c', 'read -P 3 4M 1M') == 0 + +log('Done') diff --git a/tests/qemu-iotests/310.out b/tests/qemu-iotests/310.out new
[PATCH v12 07/14] copy-on-read: limit COR operations to bottom node
Limit COR operations to the bottom node (inclusively) in the backing chain when the bottom node name is given. It will be useful for a block stream job when the COR-filter is applied. The bottom node is passed as the base itself may change due to concurrent commit jobs on the same backing chain. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 42 -- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index 3d8e4db..8178a91 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -123,8 +123,46 @@ static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, size_t qiov_offset, int flags) { -return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, - flags | BDRV_REQ_COPY_ON_READ); +int64_t n = 0; +int local_flags; +int ret; +BDRVStateCOR *state = bs->opaque; + +if (!state->bottom_bs) { +return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, + flags | BDRV_REQ_COPY_ON_READ); +} + +while (bytes) { +local_flags = flags; + +/* In case of failure, try to copy-on-read anyway */ +ret = bdrv_is_allocated(bs->file->bs, offset, bytes, ); +if (!ret || ret < 0) { +ret = bdrv_is_allocated_above(bdrv_backing_chain_next(bs->file->bs), + state->bottom_bs, true, offset, + n, ); +if (ret == 1 || ret < 0) { +local_flags |= BDRV_REQ_COPY_ON_READ; +} +/* Finish earlier if the end of a backing file has been reached */ +if (ret == 0 && n == 0) { +break; +} +} + +ret = bdrv_co_preadv_part(bs->file, offset, n, qiov, qiov_offset, + local_flags); +if (ret < 0) { +return ret; +} + +offset += n; +qiov_offset += n; +bytes -= n; +} + +return 0; } -- 1.8.3.1
[PATCH v12 03/14] copy-on-read: add filter drop function
Provide API for the COR-filter removal. Also, drop the filter child permissions for an inactive state when the filter node is being removed. This function may be considered as an intermediate solution before we are able to use bdrv_remove_node(). It will be possible once the QEMU permission update system has overhauled. To insert the filter, the block generic layer function bdrv_insert_node() can be used. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 56 block/copy-on-read.h | 32 ++ 2 files changed, 88 insertions(+) create mode 100644 block/copy-on-read.h diff --git a/block/copy-on-read.c b/block/copy-on-read.c index cb03e0f..618c4c4 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -23,11 +23,20 @@ #include "qemu/osdep.h" #include "block/block_int.h" #include "qemu/module.h" +#include "qapi/error.h" +#include "block/copy-on-read.h" + + +typedef struct BDRVStateCOR { +bool active; +} BDRVStateCOR; static int cor_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { +BDRVStateCOR *state = bs->opaque; + bs->file = bdrv_open_child(NULL, options, "file", bs, _of_bds, BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, false, errp); @@ -42,6 +51,13 @@ static int cor_open(BlockDriverState *bs, QDict *options, int flags, ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & bs->file->bs->supported_zero_flags); +state->active = true; + +/* + * We don't need to call bdrv_child_refresh_perms() now as the permissions + * will be updated later when the filter node gets its parent. + */ + return 0; } @@ -57,6 +73,17 @@ static void cor_child_perm(BlockDriverState *bs, BdrvChild *c, uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared) { +BDRVStateCOR *s = bs->opaque; + +if (!s->active) { +/* + * While the filter is being removed + */ +*nperm = 0; +*nshared = BLK_PERM_ALL; +return; +} + *nperm = perm & PERM_PASSTHROUGH; *nshared = (shared & PERM_PASSTHROUGH) | PERM_UNCHANGED; @@ -135,6 +162,7 @@ static void cor_lock_medium(BlockDriverState *bs, bool locked) static BlockDriver bdrv_copy_on_read = { .format_name= "copy-on-read", +.instance_size = sizeof(BDRVStateCOR), .bdrv_open = cor_open, .bdrv_child_perm= cor_child_perm, @@ -154,6 +182,34 @@ static BlockDriver bdrv_copy_on_read = { .is_filter = true, }; + +void bdrv_cor_filter_drop(BlockDriverState *cor_filter_bs) +{ +BdrvChild *child; +BlockDriverState *bs; +BDRVStateCOR *s = cor_filter_bs->opaque; + +child = bdrv_filter_child(cor_filter_bs); +if (!child) { +return; +} +bs = child->bs; + +/* Retain the BDS until we complete the graph change. */ +bdrv_ref(bs); +/* Hold a guest back from writing while permissions are being reset. */ +bdrv_drained_begin(bs); +/* Drop permissions before the graph change. */ +s->active = false; +bdrv_child_refresh_perms(cor_filter_bs, child, _abort); +bdrv_replace_node(cor_filter_bs, bs, _abort); + +bdrv_drained_end(bs); +bdrv_unref(bs); +bdrv_unref(cor_filter_bs); +} + + static void bdrv_copy_on_read_init(void) { bdrv_register(_copy_on_read); diff --git a/block/copy-on-read.h b/block/copy-on-read.h new file mode 100644 index 000..7bf405d --- /dev/null +++ b/block/copy-on-read.h @@ -0,0 +1,32 @@ +/* + * Copy-on-read filter block driver + * + * The filter driver performs Copy-On-Read (COR) operations + * + * Copyright (c) 2018-2020 Virtuozzo International GmbH. + * + * Author: + * Andrey Shinkevich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef BLOCK_COPY_ON_READ +#define BLOCK_COPY_ON_READ + +#include "block/block_int.h" + +void bdrv_cor_filter_drop(BlockDriverState *cor_filter_bs); + +#endif /* BLOCK_COPY_ON_READ */ -- 1.8.3.1
[PATCH v12 04/14] qapi: add filter-node-name to block-stream
Provide the possibility to pass the 'filter-node-name' parameter to the block-stream job as it is done for the commit block job. Signed-off-by: Andrey Shinkevich Reviewed-by: Vladimir Sementsov-Ogievskiy --- block/monitor/block-hmp-cmds.c | 4 ++-- block/stream.c | 4 +++- blockdev.c | 4 +++- include/block/block_int.h | 7 ++- qapi/block-core.json | 6 ++ 5 files changed, 20 insertions(+), 5 deletions(-) diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c index d15a2be..e8a58f3 100644 --- a/block/monitor/block-hmp-cmds.c +++ b/block/monitor/block-hmp-cmds.c @@ -508,8 +508,8 @@ void hmp_block_stream(Monitor *mon, const QDict *qdict) qmp_block_stream(true, device, device, base != NULL, base, false, NULL, false, NULL, qdict_haskey(qdict, "speed"), speed, true, - BLOCKDEV_ON_ERROR_REPORT, false, false, false, false, - ); + BLOCKDEV_ON_ERROR_REPORT, false, NULL, false, false, false, + false, ); hmp_handle_error(mon, error); } diff --git a/block/stream.c b/block/stream.c index 8ce6729..e0540ee 100644 --- a/block/stream.c +++ b/block/stream.c @@ -221,7 +221,9 @@ static const BlockJobDriver stream_job_driver = { void stream_start(const char *job_id, BlockDriverState *bs, BlockDriverState *base, const char *backing_file_str, int creation_flags, int64_t speed, - BlockdevOnError on_error, Error **errp) + BlockdevOnError on_error, + const char *filter_node_name, + Error **errp) { StreamBlockJob *s; BlockDriverState *iter; diff --git a/blockdev.c b/blockdev.c index fe6fb5d..c917625 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2499,6 +2499,7 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, bool has_backing_file, const char *backing_file, bool has_speed, int64_t speed, bool has_on_error, BlockdevOnError on_error, + bool has_filter_node_name, const char *filter_node_name, bool has_auto_finalize, bool auto_finalize, bool has_auto_dismiss, bool auto_dismiss, Error **errp) @@ -2581,7 +2582,8 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, } stream_start(has_job_id ? job_id : NULL, bs, base_bs, base_name, - job_flags, has_speed ? speed : 0, on_error, _err); + job_flags, has_speed ? speed : 0, on_error, + filter_node_name, _err); if (local_err) { error_propagate(errp, local_err); goto out; diff --git a/include/block/block_int.h b/include/block/block_int.h index 38cad9d..f782737 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -1134,6 +1134,9 @@ int is_windows_drive(const char *filename); * See @BlockJobCreateFlags * @speed: The maximum speed, in bytes per second, or 0 for unlimited. * @on_error: The action to take upon error. + * @filter_node_name: The node name that should be assigned to the filter + * driver that the commit job inserts into the graph above @bs. NULL means + * that a node name should be autogenerated. * @errp: Error object. * * Start a streaming operation on @bs. Clusters that are unallocated @@ -1146,7 +1149,9 @@ int is_windows_drive(const char *filename); void stream_start(const char *job_id, BlockDriverState *bs, BlockDriverState *base, const char *backing_file_str, int creation_flags, int64_t speed, - BlockdevOnError on_error, Error **errp); + BlockdevOnError on_error, + const char *filter_node_name, + Error **errp); /** * commit_start: diff --git a/qapi/block-core.json b/qapi/block-core.json index ee5ebef..0a64306 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -2542,6 +2542,11 @@ #'stop' and 'enospc' can only be used if the block device #supports io-status (see BlockInfo). Since 1.3. # +# @filter-node-name: the node name that should be assigned to the +#filter driver that the stream job inserts into the graph +#above @device. If this option is not given, a node name is +#autogenerated. (Since: 5.2) +# # @auto-finalize: When false, this job will wait in a PENDING state after it has # finished its work, waiting for @block-job-finalize before # making any block graph changes. @@ -2572,6 +2577,7 @@ 'data': { '*job-id': 'str', 'device': 'str', '*base': 'str', '*base-node': 'str', '*backing-file': 'str', '*speed': 'int', '*on-error': 'Block
Re: [PATCH v11 09/13] copy-on-read: skip non-guest reads if no copy needed
On 21.10.2020 23:43, Andrey Shinkevich wrote: On 14.10.2020 18:22, Vladimir Sementsov-Ogievskiy wrote: 14.10.2020 15:51, Max Reitz wrote: On 12.10.20 19:43, Andrey Shinkevich wrote: If the flag BDRV_REQ_PREFETCH was set, pass it further to the COR-driver to skip unneeded reading. It can be taken into account for the COR-algorithms optimization. That check is being made during the block stream job by the moment. Signed-off-by: Andrey Shinkevich --- [...] diff --git a/block/io.c b/block/io.c index 11df188..bff1808 100644 --- a/block/io.c +++ b/block/io.c @@ -1512,7 +1512,8 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); if (bytes <= max_bytes && bytes <= max_transfer) { - ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, 0); + ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, + flags & bs->supported_read_flags); When BDRV_REQ_PREFETCH is passed, qiov may be (and generally should be) NULL. This means, that we can't just drop the flag when call the driver that doesn't support it. Actually, if driver doesn't support the PREFETCH flag we should do nothing. Ah, OK. I see. I expected this to be a separate patch. I still wonder why it isn’t. Could it be part of patch 07? I mean introduce new field supported_read_flags and handle it in generic code in one patch, prior to implementing support for it in COR driver. We have to add the supported flags for the COR driver in the same patch. Or before handling the supported_read_flags at the generic layer (handling zero does not make a sence). Otherwise, the test #216 (where the COR-filter is applied) will not pass. Andrey I have found a workaround and am going to send all the related patches as a separate series. Andrey
Re: [PATCH v11 09/13] copy-on-read: skip non-guest reads if no copy needed
On 14.10.2020 18:22, Vladimir Sementsov-Ogievskiy wrote: 14.10.2020 15:51, Max Reitz wrote: On 12.10.20 19:43, Andrey Shinkevich wrote: If the flag BDRV_REQ_PREFETCH was set, pass it further to the COR-driver to skip unneeded reading. It can be taken into account for the COR-algorithms optimization. That check is being made during the block stream job by the moment. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 13 + block/io.c | 3 ++- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index b136895..278a11a 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -148,10 +148,15 @@ static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, } } - ret = bdrv_co_preadv_part(bs->file, offset, n, qiov, qiov_offset, - local_flags); - if (ret < 0) { - return ret; + if (!!(flags & BDRV_REQ_PREFETCH) & How about dropping the double negation and using a logical && instead of the binary &? + !(local_flags & BDRV_REQ_COPY_ON_READ)) { + /* Skip non-guest reads if no copy needed */ + } else { Hm. I would have just written the negated form (!(flags & BDRV_REQ_PREFETCH) || (local_flags & BDRV_REQ_COPY_ON_READ)) and put the “skip” comment above that condition. (Since local_flags is initialized to flags, it can be written as a single comparison, but that’s a matter of taste and I’m not going to recommend either over the other: ((local_flags & (BDRV_REQ_PREFETCH | BDRV_REQ_COPY_ON_READ)) != BDRV_REQ_PREFETCH) ) + ret = bdrv_co_preadv_part(bs->file, offset, n, qiov, qiov_offset, + local_flags); + if (ret < 0) { + return ret; + } } offset += n; diff --git a/block/io.c b/block/io.c index 11df188..bff1808 100644 --- a/block/io.c +++ b/block/io.c @@ -1512,7 +1512,8 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); if (bytes <= max_bytes && bytes <= max_transfer) { - ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, 0); + ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, + flags & bs->supported_read_flags); When BDRV_REQ_PREFETCH is passed, qiov may be (and generally should be) NULL. This means, that we can't just drop the flag when call the driver that doesn't support it. Actually, if driver doesn't support the PREFETCH flag we should do nothing. Ah, OK. I see. I expected this to be a separate patch. I still wonder why it isn’t. Could it be part of patch 07? I mean introduce new field supported_read_flags and handle it in generic code in one patch, prior to implementing support for it in COR driver. We have to add the supported flags for the COR driver in the same patch. Or before handling the supported_read_flags at the generic layer (handling zero does not make a sence). Otherwise, the test #216 (where the COR-filter is applied) will not pass. Andrey
Re: [PATCH v11 13/13] block: apply COR-filter to block-stream jobs
On 16.10.2020 18:45, Vladimir Sementsov-Ogievskiy wrote: 15.10.2020 20:16, Andrey Shinkevich wrote: On 14.10.2020 19:24, Max Reitz wrote: On 12.10.20 19:43, Andrey Shinkevich wrote: [...] --- block/stream.c | 93 +- tests/qemu-iotests/030 | 51 +++-- tests/qemu-iotests/030.out | 4 +- tests/qemu-iotests/141.out | 2 +- tests/qemu-iotests/245 | 19 +++--- 5 files changed, 81 insertions(+), 88 deletions(-) Looks like stream_run() could be a bit streamlined now (the allocation checking should be unnecessary, unconditionally calling stream_populate() should be sufficient), but not necessary now. That is what I had kept in my mind when I tackled this patch. But there is an underwater reef to streamline. Namely, how the block-stream job gets known about a long unallocated tail to exit the loop earlier in the stream_run(). Shall we return the '-EOF' or another error code from the cor_co_preadv_part() to be handled by the stream_run()? Any other suggestions, if any, will be appreciated. Just calling read CHUNK by CHUNK may be less efficient than is_allocated()-driven loop: you may end up with splitting regions unaligned to CHUNK-granularity, which would not be splitted with is_allocated()-driven loop. Current loop allows chunks unaligned to CHUNK. The cor_co_preadv_part() will check for the end of a file in the next version. So, the unalignment is not going to be the issue. Andrey So, I think, it's better to keep is_allocated() logic as is for now.
Re: [PATCH v11 13/13] block: apply COR-filter to block-stream jobs
On 15.10.2020 20:16, Andrey Shinkevich wrote: On 14.10.2020 19:24, Max Reitz wrote: On 12.10.20 19:43, Andrey Shinkevich wrote: [...] --- block/stream.c | 93 +- tests/qemu-iotests/030 | 51 +++-- tests/qemu-iotests/030.out | 4 +- tests/qemu-iotests/141.out | 2 +- tests/qemu-iotests/245 | 19 +++--- 5 files changed, 81 insertions(+), 88 deletions(-) Looks like stream_run() could be a bit streamlined now (the allocation checking should be unnecessary, unconditionally calling stream_populate() should be sufficient), but not necessary now. That is what I had kept in my mind when I tackled this patch. But there is an underwater reef to streamline. Namely, how the block-stream job gets known about a long unallocated tail to exit the loop earlier in the stream_run(). Shall we return the '-EOF' or another error code from the cor_co_preadv_part() to be handled by the stream_run()? Any other suggestions, if any, will be appreciated. diff --git a/block/stream.c b/block/stream.c index d3e1812..93564db 100644 --- a/block/stream.c +++ b/block/stream.c [...] + + cor_filter_bs = bdrv_cor_filter_append(bs, opts, BDRV_O_RDWR, errp); + if (cor_filter_bs == NULL) { + goto fail; + } + + if (bdrv_freeze_backing_chain(cor_filter_bs, bs, errp) < 0) { Is there a reason why we can’t combine this with the bdrv_free_backing_chain() from bs down to above_base? I mean, the effect should be the same, just asking. The bdrv_freeze_backing_chain(bs, above_base, errp) is called before the bdrv_reopen_set_read_only() to keep the backing chain safe during the context switch. Then we will want to freeze the 'COR -> TOP BS' link as well. Freezing/unfreezing parts is simlier to manage than doing that with the whole chain. If we decide to invoke the bdrv_reopen_set_read_only() after freezing the backing chain together with the COR-filter, we will not be able to get the 'write' permission on the read-only node. + bdrv_cor_filter_drop(cor_filter_bs); + cor_filter_bs = NULL; + goto fail; + } + + s = block_job_create(job_id, _job_driver, NULL, cor_filter_bs, + BLK_PERM_CONSISTENT_READ, + basic_flags | BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD, Not that I’m an expert on the GRAPH_MOD permission, but why is this shared here but not below? Shouldn’t it be the same in both cases? (Same for taking it as a permission.) When we invoke the block_job_add_bdrv(>common, "active node", bs,..) below (particularly, we need it to block the operations on the top node, bdrv_op_block_all()), we ask for the GRAPH_MOD permission for the top node. To allow that, the parent filter node should share that permission for the underlying node. Otherwise, we get assertion failed in the bdrv_check_update_perm() called from bdrv_replace_node() when we remove the filter. I will add my comments above to the code. Andrey [...]
Re: [PATCH v11 05/13] copy-on-read: limit COR operations to base in COR driver
On 15.10.2020 18:56, Max Reitz wrote: On 14.10.20 20:57, Andrey Shinkevich wrote: On 14.10.2020 15:01, Max Reitz wrote: On 12.10.20 19:43, Andrey Shinkevich wrote: Limit COR operations by the base node in the backing chain when the overlay base node name is given. It will be useful for a block stream job when the COR-filter is applied. The overlay base node is passed as the base itself may change due to concurrent commit jobs on the same backing chain. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 39 +-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index c578b1b..dfbd6ad 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -122,8 +122,43 @@ static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, size_t qiov_offset, int flags) { [...] + ret = bdrv_is_allocated_above(bdrv_cow_bs(bs->file->bs), + state->base_overlay, true, offset, + n, ); + if (ret) { + local_flags |= BDRV_REQ_COPY_ON_READ; + } + } Furthermore, I just noticed – can the is_allocated functions not return 0 in @n, when @offset is a the EOF? Is that something to look out for? (I’m not sure.) Max The check for EOF is managed earlier in the stream_run() for a block-stream job. For other cases of using the COR-filter, the check for EOF can be added to the cor_co_preadv_part(). I would be more than happy if we can escape the duplicated checking for is_allocated in the block-stream. But how the stream_run() can stop calling the blk_co_preadv() when EOF is reached if is_allocated removed from it? True. Is it that bad to lose that optimization, though? (And I would expect the case of a short backing file to be rather rare, too.) May the cor_co_preadv_part() return EOF (or other error code) to be handled by a caller if (ret == 0 && n == 0 && (flags & BDRV_REQ_PREFETCH)? That sounds like a bad hack. I’d rather keep the double is_allocated(). But what would be the problem with losing the short backing file optimization? Just performance? Or would we end up writing actual zeroes into the overlay past the end of the backing file? Hm, probably not, if the COR filter would detect that case and handle it like stream does. So it seems only a question of performance to me, and I don’t think it would be that bad to in this rather rare case to have a bunch of useless is_allocated and is_allocated_above calls past the backing file’s EOF. (Maybe I’m wrong, though.) Max Thank you, Max, for sharing your thoughts on this subject. The double check for the is_allocated in the stream_run() is a performance degradation also. And we will make a check for the EOF in the cor_co_preadv_part() in either case, won't we? Andrey
Re: [PATCH v11 13/13] block: apply COR-filter to block-stream jobs
On 14.10.2020 19:24, Max Reitz wrote: On 12.10.20 19:43, Andrey Shinkevich wrote: [...] --- block/stream.c | 93 +- tests/qemu-iotests/030 | 51 +++-- tests/qemu-iotests/030.out | 4 +- tests/qemu-iotests/141.out | 2 +- tests/qemu-iotests/245 | 19 +++--- 5 files changed, 81 insertions(+), 88 deletions(-) Looks like stream_run() could be a bit streamlined now (the allocation checking should be unnecessary, unconditionally calling stream_populate() should be sufficient), but not necessary now. That is what I had kept in my mind when I tackled this patch. But there is an underwater reef to streamline. Namely, how the block-stream job gets known about a long unallocated tail to exit the loop earlier in the stream_run(). Shall we return the '-EOF' or another error code from the cor_co_preadv_part() to be handled by the stream_run()? Any other suggestions, if any, will be appreciated. diff --git a/block/stream.c b/block/stream.c index d3e1812..93564db 100644 --- a/block/stream.c +++ b/block/stream.c [...] + +cor_filter_bs = bdrv_cor_filter_append(bs, opts, BDRV_O_RDWR, errp); +if (cor_filter_bs == NULL) { +goto fail; +} + +if (bdrv_freeze_backing_chain(cor_filter_bs, bs, errp) < 0) { Is there a reason why we can’t combine this with the bdrv_free_backing_chain() from bs down to above_base? I mean, the effect should be the same, just asking. The bdrv_freeze_backing_chain(bs, above_base, errp) is called before the bdrv_reopen_set_read_only() to keep the backing chain safe during the context switch. Then we will want to freeze the 'COR -> TOP BS' link as well. Freezing/unfreezing parts is simlier to manage than doing that with the whole chain. If we decide to invoke the bdrv_reopen_set_read_only() after freezing the backing chain together with the COR-filter, we will not be able to get the 'write' permission on the read-only node. +bdrv_cor_filter_drop(cor_filter_bs); +cor_filter_bs = NULL; +goto fail; +} + +s = block_job_create(job_id, _job_driver, NULL, cor_filter_bs, + BLK_PERM_CONSISTENT_READ, + basic_flags | BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD, Not that I’m an expert on the GRAPH_MOD permission, but why is this shared here but not below? Shouldn’t it be the same in both cases? (Same for taking it as a permission.) When we invoke the block_job_add_bdrv(>common, "active node", bs,..) below (particularly, we need it to block the operations on the top node, bdrv_op_block_all()), we ask for the GRAPH_MOD permission for the top node. To allow that, the parent filter node should share that permission for the underlying node. Otherwise, we get assertion failed in the bdrv_check_update_perm() called from bdrv_replace_node() when we remove the filter. speed, creation_flags, NULL, NULL, errp); if (!s) { goto fail; } +/* + * Prevent concurrent jobs trying to modify the graph structure here, we + * already have our own plans. Also don't allow resize as the image size is + * queried only at the job start and then cached. + */ +if (block_job_add_bdrv(>common, "active node", bs, + basic_flags | BLK_PERM_GRAPH_MOD, + basic_flags | BLK_PERM_WRITE, _abort)) { +goto fail; +} + /* Block all intermediate nodes between bs and base, because they will * disappear from the chain after this operation. The streaming job reads * every block only once, assuming that it doesn't change, so forbid writes [...] diff --git a/tests/qemu-iotests/245 b/tests/qemu-iotests/245 index e60c832..940e85a 100755 --- a/tests/qemu-iotests/245 +++ b/tests/qemu-iotests/245 @@ -899,17 +899,26 @@ class TestBlockdevReopen(iotests.QMPTestCase): # make hd1 read-only and block-stream requires it to be read-write # (Which error message appears depends on whether the stream job is # already done with copying at this point.) Hm. Let’s look at the set of messages below... [1] -self.reopen(opts, {}, +# As the COR-filter node is inserted into the backing chain with the +# 'block-stream' operation, we move the options to their proper nodes. +opts = hd_opts(1) Oh, so this patch changes it so that only the subtree below hd1 is reopened, and we don’t have to deal with the filter options. Got it. (I think.) Yes, that's right. +opts['backing'] = hd_opts(2) +opts['backing']['backing'] = None +self.reopen(opts, {'read-only': True}, ["Can't set node 'hd1' to r/o with copy-on-read enabled", [1] This isn’t done anymore as of this patch. So I don’t think this error message can still appear. Will some other me
Re: [PATCH v11 11/13] stream: mark backing-file argument as deprecated
On 14.10.2020 18:43, Vladimir Sementsov-Ogievskiy wrote: 14.10.2020 18:03, Max Reitz wrote: On 12.10.20 19:43, Andrey Shinkevich wrote: Whereas the block-stream job starts using a backing file name of the base node overlay after the block-stream job completes, mark the QMP 'backing-file' argument as deprecated. Signed-off-by: Andrey Shinkevich --- docs/system/deprecated.rst | 6 ++ 1 file changed, 6 insertions(+) diff --git a/docs/system/deprecated.rst b/docs/system/deprecated.rst index 8b3ab5b..7491fcf 100644 --- a/docs/system/deprecated.rst +++ b/docs/system/deprecated.rst @@ -285,6 +285,12 @@ details. The ``query-events`` command has been superseded by the more powerful and accurate ``query-qmp-schema`` command. +``block-stream`` argument ``backing-file`` (since 5.2) +' + +The argument ``backing-file`` is deprecated. QEMU uses a backing file +name of the base node overlay after the block-stream job completes. + Hm, why? I don’t see the problem with it. My wrong idea, sorry. I believed that the argument is unused when I were reviewing v10. But it actually become unused during the series and it is wrong. I missed searching for calls to the qmp_block_stream() in the QEMU dinamically generated code. Will roll back. Andrey
Re: [PATCH v11 09/13] copy-on-read: skip non-guest reads if no copy needed
On 14.10.2020 15:51, Max Reitz wrote: On 12.10.20 19:43, Andrey Shinkevich wrote: If the flag BDRV_REQ_PREFETCH was set, pass it further to the COR-driver to skip unneeded reading. It can be taken into account for the COR-algorithms optimization. That check is being made during the block stream job by the moment. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 13 + block/io.c | 3 ++- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index b136895..278a11a 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -148,10 +148,15 @@ static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, } } -ret = bdrv_co_preadv_part(bs->file, offset, n, qiov, qiov_offset, - local_flags); -if (ret < 0) { -return ret; +if (!!(flags & BDRV_REQ_PREFETCH) & How about dropping the double negation and using a logical && instead of the binary &? Yes, that's correct. +!(local_flags & BDRV_REQ_COPY_ON_READ)) { +/* Skip non-guest reads if no copy needed */ +} else { Hm. I would have just written the negated form (!(flags & BDRV_REQ_PREFETCH) || (local_flags & BDRV_REQ_COPY_ON_READ)) and put the “skip” comment above that condition. (Since local_flags is initialized to flags, it can be written as a single comparison, but that’s a matter of taste and I’m not going to recommend either over the other: I played with the flags to make the idea obvious for the eye of a beholder: "we neither read nor write". Comparing the BDRV_REQ_PREFETCH against the 'flags' means that the flag comes from outside of the function. And the empty section means we do nothing in that case. Eventually, I will pick up the brief expression below. Thanks, Andrey ((local_flags & (BDRV_REQ_PREFETCH | BDRV_REQ_COPY_ON_READ)) != BDRV_REQ_PREFETCH) ) +ret = bdrv_co_preadv_part(bs->file, offset, n, qiov, qiov_offset, + local_flags); +if (ret < 0) { +return ret; +} } offset += n; diff --git a/block/io.c b/block/io.c index 11df188..bff1808 100644 --- a/block/io.c +++ b/block/io.c @@ -1512,7 +1512,8 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); if (bytes <= max_bytes && bytes <= max_transfer) { -ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, 0); +ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, + flags & bs->supported_read_flags); Ah, OK. I see. I expected this to be a separate patch. I still wonder why it isn’t. Max goto out; }
Re: [PATCH v11 06/13] block: modify the comment for BDRV_REQ_PREFETCH flag
On 14.10.2020 15:22, Max Reitz wrote: On 12.10.20 19:43, Andrey Shinkevich wrote: Modify the comment for the flag BDRV_REQ_PREFETCH as we are going to use it alone and pass it to the COR-filter driver for further processing. Signed-off-by: Andrey Shinkevich --- include/block/block.h | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/block/block.h b/include/block/block.h index 981ab5b..2b7efd1 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -71,9 +71,10 @@ typedef enum { BDRV_REQ_NO_FALLBACK= 0x100, /* - * BDRV_REQ_PREFETCH may be used only together with BDRV_REQ_COPY_ON_READ - * on read request and means that caller doesn't really need data to be - * written to qiov parameter which may be NULL. + * BDRV_REQ_PREFETCH may be used together with the BDRV_REQ_COPY_ON_READ + * flag or when the COR-filter applied to read operations and means that There’s some word missing here, but I’m not sure what it is... At least an “is” before “applied”. Perhaps something like ”or when a COR filter is involved (in read operations)” would be better. + * caller doesn't really need data to be written to qiov parameter which And this “written to” confused me for a second, because we’re reading into qiov. Technically, that means writing into the buffer, but, you know. Could we rewrite the whole thing, perhaps? Something like “BDRV_REQ_PREFETCH makes sense only in the context of copy-on-read (i.e., together with the BDRV_REQ_COPY_ON_READ flag or when there is a COR filter), in which case it signals that the COR operation need not read the data into memory (qiov), but only ensure it is copied to the top layer (i.e., that COR is done).” I don’t know. Max I would modify a little: “BDRV_REQ_PREFETCH makes sense only in the context of copy-on-read (i.e., together with the BDRV_REQ_COPY_ON_READ flag or when a COR filter is involved), in which case it signals that the COR operation need not read the data into memory (qiov) but only ensure they are copied to the top layer (i.e., that COR operation is done).” + * may be NULL. */ BDRV_REQ_PREFETCH = 0x200, /* Mask of valid flags */
Re: [PATCH v11 05/13] copy-on-read: limit COR operations to base in COR driver
On 14.10.2020 15:01, Max Reitz wrote: On 12.10.20 19:43, Andrey Shinkevich wrote: Limit COR operations by the base node in the backing chain when the overlay base node name is given. It will be useful for a block stream job when the COR-filter is applied. The overlay base node is passed as the base itself may change due to concurrent commit jobs on the same backing chain. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 39 +-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index c578b1b..dfbd6ad 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -122,8 +122,43 @@ static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, size_t qiov_offset, int flags) { [...] +ret = bdrv_is_allocated_above(bdrv_cow_bs(bs->file->bs), + state->base_overlay, true, offset, + n, ); +if (ret) { +local_flags |= BDRV_REQ_COPY_ON_READ; +} +} Furthermore, I just noticed – can the is_allocated functions not return 0 in @n, when @offset is a the EOF? Is that something to look out for? (I’m not sure.) Max The check for EOF is managed earlier in the stream_run() for a block-stream job. For other cases of using the COR-filter, the check for EOF can be added to the cor_co_preadv_part(). I would be more than happy if we can escape the duplicated checking for is_allocated in the block-stream. But how the stream_run() can stop calling the blk_co_preadv() when EOF is reached if is_allocated removed from it? May the cor_co_preadv_part() return EOF (or other error code) to be handled by a caller if (ret == 0 && n == 0 && (flags & BDRV_REQ_PREFETCH)? Andrey
Re: [PATCH v11 05/13] copy-on-read: limit COR operations to base in COR driver
On 14.10.2020 14:59, Max Reitz wrote: On 12.10.20 19:43, Andrey Shinkevich wrote: Limit COR operations by the base node in the backing chain when the overlay base node name is given. It will be useful for a block stream job when the COR-filter is applied. The overlay base node is passed as the base itself may change due to concurrent commit jobs on the same backing chain. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 39 +-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index c578b1b..dfbd6ad 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -122,8 +122,43 @@ static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, size_t qiov_offset, int flags) { -return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, - flags | BDRV_REQ_COPY_ON_READ); +int64_t n = 0; +int64_t size = offset + bytes; +int local_flags; +int ret; +BDRVStateCOR *state = bs->opaque; + +if (!state->base_overlay) { +return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, + flags | BDRV_REQ_COPY_ON_READ); +} + +while (offset < size) { +local_flags = flags; + +/* In case of failure, try to copy-on-read anyway */ +ret = bdrv_is_allocated(bs->file->bs, offset, bytes, ); +if (!ret) { In case of failure, a negative value is going to be returned, we won’t go into this conditional block, and local_flags isn’t going to contain BDRV_REQ_COPY_ON_READ. So the idea of CORing in case of failure sounds sound to me, but it doesn’t look like that’s done. Yes, it's obvious. That was just my fault to miss setting the additional condition for "ret < 0". Thank you for noticing that. Andrey +ret = bdrv_is_allocated_above(bdrv_cow_bs(bs->file->bs), I think this should either be bdrv_backing_chain_next() or we must rule out the possibility of bs->file->bs being a filter somewhere. I think I’d prefer the former. + state->base_overlay, true, offset, + n, ); +if (ret) { “ret == 1 || ret < 0” would be more explicit (and in line with the “!ret || ret < 0” probably needed above), but correct either way. Max
Re: [PATCH v11 04/13] copy-on-read: pass overlay base node name to COR driver
On 14.10.2020 14:09, Max Reitz wrote: On 12.10.20 19:43, Andrey Shinkevich wrote: We are going to use the COR-filter for a block-stream job. To limit COR operations by the base node in the backing chain during stream job, pass the name of overlay base node to the copy-on-read driver as base node itself may change due to possible concurrent jobs. The rest of the functionality will be implemented in the patch that follows. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 14 ++ 1 file changed, 14 insertions(+) Is there a reason why you didn’t add this option to QAPI (as part of a yet-to-be-created BlockdevOptionsCor)? Because I’d really like it there. I agree that passing a base overlay under the base option looks clumsy. We could pass the base node name and find its overlay ourselves here in cor_open(). In that case, we can use the existing QAPI. The reason I used the existing QAPI is to make it easier for a user to operate with the traditional options and to keep things simple. So, the user shouldn't think what overlay or above-base node to pass. If we introduce the specific BlockdevOptionsCor, what other options may come with? diff --git a/block/copy-on-read.c b/block/copy-on-read.c index bcccf0f..c578b1b 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -24,19 +24,24 @@ #include "block/block_int.h" #include "qemu/module.h" #include "qapi/error.h" +#include "qapi/qmp/qerror.h" #include "qapi/qmp/qdict.h" #include "block/copy-on-read.h" typedef struct BDRVStateCOR { bool active; +BlockDriverState *base_overlay; } BDRVStateCOR; static int cor_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { +BlockDriverState *base_overlay = NULL; BDRVStateCOR *state = bs->opaque; +/* We need the base overlay node rather than the base itself */ +const char *base_overlay_node = qdict_get_try_str(options, "base"); Shouldn’t it be called base-overlay or above-base then? The base_overlay identifier is used below as the pointer to BS. The base_overlay_node stands for the name of the node. I used that identifier to differ between the types. And the above_base has another meaning per block/stream.c - it can be a temporary filter with a JSON-name. bs->file = bdrv_open_child(NULL, options, "file", bs, _of_bds, BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, @@ -52,7 +57,16 @@ static int cor_open(BlockDriverState *bs, QDict *options, int flags, ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & bs->file->bs->supported_zero_flags); +if (base_overlay_node) { +qdict_del(options, "base"); +base_overlay = bdrv_lookup_bs(NULL, base_overlay_node, errp); I think this is a use-after-free. The storage @base_overlay_node points to belongs to a QString, which is referenced only by @options; so deleting that element of @options should free that string. Max I will swap those two function calls (bdrv_lookup_bs(); qdict_del();). Thank you. Andrey +if (!base_overlay) { +error_setg(errp, QERR_BASE_NOT_FOUND, base_overlay_node); +return -EINVAL; +} +} state->active = true; +state->base_overlay = base_overlay; /* * We don't need to call bdrv_child_refresh_perms() now as the permissions
Re: [PATCH v11 02/13] copy-on-read: add filter append/drop functions
On 14.10.2020 13:44, Max Reitz wrote: On 12.10.20 19:43, Andrey Shinkevich wrote: Provide API for the COR-filter insertion/removal. Also, drop the filter child permissions for an inactive state when the filter node is being removed. Signed-off-by: Andrey Shinkevich Reviewed-by: Vladimir Sementsov-Ogievskiy --- block/copy-on-read.c | 88 block/copy-on-read.h | 35 + 2 files changed, 123 insertions(+) create mode 100644 block/copy-on-read.h diff --git a/block/copy-on-read.c b/block/copy-on-read.c index cb03e0f..bcccf0f 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c [...] @@ -159,4 +188,63 @@ static void bdrv_copy_on_read_init(void) bdrv_register(_copy_on_read); } + +BlockDriverState *bdrv_cor_filter_append(BlockDriverState *bs, + QDict *node_options, + int flags, Error **errp) I had hoped you could make this a generic block layer function. :( (Because it really is rather generic) *shrug* Actually, I did (and still can do) that for the 'append node' function only but not for the 'drop node' one so far... diff --git a/block.c b/block.c index 11ab55f..f41e876 100644 --- a/block.c +++ b/block.c @@ -4669,6 +4669,55 @@ static void bdrv_delete(BlockDriverState *bs) g_free(bs); } +BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *node_options, + int flags, Error **errp) +{ +BlockDriverState *new_node_bs; +Error *local_err = NULL; + +new_node_bs = bdrv_open(NULL, NULL, node_options, flags, errp); +if (new_node_bs == NULL) { +error_prepend(errp, "Could not create node: "); +return NULL; +} + +bdrv_drained_begin(bs); +bdrv_replace_node(bs, new_node_bs, _err); +bdrv_drained_end(bs); + +if (local_err) { +bdrv_unref(new_node_bs); +error_propagate(errp, local_err); +return NULL; +} + +return new_node_bs; +} + +void bdrv_remove_node(BlockDriverState *bs) +{ +BdrvChild *child; +BlockDriverState *inferior_bs; + +child = bdrv_filter_or_cow_child(bs); +if (!child) { +return; +} +inferior_bs = child->bs; + +/* Retain the BDS until we complete the graph change. */ +bdrv_ref(inferior_bs); +/* Hold a guest back from writing while permissions are being reset. */ +bdrv_drained_begin(inferior_bs); +/* Refresh permissions before the graph change. */ +bdrv_child_refresh_perms(bs, child, _abort); +bdrv_replace_node(bs, inferior_bs, _abort); + +bdrv_drained_end(inferior_bs); +bdrv_unref(inferior_bs); +bdrv_unref(bs); +} So, it is an intermediate solution in this patch of the series. I am going to make both functions generic once Vladimir overhauls the QEMU permission update system. Otherwise, the COR-filter node cannot be removed from the backing chain gracefully. Thank you for your r-b. If the next version comes, I can move the 'append node' function only to the generic layer. Andrey Reviewed-by: Max Reitz +{ +BlockDriverState *cor_filter_bs; +Error *local_err = NULL; + +cor_filter_bs = bdrv_open(NULL, NULL, node_options, flags, errp); +if (cor_filter_bs == NULL) { +error_prepend(errp, "Could not create COR-filter node: "); +return NULL; +} + +if (!qdict_get_try_str(node_options, "node-name")) { +cor_filter_bs->implicit = true; +} + +bdrv_drained_begin(bs); +bdrv_replace_node(bs, cor_filter_bs, _err); +bdrv_drained_end(bs); + +if (local_err) { +bdrv_unref(cor_filter_bs); +error_propagate(errp, local_err); +return NULL; +} + +return cor_filter_bs; +}
[PATCH v11 13/13] block: apply COR-filter to block-stream jobs
This patch completes the series with the COR-filter insertion for block-stream operations. Adding the filter makes it possible for copied regions to be discarded in backing files during the block-stream job, what will reduce the disk overuse. The COR-filter insertion incurs changes in the iotests case 245:test_block_stream_4 that reopens the backing chain during a block-stream job. There are changes in the iotests #030 as well. The iotests case 030:test_stream_parallel was deleted due to multiple conflicts between the concurrent job operations over the same backing chain. The base backing node for one job is the top node for another job. It may change due to the filter node inserted into the backing chain while both jobs are running. Another issue is that the parts of the backing chain are being frozen by the running job and may not be changed by the concurrent job when needed. The concept of the parallel jobs with common nodes is considered vital no more. Signed-off-by: Andrey Shinkevich --- block/stream.c | 93 +- tests/qemu-iotests/030 | 51 +++-- tests/qemu-iotests/030.out | 4 +- tests/qemu-iotests/141.out | 2 +- tests/qemu-iotests/245 | 19 +++--- 5 files changed, 81 insertions(+), 88 deletions(-) diff --git a/block/stream.c b/block/stream.c index d3e1812..93564db 100644 --- a/block/stream.c +++ b/block/stream.c @@ -17,8 +17,10 @@ #include "block/blockjob_int.h" #include "qapi/error.h" #include "qapi/qmp/qerror.h" +#include "qapi/qmp/qdict.h" #include "qemu/ratelimit.h" #include "sysemu/block-backend.h" +#include "block/copy-on-read.h" enum { /* @@ -33,6 +35,8 @@ typedef struct StreamBlockJob { BlockJob common; BlockDriverState *base_overlay; /* COW overlay (stream from this) */ BlockDriverState *above_base; /* Node directly above the base */ +BlockDriverState *cor_filter_bs; +BlockDriverState *target_bs; BlockdevOnError on_error; bool bs_read_only; bool chain_frozen; @@ -43,8 +47,7 @@ static int coroutine_fn stream_populate(BlockBackend *blk, { assert(bytes < SIZE_MAX); -return blk_co_preadv(blk, offset, bytes, NULL, - BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH); +return blk_co_preadv(blk, offset, bytes, NULL, BDRV_REQ_PREFETCH); } static void stream_abort(Job *job) @@ -52,23 +55,20 @@ static void stream_abort(Job *job) StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); if (s->chain_frozen) { -BlockJob *bjob = >common; -bdrv_unfreeze_backing_chain(blk_bs(bjob->blk), s->above_base); +bdrv_unfreeze_backing_chain(s->cor_filter_bs, s->above_base); } } static int stream_prepare(Job *job) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); -BlockJob *bjob = >common; -BlockDriverState *bs = blk_bs(bjob->blk); -BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs); +BlockDriverState *unfiltered_bs = bdrv_skip_filters(s->target_bs); BlockDriverState *base = bdrv_filter_or_cow_bs(s->above_base); BlockDriverState *base_unfiltered = bdrv_skip_filters(base); Error *local_err = NULL; int ret = 0; -bdrv_unfreeze_backing_chain(bs, s->above_base); +bdrv_unfreeze_backing_chain(s->cor_filter_bs, s->above_base); s->chain_frozen = false; if (bdrv_cow_child(unfiltered_bs)) { @@ -94,13 +94,14 @@ static void stream_clean(Job *job) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); BlockJob *bjob = >common; -BlockDriverState *bs = blk_bs(bjob->blk); + +bdrv_cor_filter_drop(s->cor_filter_bs); /* Reopen the image back in read-only mode if necessary */ if (s->bs_read_only) { /* Give up write permissions before making it read-only */ blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, _abort); -bdrv_reopen_set_read_only(bs, true, NULL); +bdrv_reopen_set_read_only(s->target_bs, true, NULL); } } @@ -108,9 +109,7 @@ static int coroutine_fn stream_run(Job *job, Error **errp) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); BlockBackend *blk = s->common.blk; -BlockDriverState *bs = blk_bs(blk); -BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs); -bool enable_cor = !bdrv_cow_child(s->base_overlay); +BlockDriverState *unfiltered_bs = bdrv_skip_filters(s->target_bs); int64_t len; int64_t offset = 0; uint64_t delay_ns = 0; @@ -122,21 +121,12 @@ static int coroutine_fn stream_run(Job *job, Error **errp) return 0; } -len = bdrv_getlength(bs); +len = bdrv_getlength(s->target_bs); if (len < 0) { return len; } job_progress_set_remaining(>common.job, len); -
[PATCH v11 12/13] stream: remove unused backing-file name parameter
The 'backing-file' argument is not used by the block-stream job. It designates a backing file name to set in QCOW2 image header after the block-stream job finished. A backing file name of the node above base is used instead. Signed-off-by: Andrey Shinkevich --- block/stream.c| 6 +- blockdev.c| 21 ++--- include/block/block_int.h | 2 +- 3 files changed, 8 insertions(+), 21 deletions(-) diff --git a/block/stream.c b/block/stream.c index 51462bd..d3e1812 100644 --- a/block/stream.c +++ b/block/stream.c @@ -34,7 +34,6 @@ typedef struct StreamBlockJob { BlockDriverState *base_overlay; /* COW overlay (stream from this) */ BlockDriverState *above_base; /* Node directly above the base */ BlockdevOnError on_error; -char *backing_file_str; bool bs_read_only; bool chain_frozen; } StreamBlockJob; @@ -103,8 +102,6 @@ static void stream_clean(Job *job) blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, _abort); bdrv_reopen_set_read_only(bs, true, NULL); } - -g_free(s->backing_file_str); } static int coroutine_fn stream_run(Job *job, Error **errp) @@ -220,7 +217,7 @@ static const BlockJobDriver stream_job_driver = { }; void stream_start(const char *job_id, BlockDriverState *bs, - BlockDriverState *base, const char *backing_file_str, + BlockDriverState *base, int creation_flags, int64_t speed, BlockdevOnError on_error, const char *filter_node_name, @@ -295,7 +292,6 @@ void stream_start(const char *job_id, BlockDriverState *bs, s->base_overlay = base_overlay; s->above_base = above_base; -s->backing_file_str = g_strdup(backing_file_str); s->bs_read_only = bs_read_only; s->chain_frozen = true; diff --git a/blockdev.c b/blockdev.c index d719c47..019b6e0 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2498,7 +2498,6 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, BlockDriverState *base_bs = NULL; AioContext *aio_context; Error *local_err = NULL; -const char *base_name = NULL; int job_flags = JOB_DEFAULT; if (!has_on_error) { @@ -2526,7 +2525,6 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, goto out; } assert(bdrv_get_aio_context(base_bs) == aio_context); -base_name = base; } if (has_base_node) { @@ -2541,7 +2539,11 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, } assert(bdrv_get_aio_context(base_bs) == aio_context); bdrv_refresh_filename(base_bs); -base_name = base_bs->filename; +} + +if (has_backing_file) { +warn_report("Use of \"backing-file\" argument is deprecated; " +"a backing file of the node above base is used instead"); } /* Check for op blockers in the whole chain between bs and base */ @@ -2553,17 +2555,6 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, } } -/* if we are streaming the entire chain, the result will have no backing - * file, and specifying one is therefore an error */ -if (base_bs == NULL && has_backing_file) { -error_setg(errp, "backing file specified, but streaming the " - "entire chain"); -goto out; -} - -/* backing_file string overrides base bs filename */ -base_name = has_backing_file ? backing_file : base_name; - if (has_auto_finalize && !auto_finalize) { job_flags |= JOB_MANUAL_FINALIZE; } @@ -2571,7 +2562,7 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, job_flags |= JOB_MANUAL_DISMISS; } -stream_start(has_job_id ? job_id : NULL, bs, base_bs, base_name, +stream_start(has_job_id ? job_id : NULL, bs, base_bs, job_flags, has_speed ? speed : 0, on_error, filter_node_name, _err); if (local_err) { diff --git a/include/block/block_int.h b/include/block/block_int.h index a142867..4f523c3 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -1151,7 +1151,7 @@ int is_windows_drive(const char *filename); * BlockDriverState. */ void stream_start(const char *job_id, BlockDriverState *bs, - BlockDriverState *base, const char *backing_file_str, + BlockDriverState *base, int creation_flags, int64_t speed, BlockdevOnError on_error, const char *filter_node_name, -- 1.8.3.1
[PATCH v11 11/13] stream: mark backing-file argument as deprecated
Whereas the block-stream job starts using a backing file name of the base node overlay after the block-stream job completes, mark the QMP 'backing-file' argument as deprecated. Signed-off-by: Andrey Shinkevich --- docs/system/deprecated.rst | 6 ++ 1 file changed, 6 insertions(+) diff --git a/docs/system/deprecated.rst b/docs/system/deprecated.rst index 8b3ab5b..7491fcf 100644 --- a/docs/system/deprecated.rst +++ b/docs/system/deprecated.rst @@ -285,6 +285,12 @@ details. The ``query-events`` command has been superseded by the more powerful and accurate ``query-qmp-schema`` command. +``block-stream`` argument ``backing-file`` (since 5.2) +' + +The argument ``backing-file`` is deprecated. QEMU uses a backing file +name of the base node overlay after the block-stream job completes. + chardev client socket with ``wait`` option (since 4.0) '' -- 1.8.3.1
[PATCH v11 08/13] copy-on-read: add support for BDRV_REQ_PREFETCH to COR-filter
Add support for the BDRV_REQ_PREFETCH flag to the supported_write_flags of the COR-filter. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index dfbd6ad..b136895 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -50,6 +50,7 @@ static int cor_open(BlockDriverState *bs, QDict *options, int flags, return -EINVAL; } +bs->supported_read_flags = BDRV_REQ_PREFETCH; bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED | (BDRV_REQ_FUA & bs->file->bs->supported_write_flags); -- 1.8.3.1
[PATCH v11 09/13] copy-on-read: skip non-guest reads if no copy needed
If the flag BDRV_REQ_PREFETCH was set, pass it further to the COR-driver to skip unneeded reading. It can be taken into account for the COR-algorithms optimization. That check is being made during the block stream job by the moment. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 13 + block/io.c | 3 ++- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index b136895..278a11a 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -148,10 +148,15 @@ static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, } } -ret = bdrv_co_preadv_part(bs->file, offset, n, qiov, qiov_offset, - local_flags); -if (ret < 0) { -return ret; +if (!!(flags & BDRV_REQ_PREFETCH) & +!(local_flags & BDRV_REQ_COPY_ON_READ)) { +/* Skip non-guest reads if no copy needed */ +} else { +ret = bdrv_co_preadv_part(bs->file, offset, n, qiov, qiov_offset, + local_flags); +if (ret < 0) { +return ret; +} } offset += n; diff --git a/block/io.c b/block/io.c index 11df188..bff1808 100644 --- a/block/io.c +++ b/block/io.c @@ -1512,7 +1512,8 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); if (bytes <= max_bytes && bytes <= max_transfer) { -ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, 0); +ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, + flags & bs->supported_read_flags); goto out; } -- 1.8.3.1
[PATCH v11 10/13] stream: skip filters when writing backing file name to QCOW2 header
Avoid writing a filter JSON-name to QCOW2 image when the backing file is changed after the block stream job. Signed-off-by: Andrey Shinkevich --- block/stream.c | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/block/stream.c b/block/stream.c index e0540ee..51462bd 100644 --- a/block/stream.c +++ b/block/stream.c @@ -65,6 +65,7 @@ static int stream_prepare(Job *job) BlockDriverState *bs = blk_bs(bjob->blk); BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs); BlockDriverState *base = bdrv_filter_or_cow_bs(s->above_base); +BlockDriverState *base_unfiltered = bdrv_skip_filters(base); Error *local_err = NULL; int ret = 0; @@ -73,10 +74,10 @@ static int stream_prepare(Job *job) if (bdrv_cow_child(unfiltered_bs)) { const char *base_id = NULL, *base_fmt = NULL; -if (base) { -base_id = s->backing_file_str; -if (base->drv) { -base_fmt = base->drv->format_name; +if (base_unfiltered) { +base_id = base_unfiltered->filename; +if (base_unfiltered->drv) { +base_fmt = base_unfiltered->drv->format_name; } } bdrv_set_backing_hd(unfiltered_bs, base, _err); -- 1.8.3.1
[PATCH v11 07/13] block: include supported_read_flags into BDS structure
Add the new member supported_read_flags to BlockDriverState structure. It will control the BDRV_REQ_PREFETCH flag set for copy-on-read operations. Signed-off-by: Andrey Shinkevich --- include/block/block_int.h | 4 1 file changed, 4 insertions(+) diff --git a/include/block/block_int.h b/include/block/block_int.h index f782737..a142867 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -873,6 +873,10 @@ struct BlockDriverState { /* I/O Limits */ BlockLimits bl; +/* + * Flags honored during pread (so far: BDRV_REQ_PREFETCH) + */ +unsigned int supported_read_flags; /* Flags honored during pwrite (so far: BDRV_REQ_FUA, * BDRV_REQ_WRITE_UNCHANGED). * If a driver does not support BDRV_REQ_WRITE_UNCHANGED, those -- 1.8.3.1
[PATCH v11 06/13] block: modify the comment for BDRV_REQ_PREFETCH flag
Modify the comment for the flag BDRV_REQ_PREFETCH as we are going to use it alone and pass it to the COR-filter driver for further processing. Signed-off-by: Andrey Shinkevich --- include/block/block.h | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/block/block.h b/include/block/block.h index 981ab5b..2b7efd1 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -71,9 +71,10 @@ typedef enum { BDRV_REQ_NO_FALLBACK= 0x100, /* - * BDRV_REQ_PREFETCH may be used only together with BDRV_REQ_COPY_ON_READ - * on read request and means that caller doesn't really need data to be - * written to qiov parameter which may be NULL. + * BDRV_REQ_PREFETCH may be used together with the BDRV_REQ_COPY_ON_READ + * flag or when the COR-filter applied to read operations and means that + * caller doesn't really need data to be written to qiov parameter which + * may be NULL. */ BDRV_REQ_PREFETCH = 0x200, /* Mask of valid flags */ -- 1.8.3.1
[PATCH v11 05/13] copy-on-read: limit COR operations to base in COR driver
Limit COR operations by the base node in the backing chain when the overlay base node name is given. It will be useful for a block stream job when the COR-filter is applied. The overlay base node is passed as the base itself may change due to concurrent commit jobs on the same backing chain. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 39 +-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index c578b1b..dfbd6ad 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -122,8 +122,43 @@ static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, size_t qiov_offset, int flags) { -return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, - flags | BDRV_REQ_COPY_ON_READ); +int64_t n = 0; +int64_t size = offset + bytes; +int local_flags; +int ret; +BDRVStateCOR *state = bs->opaque; + +if (!state->base_overlay) { +return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, + flags | BDRV_REQ_COPY_ON_READ); +} + +while (offset < size) { +local_flags = flags; + +/* In case of failure, try to copy-on-read anyway */ +ret = bdrv_is_allocated(bs->file->bs, offset, bytes, ); +if (!ret) { +ret = bdrv_is_allocated_above(bdrv_cow_bs(bs->file->bs), + state->base_overlay, true, offset, + n, ); +if (ret) { +local_flags |= BDRV_REQ_COPY_ON_READ; +} +} + +ret = bdrv_co_preadv_part(bs->file, offset, n, qiov, qiov_offset, + local_flags); +if (ret < 0) { +return ret; +} + +offset += n; +qiov_offset += n; +bytes -= n; +} + +return 0; } -- 1.8.3.1
[PATCH v11 02/13] copy-on-read: add filter append/drop functions
Provide API for the COR-filter insertion/removal. Also, drop the filter child permissions for an inactive state when the filter node is being removed. Signed-off-by: Andrey Shinkevich Reviewed-by: Vladimir Sementsov-Ogievskiy --- block/copy-on-read.c | 88 block/copy-on-read.h | 35 + 2 files changed, 123 insertions(+) create mode 100644 block/copy-on-read.h diff --git a/block/copy-on-read.c b/block/copy-on-read.c index cb03e0f..bcccf0f 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -23,11 +23,21 @@ #include "qemu/osdep.h" #include "block/block_int.h" #include "qemu/module.h" +#include "qapi/error.h" +#include "qapi/qmp/qdict.h" +#include "block/copy-on-read.h" + + +typedef struct BDRVStateCOR { +bool active; +} BDRVStateCOR; static int cor_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { +BDRVStateCOR *state = bs->opaque; + bs->file = bdrv_open_child(NULL, options, "file", bs, _of_bds, BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, false, errp); @@ -42,6 +52,13 @@ static int cor_open(BlockDriverState *bs, QDict *options, int flags, ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & bs->file->bs->supported_zero_flags); +state->active = true; + +/* + * We don't need to call bdrv_child_refresh_perms() now as the permissions + * will be updated later when the filter node gets its parent. + */ + return 0; } @@ -57,6 +74,17 @@ static void cor_child_perm(BlockDriverState *bs, BdrvChild *c, uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared) { +BDRVStateCOR *s = bs->opaque; + +if (!s->active) { +/* + * While the filter is being removed + */ +*nperm = 0; +*nshared = BLK_PERM_ALL; +return; +} + *nperm = perm & PERM_PASSTHROUGH; *nshared = (shared & PERM_PASSTHROUGH) | PERM_UNCHANGED; @@ -135,6 +163,7 @@ static void cor_lock_medium(BlockDriverState *bs, bool locked) static BlockDriver bdrv_copy_on_read = { .format_name= "copy-on-read", +.instance_size = sizeof(BDRVStateCOR), .bdrv_open = cor_open, .bdrv_child_perm= cor_child_perm, @@ -159,4 +188,63 @@ static void bdrv_copy_on_read_init(void) bdrv_register(_copy_on_read); } + +BlockDriverState *bdrv_cor_filter_append(BlockDriverState *bs, + QDict *node_options, + int flags, Error **errp) +{ +BlockDriverState *cor_filter_bs; +Error *local_err = NULL; + +cor_filter_bs = bdrv_open(NULL, NULL, node_options, flags, errp); +if (cor_filter_bs == NULL) { +error_prepend(errp, "Could not create COR-filter node: "); +return NULL; +} + +if (!qdict_get_try_str(node_options, "node-name")) { +cor_filter_bs->implicit = true; +} + +bdrv_drained_begin(bs); +bdrv_replace_node(bs, cor_filter_bs, _err); +bdrv_drained_end(bs); + +if (local_err) { +bdrv_unref(cor_filter_bs); +error_propagate(errp, local_err); +return NULL; +} + +return cor_filter_bs; +} + + +void bdrv_cor_filter_drop(BlockDriverState *cor_filter_bs) +{ +BdrvChild *child; +BlockDriverState *bs; +BDRVStateCOR *s = cor_filter_bs->opaque; + +child = bdrv_filter_child(cor_filter_bs); +if (!child) { +return; +} +bs = child->bs; + +/* Retain the BDS until we complete the graph change. */ +bdrv_ref(bs); +/* Hold a guest back from writing while permissions are being reset. */ +bdrv_drained_begin(bs); +/* Drop permissions before the graph change. */ +s->active = false; +bdrv_child_refresh_perms(cor_filter_bs, child, _abort); +bdrv_replace_node(cor_filter_bs, bs, _abort); + +bdrv_drained_end(bs); +bdrv_unref(bs); +bdrv_unref(cor_filter_bs); +} + + block_init(bdrv_copy_on_read_init); diff --git a/block/copy-on-read.h b/block/copy-on-read.h new file mode 100644 index 000..d6f2422 --- /dev/null +++ b/block/copy-on-read.h @@ -0,0 +1,35 @@ +/* + * Copy-on-read filter block driver + * + * The filter driver performs Copy-On-Read (COR) operations + * + * Copyright (c) 2018-2020 Virtuozzo International GmbH. + * + * Author: + * Andrey Shinkevich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any la
[PATCH v11 04/13] copy-on-read: pass overlay base node name to COR driver
We are going to use the COR-filter for a block-stream job. To limit COR operations by the base node in the backing chain during stream job, pass the name of overlay base node to the copy-on-read driver as base node itself may change due to possible concurrent jobs. The rest of the functionality will be implemented in the patch that follows. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 14 ++ 1 file changed, 14 insertions(+) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index bcccf0f..c578b1b 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -24,19 +24,24 @@ #include "block/block_int.h" #include "qemu/module.h" #include "qapi/error.h" +#include "qapi/qmp/qerror.h" #include "qapi/qmp/qdict.h" #include "block/copy-on-read.h" typedef struct BDRVStateCOR { bool active; +BlockDriverState *base_overlay; } BDRVStateCOR; static int cor_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { +BlockDriverState *base_overlay = NULL; BDRVStateCOR *state = bs->opaque; +/* We need the base overlay node rather than the base itself */ +const char *base_overlay_node = qdict_get_try_str(options, "base"); bs->file = bdrv_open_child(NULL, options, "file", bs, _of_bds, BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, @@ -52,7 +57,16 @@ static int cor_open(BlockDriverState *bs, QDict *options, int flags, ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & bs->file->bs->supported_zero_flags); +if (base_overlay_node) { +qdict_del(options, "base"); +base_overlay = bdrv_lookup_bs(NULL, base_overlay_node, errp); +if (!base_overlay) { +error_setg(errp, QERR_BASE_NOT_FOUND, base_overlay_node); +return -EINVAL; +} +} state->active = true; +state->base_overlay = base_overlay; /* * We don't need to call bdrv_child_refresh_perms() now as the permissions -- 1.8.3.1
[PATCH v11 03/13] qapi: add filter-node-name to block-stream
Provide the possibility to pass the 'filter-node-name' parameter to the block-stream job as it is done for the commit block job. Signed-off-by: Andrey Shinkevich Reviewed-by: Vladimir Sementsov-Ogievskiy --- block/monitor/block-hmp-cmds.c | 4 ++-- block/stream.c | 4 +++- blockdev.c | 4 +++- include/block/block_int.h | 7 ++- qapi/block-core.json | 6 ++ 5 files changed, 20 insertions(+), 5 deletions(-) diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c index 4d3db5e..4e66775 100644 --- a/block/monitor/block-hmp-cmds.c +++ b/block/monitor/block-hmp-cmds.c @@ -507,8 +507,8 @@ void hmp_block_stream(Monitor *mon, const QDict *qdict) qmp_block_stream(true, device, device, base != NULL, base, false, NULL, false, NULL, qdict_haskey(qdict, "speed"), speed, true, - BLOCKDEV_ON_ERROR_REPORT, false, false, false, false, - ); + BLOCKDEV_ON_ERROR_REPORT, false, NULL, false, false, false, + false, ); hmp_handle_error(mon, error); } diff --git a/block/stream.c b/block/stream.c index 8ce6729..e0540ee 100644 --- a/block/stream.c +++ b/block/stream.c @@ -221,7 +221,9 @@ static const BlockJobDriver stream_job_driver = { void stream_start(const char *job_id, BlockDriverState *bs, BlockDriverState *base, const char *backing_file_str, int creation_flags, int64_t speed, - BlockdevOnError on_error, Error **errp) + BlockdevOnError on_error, + const char *filter_node_name, + Error **errp) { StreamBlockJob *s; BlockDriverState *iter; diff --git a/blockdev.c b/blockdev.c index bebd3ba..d719c47 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2489,6 +2489,7 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, bool has_backing_file, const char *backing_file, bool has_speed, int64_t speed, bool has_on_error, BlockdevOnError on_error, + bool has_filter_node_name, const char *filter_node_name, bool has_auto_finalize, bool auto_finalize, bool has_auto_dismiss, bool auto_dismiss, Error **errp) @@ -2571,7 +2572,8 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, } stream_start(has_job_id ? job_id : NULL, bs, base_bs, base_name, - job_flags, has_speed ? speed : 0, on_error, _err); + job_flags, has_speed ? speed : 0, on_error, + filter_node_name, _err); if (local_err) { error_propagate(errp, local_err); goto out; diff --git a/include/block/block_int.h b/include/block/block_int.h index 38cad9d..f782737 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -1134,6 +1134,9 @@ int is_windows_drive(const char *filename); * See @BlockJobCreateFlags * @speed: The maximum speed, in bytes per second, or 0 for unlimited. * @on_error: The action to take upon error. + * @filter_node_name: The node name that should be assigned to the filter + * driver that the commit job inserts into the graph above @bs. NULL means + * that a node name should be autogenerated. * @errp: Error object. * * Start a streaming operation on @bs. Clusters that are unallocated @@ -1146,7 +1149,9 @@ int is_windows_drive(const char *filename); void stream_start(const char *job_id, BlockDriverState *bs, BlockDriverState *base, const char *backing_file_str, int creation_flags, int64_t speed, - BlockdevOnError on_error, Error **errp); + BlockdevOnError on_error, + const char *filter_node_name, + Error **errp); /** * commit_start: diff --git a/qapi/block-core.json b/qapi/block-core.json index 3c16f1e..32fb097 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -2533,6 +2533,11 @@ #'stop' and 'enospc' can only be used if the block device #supports io-status (see BlockInfo). Since 1.3. # +# @filter-node-name: the node name that should be assigned to the +#filter driver that the stream job inserts into the graph +#above @device. If this option is not given, a node name is +#autogenerated. (Since: 5.2) +# # @auto-finalize: When false, this job will wait in a PENDING state after it has # finished its work, waiting for @block-job-finalize before # making any block graph changes. @@ -2563,6 +2568,7 @@ 'data': { '*job-id': 'str', 'device': 'str', '*base': 'str', '*base-node': 'str', '*backing-file': 'str', '*speed': 'int', '*on-error': 'Block
[PATCH v11 01/13] copy-on-read: Support preadv/pwritev_part functions
Add support for the recently introduced functions bdrv_co_preadv_part() and bdrv_co_pwritev_part() to the COR-filter driver. Signed-off-by: Andrey Shinkevich Reviewed-by: Vladimir Sementsov-Ogievskiy --- block/copy-on-read.c | 28 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index 2816e61..cb03e0f 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -74,21 +74,25 @@ static int64_t cor_getlength(BlockDriverState *bs) } -static int coroutine_fn cor_co_preadv(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, + size_t qiov_offset, + int flags) { -return bdrv_co_preadv(bs->file, offset, bytes, qiov, - flags | BDRV_REQ_COPY_ON_READ); +return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, + flags | BDRV_REQ_COPY_ON_READ); } -static int coroutine_fn cor_co_pwritev(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +static int coroutine_fn cor_co_pwritev_part(BlockDriverState *bs, +uint64_t offset, +uint64_t bytes, +QEMUIOVector *qiov, +size_t qiov_offset, int flags) { - -return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); +return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset, +flags); } @@ -137,8 +141,8 @@ static BlockDriver bdrv_copy_on_read = { .bdrv_getlength = cor_getlength, -.bdrv_co_preadv = cor_co_preadv, -.bdrv_co_pwritev= cor_co_pwritev, +.bdrv_co_preadv_part= cor_co_preadv_part, +.bdrv_co_pwritev_part = cor_co_pwritev_part, .bdrv_co_pwrite_zeroes = cor_co_pwrite_zeroes, .bdrv_co_pdiscard = cor_co_pdiscard, .bdrv_co_pwritev_compressed = cor_co_pwritev_compressed, -- 1.8.3.1
[PATCH v11 00/13] Apply COR-filter to the block-stream permanently
The iotest case test_stream_parallel still does not pass after the COR-filter is inserted into the backing chain. As the test case may not be initialized, it does not make a sense and was removed again. v11: 04: Base node overlay is used instead of base. 05: Base node overlay is used instead of base. 06: New. 07: New. 08: New. 09: The new BDS-member 'supported_read_flags' is applied. 10: The 'base_metadata' variable renamed to 'base_unfiltered'. 11: New. 12: The backing-file argument is left in the QMP interface. Warning added. 13: The BDRV_REQ_COPY_ON_READ removed from the stream_populate(); The 'implicit' initialization moved back to COR-filter driver. Base node overlay is used instead of base. The v8 Message-Id: <1601383109-110988-1-git-send-email-andrey.shinkev...@virtuozzo.com> Andrey Shinkevich (13): copy-on-read: Support preadv/pwritev_part functions copy-on-read: add filter append/drop functions qapi: add filter-node-name to block-stream copy-on-read: pass overlay base node name to COR driver copy-on-read: limit COR operations to base in COR driver block: modify the comment for BDRV_REQ_PREFETCH flag block: include supported_read_flags into BDS structure copy-on-read: add support for BDRV_REQ_PREFETCH to COR-filter copy-on-read: skip non-guest reads if no copy needed stream: skip filters when writing backing file name to QCOW2 header stream: mark backing-file argument as deprecated stream: remove unused backing-file name parameter block: apply COR-filter to block-stream jobs block/copy-on-read.c | 171 ++--- block/copy-on-read.h | 35 + block/io.c | 3 +- block/monitor/block-hmp-cmds.c | 4 +- block/stream.c | 112 --- blockdev.c | 25 +++--- docs/system/deprecated.rst | 6 ++ include/block/block.h | 7 +- include/block/block_int.h | 13 +++- qapi/block-core.json | 6 ++ tests/qemu-iotests/030 | 51 ++-- tests/qemu-iotests/030.out | 4 +- tests/qemu-iotests/141.out | 2 +- tests/qemu-iotests/245 | 19 +++-- 14 files changed, 324 insertions(+), 134 deletions(-) create mode 100644 block/copy-on-read.h -- 1.8.3.1
Re: [PATCH v10 6/9] copy-on-read: skip non-guest reads if no copy needed
On 07.10.2020 22:28, Vladimir Sementsov-Ogievskiy wrote: 07.10.2020 22:01, Andrey Shinkevich wrote: On 07.10.2020 13:06, Vladimir Sementsov-Ogievskiy wrote: 29.09.2020 15:38, Andrey Shinkevich wrote: If the flag BDRV_REQ_PREFETCH was set, pass it further to the COR-driver to skip unneeded reading. It can be taken into account for the COR-algorithms optimization. That check is being made during the block stream job by the moment. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 14 ++ block/io.c | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index f53f7e0..5389dca 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -145,10 +145,16 @@ static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, } } - ret = bdrv_co_preadv_part(bs->file, offset, n, qiov, qiov_offset, - local_flags); - if (ret < 0) { - return ret; + if ((flags & BDRV_REQ_PREFETCH) & BDRV_REQ_PREFETCH is documented to be only used with BDRV_REQ_COPY_ON_READ. But here BDRV_REQ_COPY_ON_READ appears intermediately. We should change documentation in block.h in a separate patch (and probably code in bdrv_aligned_preadv()) OK, we will come here without the BDRV_REQ_PREFETCH flag set. flag BDRV_REQ_PREFETCH should be set in stream job. Where should it be handled, I don't follow? If we leave block/io.c unchanged in this patch, what I'm agreeing with, we'll come to the COR-driver with the hardcoded flags = 0 : #4 0x55a22bb480cf in cor_co_preadv_part (bs=0x55a22d593710, offset=0, bytes=524288, qiov=0x0, qiov_offset=0, flags=0) at ../block/copy-on-read.c:149 #5 0x55a22badcb1d in bdrv_driver_preadv (bs=0x55a22d593710, offset=0, bytes=524288, qiov=0x0, qiov_offset=0, flags=0) at ../block/io.c:1129 #6 0x55a22baddc81 in bdrv_aligned_preadv (child=0x55a22d814780, req=0x7f8c1abffce0, offset=0, bytes=524288, align=1, qiov=0x0, qiov_offset=0, flags=512) at ../block/io.c:1515 #7 0x55a22bade59a in bdrv_co_preadv_part (child=0x55a22d814780, offset=0, bytes=524288, qiov=0x0, qiov_offset=0, flags=BDRV_REQ_PREFETCH) at ../block/io.c:1757 #8 0x55a22bade3d2 in bdrv_co_preadv (child=0x55a22d814780, offset=0, bytes=524288, qiov=0x0, flags=BDRV_REQ_PREFETCH) at ../block/io.c:1715 #9 0x55a22baf5d09 in blk_do_preadv (blk=0x55a22d818c00, offset=0, bytes=524288, qiov=0x0, flags=BDRV_REQ_PREFETCH) at ../block/block-backend.c:1211 #10 0x55a22baf5d61 in blk_co_preadv (blk=0x55a22d818c00, offset=0, bytes=524288, qiov=0x0, flags=BDRV_REQ_PREFETCH) at ../block/block-backend.c:1223 #11 0x55a22bab4eba in stream_populate (blk=0x55a22d818c00, offset=0, bytes=524288) at ../block/stream.c:50 #12 0x55a22bab52c2 in stream_run (job=0x55a22d810a20, errp=0x55a22d810aa0) at ../block/stream.c:162 #13 0x55a22bab79f0 in job_co_entry (opaque=0x55a22d810a20) at ../job.c:908 So, the only way for the COR-filter driver to differ between guests reads and the stream job is to check the qiov pointer for NULL and reset the flags as appropriate. This is what I am going to do in the next version. Andrey To differ between guest reads and the stream job ones, we would set it here by checking for the qiov NULL pointer: diff --git a/block/copy-on-read.c b/block/copy-on-read.c index 4e3b1c5..df2c2ab 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -144,6 +144,9 @@ static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, n, ); if (ret) { local_flags |= BDRV_REQ_COPY_ON_READ; + if (!qiov) { + local_flags |= BDRV_REQ_PREFETCH; if qiov is NULL, this means that flags must include BDRV_REQ_PREFETCH. local_flags should inherit flags I think. + } } } Andrey + !(local_flags & BDRV_REQ_COPY_ON_READ)) { + /* Skip non-guest reads if no copy needed */ + } else { + extra new-line ? + ret = bdrv_co_preadv_part(bs->file, offset, n, qiov, qiov_offset, + local_flags); + if (ret < 0) { + return ret; + } } offset += n; diff --git a/block/io.c b/block/io.c index 11df188..62b75a5 100644 --- a/block/io.c +++ b/block/io.c @@ -1388,7 +1388,7 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, qemu_iovec_init_buf(_qiov, bounce_buffer, pnum); ret = bdrv_driver_preadv(bs, cluster_offset, pnum, - _qiov, 0, 0); + _qiov, 0, flags & BDRV_REQ_PREFETCH); Why? In this place we want to read. We'll write back the data a few lines below. What will we write, if underlying
Re: [PATCH v10 8/9] block: remove unused backing-file name parameter
On 07.10.2020 13:21, Vladimir Sementsov-Ogievskiy wrote: 29.09.2020 15:38, Andrey Shinkevich wrote: The block stream QMP parameter backing-file is in use no more. It designates a backing file name to set in QCOW2 image header after the block stream job finished. The base file name is used instead. Signed-off-by: Andrey Shinkevich We can't just remove it without a deprecation period of three releases. It has not been in use for a long. It's time. So actually, in a previous patch, we should implement new behavior for automatic backing-file detection if this parameter is unspecified. Amd keep old behavior for backing-file-name if it is given. Hmm. Or, probably, we can use direct base for base-filename? And in cases when we should skip filters (for example of parallel jobs) user should specify backing-file explicitly? The backing_file_str is always specified if the base is specified and is always equal to the base->filename. So, the user's backing file name is always NULL for the stream job. Furthermore, it is not checked for being the correct backing node and can lead to a wrong record in the QCOW2 header. Andrey --- block/monitor/block-hmp-cmds.c | 2 +- block/stream.c | 6 +- blockdev.c | 17 + include/block/block_int.h | 2 +- qapi/block-core.json | 17 + 5 files changed, 5 insertions(+), 39 deletions(-) diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c index 4e66775..5f19499 100644 --- a/block/monitor/block-hmp-cmds.c +++ b/block/monitor/block-hmp-cmds.c @@ -506,7 +506,7 @@ void hmp_block_stream(Monitor *mon, const QDict *qdict) int64_t speed = qdict_get_try_int(qdict, "speed", 0); qmp_block_stream(true, device, device, base != NULL, base, false, NULL, - false, NULL, qdict_haskey(qdict, "speed"), speed, true, + qdict_haskey(qdict, "speed"), speed, true, BLOCKDEV_ON_ERROR_REPORT, false, NULL, false, false, false, false, ); diff --git a/block/stream.c b/block/stream.c index b0719e9..fe2663f 100644 --- a/block/stream.c +++ b/block/stream.c @@ -34,7 +34,6 @@ typedef struct StreamBlockJob { BlockDriverState *base_overlay; /* COW overlay (stream from this) */ BlockDriverState *above_base; /* Node directly above the base */ BlockdevOnError on_error; - char *backing_file_str; bool bs_read_only; bool chain_frozen; } StreamBlockJob; @@ -103,8 +102,6 @@ static void stream_clean(Job *job) blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, _abort); bdrv_reopen_set_read_only(bs, true, NULL); } - - g_free(s->backing_file_str); } static int coroutine_fn stream_run(Job *job, Error **errp) @@ -220,7 +217,7 @@ static const BlockJobDriver stream_job_driver = { }; void stream_start(const char *job_id, BlockDriverState *bs, - BlockDriverState *base, const char *backing_file_str, + BlockDriverState *base, int creation_flags, int64_t speed, BlockdevOnError on_error, const char *filter_node_name, @@ -295,7 +292,6 @@ void stream_start(const char *job_id, BlockDriverState *bs, s->base_overlay = base_overlay; s->above_base = above_base; - s->backing_file_str = g_strdup(backing_file_str); s->bs_read_only = bs_read_only; s->chain_frozen = true; diff --git a/blockdev.c b/blockdev.c index d719c47..b223601 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2486,7 +2486,6 @@ out: void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, bool has_base, const char *base, bool has_base_node, const char *base_node, - bool has_backing_file, const char *backing_file, bool has_speed, int64_t speed, bool has_on_error, BlockdevOnError on_error, bool has_filter_node_name, const char *filter_node_name, @@ -2498,7 +2497,6 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, BlockDriverState *base_bs = NULL; AioContext *aio_context; Error *local_err = NULL; - const char *base_name = NULL; int job_flags = JOB_DEFAULT; if (!has_on_error) { @@ -2526,7 +2524,6 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, goto out; } assert(bdrv_get_aio_context(base_bs) == aio_context); - base_name = base; } if (has_base_node) { @@ -2541,7 +2538,6 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, } assert(bdrv_get_aio_context(base_bs) == aio_context); bdrv_refresh_filename(base_bs); - base_name = bas
Re: [PATCH v10 6/9] copy-on-read: skip non-guest reads if no copy needed
On 07.10.2020 13:06, Vladimir Sementsov-Ogievskiy wrote: 29.09.2020 15:38, Andrey Shinkevich wrote: If the flag BDRV_REQ_PREFETCH was set, pass it further to the COR-driver to skip unneeded reading. It can be taken into account for the COR-algorithms optimization. That check is being made during the block stream job by the moment. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 14 ++ block/io.c | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index f53f7e0..5389dca 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -145,10 +145,16 @@ static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, } } - ret = bdrv_co_preadv_part(bs->file, offset, n, qiov, qiov_offset, - local_flags); - if (ret < 0) { - return ret; + if ((flags & BDRV_REQ_PREFETCH) & BDRV_REQ_PREFETCH is documented to be only used with BDRV_REQ_COPY_ON_READ. But here BDRV_REQ_COPY_ON_READ appears intermediately. We should change documentation in block.h in a separate patch (and probably code in bdrv_aligned_preadv()) OK, we will come here without the BDRV_REQ_PREFETCH flag set. To differ between guest reads and the stream job ones, we would set it here by checking for the qiov NULL pointer: diff --git a/block/copy-on-read.c b/block/copy-on-read.c index 4e3b1c5..df2c2ab 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -144,6 +144,9 @@ static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, n, ); if (ret) { local_flags |= BDRV_REQ_COPY_ON_READ; +if (!qiov) { +local_flags |= BDRV_REQ_PREFETCH; +} } } Andrey + !(local_flags & BDRV_REQ_COPY_ON_READ)) { + /* Skip non-guest reads if no copy needed */ + } else { + extra new-line ? + ret = bdrv_co_preadv_part(bs->file, offset, n, qiov, qiov_offset, + local_flags); + if (ret < 0) { + return ret; + } } offset += n; diff --git a/block/io.c b/block/io.c index 11df188..62b75a5 100644 --- a/block/io.c +++ b/block/io.c @@ -1388,7 +1388,7 @@ static int coroutine_fn jk(BdrvChild *child, qemu_iovec_init_buf(_qiov, bounce_buffer, pnum); ret = bdrv_driver_preadv(bs, cluster_offset, pnum, - _qiov, 0, 0); + _qiov, 0, flags & BDRV_REQ_PREFETCH); Why? In this place we want to read. We'll write back the data a few lines below. What will we write, if underlying driver decide to do nothing because of BDRV_REQ_PREFETCH? See my comment above please. if (ret < 0) { goto err; }
Re: [PATCH v10 9/9] block: apply COR-filter to block-stream jobs
On 07.10.2020 20:27, Andrey Shinkevich wrote: On 29.09.2020 15:38, Andrey Shinkevich wrote: This patch completes the series with the COR-filter insertion for block-stream operations. Adding the filter makes it possible for copied regions to be discarded in backing files during the block-stream job, what will reduce the disk overuse. The COR-filter insertion incurs changes in the iotests case 245:test_block_stream_4 that reopens the backing chain during a block-stream job. There are changes in the iotests #030 as well. The iotests case 030:test_stream_parallel was deleted due to multiple conflicts between the concurrent job operations over the same backing chain. The base backing node for one job is the top node for another job. It may change due to the filter node inserted into the backing chain while both jobs are running. Another issue is that the parts of the backing chain are being frozen by the running job and may not be changed by the concurrent job when needed. The concept of the parallel jobs with common nodes is considered vital no more. Signed-off-by: Andrey Shinkevich --- block/stream.c | 93 ++ tests/qemu-iotests/030 | 51 +++-- tests/qemu-iotests/030.out | 4 +- tests/qemu-iotests/141.out | 2 +- tests/qemu-iotests/245 | 19 +++--- 5 files changed, 83 insertions(+), 86 deletions(-) diff --git a/block/stream.c b/block/stream.c index fe2663f..240b3dc 100644 --- a/block/stream.c +++ b/block/stream.c @@ -17,8 +17,10 @@ One more change missed, as we use the COR-filter: @@ -47,8 +47,7 @@ static int coroutine_fn stream_populate(BlockBackend *blk, { assert(bytes < SIZE_MAX); - return blk_co_preadv(blk, offset, bytes, NULL, - BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH); +return blk_co_preadv(blk, offset, bytes, NULL, BDRV_REQ_PREFETCH); Sorry, with the only flag BDRV_REQ_PREFETCH set. A change in the comment at the flag BDRV_REQ_PREFETCH is coming with a separate patch as Vladimir suggested. Andrey + return blk_co_preadv(blk, offset, bytes, NULL, 0); } Andrey
Re: [PATCH v10 9/9] block: apply COR-filter to block-stream jobs
On 29.09.2020 15:38, Andrey Shinkevich wrote: This patch completes the series with the COR-filter insertion for block-stream operations. Adding the filter makes it possible for copied regions to be discarded in backing files during the block-stream job, what will reduce the disk overuse. The COR-filter insertion incurs changes in the iotests case 245:test_block_stream_4 that reopens the backing chain during a block-stream job. There are changes in the iotests #030 as well. The iotests case 030:test_stream_parallel was deleted due to multiple conflicts between the concurrent job operations over the same backing chain. The base backing node for one job is the top node for another job. It may change due to the filter node inserted into the backing chain while both jobs are running. Another issue is that the parts of the backing chain are being frozen by the running job and may not be changed by the concurrent job when needed. The concept of the parallel jobs with common nodes is considered vital no more. Signed-off-by: Andrey Shinkevich --- block/stream.c | 93 ++ tests/qemu-iotests/030 | 51 +++-- tests/qemu-iotests/030.out | 4 +- tests/qemu-iotests/141.out | 2 +- tests/qemu-iotests/245 | 19 +++--- 5 files changed, 83 insertions(+), 86 deletions(-) diff --git a/block/stream.c b/block/stream.c index fe2663f..240b3dc 100644 --- a/block/stream.c +++ b/block/stream.c @@ -17,8 +17,10 @@ One more change missed, as we use the COR-filter: @@ -47,8 +47,7 @@ static int coroutine_fn stream_populate(BlockBackend *blk, { assert(bytes < SIZE_MAX); -return blk_co_preadv(blk, offset, bytes, NULL, - BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH); +return blk_co_preadv(blk, offset, bytes, NULL, 0); } Andrey
Re: [PATCH v10 5/9] copy-on-read: limit guest COR activity to base in COR driver
On 05.10.2020 17:58, Vladimir Sementsov-Ogievskiy wrote: 29.09.2020 15:38, Andrey Shinkevich wrote: Limit the guest's COR operations by the base node in the backing chain when the base node name is given. It will be useful for a block stream job when the COR-filter is applied. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 38 -- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index e04092f..f53f7e0 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -121,8 +121,42 @@ static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, size_t qiov_offset, int flags) { - return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, - flags | BDRV_REQ_COPY_ON_READ); + int64_t n = 0; + int64_t size = offset + bytes; + int local_flags; + int ret; + BDRVStateCOR *state = bs->opaque; + + if (!state->base_bs) { + return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, + flags | BDRV_REQ_COPY_ON_READ); + } + + while (offset < size) { + local_flags = flags; + + /* In case of failure, try to copy-on-read anyway */ But you add the flag only in case of success.. On any failure of furhter is*allocated calls we should set the flag. Actually, myself would prefer returning the error instead. Andrey + ret = bdrv_is_allocated(bs->file->bs, offset, bytes, ); + if (!ret) { + ret = bdrv_is_allocated_above(bdrv_cow_bs(bs->file->bs), + state->base_bs, false, offset, n, ); + if (ret > 0) { + local_flags |= BDRV_REQ_COPY_ON_READ; + } + } + + ret = bdrv_co_preadv_part(bs->file, offset, n, qiov, qiov_offset, + local_flags); + if (ret < 0) { + return ret; + } + + offset += n; + qiov_offset += n; + bytes -= n; + } + + return 0; }
Re: [PATCH v10 5/9] copy-on-read: limit guest COR activity to base in COR driver
On 05.10.2020 17:58, Vladimir Sementsov-Ogievskiy wrote: 29.09.2020 15:38, Andrey Shinkevich wrote: Limit the guest's COR operations by the base node in the backing chain when the base node name is given. It will be useful for a block stream job when the COR-filter is applied. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 38 -- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index e04092f..f53f7e0 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -121,8 +121,42 @@ static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, size_t qiov_offset, int flags) { - return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, - flags | BDRV_REQ_COPY_ON_READ); + int64_t n = 0; + int64_t size = offset + bytes; + int local_flags; + int ret; + BDRVStateCOR *state = bs->opaque; + + if (!state->base_bs) { + return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, + flags | BDRV_REQ_COPY_ON_READ); + } + + while (offset < size) { + local_flags = flags; + + /* In case of failure, try to copy-on-read anyway */ But you add the flag only in case of success.. On any failure of furhter is*allocated calls we should set the flag. Yes, thanks. Andrey + ret = bdrv_is_allocated(bs->file->bs, offset, bytes, ); + if (!ret) { + ret = bdrv_is_allocated_above(bdrv_cow_bs(bs->file->bs), + state->base_bs, false, offset, n, ); + if (ret > 0) { + local_flags |= BDRV_REQ_COPY_ON_READ; + } + } + + ret = bdrv_co_preadv_part(bs->file, offset, n, qiov, qiov_offset, + local_flags); + if (ret < 0) { + return ret; + } + + offset += n; + qiov_offset += n; + bytes -= n; + } + + return 0; }
Re: [PATCH v10 2/9] copy-on-read: add filter append/drop functions
On 05.10.2020 16:34, Vladimir Sementsov-Ogievskiy wrote: 29.09.2020 15:38, Andrey Shinkevich wrote: Provide API for the COR-filter insertion/removal. Also, drop the filter child permissions for an inactive state when the filter node is being removed. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 84 block/copy-on-read.h | 35 ++ 2 files changed, 119 insertions(+) create mode 100644 block/copy-on-read.h diff --git a/block/copy-on-read.c b/block/copy-on-read.c index cb03e0f..3c8231f 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -23,11 +23,21 @@ #include "qemu/osdep.h" #include "block/block_int.h" #include "qemu/module.h" +#include "qapi/error.h" +#include "qapi/qmp/qdict.h" +#include "block/copy-on-read.h" + + +typedef struct BDRVStateCOR { + bool active; +} BDRVStateCOR; static int cor_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { + BDRVStateCOR *state = bs->opaque; + bs->file = bdrv_open_child(NULL, options, "file", bs, _of_bds, BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, false, errp); @@ -42,6 +52,13 @@ static int cor_open(BlockDriverState *bs, QDict *options, int flags, ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & bs->file->bs->supported_zero_flags); + state->active = true; + + /* + * We don't need to call bdrv_child_refresh_perms() now as the permissions + * will be updated later when the filter node gets its parent. + */ + return 0; } @@ -57,6 +74,17 @@ static void cor_child_perm(BlockDriverState *bs, BdrvChild *c, uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared) { + BDRVStateCOR *s = bs->opaque; + + if (!s->active) { + /* + * While the filter is being removed + */ + *nperm = 0; + *nshared = BLK_PERM_ALL; + return; + } + *nperm = perm & PERM_PASSTHROUGH; *nshared = (shared & PERM_PASSTHROUGH) | PERM_UNCHANGED; @@ -135,6 +163,7 @@ static void cor_lock_medium(BlockDriverState *bs, bool locked) static BlockDriver bdrv_copy_on_read = { .format_name = "copy-on-read", + .instance_size = sizeof(BDRVStateCOR), .bdrv_open = cor_open, .bdrv_child_perm = cor_child_perm, @@ -159,4 +188,59 @@ static void bdrv_copy_on_read_init(void) bdrv_register(_copy_on_read); } + +BlockDriverState *bdrv_cor_filter_append(BlockDriverState *bs, + QDict *node_options, + int flags, Error **errp) Ok, now function can add ~any filter, not only COR.. But it's a pair for bdrv_cor_filter_drop(), and with "active" hack we don't want make the functions generic I think. So it's OK for now to keep function here and named _cor_. +{ + BlockDriverState *cor_filter_bs; + Error *local_err = NULL; + + cor_filter_bs = bdrv_open(NULL, NULL, node_options, flags, errp); + if (cor_filter_bs == NULL) { + error_prepend(errp, "Could not create COR-filter node: "); + return NULL; + } You've dropped setting ->implicit field if filter_node_name not specified. Probably caller now can do it.. I don't really care about implicit case, so it's OK for me if it works with iotests. Thank you for your R-B. The idea behind setting the 'implicit' member by a caller is to prepare the code for the node replacement by a function at the block generic layer in future. In the scope of this series, that may be better to keep it here. Andrey So, Reviewed-by: Vladimir Sementsov-Ogievskiy
[PATCH v10 7/9] stream: skip filters when writing backing file name to QCOW2 header
Avoid writing a filter JSON-name to QCOW2 image when the backing file is changed after the block stream job. Signed-off-by: Andrey Shinkevich --- block/stream.c | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/block/stream.c b/block/stream.c index e0540ee..b0719e9 100644 --- a/block/stream.c +++ b/block/stream.c @@ -65,6 +65,7 @@ static int stream_prepare(Job *job) BlockDriverState *bs = blk_bs(bjob->blk); BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs); BlockDriverState *base = bdrv_filter_or_cow_bs(s->above_base); +BlockDriverState *base_metadata = bdrv_skip_filters(base); Error *local_err = NULL; int ret = 0; @@ -73,10 +74,10 @@ static int stream_prepare(Job *job) if (bdrv_cow_child(unfiltered_bs)) { const char *base_id = NULL, *base_fmt = NULL; -if (base) { -base_id = s->backing_file_str; -if (base->drv) { -base_fmt = base->drv->format_name; +if (base_metadata) { +base_id = base_metadata->filename; +if (base_metadata->drv) { +base_fmt = base_metadata->drv->format_name; } } bdrv_set_backing_hd(unfiltered_bs, base, _err); -- 1.8.3.1
[PATCH v10 9/9] block: apply COR-filter to block-stream jobs
This patch completes the series with the COR-filter insertion for block-stream operations. Adding the filter makes it possible for copied regions to be discarded in backing files during the block-stream job, what will reduce the disk overuse. The COR-filter insertion incurs changes in the iotests case 245:test_block_stream_4 that reopens the backing chain during a block-stream job. There are changes in the iotests #030 as well. The iotests case 030:test_stream_parallel was deleted due to multiple conflicts between the concurrent job operations over the same backing chain. The base backing node for one job is the top node for another job. It may change due to the filter node inserted into the backing chain while both jobs are running. Another issue is that the parts of the backing chain are being frozen by the running job and may not be changed by the concurrent job when needed. The concept of the parallel jobs with common nodes is considered vital no more. Signed-off-by: Andrey Shinkevich --- block/stream.c | 93 ++ tests/qemu-iotests/030 | 51 +++-- tests/qemu-iotests/030.out | 4 +- tests/qemu-iotests/141.out | 2 +- tests/qemu-iotests/245 | 19 +++--- 5 files changed, 83 insertions(+), 86 deletions(-) diff --git a/block/stream.c b/block/stream.c index fe2663f..240b3dc 100644 --- a/block/stream.c +++ b/block/stream.c @@ -17,8 +17,10 @@ #include "block/blockjob_int.h" #include "qapi/error.h" #include "qapi/qmp/qerror.h" +#include "qapi/qmp/qdict.h" #include "qemu/ratelimit.h" #include "sysemu/block-backend.h" +#include "block/copy-on-read.h" enum { /* @@ -33,6 +35,8 @@ typedef struct StreamBlockJob { BlockJob common; BlockDriverState *base_overlay; /* COW overlay (stream from this) */ BlockDriverState *above_base; /* Node directly above the base */ +BlockDriverState *cor_filter_bs; +BlockDriverState *target_bs; BlockdevOnError on_error; bool bs_read_only; bool chain_frozen; @@ -52,23 +56,20 @@ static void stream_abort(Job *job) StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); if (s->chain_frozen) { -BlockJob *bjob = >common; -bdrv_unfreeze_backing_chain(blk_bs(bjob->blk), s->above_base); +bdrv_unfreeze_backing_chain(s->cor_filter_bs, s->above_base); } } static int stream_prepare(Job *job) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); -BlockJob *bjob = >common; -BlockDriverState *bs = blk_bs(bjob->blk); -BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs); +BlockDriverState *unfiltered_bs = bdrv_skip_filters(s->target_bs); BlockDriverState *base = bdrv_filter_or_cow_bs(s->above_base); BlockDriverState *base_metadata = bdrv_skip_filters(base); Error *local_err = NULL; int ret = 0; -bdrv_unfreeze_backing_chain(bs, s->above_base); +bdrv_unfreeze_backing_chain(s->cor_filter_bs, s->above_base); s->chain_frozen = false; if (bdrv_cow_child(unfiltered_bs)) { @@ -94,13 +95,14 @@ static void stream_clean(Job *job) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); BlockJob *bjob = >common; -BlockDriverState *bs = blk_bs(bjob->blk); + +bdrv_cor_filter_drop(s->cor_filter_bs); /* Reopen the image back in read-only mode if necessary */ if (s->bs_read_only) { /* Give up write permissions before making it read-only */ blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, _abort); -bdrv_reopen_set_read_only(bs, true, NULL); +bdrv_reopen_set_read_only(s->target_bs, true, NULL); } } @@ -108,9 +110,7 @@ static int coroutine_fn stream_run(Job *job, Error **errp) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); BlockBackend *blk = s->common.blk; -BlockDriverState *bs = blk_bs(blk); -BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs); -bool enable_cor = !bdrv_cow_child(s->base_overlay); +BlockDriverState *unfiltered_bs = bdrv_skip_filters(s->target_bs); int64_t len; int64_t offset = 0; uint64_t delay_ns = 0; @@ -122,21 +122,12 @@ static int coroutine_fn stream_run(Job *job, Error **errp) return 0; } -len = bdrv_getlength(bs); +len = bdrv_getlength(s->target_bs); if (len < 0) { return len; } job_progress_set_remaining(>common.job, len); -/* Turn on copy-on-read for the whole block device so that guest read - * requests help us make progress. Only do this when copying the entire - * backing chain since the copy-on-read operation does not take base into - * account. - */ -if (enable_cor) { -bdrv_enable_copy_on_read(bs); -} - for (
[PATCH v10 6/9] copy-on-read: skip non-guest reads if no copy needed
If the flag BDRV_REQ_PREFETCH was set, pass it further to the COR-driver to skip unneeded reading. It can be taken into account for the COR-algorithms optimization. That check is being made during the block stream job by the moment. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 14 ++ block/io.c | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index f53f7e0..5389dca 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -145,10 +145,16 @@ static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, } } -ret = bdrv_co_preadv_part(bs->file, offset, n, qiov, qiov_offset, - local_flags); -if (ret < 0) { -return ret; +if ((flags & BDRV_REQ_PREFETCH) & +!(local_flags & BDRV_REQ_COPY_ON_READ)) { +/* Skip non-guest reads if no copy needed */ +} else { + +ret = bdrv_co_preadv_part(bs->file, offset, n, qiov, qiov_offset, + local_flags); +if (ret < 0) { +return ret; +} } offset += n; diff --git a/block/io.c b/block/io.c index 11df188..62b75a5 100644 --- a/block/io.c +++ b/block/io.c @@ -1388,7 +1388,7 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, qemu_iovec_init_buf(_qiov, bounce_buffer, pnum); ret = bdrv_driver_preadv(bs, cluster_offset, pnum, - _qiov, 0, 0); + _qiov, 0, flags & BDRV_REQ_PREFETCH); if (ret < 0) { goto err; } -- 1.8.3.1
[PATCH v10 4/9] copy-on-read: pass base node name to COR driver
To limit the guest's COR operations by the base node in the backing chain during stream job, pass the base node name to the copy-on-read driver. The rest of the functionality will be implemented in the patch that follows. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 13 + 1 file changed, 13 insertions(+) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index 3c8231f..e04092f 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -24,19 +24,23 @@ #include "block/block_int.h" #include "qemu/module.h" #include "qapi/error.h" +#include "qapi/qmp/qerror.h" #include "qapi/qmp/qdict.h" #include "block/copy-on-read.h" typedef struct BDRVStateCOR { bool active; +BlockDriverState *base_bs; } BDRVStateCOR; static int cor_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { +BlockDriverState *base_bs = NULL; BDRVStateCOR *state = bs->opaque; +const char *base_node = qdict_get_try_str(options, "base"); bs->file = bdrv_open_child(NULL, options, "file", bs, _of_bds, BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, @@ -52,7 +56,16 @@ static int cor_open(BlockDriverState *bs, QDict *options, int flags, ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & bs->file->bs->supported_zero_flags); +if (base_node) { +qdict_del(options, "base"); +base_bs = bdrv_lookup_bs(NULL, base_node, errp); +if (!base_bs) { +error_setg(errp, QERR_BASE_NOT_FOUND, base_node); +return -EINVAL; +} +} state->active = true; +state->base_bs = base_bs; /* * We don't need to call bdrv_child_refresh_perms() now as the permissions -- 1.8.3.1
[PATCH v10 8/9] block: remove unused backing-file name parameter
The block stream QMP parameter backing-file is in use no more. It designates a backing file name to set in QCOW2 image header after the block stream job finished. The base file name is used instead. Signed-off-by: Andrey Shinkevich --- block/monitor/block-hmp-cmds.c | 2 +- block/stream.c | 6 +- blockdev.c | 17 + include/block/block_int.h | 2 +- qapi/block-core.json | 17 + 5 files changed, 5 insertions(+), 39 deletions(-) diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c index 4e66775..5f19499 100644 --- a/block/monitor/block-hmp-cmds.c +++ b/block/monitor/block-hmp-cmds.c @@ -506,7 +506,7 @@ void hmp_block_stream(Monitor *mon, const QDict *qdict) int64_t speed = qdict_get_try_int(qdict, "speed", 0); qmp_block_stream(true, device, device, base != NULL, base, false, NULL, - false, NULL, qdict_haskey(qdict, "speed"), speed, true, + qdict_haskey(qdict, "speed"), speed, true, BLOCKDEV_ON_ERROR_REPORT, false, NULL, false, false, false, false, ); diff --git a/block/stream.c b/block/stream.c index b0719e9..fe2663f 100644 --- a/block/stream.c +++ b/block/stream.c @@ -34,7 +34,6 @@ typedef struct StreamBlockJob { BlockDriverState *base_overlay; /* COW overlay (stream from this) */ BlockDriverState *above_base; /* Node directly above the base */ BlockdevOnError on_error; -char *backing_file_str; bool bs_read_only; bool chain_frozen; } StreamBlockJob; @@ -103,8 +102,6 @@ static void stream_clean(Job *job) blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, _abort); bdrv_reopen_set_read_only(bs, true, NULL); } - -g_free(s->backing_file_str); } static int coroutine_fn stream_run(Job *job, Error **errp) @@ -220,7 +217,7 @@ static const BlockJobDriver stream_job_driver = { }; void stream_start(const char *job_id, BlockDriverState *bs, - BlockDriverState *base, const char *backing_file_str, + BlockDriverState *base, int creation_flags, int64_t speed, BlockdevOnError on_error, const char *filter_node_name, @@ -295,7 +292,6 @@ void stream_start(const char *job_id, BlockDriverState *bs, s->base_overlay = base_overlay; s->above_base = above_base; -s->backing_file_str = g_strdup(backing_file_str); s->bs_read_only = bs_read_only; s->chain_frozen = true; diff --git a/blockdev.c b/blockdev.c index d719c47..b223601 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2486,7 +2486,6 @@ out: void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, bool has_base, const char *base, bool has_base_node, const char *base_node, - bool has_backing_file, const char *backing_file, bool has_speed, int64_t speed, bool has_on_error, BlockdevOnError on_error, bool has_filter_node_name, const char *filter_node_name, @@ -2498,7 +2497,6 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, BlockDriverState *base_bs = NULL; AioContext *aio_context; Error *local_err = NULL; -const char *base_name = NULL; int job_flags = JOB_DEFAULT; if (!has_on_error) { @@ -2526,7 +2524,6 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, goto out; } assert(bdrv_get_aio_context(base_bs) == aio_context); -base_name = base; } if (has_base_node) { @@ -2541,7 +2538,6 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, } assert(bdrv_get_aio_context(base_bs) == aio_context); bdrv_refresh_filename(base_bs); -base_name = base_bs->filename; } /* Check for op blockers in the whole chain between bs and base */ @@ -2553,17 +2549,6 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, } } -/* if we are streaming the entire chain, the result will have no backing - * file, and specifying one is therefore an error */ -if (base_bs == NULL && has_backing_file) { -error_setg(errp, "backing file specified, but streaming the " - "entire chain"); -goto out; -} - -/* backing_file string overrides base bs filename */ -base_name = has_backing_file ? backing_file : base_name; - if (has_auto_finalize && !auto_finalize) { job_flags |= JOB_MANUAL_FINALIZE; } @@ -2571,7 +2556,7 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, job_flags |= JOB_MANUAL_DISMISS; } -str
[PATCH v10 3/9] qapi: add filter-node-name to block-stream
Provide the possibility to pass the 'filter-node-name' parameter to the block-stream job as it is done for the commit block job. Signed-off-by: Andrey Shinkevich Reviewed-by: Vladimir Sementsov-Ogievskiy --- block/monitor/block-hmp-cmds.c | 4 ++-- block/stream.c | 4 +++- blockdev.c | 4 +++- include/block/block_int.h | 7 ++- qapi/block-core.json | 6 ++ 5 files changed, 20 insertions(+), 5 deletions(-) diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c index 4d3db5e..4e66775 100644 --- a/block/monitor/block-hmp-cmds.c +++ b/block/monitor/block-hmp-cmds.c @@ -507,8 +507,8 @@ void hmp_block_stream(Monitor *mon, const QDict *qdict) qmp_block_stream(true, device, device, base != NULL, base, false, NULL, false, NULL, qdict_haskey(qdict, "speed"), speed, true, - BLOCKDEV_ON_ERROR_REPORT, false, false, false, false, - ); + BLOCKDEV_ON_ERROR_REPORT, false, NULL, false, false, false, + false, ); hmp_handle_error(mon, error); } diff --git a/block/stream.c b/block/stream.c index 8ce6729..e0540ee 100644 --- a/block/stream.c +++ b/block/stream.c @@ -221,7 +221,9 @@ static const BlockJobDriver stream_job_driver = { void stream_start(const char *job_id, BlockDriverState *bs, BlockDriverState *base, const char *backing_file_str, int creation_flags, int64_t speed, - BlockdevOnError on_error, Error **errp) + BlockdevOnError on_error, + const char *filter_node_name, + Error **errp) { StreamBlockJob *s; BlockDriverState *iter; diff --git a/blockdev.c b/blockdev.c index bebd3ba..d719c47 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2489,6 +2489,7 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, bool has_backing_file, const char *backing_file, bool has_speed, int64_t speed, bool has_on_error, BlockdevOnError on_error, + bool has_filter_node_name, const char *filter_node_name, bool has_auto_finalize, bool auto_finalize, bool has_auto_dismiss, bool auto_dismiss, Error **errp) @@ -2571,7 +2572,8 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, } stream_start(has_job_id ? job_id : NULL, bs, base_bs, base_name, - job_flags, has_speed ? speed : 0, on_error, _err); + job_flags, has_speed ? speed : 0, on_error, + filter_node_name, _err); if (local_err) { error_propagate(errp, local_err); goto out; diff --git a/include/block/block_int.h b/include/block/block_int.h index 38cad9d..f782737 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -1134,6 +1134,9 @@ int is_windows_drive(const char *filename); * See @BlockJobCreateFlags * @speed: The maximum speed, in bytes per second, or 0 for unlimited. * @on_error: The action to take upon error. + * @filter_node_name: The node name that should be assigned to the filter + * driver that the commit job inserts into the graph above @bs. NULL means + * that a node name should be autogenerated. * @errp: Error object. * * Start a streaming operation on @bs. Clusters that are unallocated @@ -1146,7 +1149,9 @@ int is_windows_drive(const char *filename); void stream_start(const char *job_id, BlockDriverState *bs, BlockDriverState *base, const char *backing_file_str, int creation_flags, int64_t speed, - BlockdevOnError on_error, Error **errp); + BlockdevOnError on_error, + const char *filter_node_name, + Error **errp); /** * commit_start: diff --git a/qapi/block-core.json b/qapi/block-core.json index 3c16f1e..32fb097 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -2533,6 +2533,11 @@ #'stop' and 'enospc' can only be used if the block device #supports io-status (see BlockInfo). Since 1.3. # +# @filter-node-name: the node name that should be assigned to the +#filter driver that the stream job inserts into the graph +#above @device. If this option is not given, a node name is +#autogenerated. (Since: 5.2) +# # @auto-finalize: When false, this job will wait in a PENDING state after it has # finished its work, waiting for @block-job-finalize before # making any block graph changes. @@ -2563,6 +2568,7 @@ 'data': { '*job-id': 'str', 'device': 'str', '*base': 'str', '*base-node': 'str', '*backing-file': 'str', '*speed': 'int', '*on-error': 'Block
[PATCH v10 2/9] copy-on-read: add filter append/drop functions
Provide API for the COR-filter insertion/removal. Also, drop the filter child permissions for an inactive state when the filter node is being removed. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 84 block/copy-on-read.h | 35 ++ 2 files changed, 119 insertions(+) create mode 100644 block/copy-on-read.h diff --git a/block/copy-on-read.c b/block/copy-on-read.c index cb03e0f..3c8231f 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -23,11 +23,21 @@ #include "qemu/osdep.h" #include "block/block_int.h" #include "qemu/module.h" +#include "qapi/error.h" +#include "qapi/qmp/qdict.h" +#include "block/copy-on-read.h" + + +typedef struct BDRVStateCOR { +bool active; +} BDRVStateCOR; static int cor_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { +BDRVStateCOR *state = bs->opaque; + bs->file = bdrv_open_child(NULL, options, "file", bs, _of_bds, BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, false, errp); @@ -42,6 +52,13 @@ static int cor_open(BlockDriverState *bs, QDict *options, int flags, ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & bs->file->bs->supported_zero_flags); +state->active = true; + +/* + * We don't need to call bdrv_child_refresh_perms() now as the permissions + * will be updated later when the filter node gets its parent. + */ + return 0; } @@ -57,6 +74,17 @@ static void cor_child_perm(BlockDriverState *bs, BdrvChild *c, uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared) { +BDRVStateCOR *s = bs->opaque; + +if (!s->active) { +/* + * While the filter is being removed + */ +*nperm = 0; +*nshared = BLK_PERM_ALL; +return; +} + *nperm = perm & PERM_PASSTHROUGH; *nshared = (shared & PERM_PASSTHROUGH) | PERM_UNCHANGED; @@ -135,6 +163,7 @@ static void cor_lock_medium(BlockDriverState *bs, bool locked) static BlockDriver bdrv_copy_on_read = { .format_name= "copy-on-read", +.instance_size = sizeof(BDRVStateCOR), .bdrv_open = cor_open, .bdrv_child_perm= cor_child_perm, @@ -159,4 +188,59 @@ static void bdrv_copy_on_read_init(void) bdrv_register(_copy_on_read); } + +BlockDriverState *bdrv_cor_filter_append(BlockDriverState *bs, + QDict *node_options, + int flags, Error **errp) +{ +BlockDriverState *cor_filter_bs; +Error *local_err = NULL; + +cor_filter_bs = bdrv_open(NULL, NULL, node_options, flags, errp); +if (cor_filter_bs == NULL) { +error_prepend(errp, "Could not create COR-filter node: "); +return NULL; +} + +bdrv_drained_begin(bs); +bdrv_replace_node(bs, cor_filter_bs, _err); +bdrv_drained_end(bs); + +if (local_err) { +bdrv_unref(cor_filter_bs); +error_propagate(errp, local_err); +return NULL; +} + +return cor_filter_bs; +} + + +void bdrv_cor_filter_drop(BlockDriverState *cor_filter_bs) +{ +BdrvChild *child; +BlockDriverState *bs; +BDRVStateCOR *s = cor_filter_bs->opaque; + +child = bdrv_filter_child(cor_filter_bs); +if (!child) { +return; +} +bs = child->bs; + +/* Retain the BDS until we complete the graph change. */ +bdrv_ref(bs); +/* Hold a guest back from writing while permissions are being reset. */ +bdrv_drained_begin(bs); +/* Drop permissions before the graph change. */ +s->active = false; +bdrv_child_refresh_perms(cor_filter_bs, child, _abort); +bdrv_replace_node(cor_filter_bs, bs, _abort); + +bdrv_drained_end(bs); +bdrv_unref(bs); +bdrv_unref(cor_filter_bs); +} + + block_init(bdrv_copy_on_read_init); diff --git a/block/copy-on-read.h b/block/copy-on-read.h new file mode 100644 index 000..d6f2422 --- /dev/null +++ b/block/copy-on-read.h @@ -0,0 +1,35 @@ +/* + * Copy-on-read filter block driver + * + * The filter driver performs Copy-On-Read (COR) operations + * + * Copyright (c) 2018-2020 Virtuozzo International GmbH. + * + * Author: + * Andrey Shinkevich + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANT
[PATCH v10 1/9] copy-on-read: Support preadv/pwritev_part functions
Add support for the recently introduced functions bdrv_co_preadv_part() and bdrv_co_pwritev_part() to the COR-filter driver. Signed-off-by: Andrey Shinkevich Reviewed-by: Vladimir Sementsov-Ogievskiy --- block/copy-on-read.c | 28 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index 2816e61..cb03e0f 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -74,21 +74,25 @@ static int64_t cor_getlength(BlockDriverState *bs) } -static int coroutine_fn cor_co_preadv(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, + size_t qiov_offset, + int flags) { -return bdrv_co_preadv(bs->file, offset, bytes, qiov, - flags | BDRV_REQ_COPY_ON_READ); +return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, + flags | BDRV_REQ_COPY_ON_READ); } -static int coroutine_fn cor_co_pwritev(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +static int coroutine_fn cor_co_pwritev_part(BlockDriverState *bs, +uint64_t offset, +uint64_t bytes, +QEMUIOVector *qiov, +size_t qiov_offset, int flags) { - -return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); +return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset, +flags); } @@ -137,8 +141,8 @@ static BlockDriver bdrv_copy_on_read = { .bdrv_getlength = cor_getlength, -.bdrv_co_preadv = cor_co_preadv, -.bdrv_co_pwritev= cor_co_pwritev, +.bdrv_co_preadv_part= cor_co_preadv_part, +.bdrv_co_pwritev_part = cor_co_pwritev_part, .bdrv_co_pwrite_zeroes = cor_co_pwrite_zeroes, .bdrv_co_pdiscard = cor_co_pdiscard, .bdrv_co_pwritev_compressed = cor_co_pwritev_compressed, -- 1.8.3.1
[PATCH v10 0/9] Apply COR-filter to the block-stream permanently
Despite the patch "freeze link to base node..." has been removed from the series in the current version 9, the iotest case test_stream_parallel does not pass after the COR-filter is inserted into the backing chain. As the test case may not be initialized, it does not make a sense and was removed again. The check with bdrv_is_allocated_above() takes place in the COR-filter and in the block-stream job both. An optimization of the block-stream job based on the filter functionality may be made in a separate series. v10: 02: The missed new file block/copy-on-read.h added v9: 02: Refactored. 04: Base node name is used instead of the file name. 05: New implementation based on Max' review. 06: New. 07: New. The patch "freeze link to base node..." was deleted. 08: New. 09: The filter node options are initialized. The v8 Message-Id: <1598633579-221780-1-git-send-email-andrey.shinkev...@virtuozzo.com> Andrey Shinkevich (9): copy-on-read: Support preadv/pwritev_part functions copy-on-read: add filter append/drop functions qapi: add filter-node-name to block-stream copy-on-read: pass base node name to COR driver copy-on-read: limit guest COR activity to base in COR driver copy-on-read: skip non-guest reads if no copy needed stream: skip filters when writing backing file name to QCOW2 header block: remove unused backing-file name parameter block: apply COR-filter to block-stream jobs block/copy-on-read.c | 165 ++--- block/copy-on-read.h | 35 + block/io.c | 2 +- block/monitor/block-hmp-cmds.c | 6 +- block/stream.c | 112 +--- blockdev.c | 21 +- include/block/block_int.h | 9 ++- qapi/block-core.json | 23 ++ tests/qemu-iotests/030 | 51 ++--- tests/qemu-iotests/030.out | 4 +- tests/qemu-iotests/141.out | 2 +- tests/qemu-iotests/245 | 19 +++-- 12 files changed, 302 insertions(+), 147 deletions(-) create mode 100644 block/copy-on-read.h -- 1.8.3.1
[PATCH v10 5/9] copy-on-read: limit guest COR activity to base in COR driver
Limit the guest's COR operations by the base node in the backing chain when the base node name is given. It will be useful for a block stream job when the COR-filter is applied. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 38 -- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index e04092f..f53f7e0 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -121,8 +121,42 @@ static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, size_t qiov_offset, int flags) { -return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, - flags | BDRV_REQ_COPY_ON_READ); +int64_t n = 0; +int64_t size = offset + bytes; +int local_flags; +int ret; +BDRVStateCOR *state = bs->opaque; + +if (!state->base_bs) { +return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, + flags | BDRV_REQ_COPY_ON_READ); +} + +while (offset < size) { +local_flags = flags; + +/* In case of failure, try to copy-on-read anyway */ +ret = bdrv_is_allocated(bs->file->bs, offset, bytes, ); +if (!ret) { +ret = bdrv_is_allocated_above(bdrv_cow_bs(bs->file->bs), + state->base_bs, false, offset, n, ); +if (ret > 0) { +local_flags |= BDRV_REQ_COPY_ON_READ; +} +} + +ret = bdrv_co_preadv_part(bs->file, offset, n, qiov, qiov_offset, + local_flags); +if (ret < 0) { +return ret; +} + +offset += n; +qiov_offset += n; +bytes -= n; +} + +return 0; } -- 1.8.3.1
[PATCH v9 9/9] block: apply COR-filter to block-stream jobs
This patch completes the series with the COR-filter insertion for block-stream operations. Adding the filter makes it possible for copied regions to be discarded in backing files during the block-stream job, what will reduce the disk overuse. The COR-filter insertion incurs changes in the iotests case 245:test_block_stream_4 that reopens the backing chain during a block-stream job. There are changes in the iotests #030 as well. The iotests case 030:test_stream_parallel was deleted due to multiple conflicts between the concurrent job operations over the same backing chain. The base backing node for one job is the top node for another job. It may change due to the filter node inserted into the backing chain while both jobs are running. Another issue is that the parts of the backing chain are being frozen by the running job and may not be changed by the concurrent job when needed. The concept of the parallel jobs with common nodes is considered vital no more. Signed-off-by: Andrey Shinkevich --- block/stream.c | 93 ++ tests/qemu-iotests/030 | 51 +++-- tests/qemu-iotests/030.out | 4 +- tests/qemu-iotests/141.out | 2 +- tests/qemu-iotests/245 | 19 +++--- 5 files changed, 83 insertions(+), 86 deletions(-) diff --git a/block/stream.c b/block/stream.c index fe2663f..240b3dc 100644 --- a/block/stream.c +++ b/block/stream.c @@ -17,8 +17,10 @@ #include "block/blockjob_int.h" #include "qapi/error.h" #include "qapi/qmp/qerror.h" +#include "qapi/qmp/qdict.h" #include "qemu/ratelimit.h" #include "sysemu/block-backend.h" +#include "block/copy-on-read.h" enum { /* @@ -33,6 +35,8 @@ typedef struct StreamBlockJob { BlockJob common; BlockDriverState *base_overlay; /* COW overlay (stream from this) */ BlockDriverState *above_base; /* Node directly above the base */ +BlockDriverState *cor_filter_bs; +BlockDriverState *target_bs; BlockdevOnError on_error; bool bs_read_only; bool chain_frozen; @@ -52,23 +56,20 @@ static void stream_abort(Job *job) StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); if (s->chain_frozen) { -BlockJob *bjob = >common; -bdrv_unfreeze_backing_chain(blk_bs(bjob->blk), s->above_base); +bdrv_unfreeze_backing_chain(s->cor_filter_bs, s->above_base); } } static int stream_prepare(Job *job) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); -BlockJob *bjob = >common; -BlockDriverState *bs = blk_bs(bjob->blk); -BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs); +BlockDriverState *unfiltered_bs = bdrv_skip_filters(s->target_bs); BlockDriverState *base = bdrv_filter_or_cow_bs(s->above_base); BlockDriverState *base_metadata = bdrv_skip_filters(base); Error *local_err = NULL; int ret = 0; -bdrv_unfreeze_backing_chain(bs, s->above_base); +bdrv_unfreeze_backing_chain(s->cor_filter_bs, s->above_base); s->chain_frozen = false; if (bdrv_cow_child(unfiltered_bs)) { @@ -94,13 +95,14 @@ static void stream_clean(Job *job) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); BlockJob *bjob = >common; -BlockDriverState *bs = blk_bs(bjob->blk); + +bdrv_cor_filter_drop(s->cor_filter_bs); /* Reopen the image back in read-only mode if necessary */ if (s->bs_read_only) { /* Give up write permissions before making it read-only */ blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, _abort); -bdrv_reopen_set_read_only(bs, true, NULL); +bdrv_reopen_set_read_only(s->target_bs, true, NULL); } } @@ -108,9 +110,7 @@ static int coroutine_fn stream_run(Job *job, Error **errp) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); BlockBackend *blk = s->common.blk; -BlockDriverState *bs = blk_bs(blk); -BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs); -bool enable_cor = !bdrv_cow_child(s->base_overlay); +BlockDriverState *unfiltered_bs = bdrv_skip_filters(s->target_bs); int64_t len; int64_t offset = 0; uint64_t delay_ns = 0; @@ -122,21 +122,12 @@ static int coroutine_fn stream_run(Job *job, Error **errp) return 0; } -len = bdrv_getlength(bs); +len = bdrv_getlength(s->target_bs); if (len < 0) { return len; } job_progress_set_remaining(>common.job, len); -/* Turn on copy-on-read for the whole block device so that guest read - * requests help us make progress. Only do this when copying the entire - * backing chain since the copy-on-read operation does not take base into - * account. - */ -if (enable_cor) { -bdrv_enable_copy_on_read(bs); -} - for (
[PATCH v9 8/9] block: remove unused backing-file name parameter
The block stream QMP parameter backing-file is in use no more. It designates a backing file name to set in QCOW2 image header after the block stream job finished. The base file name is used instead. Signed-off-by: Andrey Shinkevich --- block/monitor/block-hmp-cmds.c | 2 +- block/stream.c | 6 +- blockdev.c | 17 + include/block/block_int.h | 2 +- qapi/block-core.json | 17 + 5 files changed, 5 insertions(+), 39 deletions(-) diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c index 4e66775..5f19499 100644 --- a/block/monitor/block-hmp-cmds.c +++ b/block/monitor/block-hmp-cmds.c @@ -506,7 +506,7 @@ void hmp_block_stream(Monitor *mon, const QDict *qdict) int64_t speed = qdict_get_try_int(qdict, "speed", 0); qmp_block_stream(true, device, device, base != NULL, base, false, NULL, - false, NULL, qdict_haskey(qdict, "speed"), speed, true, + qdict_haskey(qdict, "speed"), speed, true, BLOCKDEV_ON_ERROR_REPORT, false, NULL, false, false, false, false, ); diff --git a/block/stream.c b/block/stream.c index b0719e9..fe2663f 100644 --- a/block/stream.c +++ b/block/stream.c @@ -34,7 +34,6 @@ typedef struct StreamBlockJob { BlockDriverState *base_overlay; /* COW overlay (stream from this) */ BlockDriverState *above_base; /* Node directly above the base */ BlockdevOnError on_error; -char *backing_file_str; bool bs_read_only; bool chain_frozen; } StreamBlockJob; @@ -103,8 +102,6 @@ static void stream_clean(Job *job) blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, _abort); bdrv_reopen_set_read_only(bs, true, NULL); } - -g_free(s->backing_file_str); } static int coroutine_fn stream_run(Job *job, Error **errp) @@ -220,7 +217,7 @@ static const BlockJobDriver stream_job_driver = { }; void stream_start(const char *job_id, BlockDriverState *bs, - BlockDriverState *base, const char *backing_file_str, + BlockDriverState *base, int creation_flags, int64_t speed, BlockdevOnError on_error, const char *filter_node_name, @@ -295,7 +292,6 @@ void stream_start(const char *job_id, BlockDriverState *bs, s->base_overlay = base_overlay; s->above_base = above_base; -s->backing_file_str = g_strdup(backing_file_str); s->bs_read_only = bs_read_only; s->chain_frozen = true; diff --git a/blockdev.c b/blockdev.c index d719c47..b223601 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2486,7 +2486,6 @@ out: void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, bool has_base, const char *base, bool has_base_node, const char *base_node, - bool has_backing_file, const char *backing_file, bool has_speed, int64_t speed, bool has_on_error, BlockdevOnError on_error, bool has_filter_node_name, const char *filter_node_name, @@ -2498,7 +2497,6 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, BlockDriverState *base_bs = NULL; AioContext *aio_context; Error *local_err = NULL; -const char *base_name = NULL; int job_flags = JOB_DEFAULT; if (!has_on_error) { @@ -2526,7 +2524,6 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, goto out; } assert(bdrv_get_aio_context(base_bs) == aio_context); -base_name = base; } if (has_base_node) { @@ -2541,7 +2538,6 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, } assert(bdrv_get_aio_context(base_bs) == aio_context); bdrv_refresh_filename(base_bs); -base_name = base_bs->filename; } /* Check for op blockers in the whole chain between bs and base */ @@ -2553,17 +2549,6 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, } } -/* if we are streaming the entire chain, the result will have no backing - * file, and specifying one is therefore an error */ -if (base_bs == NULL && has_backing_file) { -error_setg(errp, "backing file specified, but streaming the " - "entire chain"); -goto out; -} - -/* backing_file string overrides base bs filename */ -base_name = has_backing_file ? backing_file : base_name; - if (has_auto_finalize && !auto_finalize) { job_flags |= JOB_MANUAL_FINALIZE; } @@ -2571,7 +2556,7 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, job_flags |= JOB_MANUAL_DISMISS; } -str
[PATCH v9 2/9] copy-on-read: add filter append/drop functions
Provide API for the COR-filter insertion/removal. Also, drop the filter child permissions for an inactive state when the filter node is being removed. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 84 1 file changed, 84 insertions(+) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index cb03e0f..3c8231f 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -23,11 +23,21 @@ #include "qemu/osdep.h" #include "block/block_int.h" #include "qemu/module.h" +#include "qapi/error.h" +#include "qapi/qmp/qdict.h" +#include "block/copy-on-read.h" + + +typedef struct BDRVStateCOR { +bool active; +} BDRVStateCOR; static int cor_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { +BDRVStateCOR *state = bs->opaque; + bs->file = bdrv_open_child(NULL, options, "file", bs, _of_bds, BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, false, errp); @@ -42,6 +52,13 @@ static int cor_open(BlockDriverState *bs, QDict *options, int flags, ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & bs->file->bs->supported_zero_flags); +state->active = true; + +/* + * We don't need to call bdrv_child_refresh_perms() now as the permissions + * will be updated later when the filter node gets its parent. + */ + return 0; } @@ -57,6 +74,17 @@ static void cor_child_perm(BlockDriverState *bs, BdrvChild *c, uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared) { +BDRVStateCOR *s = bs->opaque; + +if (!s->active) { +/* + * While the filter is being removed + */ +*nperm = 0; +*nshared = BLK_PERM_ALL; +return; +} + *nperm = perm & PERM_PASSTHROUGH; *nshared = (shared & PERM_PASSTHROUGH) | PERM_UNCHANGED; @@ -135,6 +163,7 @@ static void cor_lock_medium(BlockDriverState *bs, bool locked) static BlockDriver bdrv_copy_on_read = { .format_name= "copy-on-read", +.instance_size = sizeof(BDRVStateCOR), .bdrv_open = cor_open, .bdrv_child_perm= cor_child_perm, @@ -159,4 +188,59 @@ static void bdrv_copy_on_read_init(void) bdrv_register(_copy_on_read); } + +BlockDriverState *bdrv_cor_filter_append(BlockDriverState *bs, + QDict *node_options, + int flags, Error **errp) +{ +BlockDriverState *cor_filter_bs; +Error *local_err = NULL; + +cor_filter_bs = bdrv_open(NULL, NULL, node_options, flags, errp); +if (cor_filter_bs == NULL) { +error_prepend(errp, "Could not create COR-filter node: "); +return NULL; +} + +bdrv_drained_begin(bs); +bdrv_replace_node(bs, cor_filter_bs, _err); +bdrv_drained_end(bs); + +if (local_err) { +bdrv_unref(cor_filter_bs); +error_propagate(errp, local_err); +return NULL; +} + +return cor_filter_bs; +} + + +void bdrv_cor_filter_drop(BlockDriverState *cor_filter_bs) +{ +BdrvChild *child; +BlockDriverState *bs; +BDRVStateCOR *s = cor_filter_bs->opaque; + +child = bdrv_filter_child(cor_filter_bs); +if (!child) { +return; +} +bs = child->bs; + +/* Retain the BDS until we complete the graph change. */ +bdrv_ref(bs); +/* Hold a guest back from writing while permissions are being reset. */ +bdrv_drained_begin(bs); +/* Drop permissions before the graph change. */ +s->active = false; +bdrv_child_refresh_perms(cor_filter_bs, child, _abort); +bdrv_replace_node(cor_filter_bs, bs, _abort); + +bdrv_drained_end(bs); +bdrv_unref(bs); +bdrv_unref(cor_filter_bs); +} + + block_init(bdrv_copy_on_read_init); -- 1.8.3.1