Re: [PATCH v5 2/2] nbd/server: Mark negotiation functions as coroutine_fn
On 09.04.24 18:49, Eric Blake wrote: On Tue, Apr 09, 2024 at 09:30:39AM +0300, Vladimir Sementsov-Ogievskiy wrote: On 08.04.24 19:00, Eric Blake wrote: nbd_negotiate() is already marked coroutine_fn. And given the fix in the previous patch to have nbd_negotiate_handle_starttls not create and wait on a g_main_loop (as that would violate coroutine constraints), it is worth marking the rest of the related static functions reachable only during option negotiation as also being coroutine_fn. Suggested-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Eric Blake --- nbd/server.c | 102 +-- 1 file changed, 59 insertions(+), 43 deletions(-) diff --git a/nbd/server.c b/nbd/server.c index 98ae0e16326..1857fba51c1 100644 [..] { int rc; g_autofree char *name = NULL; @@ -755,7 +764,8 @@ struct NBDTLSServerHandshakeData { Coroutine *co; }; -static void nbd_server_tls_handshake(QIOTask *task, void *opaque) +static coroutine_fn void This is not, that's a callback for tls handshake, which is not coroutine context as I understand. The call chain is: nbd_negotiate() - coroutine_fn before this patch nbd_negotiate_options() - marked coroutine_fn here nbd_negotiate_handle_starttls() - marked coroutine_fn here qio_channel_tls_handshake() - in io/channel-tls.c; not marked coroutine_fn, but works both in or out of coroutine context ... nbd_server_tls_handshake() - renamed in 1/2 of this series without this hunk: I can drop that particular marking. There are cases where the callback is reached without having done the qemu_coroutine_yield() in nbd_negotiate_handle_starttls; but there are also cases where the IO took enough time that the coroutine yielded and it is that callback that reawakens it. The key thing for me is that in this case (when coroutine yielded), nbd_server_tls_handshake() would finally be called from glib IO handler, set in qio_channel_tls_handshake() qio_channel_tls_handshake_task() qio_channel_add_watch_full() g_source_set_callback(source, (GSourceFunc)func, user_data, notify); <<< , which would definitely not be a coroutine context. Do I understand correctly, that "coroutine_fn" means "only call from coroutine context"[1], or does it mean "could be called from coroutine context"[2]? The comment in osdep.h says: * Mark a function that executes in coroutine context |} * | * Functions that execute in coroutine context cannot be called directly from | * normal functions. ... So I assume, we mean [1]. Reviewed-by: Vladimir Sementsov-Ogievskiy Thanks. -- Best regards, Vladimir
[PATCH] tests/avocado: add hotplug_blk test
Introduce a test, that checks that plug/unplug of virtio-blk device works. (the test is developed by copying hotplug_cpu.py, so keep original copyright) Signed-off-by: Vladimir Sementsov-Ogievskiy --- tests/avocado/hotplug_blk.py | 69 1 file changed, 69 insertions(+) create mode 100644 tests/avocado/hotplug_blk.py diff --git a/tests/avocado/hotplug_blk.py b/tests/avocado/hotplug_blk.py new file mode 100644 index 00..5dc30f6616 --- /dev/null +++ b/tests/avocado/hotplug_blk.py @@ -0,0 +1,69 @@ +# Functional test that hotplugs a virtio blk disk and checks it on a Linux +# guest +# +# Copyright (c) 2021 Red Hat, Inc. +# Copyright (c) Yandex +# +# This work is licensed under the terms of the GNU GPL, version 2 or +# later. See the COPYING file in the top-level directory. + +import time + +from avocado_qemu import LinuxTest + + +class HotPlug(LinuxTest): +def blockdev_add(self) -> None: +self.vm.cmd('blockdev-add', **{ +'driver': 'null-co', +'size': 1073741824, +'node-name': 'disk' +}) + +def assert_vda(self) -> None: +self.ssh_command('test -e /sys/block/vda') + +def assert_no_vda(self) -> None: +with self.assertRaises(AssertionError): +self.assert_vda() + +def plug(self) -> None: +args = { +'driver': 'virtio-blk-pci', +'drive': 'disk', +'id': 'virtio-disk0', +'bus': 'pci.1', +'addr': 1 +} + +self.assert_no_vda() +self.vm.cmd('device_add', args) +try: +self.assert_vda() +except AssertionError: +time.sleep(1) +self.assert_vda() + +def unplug(self) -> None: +self.vm.cmd('device_del', id='virtio-disk0') + +self.vm.event_wait('DEVICE_DELETED', 1.0, + match={'data': {'device': 'virtio-disk0'}}) + +self.assert_no_vda() + +def test(self) -> None: +""" +:avocado: tags=arch:x86_64 +:avocado: tags=machine:q35 +:avocado: tags=accel:kvm +""" +self.require_accelerator('kvm') +self.vm.add_args('-accel', 'kvm') +self.vm.add_args('-device', 'pcie-pci-bridge,id=pci.1,bus=pcie.0') + +self.launch_and_wait() +self.blockdev_add() + +self.plug() +self.unplug() -- 2.34.1
Re: [PATCH v5 2/2] nbd/server: Mark negotiation functions as coroutine_fn
On 08.04.24 19:00, Eric Blake wrote: nbd_negotiate() is already marked coroutine_fn. And given the fix in the previous patch to have nbd_negotiate_handle_starttls not create and wait on a g_main_loop (as that would violate coroutine constraints), it is worth marking the rest of the related static functions reachable only during option negotiation as also being coroutine_fn. Suggested-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Eric Blake --- nbd/server.c | 102 +-- 1 file changed, 59 insertions(+), 43 deletions(-) diff --git a/nbd/server.c b/nbd/server.c index 98ae0e16326..1857fba51c1 100644 [..] { int rc; g_autofree char *name = NULL; @@ -755,7 +764,8 @@ struct NBDTLSServerHandshakeData { Coroutine *co; }; -static void nbd_server_tls_handshake(QIOTask *task, void *opaque) +static coroutine_fn void This is not, that's a callback for tls handshake, which is not coroutine context as I understand. without this hunk: Reviewed-by: Vladimir Sementsov-Ogievskiy +nbd_server_tls_handshake(QIOTask *task, void *opaque) { struct NBDTLSServerHandshakeData *data = opaque; @@ -768,8 +778,8 @@ static void nbd_server_tls_handshake(QIOTask *task, void *opaque) [..] -- Best regards, Vladimir
Re: [PATCH v5 1/2] nbd/server: do not poll within a coroutine context
On 08.04.24 19:00, Eric Blake wrote: From: Zhu Yangyang Coroutines are not supposed to block. Instead, they should yield. The client performs TLS upgrade outside of an AIOContext, during synchronous handshake; this still requires g_main_loop. But the server responds to TLS upgrade inside a coroutine, so a nested g_main_loop is wrong. Since the two callbacks no longer share more than the setting of data.complete and data.error, it's just as easy to use static helpers instead of trying to share a common code path. It is also possible to add assertions that no other code is interfering with the eventual path to qio reaching the callback, whether or not it required a yield or main loop. Fixes: f95910f ("nbd: implement TLS support in the protocol negotiation") Signed-off-by: Zhu Yangyang [eblake: move callbacks to their use point, add assertions] Signed-off-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy -- Best regards, Vladimir
Re: [PATCH v4] nbd/server: do not poll within a coroutine context
On 05.04.24 20:44, Eric Blake wrote: From: Zhu Yangyang Coroutines are not supposed to block. Instead, they should yield. The client performs TLS upgrade outside of an AIOContext, during synchronous handshake; this still requires g_main_loop. But the server responds to TLS upgrade inside a coroutine, so a nested g_main_loop is wrong. Since the two callbacks no longer share more than the setting of data.complete and data.error, it's just as easy to use static helpers instead of trying to share a common code path. Fixes: f95910f ("nbd: implement TLS support in the protocol negotiation") Signed-off-by: Zhu Yangyang [eblake: move callbacks to their use point] Signed-off-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy still, some notes below --- v3: https://lists.gnu.org/archive/html/qemu-devel/2024-04/msg00375.html in v4, factor even the struct to the .c files, avoiding a union [Vladimir] nbd/nbd-internal.h | 10 -- nbd/client.c | 27 +++ nbd/common.c | 11 --- nbd/server.c | 29 +++-- 4 files changed, 46 insertions(+), 31 deletions(-) diff --git a/nbd/nbd-internal.h b/nbd/nbd-internal.h index dfa02f77ee4..91895106a95 100644 --- a/nbd/nbd-internal.h +++ b/nbd/nbd-internal.h @@ -72,16 +72,6 @@ static inline int nbd_write(QIOChannel *ioc, const void *buffer, size_t size, return qio_channel_write_all(ioc, buffer, size, errp) < 0 ? -EIO : 0; } -struct NBDTLSHandshakeData { -GMainLoop *loop; -bool complete; -Error *error; -}; - - -void nbd_tls_handshake(QIOTask *task, - void *opaque); - int nbd_drop(QIOChannel *ioc, size_t size, Error **errp); #endif diff --git a/nbd/client.c b/nbd/client.c index 29ffc609a4b..c7141d7a098 100644 --- a/nbd/client.c +++ b/nbd/client.c @@ -596,13 +596,31 @@ static int nbd_request_simple_option(QIOChannel *ioc, int opt, bool strict, return 1; } +/* Callback to learn when QIO TLS upgrade is complete */ +struct NBDTLSClientHandshakeData { +bool complete; +Error *error; +GMainLoop *loop; +}; + +static void nbd_client_tls_handshake(QIOTask *task, void *opaque) +{ +struct NBDTLSClientHandshakeData *data = opaque; + +qio_task_propagate_error(task, >error); +data->complete = true; +if (data->loop) { +g_main_loop_quit(data->loop); +} +} + static QIOChannel *nbd_receive_starttls(QIOChannel *ioc, QCryptoTLSCreds *tlscreds, const char *hostname, Error **errp) { int ret; QIOChannelTLS *tioc; -struct NBDTLSHandshakeData data = { 0 }; +struct NBDTLSClientHandshakeData data = { 0 }; ret = nbd_request_simple_option(ioc, NBD_OPT_STARTTLS, true, errp); if (ret <= 0) { @@ -619,18 +637,19 @@ static QIOChannel *nbd_receive_starttls(QIOChannel *ioc, return NULL; } qio_channel_set_name(QIO_CHANNEL(tioc), "nbd-client-tls"); -data.loop = g_main_loop_new(g_main_context_default(), FALSE); trace_nbd_receive_starttls_tls_handshake(); qio_channel_tls_handshake(tioc, - nbd_tls_handshake, + nbd_client_tls_handshake, , NULL, NULL); if (!data.complete) { +data.loop = g_main_loop_new(g_main_context_default(), FALSE); g_main_loop_run(data.loop); +g_main_loop_unref(data.loop); probably good to assert(data.complete); } -g_main_loop_unref(data.loop); + if (data.error) { error_propagate(errp, data.error); object_unref(OBJECT(tioc)); diff --git a/nbd/common.c b/nbd/common.c index 3247c1d618a..589a748cfe6 100644 --- a/nbd/common.c +++ b/nbd/common.c @@ -47,17 +47,6 @@ int nbd_drop(QIOChannel *ioc, size_t size, Error **errp) } -void nbd_tls_handshake(QIOTask *task, - void *opaque) -{ -struct NBDTLSHandshakeData *data = opaque; - -qio_task_propagate_error(task, >error); -data->complete = true; -g_main_loop_quit(data->loop); -} - - const char *nbd_opt_lookup(uint32_t opt) { switch (opt) { diff --git a/nbd/server.c b/nbd/server.c index c3484cc1ebc..ea13cf0e766 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -748,6 +748,23 @@ static int nbd_negotiate_handle_info(NBDClient *client, Error **errp) return rc; } +/* Callback to learn when QIO TLS upgrade is complete */ +struct NBDTLSServerHandshakeData { +bool complete; +Error *error; +Coroutine *co; +}; + +static void nbd_server_tls_handshake(QIOTask *task, void *opaque) +{ +struct NBDTLSServerHandshakeData *data = opaque; + +qio_task_propagate_error(task, >error); +data->complete = true; +if (!qemu_coroutine_entered(data->co)) { +aio_co
Re: [PATCH v3] nbd/server: do not poll within a coroutine context
On 04.04.24 04:42, Eric Blake wrote: From: Zhu Yangyang Coroutines are not supposed to block. Instead, they should yield. The client performs TLS upgrade outside of an AIOContext, during synchronous handshake; this still requires g_main_loop. But the server responds to TLS upgrade inside a coroutine, so a nested g_main_loop is wrong. Since the two callbacks no longer share more than the setting of data.complete and data.error, it's just as easy to use static helpers instead of trying to share a common code path. Fixes: f95910f ("nbd: implement TLS support in the protocol negotiation") Signed-off-by: Zhu Yangyang [eblake: move callbacks to their use point] Signed-off-by: Eric Blake --- After looking at this more, I'm less convinced that there is enough common code here to even be worth trying to share in common.c. This takes the essence of the v2 patch, but refactors it a bit. Maybe, do the complete split, and make separate structure definitions in client.c and server.c, and don't make shared NBDTLSHandshakeData with union? Finally, it's just a simple opaque-structure for static callback function, seems good to keep it in .c as well. v2: https://lists.gnu.org/archive/html/qemu-devel/2024-04/msg00019.html nbd/nbd-internal.h | 20 ++-- nbd/client.c | 21 + nbd/common.c | 11 --- nbd/server.c | 21 - 4 files changed, 43 insertions(+), 30 deletions(-) diff --git a/nbd/nbd-internal.h b/nbd/nbd-internal.h index dfa02f77ee4..087c6bfc002 100644 --- a/nbd/nbd-internal.h +++ b/nbd/nbd-internal.h @@ -63,6 +63,16 @@ #define NBD_SET_TIMEOUT _IO(0xab, 9) #define NBD_SET_FLAGS _IO(0xab, 10) +/* Used in NBD_OPT_STARTTLS handling */ +struct NBDTLSHandshakeData { +bool complete; +Error *error; +union { +GMainLoop *loop; +Coroutine *co; +} u; +}; + /* nbd_write * Writes @size bytes to @ioc. Returns 0 on success. */ @@ -72,16 +82,6 @@ static inline int nbd_write(QIOChannel *ioc, const void *buffer, size_t size, return qio_channel_write_all(ioc, buffer, size, errp) < 0 ? -EIO : 0; } -struct NBDTLSHandshakeData { -GMainLoop *loop; -bool complete; -Error *error; -}; - - -void nbd_tls_handshake(QIOTask *task, - void *opaque); - int nbd_drop(QIOChannel *ioc, size_t size, Error **errp); #endif diff --git a/nbd/client.c b/nbd/client.c index 29ffc609a4b..c9dc5265404 100644 --- a/nbd/client.c +++ b/nbd/client.c @@ -596,6 +596,18 @@ static int nbd_request_simple_option(QIOChannel *ioc, int opt, bool strict, return 1; } +/* Callback to learn when QIO TLS upgrade is complete */ +static void nbd_client_tls_handshake(QIOTask *task, void *opaque) +{ +struct NBDTLSHandshakeData *data = opaque; + +qio_task_propagate_error(task, >error); +data->complete = true; +if (data->u.loop) { +g_main_loop_quit(data->u.loop); +} +} + static QIOChannel *nbd_receive_starttls(QIOChannel *ioc, QCryptoTLSCreds *tlscreds, const char *hostname, Error **errp) @@ -619,18 +631,19 @@ static QIOChannel *nbd_receive_starttls(QIOChannel *ioc, return NULL; } qio_channel_set_name(QIO_CHANNEL(tioc), "nbd-client-tls"); -data.loop = g_main_loop_new(g_main_context_default(), FALSE); trace_nbd_receive_starttls_tls_handshake(); qio_channel_tls_handshake(tioc, - nbd_tls_handshake, + nbd_client_tls_handshake, , NULL, NULL); if (!data.complete) { -g_main_loop_run(data.loop); +data.u.loop = g_main_loop_new(g_main_context_default(), FALSE); +g_main_loop_run(data.u.loop); +g_main_loop_unref(data.u.loop); } -g_main_loop_unref(data.loop); + if (data.error) { error_propagate(errp, data.error); object_unref(OBJECT(tioc)); diff --git a/nbd/common.c b/nbd/common.c index 3247c1d618a..589a748cfe6 100644 --- a/nbd/common.c +++ b/nbd/common.c @@ -47,17 +47,6 @@ int nbd_drop(QIOChannel *ioc, size_t size, Error **errp) } -void nbd_tls_handshake(QIOTask *task, - void *opaque) -{ -struct NBDTLSHandshakeData *data = opaque; - -qio_task_propagate_error(task, >error); -data->complete = true; -g_main_loop_quit(data->loop); -} - - const char *nbd_opt_lookup(uint32_t opt) { switch (opt) { diff --git a/nbd/server.c b/nbd/server.c index c3484cc1ebc..d16726a6326 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -748,6 +748,17 @@ static int nbd_negotiate_handle_info(NBDClient *client, Error **errp) return rc; } +/* Callback to learn when QIO TLS upgrade is complete */ +static void nbd_server_tls_handshake(QIOTask *task, void *opaque) +{ +
Re: [PATCH v4 0/5] backup: discard-source parameter
On 13.03.24 18:28, Vladimir Sementsov-Ogievskiy wrote: Hi all! The main patch is 04, please look at it for description and diagram. v4: add t-b by Fiona add r-b by Fiona to 02-05 (patch 01 still lack an r-b) 05: fix copyrights and subject in the test 04: since 9.0 --> since 9.1 (we missed a soft freeze for 9.0) Vladimir Sementsov-Ogievskiy (5): block/copy-before-write: fix permission block/copy-before-write: support unligned snapshot-discard block/copy-before-write: create block_copy bitmap in filter node qapi: blockdev-backup: add discard-source parameter iotests: add backup-discard-source block/backup.c| 5 +- block/block-copy.c| 12 +- block/copy-before-write.c | 39 - block/copy-before-write.h | 1 + block/replication.c | 4 +- blockdev.c| 2 +- include/block/block-common.h | 2 + include/block/block-copy.h| 2 + include/block/block_int-global-state.h| 2 +- qapi/block-core.json | 4 + tests/qemu-iotests/257.out| 112 ++--- .../qemu-iotests/tests/backup-discard-source | 152 ++ .../tests/backup-discard-source.out | 5 + 13 files changed, 272 insertions(+), 70 deletions(-) create mode 100755 tests/qemu-iotests/tests/backup-discard-source create mode 100644 tests/qemu-iotests/tests/backup-discard-source.out Thanks for review, applied to my block branch. (r-b to 01 is still appreciated, I will not pull this until 9.1 tree opened) -- Best regards, Vladimir
Re: [PATCH v5] blockcommit: Reopen base image as RO after abort
On 04.04.24 12:11, Alexander Ivanov wrote: If a blockcommit is aborted the base image remains in RW mode, that leads to a fail of subsequent live migration. How to reproduce: $ virsh snapshot-create-as vm snp1 --disk-only *** write something to the disk inside the guest *** $ virsh blockcommit vm vda --active --shallow && virsh blockjob vm vda --abort $ lsof /vzt/vm.qcow2 COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAME qemu-syst 433203 root 45u REG 253,0 1724776448 133 /vzt/vm.qcow2 $ cat /proc/433203/fdinfo/45 pos:0 flags: 02140002 < The last 2 means RW mode If the base image is in RW mode at the end of blockcommit and was in RO mode before blockcommit, reopen the base BDS in RO. Signed-off-by: Alexander Ivanov Reviewed-by: Vladimir Sementsov-Ogievskiy Thanks, applied to my block branch. -- Best regards, Vladimir
Re: [PATCH 06/19] block/stream: fix -Werror=maybe-uninitialized false-positives
On 03.04.24 20:50, Eric Blake wrote: On Wed, Apr 03, 2024 at 01:24:11PM +0400, Marc-André Lureau wrote: Unfortunately, it doesn't work in all cases. It seems to have issues with some guards: ../block/stream.c: In function ‘stream_run’: ../block/stream.c:216:12: error: ‘ret’ may be used uninitialized [-Werror=maybe-uninitialized] 216 | if (ret < 0) { That one looks like: int ret; WITH_GRAPH_RDLOCK_GUARD() { ret = ...; } if (copy) { ret = ...; } if (ret < 0) I suspect the compiler is seeing the uncertainty possible from the second conditional, and letting it take priority over the certainty that the tweaked macro provided for the first conditional. So, updated macro helps in some cases, but doesn't help here? Intersting, why. What should we do? change the macros + cherry-pick the missing false-positives, or keep this series as is? An uglier macro, with sufficient comments as to why it is ugly (in order to let us have fewer false positives where we have to add initializers) may be less churn in the code base, but I'm not necessarily sold on the ugly macro. Let's see if anyone else expresses an opinion. I think marco + missing is better. No reason to add dead-initializations in cases where new macros helps. Ok Still, would be good to understand, what's the difference, why it help on some cases and not help in another. I don't know, it's like if the analyzer was lazy for this particular case, although there is nothing much different from other usages. If I replace: for (... *var2 = (void *)true; var2; with: for (... *var2 = (void *)true; var2 || true; then it doesn't warn.. but it also doesn't work. We want the body to execute exactly once, not infloop. Interestingly as well, if I change: for (... *var2 = (void *)true; var2; var2 = NULL) for: for (... *var2 = GML_OBJ_(); var2; var2 = NULL) GML_OBJ_() simply being &(GraphLockable) { }), an empty compound literal, then it doesn't work, in all usages. So the compiler is not figuring out that the compound literal is sufficient for an unconditional one time through the for loop body. What's worse, different compiler versions will change behavior over time. Making the code ugly to pacify a particular compiler, when that compiler might improve in the future, is a form of chasing windmills. All in all, I am not sure the trick of using var2 is really reliable either. And that's my biggest argument for not making the macro not more complex than it already is. All sounds reasonable, I'm not sure now. I still don't like an idea to satisfy compiler false-positive warnings by extra initializations.. Interesting that older versions do have unitialized-use warnings, but don't fail here (or nobody check?). Is it necessary to fix them at all? Older versions of compiler don't produce these warnings? Is it possible that some optimizations in new GCC version somehow breaks our WITH_ hack, so that it really lead to uninitialized behavior? And we just mask real problem with these patches? Wouldn't it more correct to just drop WITH_ hack, and move to a bit uglier but more gcc-native and fair { QEMU_LOCK_GUARD(lock); ... } ? -- Best regards, Vladimir
Re: [PATCH 06/19] block/stream: fix -Werror=maybe-uninitialized false-positives
On 03.04.24 11:11, Marc-André Lureau wrote: Hi On Tue, Apr 2, 2024 at 11:24 PM Vladimir Sementsov-Ogievskiy wrote: On 02.04.24 18:34, Eric Blake wrote: On Tue, Apr 02, 2024 at 12:58:43PM +0300, Vladimir Sementsov-Ogievskiy wrote: Again, same false-positives, because of WITH_GRAPH_RDLOCK_GUARD().. Didn't you try to change WITH_ macros somehow, so that compiler believe in our good intentions? #define WITH_QEMU_LOCK_GUARD_(x, var) \ for (g_autoptr(QemuLockable) var = \ qemu_lockable_auto_lock(QEMU_MAKE_LOCKABLE_NONNULL((x))); \ var; \ qemu_lockable_auto_unlock(var), var = NULL) I can't think of a clever way to rewrite this. The compiler probably thinks the loop may not run, due to the "var" condition. But how to convince it otherwise? it's hard to introduce another variable too.. hmm. maybe like this? #define WITH_QEMU_LOCK_GUARD_(x, var) \ for (g_autoptr(QemuLockable) var = \ qemu_lockable_auto_lock(QEMU_MAKE_LOCKABLE_NONNULL((x))), \ var2 = (void *)(true); \ var2; \ qemu_lockable_auto_unlock(var), var2 = NULL) probably, it would be simpler for compiler to understand the logic this way. Could you check? Wouldn't that attach __attribute__((cleanup(xxx))) to var2, at which point we could cause the compiler to call xxx((void*)(true)) if the user does an early return inside the lock guard, with disastrous consequences? Or is the __attribute__ applied only to the first out of two declarations in a list? Oh, most probably you are right, seems g_autoptr apply it to both variables. Also, we don't need qemu_lockable_auto_unlock(var) separate call, if we zero-out another variable. So, me fixing: #define WITH_QEMU_LOCK_GUARD_(x, var) \ for (QemuLockable *var __attribute__((cleanup(qemu_lockable_auto_unlock))) = qemu_lockable_auto_lock(QEMU_MAKE_LOCKABLE_NONNULL((x))), \ *var2 = (void *)(true); \ var2; \ var2 = NULL) (and we'll need to modify qemu_lockable_auto_unlock() to take "QemuLockable **x" argument) That's almost good enough. I fixed a few things to generate var2. I applied a similar approach to WITH_GRAPH_RDLOCK_GUARD macro: --- a/include/block/graph-lock.h +++ b/include/block/graph-lock.h @@ -224,13 +224,22 @@ graph_lockable_auto_unlock(GraphLockable *x) G_DEFINE_AUTOPTR_CLEANUP_FUNC(GraphLockable, graph_lockable_auto_unlock) -#define WITH_GRAPH_RDLOCK_GUARD_(var) \ -for (g_autoptr(GraphLockable) var = graph_lockable_auto_lock(GML_OBJ_()); \ - var; \ - graph_lockable_auto_unlock(var), var = NULL) +static inline void TSA_NO_TSA coroutine_fn +graph_lockable_auto_cleanup(GraphLockable **x) +{ +graph_lockable_auto_unlock(*x); +} + +#define WITH_GRAPH_RDLOCK_GUARD__(var) \ +GraphLockable *var \ +__attribute__((cleanup(graph_lockable_auto_cleanup))) G_GNUC_UNUSED = \ + graph_lockable_auto_lock(GML_OBJ_()) + +#define WITH_GRAPH_RDLOCK_GUARD_(var, var2) \ +for (WITH_GRAPH_RDLOCK_GUARD__(var), *var2 = (void *)true; var2; var2 = NULL) #define WITH_GRAPH_RDLOCK_GUARD() \ -WITH_GRAPH_RDLOCK_GUARD_(glue(graph_lockable_auto, __COUNTER__)) +WITH_GRAPH_RDLOCK_GUARD_(glue(graph_lockable_auto, __COUNTER__), glue(graph_lockable_auto, __COUNTER__)) Unfortunately, it doesn't work in all cases. It seems to have issues with some guards: ../block/stream.c: In function ‘stream_run’: ../block/stream.c:216:12: error: ‘ret’ may be used uninitialized [-Werror=maybe-uninitialized] 216 | if (ret < 0) { So, updated macro helps in some cases, but doesn't help here? Intersting, why. What should we do? change the macros + cherry-pick the missing false-positives, or keep this series as is? I think marco + missing is better. No reason to add dead-initializations in cases where new macros helps. Still, would be good to understand, what's the difference, why it help on some cases and not help in another. -- Best regards, Vladimir
Re: [PATCH v2 2/4] mirror: allow specifying working bitmap
On 07.03.24 16:47, Fiona Ebner wrote: From: John Snow for the mirror job. The bitmap's granularity is used as the job's granularity. The new @bitmap parameter is marked unstable in the QAPI and can currently only be used for @sync=full mode. Clusters initially dirty in the bitmap as well as new writes are copied to the target. Using block-dirty-bitmap-clear and block-dirty-bitmap-merge API, callers can simulate the three kinds of @BitmapSyncMode (which is used by backup): 1. always: default, just pass bitmap as working bitmap. 2. never: copy bitmap and pass copy to the mirror job. 3. on-success: copy bitmap and pass copy to the mirror job and if successful, merge bitmap into original afterwards. When the target image is a fresh "diff image", i.e. one that was not used as the target of a previous mirror and the target image's cluster size is larger than the bitmap's granularity, or when @copy-mode=write-blocking is used, there is a pitfall, because the cluster in the target image will be allocated, but not contain all the data corresponding to the same region in the source image. An idea to avoid the limitation would be to mark clusters which are affected by unaligned writes and are not allocated in the target image dirty, so they would be copied fully later. However, for migration, the invariant that an actively synced mirror stays actively synced (unless an error happens) is useful, because without that invariant, migration might inactivate block devices when mirror still got work to do and run into an assertion failure [0]. Another approach would be to read the missing data from the source upon unaligned writes to be able to write the full target cluster instead. But certain targets like NBD do not allow querying the cluster size. To avoid limiting/breaking the use case of syncing to an existing target, which is arguably more common than the diff image use case, document the limiation in QAPI. This patch was originally based on one by Ma Haocong, but it has since been modified pretty heavily, first by John and then again by Fiona. [0]: https://lore.kernel.org/qemu-devel/1db7f571-cb7f-c293-04cc-cd856e060...@proxmox.com/ Suggested-by: Ma Haocong Signed-off-by: Ma Haocong Signed-off-by: John Snow [FG: switch to bdrv_dirty_bitmap_merge_internal] Signed-off-by: Fabian Grünbichler Signed-off-by: Thomas Lamprecht [FE: rebase for 9.0 get rid of bitmap mode parameter use caller-provided bitmap as working bitmap turn bitmap parameter experimental] Signed-off-by: Fiona Ebner --- block/mirror.c | 95 -- blockdev.c | 39 +-- include/block/block_int-global-state.h | 5 +- qapi/block-core.json | 37 +- tests/unit/test-block-iothread.c | 2 +- 5 files changed, 146 insertions(+), 32 deletions(-) diff --git a/block/mirror.c b/block/mirror.c index 1609354db3..5c9a00b574 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -51,7 +51,7 @@ typedef struct MirrorBlockJob { BlockDriverState *to_replace; /* Used to block operations on the drive-mirror-replace target */ Error *replace_blocker; -bool is_none_mode; +MirrorSyncMode sync_mode; Could you please split this change to separate preparation patch? BlockMirrorBackingMode backing_mode; /* Whether the target image requires explicit zero-initialization */ bool zero_target; @@ -73,6 +73,11 @@ typedef struct MirrorBlockJob { size_t buf_size; int64_t bdev_length; unsigned long *cow_bitmap; +/* + * Whether the bitmap is created locally or provided by the caller (for + * incremental sync). + */ +bool dirty_bitmap_is_local; BdrvDirtyBitmap *dirty_bitmap; BdrvDirtyBitmapIter *dbi; [..] +if (bitmap_name) { +if (sync != MIRROR_SYNC_MODE_FULL) { +error_setg(errp, "Sync mode '%s' not supported with bitmap.", + MirrorSyncMode_str(sync)); +return; +} +if (granularity) { +error_setg(errp, "Granularity and bitmap cannot both be set"); +return; +} + +bitmap = bdrv_find_dirty_bitmap(bs, bitmap_name); +if (!bitmap) { +error_setg(errp, "Dirty bitmap '%s' not found", bitmap_name); +return; +} + +if (bdrv_dirty_bitmap_check(bitmap, BDRV_BITMAP_ALLOW_RO, errp)) { Why allow read-only bitmaps? +return; +} +} + if (!bdrv_backing_chain_next(bs) && sync == MIRROR_SYNC_MODE_TOP) { sync = MIRROR_SYNC_MODE_FULL; } @@ -2889,10 +2913,9 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs, [..] +# @unstable: Member @bitmap is experimental. +# # Since: 1.3 ## { 'struct': 'DriveMirror', 'data': { '*job-id': 'str', 'device': 'str', 'target': 'str', '*format': 'str', '*node-name':
Re: [PATCH 06/19] block/stream: fix -Werror=maybe-uninitialized false-positives
On 02.04.24 18:34, Eric Blake wrote: On Tue, Apr 02, 2024 at 12:58:43PM +0300, Vladimir Sementsov-Ogievskiy wrote: Again, same false-positives, because of WITH_GRAPH_RDLOCK_GUARD().. Didn't you try to change WITH_ macros somehow, so that compiler believe in our good intentions? #define WITH_QEMU_LOCK_GUARD_(x, var) \ for (g_autoptr(QemuLockable) var = \ qemu_lockable_auto_lock(QEMU_MAKE_LOCKABLE_NONNULL((x))); \ var; \ qemu_lockable_auto_unlock(var), var = NULL) I can't think of a clever way to rewrite this. The compiler probably thinks the loop may not run, due to the "var" condition. But how to convince it otherwise? it's hard to introduce another variable too.. hmm. maybe like this? #define WITH_QEMU_LOCK_GUARD_(x, var) \ for (g_autoptr(QemuLockable) var = \ qemu_lockable_auto_lock(QEMU_MAKE_LOCKABLE_NONNULL((x))), \ var2 = (void *)(true); \ var2; \ qemu_lockable_auto_unlock(var), var2 = NULL) probably, it would be simpler for compiler to understand the logic this way. Could you check? Wouldn't that attach __attribute__((cleanup(xxx))) to var2, at which point we could cause the compiler to call xxx((void*)(true)) if the user does an early return inside the lock guard, with disastrous consequences? Or is the __attribute__ applied only to the first out of two declarations in a list? Oh, most probably you are right, seems g_autoptr apply it to both variables. Also, we don't need qemu_lockable_auto_unlock(var) separate call, if we zero-out another variable. So, me fixing: #define WITH_QEMU_LOCK_GUARD_(x, var) \ for (QemuLockable *var __attribute__((cleanup(qemu_lockable_auto_unlock))) = qemu_lockable_auto_lock(QEMU_MAKE_LOCKABLE_NONNULL((x))), \ *var2 = (void *)(true); \ var2; \ var2 = NULL) (and we'll need to modify qemu_lockable_auto_unlock() to take "QemuLockable **x" argument) -- Best regards, Vladimir
Re: [PATCH 06/19] block/stream: fix -Werror=maybe-uninitialized false-positives
On 02.04.24 12:12, Marc-André Lureau wrote: Hi On Fri, Mar 29, 2024 at 12:35 PM Vladimir Sementsov-Ogievskiy wrote: On 28.03.24 13:20, marcandre.lur...@redhat.com wrote: From: Marc-André Lureau ../block/stream.c:193:19: error: ‘unfiltered_bs’ may be used uninitialized [-Werror=maybe-uninitialized] ../block/stream.c:176:5: error: ‘len’ may be used uninitialized [-Werror=maybe-uninitialized] trace/trace-block.h:906:9: error: ‘ret’ may be used uninitialized [-Werror=maybe-uninitialized] Signed-off-by: Marc-André Lureau Again, same false-positives, because of WITH_GRAPH_RDLOCK_GUARD().. Didn't you try to change WITH_ macros somehow, so that compiler believe in our good intentions? #define WITH_QEMU_LOCK_GUARD_(x, var) \ for (g_autoptr(QemuLockable) var = \ qemu_lockable_auto_lock(QEMU_MAKE_LOCKABLE_NONNULL((x))); \ var; \ qemu_lockable_auto_unlock(var), var = NULL) I can't think of a clever way to rewrite this. The compiler probably thinks the loop may not run, due to the "var" condition. But how to convince it otherwise? it's hard to introduce another variable too.. hmm. maybe like this? #define WITH_QEMU_LOCK_GUARD_(x, var) \ for (g_autoptr(QemuLockable) var = \ qemu_lockable_auto_lock(QEMU_MAKE_LOCKABLE_NONNULL((x))), \ var2 = (void *)(true); \ var2; \ qemu_lockable_auto_unlock(var), var2 = NULL) probably, it would be simpler for compiler to understand the logic this way. Could you check? (actually, will also need to construct var2 name same way as for var) Actually, "unused variable initialization" is bad thing too. Anyway, if no better solution for now: Acked-by: Vladimir Sementsov-Ogievskiy --- block/stream.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/stream.c b/block/stream.c index 7031eef12b..9076203193 100644 --- a/block/stream.c +++ b/block/stream.c @@ -155,8 +155,8 @@ static void stream_clean(Job *job) static int coroutine_fn stream_run(Job *job, Error **errp) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); -BlockDriverState *unfiltered_bs; -int64_t len; +BlockDriverState *unfiltered_bs = NULL; +int64_t len = -1; int64_t offset = 0; int error = 0; int64_t n = 0; /* bytes */ @@ -177,7 +177,7 @@ static int coroutine_fn stream_run(Job *job, Error **errp) for ( ; offset < len; offset += n) { bool copy; -int ret; +int ret = -1; /* Note that even when no rate limit is applied we need to yield * with no pending I/O here so that bdrv_drain_all() returns. -- Best regards, Vladimir -- Best regards, Vladimir
Re: [PATCH v2 1/4] qapi/block-core: avoid the re-use of MirrorSyncMode for backup
On 07.03.24 16:47, Fiona Ebner wrote: Backup supports all modes listed in MirrorSyncMode, while mirror does not. Introduce BackupSyncMode by copying the current MirrorSyncMode and drop the variants mirror does not support from MirrorSyncMode as well as the corresponding manual check in mirror_start(). Suggested-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Fiona Ebner Reviewed-by: Vladimir Sementsov-Ogievskiy -- Best regards, Vladimir
Re: [PATCH for-9.1] migration: Add Error** argument to add_bitmaps_to_list()
On 29.03.24 13:56, Cédric Le Goater wrote: This allows to report more precise errors in the migration handler dirty_bitmap_save_setup(). Suggested-by Vladimir Sementsov-Ogievskiy Signed-off-by: Cédric Le Goater Reviewed-by: Vladimir Sementsov-Ogievskiy -- Best regards, Vladimir
[PATCH v3 2/5] qdev-monitor: fix error message in find_device_state()
This "hotpluggable" here is misleading. Actually we check is object a device or not. Let's drop the word. Suggested-by: Markus Armbruster Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Markus Armbruster --- system/qdev-monitor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/system/qdev-monitor.c b/system/qdev-monitor.c index c1243891c3..840177d19f 100644 --- a/system/qdev-monitor.c +++ b/system/qdev-monitor.c @@ -891,7 +891,7 @@ static DeviceState *find_device_state(const char *id, Error **errp) dev = (DeviceState *)object_dynamic_cast(obj, TYPE_DEVICE); if (!dev) { -error_setg(errp, "%s is not a hotpluggable device", id); +error_setg(errp, "%s is not a device", id); return NULL; } -- 2.34.1
[PATCH v3 0/5] vhost-user-blk: live resize additional APIs
v3: 02: add r-b by Markus 03: improve commit message 04: improve documentation, merge race-fix here (which was v2:05), rebase on master (migration_is_running() now without arguments) 05: improve documentation Vladimir Sementsov-Ogievskiy (5): vhost-user-blk: simplify and fix vhost_user_blk_handle_config_change qdev-monitor: fix error message in find_device_state() qdev-monitor: add option to report GenericError from find_device_state qapi: introduce device-sync-config qapi: introduce CONFIG_READ event hw/block/vhost-user-blk.c | 32 +++--- hw/virtio/virtio-pci.c| 18 ++ include/hw/qdev-core.h| 3 ++ include/monitor/qdev.h| 2 ++ include/sysemu/runstate.h | 1 + monitor/monitor.c | 1 + qapi/qdev.json| 54 ++ stubs/qdev.c | 6 system/qdev-monitor.c | 70 --- system/runstate.c | 5 +++ 10 files changed, 175 insertions(+), 17 deletions(-) -- 2.34.1
[PATCH v3 3/5] qdev-monitor: add option to report GenericError from find_device_state
Here we just prepare for the following patch, making possible to report GenericError as recommended. This patch doesn't aim to prevent further use of DeviceNotFound by future interfaces: - find_device_state() is used in blk_by_qdev_id() and qmp_get_blk() functions, which may lead to spread of DeviceNotFound anyway - also, nothing prevent simply copy-pasting find_device_state() calls with false argument Signed-off-by: Vladimir Sementsov-Ogievskiy --- system/qdev-monitor.c | 15 +++ 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/system/qdev-monitor.c b/system/qdev-monitor.c index 840177d19f..7e075d91c1 100644 --- a/system/qdev-monitor.c +++ b/system/qdev-monitor.c @@ -878,13 +878,20 @@ void qmp_device_add(QDict *qdict, QObject **ret_data, Error **errp) object_unref(OBJECT(dev)); } -static DeviceState *find_device_state(const char *id, Error **errp) +/* + * Note that creating new APIs using error classes other than GenericError is + * not recommended. Set use_generic_error=true for new interfaces. + */ +static DeviceState *find_device_state(const char *id, bool use_generic_error, + Error **errp) { Object *obj = object_resolve_path_at(qdev_get_peripheral(), id); DeviceState *dev; if (!obj) { -error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, +error_set(errp, + (use_generic_error ? + ERROR_CLASS_GENERIC_ERROR : ERROR_CLASS_DEVICE_NOT_FOUND), "Device '%s' not found", id); return NULL; } @@ -948,7 +955,7 @@ void qdev_unplug(DeviceState *dev, Error **errp) void qmp_device_del(const char *id, Error **errp) { -DeviceState *dev = find_device_state(id, errp); +DeviceState *dev = find_device_state(id, false, errp); if (dev != NULL) { if (dev->pending_deleted_event && (dev->pending_deleted_expires_ms == 0 || @@ -1068,7 +1075,7 @@ BlockBackend *blk_by_qdev_id(const char *id, Error **errp) GLOBAL_STATE_CODE(); -dev = find_device_state(id, errp); +dev = find_device_state(id, false, errp); if (dev == NULL) { return NULL; } -- 2.34.1
[PATCH v3 5/5] qapi: introduce CONFIG_READ event
Send a new event when guest reads virtio-pci config after virtio_notify_config() call. That's useful to check that guest fetched modified config, for example after resizing disk backend. Signed-off-by: Vladimir Sementsov-Ogievskiy --- hw/virtio/virtio-pci.c | 9 + include/monitor/qdev.h | 2 ++ monitor/monitor.c | 1 + qapi/qdev.json | 33 + stubs/qdev.c | 6 ++ system/qdev-monitor.c | 6 ++ 6 files changed, 57 insertions(+) diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 92afbae71c..c0c158dae2 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -23,6 +23,7 @@ #include "hw/boards.h" #include "hw/virtio/virtio.h" #include "migration/qemu-file-types.h" +#include "monitor/qdev.h" #include "hw/pci/pci.h" #include "hw/pci/pci_bus.h" #include "hw/qdev-properties.h" @@ -530,6 +531,10 @@ static uint64_t virtio_pci_config_read(void *opaque, hwaddr addr, } addr -= config; +if (vdev->generation > 0) { +qdev_virtio_config_read_event(DEVICE(proxy)); +} + switch (size) { case 1: val = virtio_config_readb(vdev, addr); @@ -1884,6 +1889,10 @@ static uint64_t virtio_pci_device_read(void *opaque, hwaddr addr, return UINT64_MAX; } +if (vdev->generation > 0) { +qdev_virtio_config_read_event(DEVICE(proxy)); +} + switch (size) { case 1: val = virtio_config_modern_readb(vdev, addr); diff --git a/include/monitor/qdev.h b/include/monitor/qdev.h index 1d57bf6577..fc9a834dca 100644 --- a/include/monitor/qdev.h +++ b/include/monitor/qdev.h @@ -36,4 +36,6 @@ DeviceState *qdev_device_add_from_qdict(const QDict *opts, */ const char *qdev_set_id(DeviceState *dev, char *id, Error **errp); +void qdev_virtio_config_read_event(DeviceState *dev); + #endif diff --git a/monitor/monitor.c b/monitor/monitor.c index 01ede1babd..5b06146503 100644 --- a/monitor/monitor.c +++ b/monitor/monitor.c @@ -316,6 +316,7 @@ static MonitorQAPIEventConf monitor_qapi_event_conf[QAPI_EVENT__MAX] = { [QAPI_EVENT_VSERPORT_CHANGE] = { 1000 * SCALE_MS }, [QAPI_EVENT_MEMORY_DEVICE_SIZE_CHANGE] = { 1000 * SCALE_MS }, [QAPI_EVENT_HV_BALLOON_STATUS_REPORT] = { 1000 * SCALE_MS }, +[QAPI_EVENT_VIRTIO_CONFIG_READ] = { 300 * SCALE_MS }, }; /* diff --git a/qapi/qdev.json b/qapi/qdev.json index e8be79c3d5..29a4f47360 100644 --- a/qapi/qdev.json +++ b/qapi/qdev.json @@ -182,3 +182,36 @@ { 'command': 'device-sync-config', 'features': [ 'unstable' ], 'data': {'id': 'str'} } + +## +# @VIRTIO_CONFIG_READ: +# +# Emitted whenever guest reads virtio device configuration after +# configuration change. +# +# The event may be used in pair with device-sync-config. It shows +# that guest has re-read updated configuration. It doesn't +# guarantee that guest successfully handled it and updated the +# view of the device for the user, but still it's a kind of +# success indicator. +# +# @device: device name +# +# @path: device path +# +# Features: +# +# @unstable: The event is experimental. +# +# Since: 9.1 +# +# Example: +# +# <- { "event": "VIRTIO_CONFIG_READ", +# "data": { "device": "virtio-net-pci-0", +#"path": "/machine/peripheral/virtio-net-pci-0" }, +# "timestamp": { "seconds": 1265044230, "microseconds": 450486 } } +## +{ 'event': 'VIRTIO_CONFIG_READ', + 'features': [ 'unstable' ], + 'data': { '*device': 'str', 'path': 'str' } } diff --git a/stubs/qdev.c b/stubs/qdev.c index 6869f6f90a..ab6c4afe0b 100644 --- a/stubs/qdev.c +++ b/stubs/qdev.c @@ -26,3 +26,9 @@ void qapi_event_send_device_unplug_guest_error(const char *device, { /* Nothing to do. */ } + +void qapi_event_send_virtio_config_read(const char *device, +const char *path) +{ +/* Nothing to do. */ +} diff --git a/system/qdev-monitor.c b/system/qdev-monitor.c index cb35ea0b86..8a2ca77fde 100644 --- a/system/qdev-monitor.c +++ b/system/qdev-monitor.c @@ -26,6 +26,7 @@ #include "sysemu/runstate.h" #include "qapi/error.h" #include "qapi/qapi-commands-qdev.h" +#include "qapi/qapi-events-qdev.h" #include "qapi/qmp/dispatch.h" #include "qapi/qmp/qdict.h" #include "qapi/qmp/qerror.h" @@ -1206,3 +1207,8 @@ bool qmp_command_available(const QmpCommand *cmd, Error **errp) } return true; } + +void qdev_virtio_config_read_event(DeviceState *dev) +{ +qapi_event_send_virtio_config_read(dev->id, dev->canonical_path); +} -- 2.34.1
[PATCH v3 4/5] qapi: introduce device-sync-config
Add command to sync config from vhost-user backend to the device. It may be helpful when VHOST_USER_SLAVE_CONFIG_CHANGE_MSG failed or not triggered interrupt to the guest or just not available (not supported by vhost-user server). Command result is racy if allow it during migration. Let's allow the sync only in RUNNING state. Signed-off-by: Vladimir Sementsov-Ogievskiy --- hw/block/vhost-user-blk.c | 27 -- hw/virtio/virtio-pci.c| 9 include/hw/qdev-core.h| 3 +++ include/sysemu/runstate.h | 1 + qapi/qdev.json| 21 + system/qdev-monitor.c | 47 +++ system/runstate.c | 5 + 7 files changed, 106 insertions(+), 7 deletions(-) diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c index 9e6bbc6950..2f301f380c 100644 --- a/hw/block/vhost-user-blk.c +++ b/hw/block/vhost-user-blk.c @@ -88,27 +88,39 @@ static void vhost_user_blk_set_config(VirtIODevice *vdev, const uint8_t *config) s->blkcfg.wce = blkcfg->wce; } +static int vhost_user_blk_sync_config(DeviceState *dev, Error **errp) +{ +int ret; +VirtIODevice *vdev = VIRTIO_DEVICE(dev); +VHostUserBlk *s = VHOST_USER_BLK(vdev); + +ret = vhost_dev_get_config(>dev, (uint8_t *)>blkcfg, + vdev->config_len, errp); +if (ret < 0) { +return ret; +} + +memcpy(vdev->config, >blkcfg, vdev->config_len); +virtio_notify_config(vdev); + +return 0; +} + static int vhost_user_blk_handle_config_change(struct vhost_dev *dev) { int ret; -VirtIODevice *vdev = dev->vdev; -VHostUserBlk *s = VHOST_USER_BLK(dev->vdev); Error *local_err = NULL; if (!dev->started) { return 0; } -ret = vhost_dev_get_config(dev, (uint8_t *)>blkcfg, - vdev->config_len, _err); +ret = vhost_user_blk_sync_config(DEVICE(dev->vdev), _err); if (ret < 0) { error_report_err(local_err); return ret; } -memcpy(dev->vdev->config, >blkcfg, vdev->config_len); -virtio_notify_config(dev->vdev); - return 0; } @@ -576,6 +588,7 @@ static void vhost_user_blk_class_init(ObjectClass *klass, void *data) device_class_set_props(dc, vhost_user_blk_properties); dc->vmsd = _vhost_user_blk; +dc->sync_config = vhost_user_blk_sync_config; set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); vdc->realize = vhost_user_blk_device_realize; vdc->unrealize = vhost_user_blk_device_unrealize; diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index eaaf86402c..92afbae71c 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -2501,6 +2501,14 @@ static void virtio_pci_dc_realize(DeviceState *qdev, Error **errp) vpciklass->parent_dc_realize(qdev, errp); } +static int virtio_pci_sync_config(DeviceState *dev, Error **errp) +{ +VirtIOPCIProxy *proxy = VIRTIO_PCI(dev); +VirtIODevice *vdev = virtio_bus_get_device(>bus); + +return qdev_sync_config(DEVICE(vdev), errp); +} + static void virtio_pci_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); @@ -2517,6 +2525,7 @@ static void virtio_pci_class_init(ObjectClass *klass, void *data) device_class_set_parent_realize(dc, virtio_pci_dc_realize, >parent_dc_realize); rc->phases.hold = virtio_pci_bus_reset_hold; +dc->sync_config = virtio_pci_sync_config; } static const TypeInfo virtio_pci_info = { diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h index 9228e96c87..87135bdcdf 100644 --- a/include/hw/qdev-core.h +++ b/include/hw/qdev-core.h @@ -95,6 +95,7 @@ typedef void (*DeviceUnrealize)(DeviceState *dev); typedef void (*DeviceReset)(DeviceState *dev); typedef void (*BusRealize)(BusState *bus, Error **errp); typedef void (*BusUnrealize)(BusState *bus); +typedef int (*DeviceSyncConfig)(DeviceState *dev, Error **errp); /** * struct DeviceClass - The base class for all devices. @@ -162,6 +163,7 @@ struct DeviceClass { DeviceReset reset; DeviceRealize realize; DeviceUnrealize unrealize; +DeviceSyncConfig sync_config; /** * @vmsd: device state serialisation description for @@ -546,6 +548,7 @@ bool qdev_hotplug_allowed(DeviceState *dev, Error **errp); */ HotplugHandler *qdev_get_hotplug_handler(DeviceState *dev); void qdev_unplug(DeviceState *dev, Error **errp); +int qdev_sync_config(DeviceState *dev, Error **errp); void qdev_simple_device_unplug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp); void qdev_machine_creation_done(void); diff --git a/include/sysemu/runstate.h b/include/sysemu/runstate.h index 0117d243c4..296af52322 100644 --- a/include/sysemu/runstate.h +++ b/include/sysemu/runstate.h
[PATCH v3 1/5] vhost-user-blk: simplify and fix vhost_user_blk_handle_config_change
Let's not care about what was changed and update the whole config, reasons: 1. config->geometry should be updated together with capacity, so we fix a bug. 2. Vhost-user protocol doesn't say anything about config change limitation. Silent ignore of changes doesn't seem to be correct. 3. vhost-user-vsock reads the whole config 4. on realize we don't do any checks on retrieved config, so no reason to care here Comment "valid for resize only" exists since introduction the whole hw/block/vhost-user-blk.c in commit 00343e4b54ba0685e9ebe928ec5713b0cf7f1d1c "vhost-user-blk: introduce a new vhost-user-blk host device", seems it was just an extra limitation. Also, let's notify guest unconditionally: 1. So does vhost-user-vsock 2. We are going to reuse the functionality in new cases when we do want to notify the guest unconditionally. So, no reason to create extra branches in the logic. Signed-off-by: Vladimir Sementsov-Ogievskiy Acked-by: Raphael Norwitz --- hw/block/vhost-user-blk.c | 11 +++ 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c index 6a856ad51a..9e6bbc6950 100644 --- a/hw/block/vhost-user-blk.c +++ b/hw/block/vhost-user-blk.c @@ -91,7 +91,6 @@ static void vhost_user_blk_set_config(VirtIODevice *vdev, const uint8_t *config) static int vhost_user_blk_handle_config_change(struct vhost_dev *dev) { int ret; -struct virtio_blk_config blkcfg; VirtIODevice *vdev = dev->vdev; VHostUserBlk *s = VHOST_USER_BLK(dev->vdev); Error *local_err = NULL; @@ -100,19 +99,15 @@ static int vhost_user_blk_handle_config_change(struct vhost_dev *dev) return 0; } -ret = vhost_dev_get_config(dev, (uint8_t *), +ret = vhost_dev_get_config(dev, (uint8_t *)>blkcfg, vdev->config_len, _err); if (ret < 0) { error_report_err(local_err); return ret; } -/* valid for resize only */ -if (blkcfg.capacity != s->blkcfg.capacity) { -s->blkcfg.capacity = blkcfg.capacity; -memcpy(dev->vdev->config, >blkcfg, vdev->config_len); -virtio_notify_config(dev->vdev); -} +memcpy(dev->vdev->config, >blkcfg, vdev->config_len); +virtio_notify_config(dev->vdev); return 0; } -- 2.34.1
Re: [PATCH for-9.1 v5 07/14] migration: Add Error** argument to .save_setup() handler
On 29.03.24 13:53, Cédric Le Goater wrote: Hello Vladimir, On 3/29/24 10:32, Vladimir Sementsov-Ogievskiy wrote: On 20.03.24 09:49, Cédric Le Goater wrote: diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c index 2708abf3d762de774ed294d3fdb8e56690d2974c..542a8c297b329abc30d1b3a205d29340fa59a961 100644 --- a/migration/block-dirty-bitmap.c +++ b/migration/block-dirty-bitmap.c @@ -1213,12 +1213,14 @@ fail: return ret; } -static int dirty_bitmap_save_setup(QEMUFile *f, void *opaque) +static int dirty_bitmap_save_setup(QEMUFile *f, void *opaque, Error **errp) { DBMSaveState *s = &((DBMState *)opaque)->save; SaveBitmapState *dbms = NULL; if (init_dirty_bitmap_migration(s) < 0) { + error_setg(errp, + "Failed to initialize dirty tracking bitmap for blocks"); No, that's not about initializing a bitmap. This all is about migration of block-dirty-bitmaps themselves. So correct would be say "Failed to initialize migration of block dirty bitmaps". with this, for block dirty bitmap migration: Acked-by: Vladimir Sementsov-Ogievskiy I had kept your previous R-b. Oh, I missed that) Should we remove it ? or is it ok if I address your comments below in a followup patch, in which case the error message above would be removed. Yes of course, you can keep my old r-b. Followup patch is appreciated Still, a lot better is add errp to init_dirty_bitmap_migration() and to add_bitmaps_to_list() too: look, init_dirty_bitmap_migration() fails only if add_bitmaps_to_list() fails in turn, add_bitmaps_to_list() have several clear failure points, where it always does error_report (or error_report_err), which would be better to pass-through to the user. Good idea. Will do. Thanks, C. -- Best regards, Vladimir
Re: [PATCH 2/2] backup: add minimum cluster size to performance options
On 08.03.24 18:51, Fiona Ebner wrote: Useful to make discard-source work in the context of backup fleecing when the fleecing image has a larger granularity than the backup target. Backup/block-copy will use at least this granularity for copy operations and in particular, discard requests to the backup source will too. If the granularity is too small, they will just be aligned down in cbw_co_pdiscard_snapshot() and thus effectively ignored. Suggested-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Fiona Ebner --- block/backup.c| 2 +- block/copy-before-write.c | 2 ++ block/copy-before-write.h | 1 + blockdev.c| 3 +++ qapi/block-core.json | 9 +++-- 5 files changed, 14 insertions(+), 3 deletions(-) diff --git a/block/backup.c b/block/backup.c index 3dd2e229d2..a1292c01ec 100644 --- a/block/backup.c +++ b/block/backup.c @@ -458,7 +458,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, } cbw = bdrv_cbw_append(bs, target, filter_node_name, discard_source, - , errp); + perf->min_cluster_size, , errp); if (!cbw) { goto error; } diff --git a/block/copy-before-write.c b/block/copy-before-write.c index f9896c6c1e..55a9272485 100644 --- a/block/copy-before-write.c +++ b/block/copy-before-write.c @@ -545,6 +545,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source, BlockDriverState *target, const char *filter_node_name, bool discard_source, + int64_t min_cluster_size, same note, suggest uint32_t BlockCopyState **bcs, Error **errp) { @@ -563,6 +564,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source, } qdict_put_str(opts, "file", bdrv_get_node_name(source)); qdict_put_str(opts, "target", bdrv_get_node_name(target)); +qdict_put_int(opts, "min-cluster-size", min_cluster_size); top = bdrv_insert_node(source, opts, flags, errp); if (!top) { diff --git a/block/copy-before-write.h b/block/copy-before-write.h index 01af0cd3c4..dc6cafe7fa 100644 --- a/block/copy-before-write.h +++ b/block/copy-before-write.h @@ -40,6 +40,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source, BlockDriverState *target, const char *filter_node_name, bool discard_source, + int64_t min_cluster_size, BlockCopyState **bcs, Error **errp); void bdrv_cbw_drop(BlockDriverState *bs); diff --git a/blockdev.c b/blockdev.c index daceb50460..8e6bdbc94a 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2653,6 +2653,9 @@ static BlockJob *do_backup_common(BackupCommon *backup, if (backup->x_perf->has_max_chunk) { perf.max_chunk = backup->x_perf->max_chunk; } +if (backup->x_perf->has_min_cluster_size) { +perf.min_cluster_size = backup->x_perf->min_cluster_size; +} } if ((backup->sync == MIRROR_SYNC_MODE_BITMAP) || diff --git a/qapi/block-core.json b/qapi/block-core.json index 85c8f88f6e..ba0836892f 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -1551,11 +1551,16 @@ # it should not be less than job cluster size which is calculated # as maximum of target image cluster size and 64k. Default 0. # +# @min-cluster-size: Minimum size of blocks used by copy-before-write +# and background copy operations. Has to be a power of 2. No +# effect if smaller than the maximum of the target's cluster size +# and 64 KiB. Default 0. (Since 9.0) 9.1 +# # Since: 6.0 ## { 'struct': 'BackupPerf', - 'data': { '*use-copy-range': 'bool', -'*max-workers': 'int', '*max-chunk': 'int64' } } + 'data': { '*use-copy-range': 'bool', '*max-workers': 'int', +'*max-chunk': 'int64', '*min-cluster-size': 'uint32' } } ## # @BackupCommon: -- Best regards, Vladimir
Re: [PATCH 1/2] copy-before-write: allow specifying minimum cluster size
On 08.03.24 18:51, Fiona Ebner wrote: Useful to make discard-source work in the context of backup fleecing when the fleecing image has a larger granularity than the backup target. Copy-before-write operations will use at least this granularity and in particular, discard requests to the source node will too. If the granularity is too small, they will just be aligned down in cbw_co_pdiscard_snapshot() and thus effectively ignored. The QAPI uses uint32 so the value will be non-negative, but still fit into a uint64_t. Suggested-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Fiona Ebner --- block/block-copy.c | 17 + block/copy-before-write.c | 3 ++- include/block/block-copy.h | 1 + qapi/block-core.json | 8 +++- 4 files changed, 23 insertions(+), 6 deletions(-) diff --git a/block/block-copy.c b/block/block-copy.c index 7e3b378528..adb1cbb440 100644 --- a/block/block-copy.c +++ b/block/block-copy.c @@ -310,6 +310,7 @@ void block_copy_set_copy_opts(BlockCopyState *s, bool use_copy_range, } static int64_t block_copy_calculate_cluster_size(BlockDriverState *target, + int64_t min_cluster_size, Maybe better use uint32_t here as well. Error **errp) { int ret; @@ -335,7 +336,7 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target, "used. If the actual block size of the target exceeds " "this default, the backup may be unusable", BLOCK_COPY_CLUSTER_SIZE_DEFAULT); -return BLOCK_COPY_CLUSTER_SIZE_DEFAULT; +return MAX(min_cluster_size, BLOCK_COPY_CLUSTER_SIZE_DEFAULT); } else if (ret < 0 && !target_does_cow) { error_setg_errno(errp, -ret, "Couldn't determine the cluster size of the target image, " @@ -345,16 +346,18 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target, return ret; } else if (ret < 0 && target_does_cow) { /* Not fatal; just trudge on ahead. */ -return BLOCK_COPY_CLUSTER_SIZE_DEFAULT; +return MAX(min_cluster_size, BLOCK_COPY_CLUSTER_SIZE_DEFAULT); } -return MAX(BLOCK_COPY_CLUSTER_SIZE_DEFAULT, bdi.cluster_size); +return MAX(min_cluster_size, + MAX(BLOCK_COPY_CLUSTER_SIZE_DEFAULT, bdi.cluster_size)); } BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, BlockDriverState *copy_bitmap_bs, const BdrvDirtyBitmap *bitmap, bool discard_source, + int64_t min_cluster_size, and here why not uint32_t Error **errp) { ERRP_GUARD(); @@ -365,7 +368,13 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, GLOBAL_STATE_CODE(); -cluster_size = block_copy_calculate_cluster_size(target->bs, errp); +if (min_cluster_size && !is_power_of_2(min_cluster_size)) { +error_setg(errp, "min-cluster-size needs to be a power of 2"); +return NULL; +} + +cluster_size = block_copy_calculate_cluster_size(target->bs, + min_cluster_size, errp); if (cluster_size < 0) { return NULL; } diff --git a/block/copy-before-write.c b/block/copy-before-write.c index dac57481c5..f9896c6c1e 100644 --- a/block/copy-before-write.c +++ b/block/copy-before-write.c @@ -476,7 +476,8 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags, s->discard_source = flags & BDRV_O_CBW_DISCARD_SOURCE; s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap, - flags & BDRV_O_CBW_DISCARD_SOURCE, errp); + flags & BDRV_O_CBW_DISCARD_SOURCE, + opts->min_cluster_size, errp); I assume it is guaranteed to be 0 when not specified by user. if (!s->bcs) { error_prepend(errp, "Cannot create block-copy-state: "); return -EINVAL; diff --git a/include/block/block-copy.h b/include/block/block-copy.h index bdc703bacd..77857c6c68 100644 --- a/include/block/block-copy.h +++ b/include/block/block-copy.h @@ -28,6 +28,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, BlockDriverState *copy_bitmap_bs, const BdrvDirtyBitmap *bitmap, bool discard_source, + int64_t min_cluster_size, Error **errp); /* Function shoul
Re: [PATCH for-9.1 v5 07/14] migration: Add Error** argument to .save_setup() handler
On 20.03.24 09:49, Cédric Le Goater wrote: diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c index 2708abf3d762de774ed294d3fdb8e56690d2974c..542a8c297b329abc30d1b3a205d29340fa59a961 100644 --- a/migration/block-dirty-bitmap.c +++ b/migration/block-dirty-bitmap.c @@ -1213,12 +1213,14 @@ fail: return ret; } -static int dirty_bitmap_save_setup(QEMUFile *f, void *opaque) +static int dirty_bitmap_save_setup(QEMUFile *f, void *opaque, Error **errp) { DBMSaveState *s = &((DBMState *)opaque)->save; SaveBitmapState *dbms = NULL; if (init_dirty_bitmap_migration(s) < 0) { +error_setg(errp, + "Failed to initialize dirty tracking bitmap for blocks"); No, that's not about initializing a bitmap. This all is about migration of block-dirty-bitmaps themselves. So correct would be say "Failed to initialize migration of block dirty bitmaps". with this, for block dirty bitmap migration: Acked-by: Vladimir Sementsov-Ogievskiy Still, a lot better is add errp to init_dirty_bitmap_migration() and to add_bitmaps_to_list() too: look, init_dirty_bitmap_migration() fails only if add_bitmaps_to_list() fails in turn, add_bitmaps_to_list() have several clear failure points, where it always does error_report (or error_report_err), which would be better to pass-through to the user. return -1; } diff --git a/migrat -- Best regards, Vladimir
Re: [PATCH v4] blockcommit: Reopen base image as RO after abort
On 28.03.24 12:16, Alexander Ivanov wrote: If a blockcommit is aborted the base image remains in RW mode, that leads to a fail of subsequent live migration. How to reproduce: $ virsh snapshot-create-as vm snp1 --disk-only *** write something to the disk inside the guest *** $ virsh blockcommit vm vda --active --shallow && virsh blockjob vm vda --abort $ lsof /vzt/vm.qcow2 COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAME qemu-syst 433203 root 45u REG 253,0 1724776448 133 /vzt/vm.qcow2 $ cat /proc/433203/fdinfo/45 pos:0 flags: 02140002 < The last 2 means RW mode If the base image is in RW mode at the end of blockcommit and was in RO mode before blockcommit, reopen the base BDS in RO. Signed-off-by: Alexander Ivanov --- block/mirror.c | 11 +-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/block/mirror.c b/block/mirror.c index 1bdce3b657..d23be57255 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -93,6 +93,7 @@ typedef struct MirrorBlockJob { int64_t active_write_bytes_in_flight; bool prepared; bool in_drain; +bool base_ro; } MirrorBlockJob; typedef struct MirrorBDSOpaque { @@ -797,6 +798,10 @@ static int mirror_exit_common(Job *job) bdrv_drained_end(target_bs); bdrv_unref(target_bs); +if (abort && s->base_ro && !bdrv_is_read_only(target_bs)) { +bdrv_reopen_set_read_only(target_bs, true, NULL); +} + All looks good to me except this: seems it is safer to place this "if" block before "bdrv_drained_end(); bdrv_unref();" above. With it moved: Reviewed-by: Vladimir Sementsov-Ogievskiy bs_opaque->job = NULL; bdrv_drained_end(src); @@ -1717,6 +1722,7 @@ static BlockJob *mirror_start_job( bool is_none_mode, BlockDriverState *base, bool auto_complete, const char *filter_node_name, bool is_mirror, MirrorCopyMode copy_mode, + bool base_ro, Error **errp) { MirrorBlockJob *s; @@ -1800,6 +1806,7 @@ static BlockJob *mirror_start_job( bdrv_unref(mirror_top_bs); s->mirror_top_bs = mirror_top_bs; +s->base_ro = base_ro; /* No resize for the target either; while the mirror is still running, a * consistent read isn't necessarily possible. We could possibly allow @@ -2029,7 +2036,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs, speed, granularity, buf_size, backing_mode, zero_target, on_source_error, on_target_error, unmap, NULL, NULL, _job_driver, is_none_mode, base, false, - filter_node_name, true, copy_mode, errp); + filter_node_name, true, copy_mode, false, errp); } BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs, @@ -2058,7 +2065,7 @@ BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs, on_error, on_error, true, cb, opaque, _active_job_driver, false, base, auto_complete, filter_node_name, false, MIRROR_COPY_MODE_BACKGROUND, - errp); + base_read_only, errp); if (!job) { goto error_restore_flags; } -- Best regards, Vladimir
Re: [PATCH 06/19] block/stream: fix -Werror=maybe-uninitialized false-positives
On 28.03.24 13:20, marcandre.lur...@redhat.com wrote: From: Marc-André Lureau ../block/stream.c:193:19: error: ‘unfiltered_bs’ may be used uninitialized [-Werror=maybe-uninitialized] ../block/stream.c:176:5: error: ‘len’ may be used uninitialized [-Werror=maybe-uninitialized] trace/trace-block.h:906:9: error: ‘ret’ may be used uninitialized [-Werror=maybe-uninitialized] Signed-off-by: Marc-André Lureau Again, same false-positives, because of WITH_GRAPH_RDLOCK_GUARD().. Didn't you try to change WITH_ macros somehow, so that compiler believe in our good intentions? Actually, "unused variable initialization" is bad thing too. Anyway, if no better solution for now: Acked-by: Vladimir Sementsov-Ogievskiy --- block/stream.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/stream.c b/block/stream.c index 7031eef12b..9076203193 100644 --- a/block/stream.c +++ b/block/stream.c @@ -155,8 +155,8 @@ static void stream_clean(Job *job) static int coroutine_fn stream_run(Job *job, Error **errp) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); -BlockDriverState *unfiltered_bs; -int64_t len; +BlockDriverState *unfiltered_bs = NULL; +int64_t len = -1; int64_t offset = 0; int error = 0; int64_t n = 0; /* bytes */ @@ -177,7 +177,7 @@ static int coroutine_fn stream_run(Job *job, Error **errp) for ( ; offset < len; offset += n) { bool copy; -int ret; +int ret = -1; /* Note that even when no rate limit is applied we need to yield * with no pending I/O here so that bdrv_drain_all() returns. -- Best regards, Vladimir
Re: [PATCH 05/19] block/mirror: fix -Werror=maybe-uninitialized false-positive
On 28.03.24 13:20, marcandre.lur...@redhat.com wrote: From: Marc-André Lureau ../block/mirror.c:1066:22: error: ‘iostatus’ may be used uninitialized [-Werror=maybe-uninitialized] Actually that's a false-positive.. Compiler can't believe that body of WITH_JOB_LOCK_GUARD() will be executed unconditionally. Probably we should mention this in a comment. Signed-off-by: Marc-André Lureau Reviewed-by: Vladimir Sementsov-Ogievskiy --- block/mirror.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/mirror.c b/block/mirror.c index 1bdce3b657..53dd7332ee 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -926,7 +926,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) MirrorBDSOpaque *mirror_top_opaque = s->mirror_top_bs->opaque; BlockDriverState *target_bs = blk_bs(s->target); bool need_drain = true; -BlockDeviceIoStatus iostatus; +BlockDeviceIoStatus iostatus = BLOCK_DEVICE_IO_STATUS__MAX; int64_t length; int64_t target_length; BlockDriverInfo bdi; -- Best regards, Vladimir
Re: [RFC 04/15] qapi: block-job-change: make copy-mode parameter optional
On 28.03.24 12:24, Markus Armbruster wrote: Vladimir Sementsov-Ogievskiy writes: We are going to add more parameters to change. We want to make possible to change only one or any subset of available options. So all the options should be optional. Signed-off-by: Vladimir Sementsov-Ogievskiy --- block/mirror.c | 5 + qapi/block-core.json | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/block/mirror.c b/block/mirror.c index a177502e4f..2d0cd22c06 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -1265,6 +1265,11 @@ static void mirror_change(BlockJob *job, JobChangeOptions *opts, GLOBAL_STATE_CODE(); +if (!change_opts->has_copy_mode) { +/* Nothing to do */ I doubt the comment is useful. +return; +} + if (qatomic_read(>copy_mode) == change_opts->copy_mode) { return; } if (change_opts->copy_mode != MIRROR_COPY_MODE_WRITE_BLOCKING) { error_setg(errp, "Change to copy mode '%s' is not implemented", MirrorCopyMode_str(change_opts->copy_mode)); return; } current = qatomic_cmpxchg(>copy_mode, MIRROR_COPY_MODE_BACKGROUND, change_opts->copy_mode); if (current != MIRROR_COPY_MODE_BACKGROUND) { error_setg(errp, "Expected current copy mode '%s', got '%s'", MirrorCopyMode_str(MIRROR_COPY_MODE_BACKGROUND), MirrorCopyMode_str(current)); } Now I'm curious: what could be changing @copy_mode behind our backs here? For now - nothing. But it may be read from another thread, so it's declared as atomic access. So, we can check it with qatomic_read() instead, check the value first, and write then with qatomic_set(). But, I think it would be extra optimization (if it is). The operation is not often, and cmpxchg looks like a correct way to conditionally change atomic variable (even being a bit too safe). diff --git a/qapi/block-core.json b/qapi/block-core.json index 67dd0ef038..6041e7bd8f 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -3071,7 +3071,7 @@ ## # @BlockJobChangeOptionsMirror: # # @copy-mode: Switch to this copy mode. Currently, only the switch # from 'background' to 'write-blocking' is implemented. # # Since: 8.2 ## { 'struct': 'JobChangeOptionsMirror', - 'data': { 'copy-mode' : 'MirrorCopyMode' } } + 'data': { '*copy-mode' : 'MirrorCopyMode' } } ## # @JobChangeOptions: A member becoming optional is backward compatible. Okay. We may want to document "(optional since 9.1)". We haven't done so in the past. Will do, I think it's useful. The doc comment needs to spell out how absent members are handled. Will add. -- Best regards, Vladimir
Re: [RFC 01/15] scripts/qapi: support type-based unions
On 28.03.24 12:15, Markus Armbruster wrote: Vladimir Sementsov-Ogievskiy writes: Look at block-job-change command: we have to specify both 'id' to chose the job to operate on and 'type' for QAPI union be parsed. But for user this looks redundant: when we specify 'id', QEMU should be able to get corresponding job's type. This commit brings such a possibility: just specify some Enum type as 'discriminator', and define a function somewhere with prototype bool YourUnionType_mapper(YourUnionType *, EnumType *out, Error **errp) Further commits will use this functionality to upgrade block-job-change interface and introduce some new interfaces. Signed-off-by: Vladimir Sementsov-Ogievskiy --- scripts/qapi/introspect.py | 5 +++- scripts/qapi/schema.py | 50 +++--- scripts/qapi/types.py | 3 ++- scripts/qapi/visit.py | 43 ++-- 4 files changed, 73 insertions(+), 28 deletions(-) I believe you need to update docs/devel/qapi-code-gen.rst. Current text: Union types --- Syntax:: UNION = { 'union': STRING, 'base': ( MEMBERS | STRING ), 'discriminator': STRING, 'data': BRANCHES, '*if': COND, '*features': FEATURES } BRANCHES = { BRANCH, ... } BRANCH = STRING : TYPE-REF | STRING : { 'type': TYPE-REF, '*if': COND } Member 'union' names the union type. The 'base' member defines the common members. If it is a MEMBERS_ object, it defines common members just like a struct type's 'data' member defines struct type members. If it is a STRING, it names a struct type whose members are the common members. Member 'discriminator' must name a non-optional enum-typed member of the base struct. That member's value selects a branch by its name. If no such branch exists, an empty branch is assumed. If I understand your commit message correctly, this paragraph is no longer true. Right. Like this: Member 'discriminator' must name either a non-optional enum-typed member, or an enum type name. (and more description follow, about user defined function and so on). Do you think that mixing member name and type name here is OK? Or should I instead add another field 'discriminator-type', so that exactly one of 'discriminator' and 'discriminator-type' should be in union definition? Each BRANCH of the 'data' object defines a branch of the union. A union must have at least one branch. The BRANCH's STRING name is the branch name. It must be a value of the discriminator enum type. The BRANCH's value defines the branch's properties, in particular its type. The type must a struct type. The form TYPE-REF_ is shorthand for :code:`{ 'type': TYPE-REF }`. In the Client JSON Protocol, a union is represented by an object with the common members (from the base type) and the selected branch's members. The two sets of member names must be disjoint. Example:: { 'enum': 'BlockdevDriver', 'data': [ 'file', 'qcow2' ] } { 'union': 'BlockdevOptions', 'base': { 'driver': 'BlockdevDriver', '*read-only': 'bool' }, 'discriminator': 'driver', 'data': { 'file': 'BlockdevOptionsFile', 'qcow2': 'BlockdevOptionsQcow2' } } Resulting in these JSON objects:: { "driver": "file", "read-only": true, "filename": "/some/place/my-image" } { "driver": "qcow2", "read-only": false, "backing": "/some/place/my-image", "lazy-refcounts": true } The order of branches need not match the order of the enum values. The branches need not cover all possible enum values. In the resulting generated C data types, a union is represented as a struct with the base members in QAPI schema order, and then a union of structures for each branch of the struct. The optional 'if' member specifies a conditional. See `Configuring the schema`_ below for more on this. The optional 'features' member specifies features. See Features_ below for more on this. -- Best regards, Vladimir
Re: [RFC 01/15] scripts/qapi: support type-based unions
On 28.03.24 12:40, Markus Armbruster wrote: Subject: all unions are type-based. Perhaps "support implicit union tags on the wire"? Yes, sounds good. Do you need this schema language feature for folding block jobs into the jobs abstraction, or is it just for making the wire protocol nicer in places? It's not necessary, we can proceed with job-* API, specifying both type and id. But I think, as we are not in a hurry, better to make new job-* API more effective from the beginning. -- Best regards, Vladimir
Re: [PATCH] qapi/block-core: improve Qcow2OverlapCheckFlags documentation
On 25.03.24 16:04, Markus Armbruster wrote: Vladimir Sementsov-Ogievskiy writes: Most of fields have no description at all. Let's fix that. Still, no reason to place here more detailed descriptions of what these structures are, as we have public Qcow2 format specification. Signed-off-by: Vladimir Sementsov-Ogievskiy --- qapi/block-core.json | 25 + 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/qapi/block-core.json b/qapi/block-core.json index 1874f880a8..b9fb994a66 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -3403,14 +3403,31 @@ # @Qcow2OverlapCheckFlags: # # Structure of flags for each metadata structure. Setting a field to -# 'true' makes qemu guard that structure against unintended -# overwriting. The default value is chosen according to the template -# given. +# 'true' makes qemu guard that Qcow2 format structure against Mind if I use the occasion to correct the spelling of QEMU? No problem, thanks for fixing -- Best regards, Vladimir
[PATCH] qapi/block-core: improve Qcow2OverlapCheckFlags documentation
Most of fields have no description at all. Let's fix that. Still, no reason to place here more detailed descriptions of what these structures are, as we have public Qcow2 format specification. Signed-off-by: Vladimir Sementsov-Ogievskiy --- qapi/block-core.json | 25 + 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/qapi/block-core.json b/qapi/block-core.json index 1874f880a8..b9fb994a66 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -3403,14 +3403,31 @@ # @Qcow2OverlapCheckFlags: # # Structure of flags for each metadata structure. Setting a field to -# 'true' makes qemu guard that structure against unintended -# overwriting. The default value is chosen according to the template -# given. +# 'true' makes qemu guard that Qcow2 format structure against +# unintended overwriting. See Qcow2 format specification for detailed +# information on these structures. The default value is chosen +# according to the template given. # # @template: Specifies a template mode which can be adjusted using the # other flags, defaults to 'cached' # -# @bitmap-directory: since 3.0 +# @main-header: Qcow2 format header +# +# @active-l1: Qcow2 active L1 table +# +# @active-l2: Qcow2 active L2 table +# +# @refcount-table: Qcow2 refcount table +# +# @refcount-block: Qcow2 refcount blocks +# +# @snapshot-table: Qcow2 snapshot table +# +# @inactive-l1: Qcow2 inactive L1 tables +# +# @inactive-l2: Qcow2 inactive L2 tables +# +# @bitmap-directory: Qcow2 bitmap directory (since 3.0) # # Since: 2.9 ## -- 2.34.1
Re: [PATCH v3] blockcommit: Reopen base image as RO after abort
On 15.03.24 12:55, Alexander Ivanov wrote: On 2/28/24 17:48, Vladimir Sementsov-Ogievskiy wrote: On 09.02.24 15:29, Alexander Ivanov wrote: Could you please review the patch? Sorry for long delay. Honestly, I don't like refcnt in block-driver. It violate incapsulation, refcnt is interal thing of common block layer. And actually, you can't make any assumptions from value of refcnt, as you don't know which additional parents were created and why, and when they are going unref their children. Hmmm... Maybe I can just exclude refcnt check from the condition, can't I. If BDS will be removed it doesn't matter if we make it RO. What do you think? Sounds good. I even don't see, why you need bdrv_chain_has_significant_parent() check. We just roll-back ro->rw transition on failure case, isn't just always correct thing to do? What was wrong with v2? My bad, it seems, I didn't send v2 before I decided to change the patch. Hmm, somehow, I don't have it in my mailbox, but here it is: https://patchew.org/QEMU/20240109093128.157460-1-alexander.iva...@virtuozzo.com/ === More: in commit message you say about failure case. And it seems OK to roll-back ro->rw transition on failure, if we did it. But mirror_exit_common() called on success path too. I think, on success patch, we should do any additional reopenings? -- Best regards, Vladimir
Re: [PATCH v2 3/6] qdev-monitor: add option to report GenericError from find_device_state
On 15.03.24 15:51, Markus Armbruster wrote: Sorry for the late answer. Vladimir Sementsov-Ogievskiy writes: On 07.03.24 12:46, Markus Armbruster wrote: [...] I appreciate the attempt to curb the spread of DeviceNotFound errors. Two issues: * Copy-pasting find_device_state() with a false argument is an easy error to make. * Most uses of find_device_state() are via blk_by_qdev_id() and qmp_get_blk(). Any new uses of qemu_get_blk() will still produce DeviceNotFound. Hmm. Hmm. Right. Wait a bit, I can make the change stricter. Could you still explain (or give a link), why and when we decided to use only GenericError? I think, having different "error-codes" is a good thing, why we are trying to get rid of it? We actually got rid of most of them years ago :) But you deserve a more complete answer. QMP initially specified the following error response[1]: 2.4.2 error --- The error response is issued when the command execution could not be completed because of an error condition. The format is: { "error": { "class": json-string, "data": json-value }, "id": json-value } Where, - The "class" member contains the error class name (eg. "ServiceUnavailable") - The "data" member contains specific error data and is defined in a per-command basis, it will be an empty json-object if the error has no data - The "id" member contains the transaction identification associated with the command execution (if issued by the Client) Note the structure of @data depends on @class. We documented a command's possible error classes (well, we tried), but never bothered to document the @data it comes with. Documentation deficits aside, this is looks quite expressive. There are issues, though: 1. Formatting errors in human-readable form is bothersome, and creates a tight coupling between QMP server and client. Consider: {"class": "DeviceNotFound", "data": {"device": "ide1-cd0"}} To format this in human-readable form, you need to know the error. The server does. Fine print: it has a table mapping JSON templates to human-readable error message templates. The client needs to duplicate this somehow. If it receives an error it doesn't know, all it can do is barf (possibly dressed up) JSON at the human. To avoid that, clients need to track the server closely: tight coupling. 2. Errors have notational overhead, which leads to bad errors. To create a new error, you have to edit two source files (not counting clients). Strong incentive to reuse existing errors. Even when they don't quite fit. When a name provided by the user couldn't be resolved, reusing DeviceNotFound is easier than creating a new error that is more precise. 3. The human-readable error message is hidden from the programmer's view, which leads to bad error messages. At the point in the source where the error is created, we see something like QERR_DEVICE_NOT_FOUND, name. To see the human-readable message, we have to look up macro QERR_DEVICE_NOT_FOUND's error message template in the table, or actually test (*gasp*) the error. People generally do neither, and bad error messages proliferate. 4. Too little gain for the pain Clients rarely want to handle different errors differently. More often than not, all they do with @class and @data is log them. Only occasionally do they switch on @class, and basically never on @data. It me took a considerable fight to get the protocol amended to include a human-readable message[2]. This addressed issue 1. Over the next two years or so, issues 2. to 4. slowly sank in. We eventually tired of the complexity, ripped out @data, and dumbed down all error classes to GenericError, except for the few clients actually cared for[3]. We also mandated that new errors avoid the QERR_ macros. Eliminating the existing QERR_ macros has been slow. We're down to 13 in master, with patches deleting 7 on the list. This has served us reasonably well. Questions? [1] Commit f544d174dfc QMP: Introduce specification Dec 2009 [2] Commit 77e595e7c61q QMP: add human-readable description to error response Dec 2009 [3] Commit de253f14912 qmp: switch to the new error format on the wire Aug 2012 Thanks for full-picture story! Now I'm all for it. -- Best regards, Vladimir
Re: [PATCH v4 4/5] qapi: blockdev-backup: add discard-source parameter
On 13.03.24 19:08, Markus Armbruster wrote: Vladimir Sementsov-Ogievskiy writes: Add a parameter that enables discard-after-copy. That is mostly useful in "push backup with fleecing" scheme, when source is snapshot-access format driver node, based on copy-before-write filter snapshot-access API: [guest] [snapshot-access] ~~ blockdev-backup ~~> [backup target] || | root | file vv [copy-before-write] | | | file| target v v [active disk] [temp.img] In this case discard-after-copy does two things: - discard data in temp.img to save disk space - avoid further copy-before-write operation in discarded area Note that we have to declare WRITE permission on source in copy-before-write filter, for discard to work. Still we can't take it unconditionally, as it will break normal backup from RO source. So, we have to add a parameter and pass it thorough bdrv_open flags. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fiona Ebner Tested-by: Fiona Ebner [...] diff --git a/qapi/block-core.json b/qapi/block-core.json index 1874f880a8..2ef52ae9a7 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -1610,6 +1610,9 @@ # node specified by @drive. If this option is not given, a node # name is autogenerated. (Since: 4.2) # +# @discard-source: Discard blocks on source which are already copied "have been copied"? Oh, right +# to the target. (Since 9.1) +# # @x-perf: Performance options. (Since 6.0) # # Features: @@ -1631,6 +1634,7 @@ '*on-target-error': 'BlockdevOnError', '*auto-finalize': 'bool', '*auto-dismiss': 'bool', '*filter-node-name': 'str', +'*discard-source': 'bool', '*x-perf': { 'type': 'BackupPerf', 'features': [ 'unstable' ] } } } QAPI schema Acked-by: Markus Armbruster Thanks! -- Best regards, Vladimir
[PATCH v4 0/5] backup: discard-source parameter
Hi all! The main patch is 04, please look at it for description and diagram. v4: add t-b by Fiona add r-b by Fiona to 02-05 (patch 01 still lack an r-b) 05: fix copyrights and subject in the test 04: since 9.0 --> since 9.1 (we missed a soft freeze for 9.0) Vladimir Sementsov-Ogievskiy (5): block/copy-before-write: fix permission block/copy-before-write: support unligned snapshot-discard block/copy-before-write: create block_copy bitmap in filter node qapi: blockdev-backup: add discard-source parameter iotests: add backup-discard-source block/backup.c| 5 +- block/block-copy.c| 12 +- block/copy-before-write.c | 39 - block/copy-before-write.h | 1 + block/replication.c | 4 +- blockdev.c| 2 +- include/block/block-common.h | 2 + include/block/block-copy.h| 2 + include/block/block_int-global-state.h| 2 +- qapi/block-core.json | 4 + tests/qemu-iotests/257.out| 112 ++--- .../qemu-iotests/tests/backup-discard-source | 152 ++ .../tests/backup-discard-source.out | 5 + 13 files changed, 272 insertions(+), 70 deletions(-) create mode 100755 tests/qemu-iotests/tests/backup-discard-source create mode 100644 tests/qemu-iotests/tests/backup-discard-source.out -- 2.34.1
[PATCH v4 4/5] qapi: blockdev-backup: add discard-source parameter
Add a parameter that enables discard-after-copy. That is mostly useful in "push backup with fleecing" scheme, when source is snapshot-access format driver node, based on copy-before-write filter snapshot-access API: [guest] [snapshot-access] ~~ blockdev-backup ~~> [backup target] || | root | file vv [copy-before-write] | | | file| target v v [active disk] [temp.img] In this case discard-after-copy does two things: - discard data in temp.img to save disk space - avoid further copy-before-write operation in discarded area Note that we have to declare WRITE permission on source in copy-before-write filter, for discard to work. Still we can't take it unconditionally, as it will break normal backup from RO source. So, we have to add a parameter and pass it thorough bdrv_open flags. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fiona Ebner Tested-by: Fiona Ebner --- block/backup.c | 5 +++-- block/block-copy.c | 9 + block/copy-before-write.c | 15 +-- block/copy-before-write.h | 1 + block/replication.c| 4 ++-- blockdev.c | 2 +- include/block/block-common.h | 2 ++ include/block/block-copy.h | 1 + include/block/block_int-global-state.h | 2 +- qapi/block-core.json | 4 10 files changed, 37 insertions(+), 8 deletions(-) diff --git a/block/backup.c b/block/backup.c index ec29d6b810..3dd2e229d2 100644 --- a/block/backup.c +++ b/block/backup.c @@ -356,7 +356,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, BlockDriverState *target, int64_t speed, MirrorSyncMode sync_mode, BdrvDirtyBitmap *sync_bitmap, BitmapSyncMode bitmap_mode, - bool compress, + bool compress, bool discard_source, const char *filter_node_name, BackupPerf *perf, BlockdevOnError on_source_error, @@ -457,7 +457,8 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, goto error; } -cbw = bdrv_cbw_append(bs, target, filter_node_name, , errp); +cbw = bdrv_cbw_append(bs, target, filter_node_name, discard_source, + , errp); if (!cbw) { goto error; } diff --git a/block/block-copy.c b/block/block-copy.c index 8fca2c3698..7e3b378528 100644 --- a/block/block-copy.c +++ b/block/block-copy.c @@ -137,6 +137,7 @@ typedef struct BlockCopyState { CoMutex lock; int64_t in_flight_bytes; BlockCopyMethod method; +bool discard_source; BlockReqList reqs; QLIST_HEAD(, BlockCopyCallState) calls; /* @@ -353,6 +354,7 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target, BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, BlockDriverState *copy_bitmap_bs, const BdrvDirtyBitmap *bitmap, + bool discard_source, Error **errp) { ERRP_GUARD(); @@ -418,6 +420,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, cluster_size), }; +s->discard_source = discard_source; block_copy_set_copy_opts(s, false, false); ratelimit_init(>rate_limit); @@ -589,6 +592,12 @@ static coroutine_fn int block_copy_task_entry(AioTask *task) co_put_to_shres(s->mem, t->req.bytes); block_copy_task_end(t, ret); +if (s->discard_source && ret == 0) { +int64_t nbytes = +MIN(t->req.offset + t->req.bytes, s->len) - t->req.offset; +bdrv_co_pdiscard(s->source, t->req.offset, nbytes); +} + return ret; } diff --git a/block/copy-before-write.c b/block/copy-before-write.c index ed2c228da7..cd65524e26 100644 --- a/block/copy-before-write.c +++ b/block/copy-before-write.c @@ -44,6 +44,7 @@ typedef struct BDRVCopyBeforeWriteState { BdrvChild *target; OnCbwError on_cbw_error; uint32_t cbw_timeout_ns; +bool discard_source; /* * @lock: protects access to @access_bitmap, @done_bitmap and @@ -357,6 +358,8 @@ cbw_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role, uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared) { +BDRVCopyBeforeWriteState *s = bs->opaque; + if (!(role & BDRV_CHILD_FILTERED)) { /* * Target child @@ -381,6 +384,10 @@ cbw_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role, * start */ *nperm = *nperm | BLK_PERM_CONSISTENT_READ; +
[PATCH v4 3/5] block/copy-before-write: create block_copy bitmap in filter node
Currently block_copy creates copy_bitmap in source node. But that is in bad relation with .independent_close=true of copy-before-write filter: source node may be detached and removed before .bdrv_close() handler called, which should call block_copy_state_free(), which in turn should remove copy_bitmap. That's all not ideal: it would be better if internal bitmap of block-copy object is not attached to any node. But that is not possible now. The simplest solution is just create copy_bitmap in filter node, where anyway two other bitmaps are created. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fiona Ebner Tested-by: Fiona Ebner --- block/block-copy.c | 3 +- block/copy-before-write.c | 2 +- include/block/block-copy.h | 1 + tests/qemu-iotests/257.out | 112 ++--- 4 files changed, 60 insertions(+), 58 deletions(-) diff --git a/block/block-copy.c b/block/block-copy.c index 9ee3dd7ef5..8fca2c3698 100644 --- a/block/block-copy.c +++ b/block/block-copy.c @@ -351,6 +351,7 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target, } BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, + BlockDriverState *copy_bitmap_bs, const BdrvDirtyBitmap *bitmap, Error **errp) { @@ -367,7 +368,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, return NULL; } -copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL, +copy_bitmap = bdrv_create_dirty_bitmap(copy_bitmap_bs, cluster_size, NULL, errp); if (!copy_bitmap) { return NULL; diff --git a/block/copy-before-write.c b/block/copy-before-write.c index 6d89af0b29..ed2c228da7 100644 --- a/block/copy-before-write.c +++ b/block/copy-before-write.c @@ -468,7 +468,7 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags, ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & bs->file->bs->supported_zero_flags); -s->bcs = block_copy_state_new(bs->file, s->target, bitmap, errp); +s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap, errp); if (!s->bcs) { error_prepend(errp, "Cannot create block-copy-state: "); return -EINVAL; diff --git a/include/block/block-copy.h b/include/block/block-copy.h index 0700953ab8..8b41643bfa 100644 --- a/include/block/block-copy.h +++ b/include/block/block-copy.h @@ -25,6 +25,7 @@ typedef struct BlockCopyState BlockCopyState; typedef struct BlockCopyCallState BlockCopyCallState; BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, + BlockDriverState *copy_bitmap_bs, const BdrvDirtyBitmap *bitmap, Error **errp); diff --git a/tests/qemu-iotests/257.out b/tests/qemu-iotests/257.out index aa76131ca9..c33dd7f3a9 100644 --- a/tests/qemu-iotests/257.out +++ b/tests/qemu-iotests/257.out @@ -120,16 +120,16 @@ write -P0x67 0x3fe 0x2 "granularity": 65536, "persistent": false, "recording": false - } -], -"drive0": [ + }, { "busy": false, "count": 0, "granularity": 65536, "persistent": false, "recording": false - }, + } +], +"drive0": [ { "busy": false, "count": 458752, @@ -596,16 +596,16 @@ write -P0x67 0x3fe 0x2 "granularity": 65536, "persistent": false, "recording": false - } -], -"drive0": [ + }, { "busy": false, "count": 0, "granularity": 65536, "persistent": false, "recording": false - }, + } +], +"drive0": [ { "busy": false, "count": 458752, @@ -865,16 +865,16 @@ write -P0x67 0x3fe 0x2 "granularity": 65536, "persistent": false, "recording": false - } -], -"drive0": [ + }, { "busy": false, "count": 0, "granularity": 65536, "persistent": false, "recording": false - }, + } +], +"drive0": [ { "busy": false, "count": 458752, @@ -1341,16 +1341,16 @@ write -P0x67 0x3fe 0x2 "granularity": 65536, "persistent": fals
[PATCH v4 5/5] iotests: add backup-discard-source
Add test for a new backup option: discard-source. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fiona Ebner Tested-by: Fiona Ebner --- .../qemu-iotests/tests/backup-discard-source | 152 ++ .../tests/backup-discard-source.out | 5 + 2 files changed, 157 insertions(+) create mode 100755 tests/qemu-iotests/tests/backup-discard-source create mode 100644 tests/qemu-iotests/tests/backup-discard-source.out diff --git a/tests/qemu-iotests/tests/backup-discard-source b/tests/qemu-iotests/tests/backup-discard-source new file mode 100755 index 00..2391b12acd --- /dev/null +++ b/tests/qemu-iotests/tests/backup-discard-source @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +# +# Test backup discard-source parameter +# +# Copyright (c) Virtuozzo International GmbH. +# Copyright (c) Yandex +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +import os + +import iotests +from iotests import qemu_img_create, qemu_img_map, qemu_io + + +temp_img = os.path.join(iotests.test_dir, 'temp') +source_img = os.path.join(iotests.test_dir, 'source') +target_img = os.path.join(iotests.test_dir, 'target') +size = '1M' + + +def get_actual_size(vm, node_name): +nodes = vm.cmd('query-named-block-nodes', flat=True) +node = next(n for n in nodes if n['node-name'] == node_name) +return node['image']['actual-size'] + + +class TestBackup(iotests.QMPTestCase): +def setUp(self): +qemu_img_create('-f', iotests.imgfmt, source_img, size) +qemu_img_create('-f', iotests.imgfmt, temp_img, size) +qemu_img_create('-f', iotests.imgfmt, target_img, size) +qemu_io('-c', 'write 0 1M', source_img) + +self.vm = iotests.VM() +self.vm.launch() + +self.vm.cmd('blockdev-add', { +'node-name': 'cbw', +'driver': 'copy-before-write', +'file': { +'driver': iotests.imgfmt, +'file': { +'driver': 'file', +'filename': source_img, +} +}, +'target': { +'driver': iotests.imgfmt, +'discard': 'unmap', +'node-name': 'temp', +'file': { +'driver': 'file', +'filename': temp_img +} +} +}) + +self.vm.cmd('blockdev-add', { +'node-name': 'access', +'discard': 'unmap', +'driver': 'snapshot-access', +'file': 'cbw' +}) + +self.vm.cmd('blockdev-add', { +'driver': iotests.imgfmt, +'node-name': 'target', +'file': { +'driver': 'file', +'filename': target_img +} +}) + +self.assertLess(get_actual_size(self.vm, 'temp'), 512 * 1024) + +def tearDown(self): +# That should fail, because region is discarded +self.vm.hmp_qemu_io('access', 'read 0 1M') + +self.vm.shutdown() + +self.assertTrue('read failed: Permission denied' in self.vm.get_log()) + +# Final check that temp image is empty +mapping = qemu_img_map(temp_img) +self.assertEqual(len(mapping), 1) +self.assertEqual(mapping[0]['start'], 0) +self.assertEqual(mapping[0]['length'], 1024 * 1024) +self.assertEqual(mapping[0]['data'], False) + +os.remove(temp_img) +os.remove(source_img) +os.remove(target_img) + +def do_backup(self): +self.vm.cmd('blockdev-backup', device='access', +sync='full', target='target', +job_id='backup0', +discard_source=True) + +self.vm.event_wait(name='BLOCK_JOB_COMPLETED') + +def test_discard_written(self): +""" +1. Guest writes +2. copy-before-write operation, data is stored to temp +3. start backup(discard_source=True), check that data is + removed from temp +""" +# Trigger copy-before-write operation +result = self.vm.hmp_qemu_io('cbw', 'write 0 1M') +self.assert_qmp(result, 'return', '') + +# Check that data is written to temporary image +self.assertGreater(get_actual_size(self.vm, 'temp'), 1024 * 1024) + +self.do_backu
[PATCH v4 1/5] block/copy-before-write: fix permission
In case when source node does not have any parents, the condition still works as required: backup job do create the parent by block_job_create -> block_job_add_bdrv -> bdrv_root_attach_child Still, in this case checking @perm variable doesn't work, as backup job creates the root blk with empty permissions (as it rely on CBW filter to require correct permissions and don't want to create extra conflicts). So, we should not check @perm. The hack may be dropped entirely when transactional insertion of filter (when we don't try to recalculate permissions in intermediate state, when filter does conflict with original parent of the source node) merged (old big series "[PATCH v5 00/45] Transactional block-graph modifying API"[1] and it's current in-flight part is "[PATCH v8 0/7] blockdev-replace"[2]) [1] https://patchew.org/QEMU/20220330212902.590099-1-vsement...@openvz.org/ [2] https://patchew.org/QEMU/2023101718.932733-1-vsement...@yandex-team.ru/ Signed-off-by: Vladimir Sementsov-Ogievskiy Tested-by: Fiona Ebner --- block/copy-before-write.c | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/block/copy-before-write.c b/block/copy-before-write.c index 8aba27a71d..3e3af30c08 100644 --- a/block/copy-before-write.c +++ b/block/copy-before-write.c @@ -364,9 +364,13 @@ cbw_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role, perm, shared, nperm, nshared); if (!QLIST_EMPTY(>parents)) { -if (perm & BLK_PERM_WRITE) { -*nperm = *nperm | BLK_PERM_CONSISTENT_READ; -} +/* + * Note, that source child may be shared with backup job. Backup job + * does create own blk parent on copy-before-write node, so this + * works even if source node does not have any parents before backup + * start + */ +*nperm = *nperm | BLK_PERM_CONSISTENT_READ; *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE); } } -- 2.34.1
[PATCH v4 2/5] block/copy-before-write: support unligned snapshot-discard
First thing that crashes on unligned access here is bdrv_reset_dirty_bitmap(). Correct way is to align-down the snapshot-discard request. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fiona Ebner Tested-by: Fiona Ebner --- block/copy-before-write.c | 16 +--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/block/copy-before-write.c b/block/copy-before-write.c index 3e3af30c08..6d89af0b29 100644 --- a/block/copy-before-write.c +++ b/block/copy-before-write.c @@ -325,14 +325,24 @@ static int coroutine_fn GRAPH_RDLOCK cbw_co_pdiscard_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes) { BDRVCopyBeforeWriteState *s = bs->opaque; +uint32_t cluster_size = block_copy_cluster_size(s->bcs); +int64_t aligned_offset = QEMU_ALIGN_UP(offset, cluster_size); +int64_t aligned_end = QEMU_ALIGN_DOWN(offset + bytes, cluster_size); +int64_t aligned_bytes; + +if (aligned_end <= aligned_offset) { +return 0; +} +aligned_bytes = aligned_end - aligned_offset; WITH_QEMU_LOCK_GUARD(>lock) { -bdrv_reset_dirty_bitmap(s->access_bitmap, offset, bytes); +bdrv_reset_dirty_bitmap(s->access_bitmap, aligned_offset, +aligned_bytes); } -block_copy_reset(s->bcs, offset, bytes); +block_copy_reset(s->bcs, aligned_offset, aligned_bytes); -return bdrv_co_pdiscard(s->target, offset, bytes); +return bdrv_co_pdiscard(s->target, aligned_offset, aligned_bytes); } static void GRAPH_RDLOCK cbw_refresh_filename(BlockDriverState *bs) -- 2.34.1
[RFC 10/15] qapi: query-jobs: add information specific for job type
Duplicate the feature from query-block-jobs. It's a step to finally deprecate query-block-jobs command and move to query-jobs. Signed-off-by: Vladimir Sementsov-Ogievskiy --- block/mirror.c | 14 -- include/qemu/job.h | 5 + job-qmp.c | 6 ++ qapi/job.json | 22 +++--- 4 files changed, 42 insertions(+), 5 deletions(-) diff --git a/block/mirror.c b/block/mirror.c index e95c54fbc6..96dcbbc3e8 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -1294,7 +1294,7 @@ static bool mirror_change(Job *job, JobChangeOptions *opts, Error **errp) return true; } -static void mirror_query(BlockJob *job, BlockJobInfo *info) +static void mirror_query_old(BlockJob *job, BlockJobInfo *info) { MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); @@ -1303,6 +1303,15 @@ static void mirror_query(BlockJob *job, BlockJobInfo *info) }; } +static void mirror_query(Job *job, JobInfo *info) +{ +MirrorBlockJob *s = container_of(job, MirrorBlockJob, common.job); + +info->u.mirror = (JobInfoMirror) { +.actively_synced = qatomic_read(>actively_synced), +}; +} + static const BlockJobDriver mirror_job_driver = { .job_driver = { .instance_size = sizeof(MirrorBlockJob), @@ -1316,9 +1325,10 @@ static const BlockJobDriver mirror_job_driver = { .complete = mirror_complete, .cancel = mirror_cancel, .change = mirror_change, +.query = mirror_query, }, .drained_poll = mirror_drained_poll, -.query = mirror_query, +.query = mirror_query_old, }; static bool commit_active_change(Job *job, JobChangeOptions *opts, Error **errp) diff --git a/include/qemu/job.h b/include/qemu/job.h index eee1d5b50f..8a238b8658 100644 --- a/include/qemu/job.h +++ b/include/qemu/job.h @@ -315,6 +315,11 @@ struct JobDriver { */ bool (*change)(Job *job, JobChangeOptions *opts, Error **errp); +/* + * Query information specific to this kind of block job. + */ +void (*query)(Job *job, JobInfo *info); + /** * Called when the job is freed. */ diff --git a/job-qmp.c b/job-qmp.c index c048e03d9f..9643c5424d 100644 --- a/job-qmp.c +++ b/job-qmp.c @@ -177,6 +177,12 @@ static JobInfo *job_query_single_locked(Job *job, Error **errp) g_strdup(error_get_pretty(job->err)) : NULL, }; +if (job->driver->query) { +job_unlock(); +job->driver->query(job, info); +job_lock(); +} + return info; } diff --git a/qapi/job.json b/qapi/job.json index 2f1b839cfc..036fec1b57 100644 --- a/qapi/job.json +++ b/qapi/job.json @@ -251,6 +251,20 @@ ## { 'command': 'job-finalize', 'data': { 'id': 'str' } } +## +# @JobInfoMirror: +# +# Information specific to mirror block jobs. +# +# @actively-synced: Whether the source is actively synced to the +# target, i.e. same data and new writes are done synchronously to +# both. +# +# Since: 9.1 +## +{ 'struct': 'JobInfoMirror', + 'data': { 'actively-synced': 'bool' } } + ## # @JobInfo: # @@ -281,10 +295,12 @@ # # Since: 3.0 ## -{ 'struct': 'JobInfo', - 'data': { 'id': 'str', 'type': 'JobType', 'status': 'JobStatus', +{ 'union': 'JobInfo', + 'base': { 'id': 'str', 'type': 'JobType', 'status': 'JobStatus', 'current-progress': 'int', 'total-progress': 'int', -'*error': 'str' } } +'*error': 'str' }, + 'discriminator': 'type', + 'data': { 'mirror': 'JobInfoMirror' } } ## # @query-jobs: -- 2.34.1
[RFC 05/15] qapi: JobChangeOptions: make type member optional and deprecated
Now QEMU can understand type directly from the job itself, type is redundant. Signed-off-by: Vladimir Sementsov-Ogievskiy --- blockjob.c | 2 +- docs/about/deprecated.rst| 6 ++ job-qmp.c| 17 + qapi/block-core.json | 10 -- stubs/meson.build| 1 + stubs/qapi-jobchangeoptions-mapper.c | 8 6 files changed, 41 insertions(+), 3 deletions(-) create mode 100644 stubs/qapi-jobchangeoptions-mapper.c diff --git a/blockjob.c b/blockjob.c index 788cb1e07d..33c40e7d25 100644 --- a/blockjob.c +++ b/blockjob.c @@ -319,7 +319,7 @@ void block_job_change_locked(BlockJob *job, JobChangeOptions *opts, GLOBAL_STATE_CODE(); -if (job_type(>job) != opts->type) { +if (opts->has_type && job_type(>job) != opts->type) { error_setg(errp, "Job '%s' is '%s' job, not '%s'", job->job.id, job_type_str(>job), JobType_str(opts->type)); return; diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst index dfd681cd02..64981e5e4a 100644 --- a/docs/about/deprecated.rst +++ b/docs/about/deprecated.rst @@ -142,6 +142,12 @@ accepted incorrect commands will return an error. Users should make sure that all arguments passed to ``device_add`` are consistent with the documented property types. + +``block-job-change`` argument ``type`` (since 9.1) +'' + +QEMU can get job type from the job itself (by @id), @type field is redundant. + QEMU Machine Protocol (QMP) events -- diff --git a/job-qmp.c b/job-qmp.c index 9e26fa899f..c486df9579 100644 --- a/job-qmp.c +++ b/job-qmp.c @@ -26,6 +26,8 @@ #include "qemu/osdep.h" #include "qemu/job.h" #include "qapi/qapi-commands-job.h" +#include "qapi/qapi-types-block-core.h" +#include "qapi/qapi-visit-block-core.h" #include "qapi/error.h" #include "trace/trace-root.h" @@ -186,3 +188,18 @@ JobInfoList *qmp_query_jobs(Error **errp) return head; } + +bool JobChangeOptions_mapper(JobChangeOptions *opts, JobType *out, Error **errp) +{ +Job *job; + +JOB_LOCK_GUARD(); + +job = find_job_locked(opts->id, errp); +if (!job) { +return false; +} + +*out = job_type(job); +return true; +} diff --git a/qapi/block-core.json b/qapi/block-core.json index 6041e7bd8f..3d890dfcd0 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -3082,11 +3082,17 @@ # # @type: The job type # +# Features: +# +# @deprecated: Members @type is deprecated. Qemu can get type from +# the job itself, @type is redundant. +# # Since: 8.2 ## { 'union': 'JobChangeOptions', - 'base': { 'id': 'str', 'type': 'JobType' }, - 'discriminator': 'type', + 'base': { 'id': 'str', +'*type': { 'type': 'JobType', 'features': ['deprecated'] } }, + 'discriminator': 'JobType', 'data': { 'mirror': 'JobChangeOptionsMirror' } } ## diff --git a/stubs/meson.build b/stubs/meson.build index 0bf25e6ca5..e480400a6c 100644 --- a/stubs/meson.build +++ b/stubs/meson.build @@ -31,6 +31,7 @@ stub_ss.add(files('module-opts.c')) stub_ss.add(files('monitor.c')) stub_ss.add(files('monitor-core.c')) stub_ss.add(files('physmem.c')) +stub_ss.add(files('qapi-jobchangeoptions-mapper.c')) stub_ss.add(files('qemu-timer-notify-cb.c')) stub_ss.add(files('memory_device.c')) stub_ss.add(files('qmp-command-available.c')) diff --git a/stubs/qapi-jobchangeoptions-mapper.c b/stubs/qapi-jobchangeoptions-mapper.c new file mode 100644 index 00..e4acfd91b3 --- /dev/null +++ b/stubs/qapi-jobchangeoptions-mapper.c @@ -0,0 +1,8 @@ +#include "qemu/osdep.h" +#include "qapi/qapi-visit-block-core.h" +#include "qapi/qapi-types-job.h" + +bool JobChangeOptions_mapper(JobChangeOptions *opts, JobType *out, Error **errp) +{ +g_assert_not_reached(); +} -- 2.34.1
[RFC 01/15] scripts/qapi: support type-based unions
Look at block-job-change command: we have to specify both 'id' to chose the job to operate on and 'type' for QAPI union be parsed. But for user this looks redundant: when we specify 'id', QEMU should be able to get corresponding job's type. This commit brings such a possibility: just specify some Enum type as 'discriminator', and define a function somewhere with prototype bool YourUnionType_mapper(YourUnionType *, EnumType *out, Error **errp) Further commits will use this functionality to upgrade block-job-change interface and introduce some new interfaces. Signed-off-by: Vladimir Sementsov-Ogievskiy --- scripts/qapi/introspect.py | 5 +++- scripts/qapi/schema.py | 50 +++--- scripts/qapi/types.py | 3 ++- scripts/qapi/visit.py | 43 ++-- 4 files changed, 73 insertions(+), 28 deletions(-) diff --git a/scripts/qapi/introspect.py b/scripts/qapi/introspect.py index 67c7d89aae..04d8d424f5 100644 --- a/scripts/qapi/introspect.py +++ b/scripts/qapi/introspect.py @@ -336,7 +336,10 @@ def visit_object_type_flat(self, name: str, info: Optional[QAPISourceInfo], 'members': [self._gen_object_member(m) for m in members] } if variants: -obj['tag'] = variants.tag_member.name +if variants.tag_member: +obj['tag'] = variants.tag_member.name +else: +obj['discriminator-type'] = variants.tag_type.name obj['variants'] = [self._gen_variant(v) for v in variants.variants] self._gen_tree(name, 'object', obj, ifcond, features) diff --git a/scripts/qapi/schema.py b/scripts/qapi/schema.py index 8ba5665bc6..0efe8d815c 100644 --- a/scripts/qapi/schema.py +++ b/scripts/qapi/schema.py @@ -585,16 +585,16 @@ def visit(self, visitor): class QAPISchemaVariants: -def __init__(self, tag_name, info, tag_member, variants): +def __init__(self, discriminator, info, tag_member, variants): # Unions pass tag_name but not tag_member. # Alternates pass tag_member but not tag_name. # After check(), tag_member is always set. -assert bool(tag_member) != bool(tag_name) -assert (isinstance(tag_name, str) or +assert bool(tag_member) != bool(discriminator) +assert (isinstance(discriminator, str) or isinstance(tag_member, QAPISchemaObjectTypeMember)) for v in variants: assert isinstance(v, QAPISchemaVariant) -self._tag_name = tag_name +self.discriminator = discriminator self.info = info self.tag_member = tag_member self.variants = variants @@ -604,16 +604,24 @@ def set_defined_in(self, name): v.set_defined_in(name) def check(self, schema, seen): -if self._tag_name: # union -self.tag_member = seen.get(c_name(self._tag_name)) +self.tag_type = None + +if self.discriminator: # assume union with type discriminator +self.tag_type = schema.lookup_type(self.discriminator) + +# union with discriminator field +if self.discriminator and not self.tag_type: +tag_name = self.discriminator +self.tag_member = seen.get(c_name(tag_name)) +self.tag_type = self.tag_member.type base = "'base'" # Pointing to the base type when not implicit would be # nice, but we don't know it here -if not self.tag_member or self._tag_name != self.tag_member.name: +if not self.tag_member or tag_name != self.tag_member.name: raise QAPISemError( self.info, "discriminator '%s' is not a member of %s" -% (self._tag_name, base)) +% (tag_name, base)) # Here we do: base_type = schema.lookup_type(self.tag_member.defined_in) assert base_type @@ -623,29 +631,33 @@ def check(self, schema, seen): raise QAPISemError( self.info, "discriminator member '%s' of %s must be of enum type" -% (self._tag_name, base)) +% (tag_name, base)) if self.tag_member.optional: raise QAPISemError( self.info, "discriminator member '%s' of %s must not be optional" -% (self._tag_name, base)) +% (tag_name, base)) if self.tag_member.ifcond.is_present(): raise QAPISemError( self.info, "discriminator member '%s' of %s must not be conditional" -% (self._tag_name, base)) -else: # alternate +% (tag_name, base)) +elif not self.tag_type: # alternate
[RFC 12/15] qapi: rename BlockDeviceIoStatus to IoStatus
This status is already shared between block-jobs and block-devices, so structure name is misleading a bit. We also will need to use the structure both in block-core.json and job.json. So give it more generic name first. The commit is made by commands: git grep -l BlockDeviceIoStatus | \ xargs sed -i 's/BlockDeviceIoStatus/IoStatus/g' git grep -l BLOCK_DEVICE_IO_STATUS | \ xargs sed -i 's/BLOCK_DEVICE_IO_STATUS/IO_STATUS/g' Signed-off-by: Vladimir Sementsov-Ogievskiy --- block/block-backend.c | 14 +++--- block/mirror.c | 4 ++-- block/monitor/block-hmp-cmds.c | 4 ++-- blockjob.c | 10 +- include/block/blockjob.h| 2 +- include/sysemu/block-backend-global-state.h | 2 +- qapi/block-core.json| 10 +- 7 files changed, 23 insertions(+), 23 deletions(-) diff --git a/block/block-backend.c b/block/block-backend.c index 9c4de79e6b..ec19c50e96 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -65,7 +65,7 @@ struct BlockBackend { BlockdevOnError on_read_error, on_write_error; bool iostatus_enabled; -BlockDeviceIoStatus iostatus; +IoStatus iostatus; uint64_t perm; uint64_t shared_perm; @@ -1198,7 +1198,7 @@ void blk_iostatus_enable(BlockBackend *blk) { GLOBAL_STATE_CODE(); blk->iostatus_enabled = true; -blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK; +blk->iostatus = IO_STATUS_OK; } /* The I/O status is only enabled if the drive explicitly @@ -1212,7 +1212,7 @@ bool blk_iostatus_is_enabled(const BlockBackend *blk) blk->on_read_error == BLOCKDEV_ON_ERROR_STOP)); } -BlockDeviceIoStatus blk_iostatus(const BlockBackend *blk) +IoStatus blk_iostatus(const BlockBackend *blk) { GLOBAL_STATE_CODE(); return blk->iostatus; @@ -1228,7 +1228,7 @@ void blk_iostatus_reset(BlockBackend *blk) { GLOBAL_STATE_CODE(); if (blk_iostatus_is_enabled(blk)) { -blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK; +blk->iostatus = IO_STATUS_OK; } } @@ -1236,9 +1236,9 @@ void blk_iostatus_set_err(BlockBackend *blk, int error) { IO_CODE(); assert(blk_iostatus_is_enabled(blk)); -if (blk->iostatus == BLOCK_DEVICE_IO_STATUS_OK) { -blk->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE : - BLOCK_DEVICE_IO_STATUS_FAILED; +if (blk->iostatus == IO_STATUS_OK) { +blk->iostatus = error == ENOSPC ? IO_STATUS_NOSPACE : + IO_STATUS_FAILED; } } diff --git a/block/mirror.c b/block/mirror.c index 96dcbbc3e8..8e672f3309 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -923,7 +923,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) MirrorBDSOpaque *mirror_top_opaque = s->mirror_top_bs->opaque; BlockDriverState *target_bs = blk_bs(s->target); bool need_drain = true; -BlockDeviceIoStatus iostatus; +IoStatus iostatus; int64_t length; int64_t target_length; BlockDriverInfo bdi; @@ -1060,7 +1060,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) iostatus = s->common.iostatus; } if (delta < BLOCK_JOB_SLICE_TIME && -iostatus == BLOCK_DEVICE_IO_STATUS_OK) { +iostatus == IO_STATUS_OK) { if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 || (cnt == 0 && s->in_flight > 0)) { trace_mirror_yield(s, cnt, s->buf_free_count, s->in_flight); diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c index d954bec6f1..64acb9cd6a 100644 --- a/block/monitor/block-hmp-cmds.c +++ b/block/monitor/block-hmp-cmds.c @@ -642,9 +642,9 @@ static void print_block_info(Monitor *mon, BlockInfo *info, if (info->qdev) { monitor_printf(mon, "Attached to: %s\n", info->qdev); } -if (info->has_io_status && info->io_status != BLOCK_DEVICE_IO_STATUS_OK) { +if (info->has_io_status && info->io_status != IO_STATUS_OK) { monitor_printf(mon, "I/O status: %s\n", - BlockDeviceIoStatus_str(info->io_status)); + IoStatus_str(info->io_status)); } if (info->removable) { diff --git a/blockjob.c b/blockjob.c index d3cd4f4fbf..de1dd03b2d 100644 --- a/blockjob.c +++ b/blockjob.c @@ -394,9 +394,9 @@ BlockJobInfo *block_job_query_locked(BlockJob *job, Error **errp) /* Called with job lock held */ static void block_job_iostatus_set_err_locked(BlockJob *job, int error) { -if (job->iostatus == BLOCK_DEVICE_IO_STATUS_OK) { -job->iostatus = error == ENOSPC ? BLOCK_D
[RFC 11/15] job-qmp: job_query_single_locked: add assertion on job ret
job->ret must be always set together with job->err. Let's assert this. Reproting no-error to the user, when job->err is unset and job->ret is somehow set would be a bug. Signed-off-by: Vladimir Sementsov-Ogievskiy --- job-qmp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/job-qmp.c b/job-qmp.c index 9643c5424d..3e2172c26a 100644 --- a/job-qmp.c +++ b/job-qmp.c @@ -163,6 +163,7 @@ static JobInfo *job_query_single_locked(Job *job, Error **errp) uint64_t progress_total; assert(!job_is_internal(job)); +assert(!job->err == !job->ret); progress_get_snapshot(>progress, _current, _total); -- 2.34.1
[RFC 07/15] qapi: add job-change
Add a new-style command job-change, doing same thing as block-job-change. The aim is finally deprecate block-job-* APIs and move to job-* APIs. We add a new command to qapi/block-core.json, not to qapi/job.json to avoid resolving json file including loops for now. This all would be a lot simple to refactor when we finally drop deprecated block-job-* APIs. @type argument of the new command immediately becomes deprecated. Signed-off-by: Vladimir Sementsov-Ogievskiy --- docs/about/deprecated.rst | 4 ++-- job-qmp.c | 14 ++ qapi/block-core.json | 10 ++ 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst index 64981e5e4a..5ff98ef95f 100644 --- a/docs/about/deprecated.rst +++ b/docs/about/deprecated.rst @@ -143,8 +143,8 @@ all arguments passed to ``device_add`` are consistent with the documented property types. -``block-job-change`` argument ``type`` (since 9.1) -'' +``block-job-change`` and ``job-change`` argument ``type`` (since 9.1) +'' QEMU can get job type from the job itself (by @id), @type field is redundant. diff --git a/job-qmp.c b/job-qmp.c index abe9b59487..011a8736ea 100644 --- a/job-qmp.c +++ b/job-qmp.c @@ -141,6 +141,20 @@ void qmp_job_dismiss(const char *id, Error **errp) job_dismiss_locked(, errp); } +void qmp_job_change(JobChangeOptions *opts, Error **errp) +{ +Job *job; + +JOB_LOCK_GUARD(); +job = find_job_locked(opts->id, errp); + +if (!job) { +return; +} + +job_change_locked(job, opts, errp); +} + /* Called with job_mutex held. */ static JobInfo *job_query_single_locked(Job *job, Error **errp) { diff --git a/qapi/block-core.json b/qapi/block-core.json index 3d890dfcd0..f5cefa441b 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -3105,6 +3105,16 @@ { 'command': 'block-job-change', 'data': 'JobChangeOptions', 'boxed': true } +## +# @job-change: +# +# Change the block job's options. +# +# Since: 9.1 +## +{ 'command': 'job-change', + 'data': 'JobChangeOptions', 'boxed': true } + ## # @BlockdevDiscardOptions: # -- 2.34.1
[RFC 14/15] qapi: query-job: add block-job specific information
Add io-status and speed, which make sense only for block-jobs. This allows us to finally deprecate old query-block-jobs API in the next commit. Signed-off-by: Vladimir Sementsov-Ogievskiy --- block/backup.c | 6 ++ block/commit.c | 6 ++ block/mirror.c | 8 block/stream.c | 6 ++ blockjob.c | 10 ++ include/block/blockjob.h | 6 ++ qapi/job.json| 21 - 7 files changed, 62 insertions(+), 1 deletion(-) diff --git a/block/backup.c b/block/backup.c index bf086dc5f9..55bbe85bf6 100644 --- a/block/backup.c +++ b/block/backup.c @@ -343,6 +343,11 @@ static bool backup_change(Job *job, JobChangeOptions *opts, Error **errp) return block_job_change(bjob, >u.backup, errp); } +static void backup_query(Job *job, JobInfo *info) +{ +block_job_query(job, >u.backup); +} + static const BlockJobDriver backup_job_driver = { .job_driver = { .instance_size = sizeof(BackupBlockJob), @@ -356,6 +361,7 @@ static const BlockJobDriver backup_job_driver = { .pause = backup_pause, .cancel = backup_cancel, .change = backup_change, +.query = backup_query, }, .set_speed = backup_set_speed, }; diff --git a/block/commit.c b/block/commit.c index ccb6097679..9199a6adc8 100644 --- a/block/commit.c +++ b/block/commit.c @@ -211,6 +211,11 @@ static bool commit_change(Job *job, JobChangeOptions *opts, Error **errp) return block_job_change(bjob, >u.commit, errp); } +static void commit_query(Job *job, JobInfo *info) +{ +block_job_query(job, >u.commit); +} + static const BlockJobDriver commit_job_driver = { .job_driver = { .instance_size = sizeof(CommitBlockJob), @@ -222,6 +227,7 @@ static const BlockJobDriver commit_job_driver = { .abort = commit_abort, .clean = commit_clean, .change= commit_change, +.query = commit_query, }, }; diff --git a/block/mirror.c b/block/mirror.c index 8e672f3309..e8092d56be 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -1310,6 +1310,8 @@ static void mirror_query(Job *job, JobInfo *info) info->u.mirror = (JobInfoMirror) { .actively_synced = qatomic_read(>actively_synced), }; + +block_job_query(job, qapi_JobInfoMirror_base(>u.mirror)); } static const BlockJobDriver mirror_job_driver = { @@ -1338,6 +1340,11 @@ static bool commit_active_change(Job *job, JobChangeOptions *opts, Error **errp) return block_job_change(bjob, >u.commit, errp); } +static void commit_active_query(Job *job, JobInfo *info) +{ +block_job_query(job, >u.commit); +} + static const BlockJobDriver commit_active_job_driver = { .job_driver = { .instance_size = sizeof(MirrorBlockJob), @@ -1351,6 +1358,7 @@ static const BlockJobDriver commit_active_job_driver = { .complete = mirror_complete, .cancel = commit_active_cancel, .change = commit_active_change, +.query = commit_active_query, }, .drained_poll = mirror_drained_poll, }; diff --git a/block/stream.c b/block/stream.c index 34f4588537..e5e4d0bc77 100644 --- a/block/stream.c +++ b/block/stream.c @@ -246,6 +246,11 @@ static bool stream_change(Job *job, JobChangeOptions *opts, Error **errp) return block_job_change(bjob, >u.stream, errp); } +static void stream_query(Job *job, JobInfo *info) +{ +block_job_query(job, >u.stream); +} + static const BlockJobDriver stream_job_driver = { .job_driver = { .instance_size = sizeof(StreamBlockJob), @@ -256,6 +261,7 @@ static const BlockJobDriver stream_job_driver = { .clean = stream_clean, .user_resume = block_job_user_resume, .change= stream_change, +.query = stream_query, }, }; diff --git a/blockjob.c b/blockjob.c index de1dd03b2d..7dd1ed3ff2 100644 --- a/blockjob.c +++ b/blockjob.c @@ -306,6 +306,16 @@ bool block_job_set_speed_locked(BlockJob *job, int64_t speed, Error **errp) return true; } +void block_job_query(Job *job, JobInfoBlockJob *info) +{ +BlockJob *bjob = container_of(job, BlockJob, job); + +JOB_LOCK_GUARD(); + +info->speed = bjob->speed; +info->io_status = bjob->iostatus; +} + static bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp) { JOB_LOCK_GUARD(); diff --git a/include/block/blockjob.h b/include/block/blockjob.h index fd7ba1a285..bc33c2ba77 100644 --- a/include/block/blockjob.h +++ b/include/block/blockjob.h @@ -231,4 +231,10 @@ const BlockJobDriver *block_job_driver(BlockJob *job); bool block_job_change(BlockJob *job, JobChangeOptionsBlockJob *opts, Error **errp); +/** + * Common pa
[RFC 06/15] blockjob: move change action implementation to job from block-job
Like for other block-job-* APIs we want have the actual functionality in job layer and make block-job-change to be a deprecated duplication of job-change in the following commit. Signed-off-by: Vladimir Sementsov-Ogievskiy --- block/mirror.c | 7 +++ blockdev.c | 2 +- blockjob.c | 26 -- include/block/blockjob.h | 11 --- include/block/blockjob_int.h | 7 --- include/qemu/job.h | 12 job-qmp.c| 1 + job.c| 23 +++ 8 files changed, 40 insertions(+), 49 deletions(-) diff --git a/block/mirror.c b/block/mirror.c index 2d0cd22c06..e670d0dd4f 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -1251,10 +1251,9 @@ static bool commit_active_cancel(Job *job, bool force) return force || !job_is_ready(job); } -static void mirror_change(BlockJob *job, JobChangeOptions *opts, - Error **errp) +static void mirror_change(Job *job, JobChangeOptions *opts, Error **errp) { -MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); +MirrorBlockJob *s = container_of(job, MirrorBlockJob, common.job); JobChangeOptionsMirror *change_opts = >u.mirror; MirrorCopyMode current; @@ -1310,9 +1309,9 @@ static const BlockJobDriver mirror_job_driver = { .pause = mirror_pause, .complete = mirror_complete, .cancel = mirror_cancel, +.change = mirror_change, }, .drained_poll = mirror_drained_poll, -.change = mirror_change, .query = mirror_query, }; diff --git a/blockdev.c b/blockdev.c index 7881f6e5a6..7e13213040 100644 --- a/blockdev.c +++ b/blockdev.c @@ -3256,7 +3256,7 @@ void qmp_block_job_change(JobChangeOptions *opts, Error **errp) return; } -block_job_change_locked(job, opts, errp); +job_change_locked(>job, opts, errp); } void qmp_change_backing_file(const char *device, diff --git a/blockjob.c b/blockjob.c index 33c40e7d25..2769722b37 100644 --- a/blockjob.c +++ b/blockjob.c @@ -312,32 +312,6 @@ static bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp) return block_job_set_speed_locked(job, speed, errp); } -void block_job_change_locked(BlockJob *job, JobChangeOptions *opts, - Error **errp) -{ -const BlockJobDriver *drv = block_job_driver(job); - -GLOBAL_STATE_CODE(); - -if (opts->has_type && job_type(>job) != opts->type) { -error_setg(errp, "Job '%s' is '%s' job, not '%s'", job->job.id, - job_type_str(>job), JobType_str(opts->type)); -return; -} - -if (job_apply_verb_locked(>job, JOB_VERB_CHANGE, errp)) { -return; -} - -if (drv->change) { -job_unlock(); -drv->change(job, opts, errp); -job_lock(); -} else { -error_setg(errp, "Job type does not support change"); -} -} - void block_job_ratelimit_processed_bytes(BlockJob *job, uint64_t n) { IO_CODE(); diff --git a/include/block/blockjob.h b/include/block/blockjob.h index 5dd1b08909..72e849a140 100644 --- a/include/block/blockjob.h +++ b/include/block/blockjob.h @@ -173,17 +173,6 @@ bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs); */ bool block_job_set_speed_locked(BlockJob *job, int64_t speed, Error **errp); -/** - * block_job_change_locked: - * @job: The job to change. - * @opts: The new options. - * @errp: Error object. - * - * Change the job according to opts. - */ -void block_job_change_locked(BlockJob *job, JobChangeOptions *opts, - Error **errp); - /** * block_job_query_locked: * @job: The job to get information about. diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h index d9c3b911d0..58bc7a5cea 100644 --- a/include/block/blockjob_int.h +++ b/include/block/blockjob_int.h @@ -68,13 +68,6 @@ struct BlockJobDriver { void (*set_speed)(BlockJob *job, int64_t speed); -/* - * Change the @job's options according to @opts. - * - * Note that this can already be called before the job coroutine is running. - */ -void (*change)(BlockJob *job, JobChangeOptions *opts, Error **errp); - /* * Query information specific to this kind of block job. */ diff --git a/include/qemu/job.h b/include/qemu/job.h index 9ea98b5927..d44cdb0cc8 100644 --- a/include/qemu/job.h +++ b/include/qemu/job.h @@ -27,6 +27,7 @@ #define JOB_H #include "qapi/qapi-types-job.h" +#include "qapi/qapi-types-block-core.h" #include "qemu/queue.h" #include "qemu/progress_meter.h" #include "qemu/coroutine.h" @@ -307,6 +308,12 @@ struct JobDriver { */ bool (*cancel)(Job *job, bool forc
[RFC 13/15] qapi: move IoStatus to common.json
We will need to use the structure both in block-core.json and job.json. So, move it to common.json, which could be then included to job.json. Signed-off-by: Vladimir Sementsov-Ogievskiy --- qapi/block-core.json | 16 qapi/common.json | 16 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/qapi/block-core.json b/qapi/block-core.json index 1f00d4f031..75c02e1586 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -570,22 +570,6 @@ '*iops_size': 'int', '*group': 'str', 'cache': 'BlockdevCacheInfo', 'write_threshold': 'int', '*dirty-bitmaps': ['BlockDirtyInfo'] } } -## -# @IoStatus: -# -# An enumeration of block device I/O status. -# -# @ok: The last I/O operation has succeeded -# -# @failed: The last I/O operation has failed -# -# @nospace: The last I/O operation has failed due to a no-space -# condition -# -# Since: 1.0 -## -{ 'enum': 'IoStatus', 'data': [ 'ok', 'failed', 'nospace' ] } - ## # @BlockDirtyInfo: # diff --git a/qapi/common.json b/qapi/common.json index f1bb841951..3becca1300 100644 --- a/qapi/common.json +++ b/qapi/common.json @@ -19,6 +19,22 @@ { 'enum': 'IoOperationType', 'data': [ 'read', 'write' ] } +## +# @IoStatus: +# +# An enumeration of block device I/O status. +# +# @ok: The last I/O operation has succeeded +# +# @failed: The last I/O operation has failed +# +# @nospace: The last I/O operation has failed due to a no-space +# condition +# +# Since: 1.0 +## +{ 'enum': 'IoStatus', 'data': [ 'ok', 'failed', 'nospace' ] } + ## # @OnOffAuto: # -- 2.34.1
[RFC 09/15] qapi: job-complete: introduce no-block-replace option for mirror
That's an alternative to block-job-cancel(hard=false) behavior for mirror, which exactly is: complete the job, wait for in-flight requests, but don't do any block graph modifications. Why: 1. We move to deprecating block-job-* APIs and use job-* APIs instead. So we need to port somehow the functionality to job- API. 2. The specified behavior was also a kind of "quirk". It's strange to "cancel" something in a normal success path of user scenario. This block-job-cancel(hard=false) results in BLOCK_JOB_COMPLETE event, so definitely it's just another mode of "complete", not "cancel". Signed-off-by: Vladimir Sementsov-Ogievskiy --- block/mirror.c | 8 +--- include/qemu/job.h | 8 +++- job-qmp.c| 20 +++- job.c| 12 ++-- qapi/job.json| 28 +--- stubs/qapi-jobchangeoptions-mapper.c | 5 + tests/unit/test-bdrv-drain.c | 2 +- tests/unit/test-block-iothread.c | 2 +- tests/unit/test-blockjob.c | 2 +- 9 files changed, 70 insertions(+), 17 deletions(-) diff --git a/block/mirror.c b/block/mirror.c index f474b87079..e95c54fbc6 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -69,6 +69,7 @@ typedef struct MirrorBlockJob { */ bool actively_synced; bool should_complete; +bool no_block_replace; int64_t granularity; size_t buf_size; int64_t bdev_length; @@ -740,7 +741,7 @@ static int mirror_exit_common(Job *job) } bdrv_graph_rdunlock_main_loop(); -if (s->should_complete && !abort) { +if (s->should_complete && !abort && !s->no_block_replace) { BlockDriverState *to_replace = s->to_replace ?: src; bool ro = bdrv_is_read_only(to_replace); @@ -1167,7 +1168,7 @@ immediate_exit: return ret; } -static void mirror_complete(Job *job, Error **errp) +static void mirror_complete(Job *job, JobComplete *opts, Error **errp) { MirrorBlockJob *s = container_of(job, MirrorBlockJob, common.job); @@ -1178,7 +1179,7 @@ static void mirror_complete(Job *job, Error **errp) } /* block all operations on to_replace bs */ -if (s->replaces) { +if (s->replaces && !opts->u.mirror.no_block_replace) { s->to_replace = bdrv_find_node(s->replaces); if (!s->to_replace) { error_setg(errp, "Node name '%s' not found", s->replaces); @@ -1193,6 +1194,7 @@ static void mirror_complete(Job *job, Error **errp) } s->should_complete = true; +s->no_block_replace = opts->u.mirror.no_block_replace; /* If the job is paused, it will be re-entered when it is resumed */ WITH_JOB_LOCK_GUARD() { diff --git a/include/qemu/job.h b/include/qemu/job.h index 1c9da74a0c..eee1d5b50f 100644 --- a/include/qemu/job.h +++ b/include/qemu/job.h @@ -254,7 +254,7 @@ struct JobDriver { * Optional callback for job types whose completion must be triggered * manually. */ -void (*complete)(Job *job, Error **errp); +void (*complete)(Job *job, JobComplete *opts, Error **errp); /** * If the callback is not NULL, prepare will be invoked when all the jobs @@ -634,6 +634,12 @@ void job_early_fail(Job *job); */ void job_transition_to_ready(Job *job); +/** + * Asynchronously complete the specified @job. + * Called with job lock held, but might release it temporarily. + */ +void job_complete_opts_locked(Job *job, JobComplete *opts, Error **errp); + /** * Asynchronously complete the specified @job. * Called with job lock held, but might release it temporarily. diff --git a/job-qmp.c b/job-qmp.c index 011a8736ea..c048e03d9f 100644 --- a/job-qmp.c +++ b/job-qmp.c @@ -93,19 +93,19 @@ void qmp_job_resume(const char *id, Error **errp) job_user_resume_locked(job, errp); } -void qmp_job_complete(const char *id, Error **errp) +void qmp_job_complete(JobComplete *opts, Error **errp) { Job *job; JOB_LOCK_GUARD(); -job = find_job_locked(id, errp); +job = find_job_locked(opts->id, errp); if (!job) { return; } trace_qmp_job_complete(job); -job_complete_locked(job, errp); +job_complete_opts_locked(job, opts, errp); } void qmp_job_finalize(const char *id, Error **errp) @@ -204,13 +204,13 @@ JobInfoList *qmp_query_jobs(Error **errp) return head; } -bool JobChangeOptions_mapper(JobChangeOptions *opts, JobType *out, Error **errp) +static bool job_mapper(const char *id, JobType *out, Error **errp) { Job *job; JOB_LOCK_GUARD(); -job = find_job_locked(opts->id, errp); +job = find_job_locked(id, errp); if (!job) { return false; } @@ -218,3 +218,13 @@ bool JobChangeOptions_mapper(JobChangeOptions *opts, JobType *out, Error **errp)
[RFC 04/15] qapi: block-job-change: make copy-mode parameter optional
We are going to add more parameters to change. We want to make possible to change only one or any subset of available options. So all the options should be optional. Signed-off-by: Vladimir Sementsov-Ogievskiy --- block/mirror.c | 5 + qapi/block-core.json | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/block/mirror.c b/block/mirror.c index a177502e4f..2d0cd22c06 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -1265,6 +1265,11 @@ static void mirror_change(BlockJob *job, JobChangeOptions *opts, GLOBAL_STATE_CODE(); +if (!change_opts->has_copy_mode) { +/* Nothing to do */ +return; +} + if (qatomic_read(>copy_mode) == change_opts->copy_mode) { return; } diff --git a/qapi/block-core.json b/qapi/block-core.json index 67dd0ef038..6041e7bd8f 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -3071,7 +3071,7 @@ # Since: 8.2 ## { 'struct': 'JobChangeOptionsMirror', - 'data': { 'copy-mode' : 'MirrorCopyMode' } } + 'data': { '*copy-mode' : 'MirrorCopyMode' } } ## # @JobChangeOptions: -- 2.34.1
[RFC 02/15] qapi: rename BlockJobChangeOptions to JobChangeOptions
We are going to move change action from block-job to job implementation, and then move to job-* extenral APIs, deprecating block-job-* APIs. This commit simplifies further transition. The commit is made by command git grep -l BlockJobChangeOptions | \ xargs sed -i 's/BlockJobChangeOptions/JobChangeOptions/g' Signed-off-by: Vladimir Sementsov-Ogievskiy --- block/mirror.c | 4 ++-- blockdev.c | 2 +- blockjob.c | 2 +- include/block/blockjob.h | 2 +- include/block/blockjob_int.h | 2 +- qapi/block-core.json | 12 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/block/mirror.c b/block/mirror.c index 5145eb53e1..a177502e4f 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -1251,11 +1251,11 @@ static bool commit_active_cancel(Job *job, bool force) return force || !job_is_ready(job); } -static void mirror_change(BlockJob *job, BlockJobChangeOptions *opts, +static void mirror_change(BlockJob *job, JobChangeOptions *opts, Error **errp) { MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); -BlockJobChangeOptionsMirror *change_opts = >u.mirror; +JobChangeOptionsMirror *change_opts = >u.mirror; MirrorCopyMode current; /* diff --git a/blockdev.c b/blockdev.c index d8fb3399f5..7881f6e5a6 100644 --- a/blockdev.c +++ b/blockdev.c @@ -3245,7 +3245,7 @@ void qmp_block_job_dismiss(const char *id, Error **errp) job_dismiss_locked(, errp); } -void qmp_block_job_change(BlockJobChangeOptions *opts, Error **errp) +void qmp_block_job_change(JobChangeOptions *opts, Error **errp) { BlockJob *job; diff --git a/blockjob.c b/blockjob.c index d5f29e14af..8cfbb15543 100644 --- a/blockjob.c +++ b/blockjob.c @@ -312,7 +312,7 @@ static bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp) return block_job_set_speed_locked(job, speed, errp); } -void block_job_change_locked(BlockJob *job, BlockJobChangeOptions *opts, +void block_job_change_locked(BlockJob *job, JobChangeOptions *opts, Error **errp) { const BlockJobDriver *drv = block_job_driver(job); diff --git a/include/block/blockjob.h b/include/block/blockjob.h index 7061ab7201..5dd1b08909 100644 --- a/include/block/blockjob.h +++ b/include/block/blockjob.h @@ -181,7 +181,7 @@ bool block_job_set_speed_locked(BlockJob *job, int64_t speed, Error **errp); * * Change the job according to opts. */ -void block_job_change_locked(BlockJob *job, BlockJobChangeOptions *opts, +void block_job_change_locked(BlockJob *job, JobChangeOptions *opts, Error **errp); /** diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h index 4c3d2e25a2..d9c3b911d0 100644 --- a/include/block/blockjob_int.h +++ b/include/block/blockjob_int.h @@ -73,7 +73,7 @@ struct BlockJobDriver { * * Note that this can already be called before the job coroutine is running. */ -void (*change)(BlockJob *job, BlockJobChangeOptions *opts, Error **errp); +void (*change)(BlockJob *job, JobChangeOptions *opts, Error **errp); /* * Query information specific to this kind of block job. diff --git a/qapi/block-core.json b/qapi/block-core.json index 1874f880a8..67dd0ef038 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -3063,18 +3063,18 @@ 'allow-preconfig': true } ## -# @BlockJobChangeOptionsMirror: +# @JobChangeOptionsMirror: # # @copy-mode: Switch to this copy mode. Currently, only the switch # from 'background' to 'write-blocking' is implemented. # # Since: 8.2 ## -{ 'struct': 'BlockJobChangeOptionsMirror', +{ 'struct': 'JobChangeOptionsMirror', 'data': { 'copy-mode' : 'MirrorCopyMode' } } ## -# @BlockJobChangeOptions: +# @JobChangeOptions: # # Block job options that can be changed after job creation. # @@ -3084,10 +3084,10 @@ # # Since: 8.2 ## -{ 'union': 'BlockJobChangeOptions', +{ 'union': 'JobChangeOptions', 'base': { 'id': 'str', 'type': 'JobType' }, 'discriminator': 'type', - 'data': { 'mirror': 'BlockJobChangeOptionsMirror' } } + 'data': { 'mirror': 'JobChangeOptionsMirror' } } ## # @block-job-change: @@ -3097,7 +3097,7 @@ # Since: 8.2 ## { 'command': 'block-job-change', - 'data': 'BlockJobChangeOptions', 'boxed': true } + 'data': 'JobChangeOptions', 'boxed': true } ## # @BlockdevDiscardOptions: -- 2.34.1
[RFC 08/15] qapi: job-change: support speed parameter
Allow change job speed through job-change command. Old block-job-set-speed would be deprecated soon. Signed-off-by: Vladimir Sementsov-Ogievskiy --- block/backup.c | 8 ++ block/commit.c | 10 ++- block/mirror.c | 60 block/stream.c | 8 ++ blockjob.c | 13 + include/block/blockjob.h | 7 + include/qemu/job.h | 2 +- qapi/block-core.json | 23 +-- 8 files changed, 103 insertions(+), 28 deletions(-) diff --git a/block/backup.c b/block/backup.c index ec29d6b810..bf086dc5f9 100644 --- a/block/backup.c +++ b/block/backup.c @@ -336,6 +336,13 @@ static bool backup_cancel(Job *job, bool force) return true; } +static bool backup_change(Job *job, JobChangeOptions *opts, Error **errp) +{ +BlockJob *bjob = container_of(job, BlockJob, job); + +return block_job_change(bjob, >u.backup, errp); +} + static const BlockJobDriver backup_job_driver = { .job_driver = { .instance_size = sizeof(BackupBlockJob), @@ -348,6 +355,7 @@ static const BlockJobDriver backup_job_driver = { .clean = backup_clean, .pause = backup_pause, .cancel = backup_cancel, +.change = backup_change, }, .set_speed = backup_set_speed, }; diff --git a/block/commit.c b/block/commit.c index 7c3fdcb0ca..ccb6097679 100644 --- a/block/commit.c +++ b/block/commit.c @@ -204,6 +204,13 @@ static int coroutine_fn commit_run(Job *job, Error **errp) return 0; } +static bool commit_change(Job *job, JobChangeOptions *opts, Error **errp) +{ +BlockJob *bjob = container_of(job, BlockJob, job); + +return block_job_change(bjob, >u.commit, errp); +} + static const BlockJobDriver commit_job_driver = { .job_driver = { .instance_size = sizeof(CommitBlockJob), @@ -213,7 +220,8 @@ static const BlockJobDriver commit_job_driver = { .run = commit_run, .prepare = commit_prepare, .abort = commit_abort, -.clean = commit_clean +.clean = commit_clean, +.change= commit_change, }, }; diff --git a/block/mirror.c b/block/mirror.c index e670d0dd4f..f474b87079 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -1251,41 +1251,45 @@ static bool commit_active_cancel(Job *job, bool force) return force || !job_is_ready(job); } -static void mirror_change(Job *job, JobChangeOptions *opts, Error **errp) +static bool mirror_change(Job *job, JobChangeOptions *opts, Error **errp) { +BlockJob *bjob = container_of(job, BlockJob, job); MirrorBlockJob *s = container_of(job, MirrorBlockJob, common.job); JobChangeOptionsMirror *change_opts = >u.mirror; -MirrorCopyMode current; - -/* - * The implementation relies on the fact that copy_mode is only written - * under the BQL. Otherwise, further synchronization would be required. - */ +MirrorCopyMode old_mode; GLOBAL_STATE_CODE(); -if (!change_opts->has_copy_mode) { -/* Nothing to do */ -return; -} +if (change_opts->has_copy_mode) { +old_mode = qatomic_read(>copy_mode); -if (qatomic_read(>copy_mode) == change_opts->copy_mode) { -return; -} +if (old_mode != change_opts->copy_mode) { +if (change_opts->copy_mode != MIRROR_COPY_MODE_WRITE_BLOCKING) { +error_setg(errp, "Change to copy mode '%s' is not implemented", + MirrorCopyMode_str(change_opts->copy_mode)); +return false; +} -if (change_opts->copy_mode != MIRROR_COPY_MODE_WRITE_BLOCKING) { -error_setg(errp, "Change to copy mode '%s' is not implemented", - MirrorCopyMode_str(change_opts->copy_mode)); -return; +if (old_mode != MIRROR_COPY_MODE_BACKGROUND) { +error_setg(errp, "Expected current copy mode '%s', got '%s'", + MirrorCopyMode_str(MIRROR_COPY_MODE_BACKGROUND), + MirrorCopyMode_str(old_mode)); +return false; +} +} } -current = qatomic_cmpxchg(>copy_mode, MIRROR_COPY_MODE_BACKGROUND, - change_opts->copy_mode); -if (current != MIRROR_COPY_MODE_BACKGROUND) { -error_setg(errp, "Expected current copy mode '%s', got '%s'", - MirrorCopyMode_str(MIRROR_COPY_MODE_BACKGROUND), - MirrorCopyMode_str(current)); +if (!block_job_change(bjob, qapi_JobChangeOptionsMirror_base(change_opts), + errp)) +{ +return false; } + +old_mode = qatomic_cmpxchg(>copy_mode, MIRROR_COPY_MODE_BACKGROUND, + chan
[RFC 15/15] qapi/block-core: derpecate block-job- APIs
For change, pause, resume, complete, dismiss and finalize actions corresponding job- and block-job commands are almost equal. The difference is in find_block_job_locked() vs find_job_locked() functions. What's different? 1. find_block_job_locked() do check, is found job a block-job. This OK when moving to more generic API, no needs to document this change. 2. find_block_job_locked() reports DeviceNotActive on failure, when find_job_locked() reports GenericError. So, lets document this difference in deprecated.txt. Still, for dismiss and finalize errors are not documented at all, so be silent in deprecated.txt as well. For cancel, we have a new solution about soft-cancelling mirror: job-complete(no-block-replace=true) For set-speed, the action is supported by job-change. For query, query-jobs is not equal to query-block-jobs, but now it has enough information (see documentation changes for details). Signed-off-by: Vladimir Sementsov-Ogievskiy --- docs/about/deprecated.rst | 73 +-- qapi/block-core.json | 59 ++- 2 files changed, 129 insertions(+), 3 deletions(-) diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst index 5ff98ef95f..7db3ba983b 100644 --- a/docs/about/deprecated.rst +++ b/docs/about/deprecated.rst @@ -128,6 +128,75 @@ options are removed in favor of using explicit ``blockdev-create`` and ``blockdev-add`` calls. See :doc:`/interop/live-block-operations` for details. +``block-job-pause`` (since 9.1) +''' + +Use ``job-pause`` instead. The only difference is that ``job-pause`` +always reports GenericError on failure when ``block-job-pause`` reports +DeviceNotActive when block-job is not found. + +``block-job-resume`` (since 9.1) + + +Use ``job-resume`` instead. The only difference is that ``job-resume`` +always reports GenericError on failure when ``block-job-resume`` reports +DeviceNotActive when block-job is not found. + +``block-job-complete`` (since 9.1) +'' + +Use ``job-complete`` instead. The only difference is that ``job-complete`` +always reports GenericError on failure when ``block-job-complete`` reports +DeviceNotActive when block-job is not found. + +``block-job-dismiss`` (since 9.1) +' + +Use ``job-dismiss`` instead. + +``block-job-finalize`` (since 9.1) +'' + +Use ``job-finalize`` instead. + +``block-job-set-speed`` (since 9.1) +''' + +Use ``job-change`` instead. + +``block-job-change`` (since 9.1) + + +Use ``job-change`` instead. + +``block-job-cancel`` (since 9.1) + + +Use ``job-cancel`` instead which correspond to +``block-job-cancel`` (``force`` = true ). For special case of +soft-cancelling mirror in ready state, use ``job-complete`` +(``no-block-replace`` = true ). + +``query-block-jobs`` (since 9.1) + +Use ``query-jobs`` instead. Per-field recommendations: + +``query-block-jobs`` ``device`` field: use ``query-jobs`` ``id`` field. + +``query-block-jobs`` ``len`` and ``offset`` fields: use ``query-jobs`` +``current-progress`` and ``total-progress`` fields. + +``query-block-jobs`` ``paused``, ``ready``: +use ``qeury-jobs`` ``status``. + +``auto-finalize``, ``auto-dismiss``: these are parameters set by user +on job creation and never changed. So, no reason to query them. No +support in ``query-jobs``. + +``busy``: it actually means nothing for user, it's a mistake to rely on +it. No support in ``query-jobs``. + + Incorrectly typed ``device_add`` arguments (since 6.2) '' @@ -143,8 +212,8 @@ all arguments passed to ``device_add`` are consistent with the documented property types. -``block-job-change`` and ``job-change`` argument ``type`` (since 9.1) -'' +``job-change`` argument ``type`` (since 9.1) + QEMU can get job type from the job itself (by @id), @type field is redundant. diff --git a/qapi/block-core.json b/qapi/block-core.json index 75c02e1586..793155f174 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -1422,9 +1422,15 @@ # # Returns: a list of @BlockJobInfo for each active block job # +# Features: +# +# @deprecated: This command is deprecated. Use @query-jobs +# instead. +# # Since: 1.1 ## { 'command': 'query-block-jobs', 'returns': ['BlockJobInfo'], + 'features': ['deprecated'], 'allow-preconfig': true } ## @@ -2875,6 +2881,11 @@ # @speed: the maximum speed, in bytes per second, or 0 for unlimited. # Defaults to 0. # +# Features: +# +# @deprecated: This command is deprecated. Use @job-change +# instead. +# # Errors: # - If no background operation is active on this device
[RFC 03/15] blockjob: block_job_change_locked(): check job type
User may specify wrong type for the job id. Let's check it. Signed-off-by: Vladimir Sementsov-Ogievskiy --- blockjob.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/blockjob.c b/blockjob.c index 8cfbb15543..788cb1e07d 100644 --- a/blockjob.c +++ b/blockjob.c @@ -319,6 +319,12 @@ void block_job_change_locked(BlockJob *job, JobChangeOptions *opts, GLOBAL_STATE_CODE(); +if (job_type(>job) != opts->type) { +error_setg(errp, "Job '%s' is '%s' job, not '%s'", job->job.id, + job_type_str(>job), JobType_str(opts->type)); +return; +} + if (job_apply_verb_locked(>job, JOB_VERB_CHANGE, errp)) { return; } -- 2.34.1
[RFC 00/15] block job API
Hi all! The series aims to reach feature parity between job-* and block-job-* APIs and finally deprecate block-job-* APIs. 01: new type-based unions for QAPI 02-07: rework block-job-change and add similar job-change command 08: support set-speed in new API (as an option to job-change) 09: support soft-cancelling ready mirror job in new API (as an option to job-complete) 10-14: prepare query-jobs to substitute query-block-jobs 15: finally, deprecate old APIs RFC: the series are untested. I plan to move tests to using new APIs instead of deprecated ones in v2. Vladimir Sementsov-Ogievskiy (15): scripts/qapi: support type-based unions qapi: rename BlockJobChangeOptions to JobChangeOptions blockjob: block_job_change_locked(): check job type qapi: block-job-change: make copy-mode parameter optional qapi: JobChangeOptions: make type member optional and deprecated blockjob: move change action implementation to job from block-job qapi: add job-change qapi: job-change: support speed parameter qapi: job-complete: introduce no-block-replace option for mirror qapi: query-jobs: add information specific for job type job-qmp: job_query_single_locked: add assertion on job ret qapi: rename BlockDeviceIoStatus to IoStatus qapi: move IoStatus to common.json qapi: query-job: add block-job specific information qapi/block-core: derpecate block-job- APIs block/backup.c | 14 ++ block/block-backend.c | 14 +- block/commit.c | 16 ++- block/mirror.c | 98 +- block/monitor/block-hmp-cmds.c | 4 +- block/stream.c | 14 ++ blockdev.c | 4 +- blockjob.c | 39 +++--- docs/about/deprecated.rst | 75 +++ include/block/blockjob.h| 26 ++-- include/block/blockjob_int.h| 7 - include/qemu/job.h | 25 +++- include/sysemu/block-backend-global-state.h | 2 +- job-qmp.c | 55 +++- job.c | 35 - qapi/block-core.json| 136 +++- qapi/common.json| 16 +++ qapi/job.json | 69 +- scripts/qapi/introspect.py | 5 +- scripts/qapi/schema.py | 50 --- scripts/qapi/types.py | 3 +- scripts/qapi/visit.py | 43 ++- stubs/meson.build | 1 + stubs/qapi-jobchangeoptions-mapper.c| 13 ++ tests/unit/test-bdrv-drain.c| 2 +- tests/unit/test-block-iothread.c| 2 +- tests/unit/test-blockjob.c | 2 +- 27 files changed, 616 insertions(+), 154 deletions(-) create mode 100644 stubs/qapi-jobchangeoptions-mapper.c -- 2.34.1
Re: [PATCH v2 00/10] mirror: allow switching from background to active mode
On 12.03.24 18:49, Kevin Wolf wrote: Am 12.03.2024 um 14:44 hat Vladimir Sementsov-Ogievskiy geschrieben: On 11.03.24 18:15, Vladimir Sementsov-Ogievskiy wrote: On 08.03.24 11:52, Kevin Wolf wrote: Am 07.03.2024 um 20:42 hat Vladimir Sementsov-Ogievskiy geschrieben: On 04.03.24 14:09, Peter Krempa wrote: On Mon, Mar 04, 2024 at 11:48:54 +0100, Kevin Wolf wrote: Am 28.02.2024 um 19:07 hat Vladimir Sementsov-Ogievskiy geschrieben: On 03.11.23 18:56, Markus Armbruster wrote: Kevin Wolf writes: [...] Is the job abstraction a failure? We have block-job- command since job- command since - block-job-set-speed 1.1 block-job-cancel 1.1 job-cancel 3.0 block-job-pause 1.3 job-pause 3.0 block-job-resume 1.3 job-resume 3.0 block-job-complete 1.3 job-complete 3.0 block-job-dismiss 2.12 job-dismiss 3.0 block-job-finalize 2.12 job-finalize 3.0 block-job-change 8.2 query-block-jobs 1.1 query-jobs [...] I consider these strictly optional. We don't really have strong reasons to deprecate these commands (they are just thin wrappers), and I think libvirt still uses block-job-* in some places. Libvirt uses 'block-job-cancel' because it has different semantics from 'job-cancel' which libvirt documented as the behaviour of the API that uses it. (Semantics regarding the expectation of what is written to the destination node at the point when the job is cancelled). That's the following semantics: # Note that if you issue 'block-job-cancel' after 'drive-mirror' has # indicated (via the event BLOCK_JOB_READY) that the source and # destination are synchronized, then the event triggered by this # command changes to BLOCK_JOB_COMPLETED, to indicate that the # mirroring has ended and the destination now has a point-in-time copy # tied to the time of the cancellation. Hmm. Looking at this, it looks for me, that should probably a 'block-job-complete" command (as leading to BLOCK_JOB_COMPLETED). Yes, it's just a different completion mode. Actually, what is the difference between block-job-complete and block-job-cancel(force=false) for mirror in ready state? I only see the following differencies: 1. block-job-complete documents that it completes the job synchronously.. But looking at mirror code I see it just set s->should_complete = true, which will be then handled asynchronously.. So I doubt that documentation is correct. 2. block-job-complete will trigger final graph changes. block-job-cancel will not. Is [2] really useful? Seems yes: in case of some failure before starting migration target, we'd like to continue executing source. So, no reason to break block-graph in source, better keep it unchanged. But I think, such behavior better be setup by mirror-job start parameter, rather then by special option for cancel (or even compelete) command, useful only for mirror. I'm not sure, having the option on the complete command makes more sense to me than having it in blockdev-mirror. I do see the challenge of representing this meaningfully in QAPI, though. Semantically it should be a union with job-specific options and only mirror adds the graph-changes option. But the union variant can't be directly selected from another option - instead we have a job ID, and the variant is the job type of the job with this ID. We already have such command: block-job-change. Which has id and type parameters, so user have to pass both, to identify the job itself and pick corresponding variant of the union type. That would be good to somehow teach QAPI to get the type automatically from the job itself... Seems, that's easy enough to implement such a possibility, I'll try. At least now I have a prototype, which compiles diff --git a/qapi/block-core.json b/qapi/block-core.json index 0ae8ae62dc..332de67e52 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -3116,13 +3116,11 @@ # # @id: The job identifier # -# @type: The job type -# # Since: 8.2 ## { 'union': 'BlockJobChangeOptions', - 'base': { 'id': 'str', 'type': 'JobType' }, - 'discriminator': 'type', + 'base': { 'id': 'str' }, + 'discriminator': 'JobType', 'data': { 'mirror': 'BlockJobChangeOptionsMirror' } } We probably need some different syntax because I think in theory we could get conflicts between option names and type names. I think we shouldn't, as enum types in QAPI are CamelCase and members are lowercase. Still, it's probably a bad way to rely on text case (I just imagine documenting this:)) It could be "'discriminator-type': 'JobType'" instead. But I'll leave this discussion to Markus. I hope you can figure out something that he is willing to accept, getting the variant from a callback
Re: [PATCH v2 00/10] mirror: allow switching from background to active mode
On 11.03.24 18:15, Vladimir Sementsov-Ogievskiy wrote: On 08.03.24 11:52, Kevin Wolf wrote: Am 07.03.2024 um 20:42 hat Vladimir Sementsov-Ogievskiy geschrieben: On 04.03.24 14:09, Peter Krempa wrote: On Mon, Mar 04, 2024 at 11:48:54 +0100, Kevin Wolf wrote: Am 28.02.2024 um 19:07 hat Vladimir Sementsov-Ogievskiy geschrieben: On 03.11.23 18:56, Markus Armbruster wrote: Kevin Wolf writes: [...] Is the job abstraction a failure? We have block-job- command since job- command since - block-job-set-speed 1.1 block-job-cancel 1.1 job-cancel 3.0 block-job-pause 1.3 job-pause 3.0 block-job-resume 1.3 job-resume 3.0 block-job-complete 1.3 job-complete 3.0 block-job-dismiss 2.12 job-dismiss 3.0 block-job-finalize 2.12 job-finalize 3.0 block-job-change 8.2 query-block-jobs 1.1 query-jobs [...] I consider these strictly optional. We don't really have strong reasons to deprecate these commands (they are just thin wrappers), and I think libvirt still uses block-job-* in some places. Libvirt uses 'block-job-cancel' because it has different semantics from 'job-cancel' which libvirt documented as the behaviour of the API that uses it. (Semantics regarding the expectation of what is written to the destination node at the point when the job is cancelled). That's the following semantics: # Note that if you issue 'block-job-cancel' after 'drive-mirror' has # indicated (via the event BLOCK_JOB_READY) that the source and # destination are synchronized, then the event triggered by this # command changes to BLOCK_JOB_COMPLETED, to indicate that the # mirroring has ended and the destination now has a point-in-time copy # tied to the time of the cancellation. Hmm. Looking at this, it looks for me, that should probably a 'block-job-complete" command (as leading to BLOCK_JOB_COMPLETED). Yes, it's just a different completion mode. Actually, what is the difference between block-job-complete and block-job-cancel(force=false) for mirror in ready state? I only see the following differencies: 1. block-job-complete documents that it completes the job synchronously.. But looking at mirror code I see it just set s->should_complete = true, which will be then handled asynchronously.. So I doubt that documentation is correct. 2. block-job-complete will trigger final graph changes. block-job-cancel will not. Is [2] really useful? Seems yes: in case of some failure before starting migration target, we'd like to continue executing source. So, no reason to break block-graph in source, better keep it unchanged. But I think, such behavior better be setup by mirror-job start parameter, rather then by special option for cancel (or even compelete) command, useful only for mirror. I'm not sure, having the option on the complete command makes more sense to me than having it in blockdev-mirror. I do see the challenge of representing this meaningfully in QAPI, though. Semantically it should be a union with job-specific options and only mirror adds the graph-changes option. But the union variant can't be directly selected from another option - instead we have a job ID, and the variant is the job type of the job with this ID. We already have such command: block-job-change. Which has id and type parameters, so user have to pass both, to identify the job itself and pick corresponding variant of the union type. That would be good to somehow teach QAPI to get the type automatically from the job itself... Seems, that's easy enough to implement such a possibility, I'll try. At least now I have a prototype, which compiles diff --git a/qapi/block-core.json b/qapi/block-core.json index 0ae8ae62dc..332de67e52 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -3116,13 +3116,11 @@ # # @id: The job identifier # -# @type: The job type -# # Since: 8.2 ## { 'union': 'BlockJobChangeOptions', - 'base': { 'id': 'str', 'type': 'JobType' }, - 'discriminator': 'type', + 'base': { 'id': 'str' }, + 'discriminator': 'JobType', 'data': { 'mirror': 'BlockJobChangeOptionsMirror' } } to bool visit_type_BlockJobChangeOptions_members(Visitor *v, BlockJobChangeOptions *obj, Error **errp) { JobType tag; if (!visit_type_q_obj_BlockJobChangeOptions_base_members(v, (q_obj_BlockJobChangeOptions_base *)obj, errp)) { return false; } if (!BlockJobChangeOptions_mapper(obj, , errp)) { return false; } switch (tag) { case JOB_TYPE_MIRROR: return visit_type_BlockJobChangeOptionsMirror_members(v, >u.mirror, errp); case JOB_TYPE_COMMIT: break; ... (specifying member as descriminator works too, and I'm not going to change the interface of block-job-change, it's just
Re: [PATCH v2 00/10] mirror: allow switching from background to active mode
On 11.03.24 00:07, Peter Krempa wrote: On Thu, Mar 07, 2024 at 22:42:56 +0300, Vladimir Sementsov-Ogievskiy wrote: On 04.03.24 14:09, Peter Krempa wrote: On Mon, Mar 04, 2024 at 11:48:54 +0100, Kevin Wolf wrote: Am 28.02.2024 um 19:07 hat Vladimir Sementsov-Ogievskiy geschrieben: On 03.11.23 18:56, Markus Armbruster wrote: Kevin Wolf writes: [...] I only see the following differencies: 1. block-job-complete documents that it completes the job synchronously.. But looking at mirror code I see it just set s->should_complete = true, which will be then handled asynchronously.. So I doubt that documentation is correct. 2. block-job-complete will trigger final graph changes. block-job-cancel will not. Is [2] really useful? Seems yes: in case of some failure before starting migration target, we'd like to continue executing source. So, no reason to break block-graph in source, better keep it unchanged. But I think, such behavior better be setup by mirror-job start parameter, rather then by special option for cancel (or even compelete) command, useful only for mirror. Libvirt's API design was based on the qemu quirk and thus we allow users to do the decision after starting the job, thus anything you pick needs to allow us to do this at "completion" time. OK, this means that simply switch to an option at job-start is not enough. Libvirt can adapt to any option that will give us the above semantics (extra parameter at completion time, different completion command or extra command to switch job properties right before completion), but to be honest all of these feel like they would be more hassle than keeping 'block-job-cancel' around from qemu's side. I understand. But still, it would be good to finally resolve the duplication between job-* and block-job-* APIs. We can keep old quirk working for a long time even after making a new consistent API. -- Best regards, Vladimir
Re: [PATCH v2 00/10] mirror: allow switching from background to active mode
On 08.03.24 11:52, Kevin Wolf wrote: Am 07.03.2024 um 20:42 hat Vladimir Sementsov-Ogievskiy geschrieben: On 04.03.24 14:09, Peter Krempa wrote: On Mon, Mar 04, 2024 at 11:48:54 +0100, Kevin Wolf wrote: Am 28.02.2024 um 19:07 hat Vladimir Sementsov-Ogievskiy geschrieben: On 03.11.23 18:56, Markus Armbruster wrote: Kevin Wolf writes: [...] Is the job abstraction a failure? We have block-job- command since job- commandsince - block-job-set-speed 1.1 block-job-cancel1.1 job-cancel 3.0 block-job-pause 1.3 job-pause 3.0 block-job-resume1.3 job-resume 3.0 block-job-complete 1.3 job-complete3.0 block-job-dismiss 2.12job-dismiss 3.0 block-job-finalize 2.12job-finalize3.0 block-job-change8.2 query-block-jobs1.1 query-jobs [...] I consider these strictly optional. We don't really have strong reasons to deprecate these commands (they are just thin wrappers), and I think libvirt still uses block-job-* in some places. Libvirt uses 'block-job-cancel' because it has different semantics from 'job-cancel' which libvirt documented as the behaviour of the API that uses it. (Semantics regarding the expectation of what is written to the destination node at the point when the job is cancelled). That's the following semantics: # Note that if you issue 'block-job-cancel' after 'drive-mirror' has # indicated (via the event BLOCK_JOB_READY) that the source and # destination are synchronized, then the event triggered by this # command changes to BLOCK_JOB_COMPLETED, to indicate that the # mirroring has ended and the destination now has a point-in-time copy # tied to the time of the cancellation. Hmm. Looking at this, it looks for me, that should probably a 'block-job-complete" command (as leading to BLOCK_JOB_COMPLETED). Yes, it's just a different completion mode. Actually, what is the difference between block-job-complete and block-job-cancel(force=false) for mirror in ready state? I only see the following differencies: 1. block-job-complete documents that it completes the job synchronously.. But looking at mirror code I see it just set s->should_complete = true, which will be then handled asynchronously.. So I doubt that documentation is correct. 2. block-job-complete will trigger final graph changes. block-job-cancel will not. Is [2] really useful? Seems yes: in case of some failure before starting migration target, we'd like to continue executing source. So, no reason to break block-graph in source, better keep it unchanged. But I think, such behavior better be setup by mirror-job start parameter, rather then by special option for cancel (or even compelete) command, useful only for mirror. I'm not sure, having the option on the complete command makes more sense to me than having it in blockdev-mirror. I do see the challenge of representing this meaningfully in QAPI, though. Semantically it should be a union with job-specific options and only mirror adds the graph-changes option. But the union variant can't be directly selected from another option - instead we have a job ID, and the variant is the job type of the job with this ID. We already have such command: block-job-change. Which has id and type parameters, so user have to pass both, to identify the job itself and pick corresponding variant of the union type. That would be good to somehow teach QAPI to get the type automatically from the job itself... Probably, best way is to utilize the new "change" command? So, to avoid final graph change, user should either set graph-change=false in blockdev-mirror, or just call job-change(graph-change=false) before job-complete? Still having the option for job-complete looks nicer. But right, we either have to add type parameter like in block-job-change, or add a common option, which would be irrelevant to some jobs. Practically speaking, we would probably indeed end up with an optional field in the existing completion command. So, what about the following substitution for block-job-cancel: block-job-cancel(force=true) --> use job-cancel block-job-cancel(force=false) for backup, stream, commit --> use job-cancel block-job-cancel(force=false) for mirror in ready mode --> instead, use block-job-complete. If you don't need final graph modification which mirror job normally does, use graph-change=false parameter for blockdev-mirror command. Apart from the open question where to put the option, agreed. (I can hardly remember, that we've already discussed something like this long time ago, but I don't remember the results) I think everyone agreed that this is how things should be, and nobody did anything to achieve it. I also a bit unsure about ac
Re: [PATCH v2 00/10] mirror: allow switching from background to active mode
On 04.03.24 14:09, Peter Krempa wrote: On Mon, Mar 04, 2024 at 11:48:54 +0100, Kevin Wolf wrote: Am 28.02.2024 um 19:07 hat Vladimir Sementsov-Ogievskiy geschrieben: On 03.11.23 18:56, Markus Armbruster wrote: Kevin Wolf writes: [...] Is the job abstraction a failure? We have block-job- command since job- commandsince - block-job-set-speed 1.1 block-job-cancel1.1 job-cancel 3.0 block-job-pause 1.3 job-pause 3.0 block-job-resume1.3 job-resume 3.0 block-job-complete 1.3 job-complete3.0 block-job-dismiss 2.12job-dismiss 3.0 block-job-finalize 2.12job-finalize3.0 block-job-change8.2 query-block-jobs1.1 query-jobs [...] I consider these strictly optional. We don't really have strong reasons to deprecate these commands (they are just thin wrappers), and I think libvirt still uses block-job-* in some places. Libvirt uses 'block-job-cancel' because it has different semantics from 'job-cancel' which libvirt documented as the behaviour of the API that uses it. (Semantics regarding the expectation of what is written to the destination node at the point when the job is cancelled). That's the following semantics: # Note that if you issue 'block-job-cancel' after 'drive-mirror' has # indicated (via the event BLOCK_JOB_READY) that the source and # destination are synchronized, then the event triggered by this # command changes to BLOCK_JOB_COMPLETED, to indicate that the # mirroring has ended and the destination now has a point-in-time copy # tied to the time of the cancellation. Hmm. Looking at this, it looks for me, that should probably a 'block-job-complete" command (as leading to BLOCK_JOB_COMPLETED). Actually, what is the difference between block-job-complete and block-job-cancel(force=false) for mirror in ready state? I only see the following differencies: 1. block-job-complete documents that it completes the job synchronously.. But looking at mirror code I see it just set s->should_complete = true, which will be then handled asynchronously.. So I doubt that documentation is correct. 2. block-job-complete will trigger final graph changes. block-job-cancel will not. Is [2] really useful? Seems yes: in case of some failure before starting migration target, we'd like to continue executing source. So, no reason to break block-graph in source, better keep it unchanged. But I think, such behavior better be setup by mirror-job start parameter, rather then by special option for cancel (or even compelete) command, useful only for mirror. So, what about the following substitution for block-job-cancel: block-job-cancel(force=true) --> use job-cancel block-job-cancel(force=false) for backup, stream, commit --> use job-cancel block-job-cancel(force=false) for mirror in ready mode --> instead, use block-job-complete. If you don't need final graph modification which mirror job normally does, use graph-change=false parameter for blockdev-mirror command. (I can hardly remember, that we've already discussed something like this long time ago, but I don't remember the results) I also a bit unsure about active commit soft-cancelling semantics. Is it actually useful? If yes, block-commit command will need similar option. -- Best regards, Vladimir
Re: [PATCH v2 3/6] qdev-monitor: add option to report GenericError from find_device_state
On 07.03.24 12:46, Markus Armbruster wrote: Vladimir Sementsov-Ogievskiy writes: Signed-off-by: Vladimir Sementsov-Ogievskiy --- system/qdev-monitor.c | 15 +++ 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/system/qdev-monitor.c b/system/qdev-monitor.c index 9febb743f1..cf7481e416 100644 --- a/system/qdev-monitor.c +++ b/system/qdev-monitor.c @@ -877,13 +877,20 @@ void qmp_device_add(QDict *qdict, QObject **ret_data, Error **errp) object_unref(OBJECT(dev)); } -static DeviceState *find_device_state(const char *id, Error **errp) +/* + * Note that creating new APIs using error classes other than GenericError is + * not recommended. Set use_generic_error=true for new interfaces. + */ +static DeviceState *find_device_state(const char *id, bool use_generic_error, + Error **errp) { Object *obj = object_resolve_path_at(qdev_get_peripheral(), id); DeviceState *dev; if (!obj) { -error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, +error_set(errp, + (use_generic_error ? + ERROR_CLASS_GENERIC_ERROR : ERROR_CLASS_DEVICE_NOT_FOUND), "Device '%s' not found", id); return NULL; } @@ -947,7 +954,7 @@ void qdev_unplug(DeviceState *dev, Error **errp) void qmp_device_del(const char *id, Error **errp) { -DeviceState *dev = find_device_state(id, errp); +DeviceState *dev = find_device_state(id, false, errp); if (dev != NULL) { if (dev->pending_deleted_event && (dev->pending_deleted_expires_ms == 0 || @@ -1067,7 +1074,7 @@ BlockBackend *blk_by_qdev_id(const char *id, Error **errp) GLOBAL_STATE_CODE(); -dev = find_device_state(id, errp); +dev = find_device_state(id, false, errp); if (dev == NULL) { return NULL; } I appreciate the attempt to curb the spread of DeviceNotFound errors. Two issues: * Copy-pasting find_device_state() with a false argument is an easy error to make. * Most uses of find_device_state() are via blk_by_qdev_id() and qmp_get_blk(). Any new uses of qemu_get_blk() will still produce DeviceNotFound. Hm. But what to do? - rename all three functions to FOO_base(), and add a boolean parameter - add FOO() wrappers, passing true (use generic error) - add FOO_deprecated() wrappers, passing false (use device not found error) - change existing callers to use FOO_deprecated() Still, new generic-error-based blk_by_qdev_id() and qmp_get_blk() will be unused.. That seem too ugly for me. And that doesn't give 100% sure that nobody will simply duplicate call to _deprecated() function... Maybe better don't care for now (and continue use device-not-found for new APIs, that reuse find_device_state()), and just declare the deprecation for ERROR_CLASS_DEVICE_NOT_FOUND? And drop it usage everywhere after 3-releases deprecation cycle. Or do we want deprecate everything except for generic-error, and deprecate error/class field in qmp return value altogether? -- Best regards, Vladimir
Re: [PATCH v2 6/6] qapi: introduce CONFIG_READ event
On 07.03.24 13:01, Markus Armbruster wrote: Vladimir Sementsov-Ogievskiy writes: Send a new event when guest reads virtio-pci config after virtio_notify_config() call. That's useful to check that guest fetched modified config, for example after resizing disk backend. Signed-off-by: Vladimir Sementsov-Ogievskiy [...] diff --git a/qapi/qdev.json b/qapi/qdev.json index 6ece164172..ffc5e3be18 100644 --- a/qapi/qdev.json +++ b/qapi/qdev.json @@ -179,3 +179,29 @@ { 'command': 'device-sync-config', 'features': [ 'unstable' ], 'data': {'id': 'str'} } + +## +# @VIRTIO_CONFIG_READ: +# +# Emitted whenever guest reads virtio device config after config change. Let's not abbreviate "configuration" to "config". Oops, me again. +# +# @device: device name +# +# @path: device path +# +# Features: +# +# @unstable: The event is experimental. +# +# Since: 9.0 +# +# Example: +# +# <- { "event": "VIRTIO_CONFIG_READ", +# "data": { "device": "virtio-net-pci-0", +#"path": "/machine/peripheral/virtio-net-pci-0" }, +# "timestamp": { "seconds": 1265044230, "microseconds": 450486 } } +## As for PATCH 4, I'd like to see some guidance on intended use. Will do +{ 'event': 'VIRTIO_CONFIG_READ', + 'features': [ 'unstable' ], + 'data': { '*device': 'str', 'path': 'str' } } [...] Thanks for reviewing my patches! -- Best regards, Vladimir
Re: [PATCH v2 5/6] qapi: device-sync-config: check runstate
On 07.03.24 12:57, Markus Armbruster wrote: Vladimir Sementsov-Ogievskiy writes: Command result is racy if allow it during migration. Let's allow the sync only in RUNNING state. Signed-off-by: Vladimir Sementsov-Ogievskiy If I understand this correctly, the previous commit introduces a race, and this one fixes it. Sure, I will merge it. I recently argued against such things, bad for me to break the rule myself) We normally avoid such temporary bugs. When avoidance is impractical, we point them out in commit message and FIXME comment. >> --- include/sysemu/runstate.h | 1 + system/qdev-monitor.c | 27 ++- system/runstate.c | 5 + 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/include/sysemu/runstate.h b/include/sysemu/runstate.h index 0117d243c4..296af52322 100644 --- a/include/sysemu/runstate.h +++ b/include/sysemu/runstate.h @@ -5,6 +5,7 @@ #include "qemu/notify.h" bool runstate_check(RunState state); +const char *current_run_state_str(void); void runstate_set(RunState new_state); RunState runstate_get(void); bool runstate_is_running(void); diff --git a/system/qdev-monitor.c b/system/qdev-monitor.c index e3107a12d7..b83b5d23c9 100644 --- a/system/qdev-monitor.c +++ b/system/qdev-monitor.c @@ -23,6 +23,7 @@ #include "monitor/monitor.h" #include "monitor/qdev.h" #include "sysemu/arch_init.h" +#include "sysemu/runstate.h" #include "qapi/error.h" #include "qapi/qapi-commands-qdev.h" #include "qapi/qmp/dispatch.h" @@ -983,7 +984,31 @@ int qdev_sync_config(DeviceState *dev, Error **errp) void qmp_device_sync_config(const char *id, Error **errp) { -DeviceState *dev = find_device_state(id, true, errp); +MigrationState *s = migrate_get_current(); +DeviceState *dev; + +/* + * During migration there is a race between syncing`config and migrating it, + * so let's just not allow it. + * + * Moreover, let's not rely on setting up interrupts in paused state, which + * may be a part of migration process. Wrap your comment lines around column 70, please. OK. I never remember this recommendation.. Probably we'd better have a check in checkpatch for it + */ + +if (migration_is_running(s->state)) { +error_setg(errp, "Config synchronization is not allowed " + "during migration."); +return; +} + +if (!runstate_is_running()) { +error_setg(errp, "Config synchronization allowed only in '%s' state, " + "current state is '%s'", RunState_str(RUN_STATE_RUNNING), + current_run_state_str()); +return; +} + +dev = find_device_state(id, true, errp); if (!dev) { return; } diff --git a/system/runstate.c b/system/runstate.c index d6ab860eca..8fd89172ae 100644 --- a/system/runstate.c +++ b/system/runstate.c @@ -189,6 +189,11 @@ bool runstate_check(RunState state) return current_run_state == state; } +const char *current_run_state_str(void) +{ +return RunState_str(current_run_state); +} + static void runstate_init(void) { const RunStateTransition *p; -- Best regards, Vladimir
Re: [PATCH v2 4/6] qapi: introduce device-sync-config
On 07.03.24 12:54, Markus Armbruster wrote: Vladimir Sementsov-Ogievskiy writes: Add command to sync config from vhost-user backend to the device. It may be helpful when VHOST_USER_SLAVE_CONFIG_CHANGE_MSG failed or not triggered interrupt to the guest or just not available (not supported by vhost-user server). Signed-off-by: Vladimir Sementsov-Ogievskiy --- hw/block/vhost-user-blk.c | 27 --- hw/virtio/virtio-pci.c| 9 + include/hw/qdev-core.h| 3 +++ qapi/qdev.json| 17 + system/qdev-monitor.c | 23 +++ 5 files changed, 72 insertions(+), 7 deletions(-) diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c index 9e6bbc6950..2f301f380c 100644 --- a/hw/block/vhost-user-blk.c +++ b/hw/block/vhost-user-blk.c @@ -88,27 +88,39 @@ static void vhost_user_blk_set_config(VirtIODevice *vdev, const uint8_t *config) s->blkcfg.wce = blkcfg->wce; } +static int vhost_user_blk_sync_config(DeviceState *dev, Error **errp) +{ +int ret; +VirtIODevice *vdev = VIRTIO_DEVICE(dev); +VHostUserBlk *s = VHOST_USER_BLK(vdev); + +ret = vhost_dev_get_config(>dev, (uint8_t *)>blkcfg, + vdev->config_len, errp); +if (ret < 0) { +return ret; +} + +memcpy(vdev->config, >blkcfg, vdev->config_len); +virtio_notify_config(vdev); + +return 0; +} + static int vhost_user_blk_handle_config_change(struct vhost_dev *dev) { int ret; -VirtIODevice *vdev = dev->vdev; -VHostUserBlk *s = VHOST_USER_BLK(dev->vdev); Error *local_err = NULL; if (!dev->started) { return 0; } -ret = vhost_dev_get_config(dev, (uint8_t *)>blkcfg, - vdev->config_len, _err); +ret = vhost_user_blk_sync_config(DEVICE(dev->vdev), _err); if (ret < 0) { error_report_err(local_err); return ret; } -memcpy(dev->vdev->config, >blkcfg, vdev->config_len); -virtio_notify_config(dev->vdev); - return 0; } @@ -576,6 +588,7 @@ static void vhost_user_blk_class_init(ObjectClass *klass, void *data) device_class_set_props(dc, vhost_user_blk_properties); dc->vmsd = _vhost_user_blk; +dc->sync_config = vhost_user_blk_sync_config; set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); vdc->realize = vhost_user_blk_device_realize; vdc->unrealize = vhost_user_blk_device_unrealize; diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 1a7039fb0c..1197341a3c 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -2318,6 +2318,14 @@ static void virtio_pci_dc_realize(DeviceState *qdev, Error **errp) vpciklass->parent_dc_realize(qdev, errp); } +static int virtio_pci_sync_config(DeviceState *dev, Error **errp) +{ +VirtIOPCIProxy *proxy = VIRTIO_PCI(dev); +VirtIODevice *vdev = virtio_bus_get_device(>bus); + +return qdev_sync_config(DEVICE(vdev), errp); +} + static void virtio_pci_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); @@ -2334,6 +2342,7 @@ static void virtio_pci_class_init(ObjectClass *klass, void *data) device_class_set_parent_realize(dc, virtio_pci_dc_realize, >parent_dc_realize); rc->phases.hold = virtio_pci_bus_reset_hold; +dc->sync_config = virtio_pci_sync_config; } static const TypeInfo virtio_pci_info = { diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h index 9228e96c87..87135bdcdf 100644 --- a/include/hw/qdev-core.h +++ b/include/hw/qdev-core.h @@ -95,6 +95,7 @@ typedef void (*DeviceUnrealize)(DeviceState *dev); typedef void (*DeviceReset)(DeviceState *dev); typedef void (*BusRealize)(BusState *bus, Error **errp); typedef void (*BusUnrealize)(BusState *bus); +typedef int (*DeviceSyncConfig)(DeviceState *dev, Error **errp); /** * struct DeviceClass - The base class for all devices. @@ -162,6 +163,7 @@ struct DeviceClass { DeviceReset reset; DeviceRealize realize; DeviceUnrealize unrealize; +DeviceSyncConfig sync_config; /** * @vmsd: device state serialisation description for @@ -546,6 +548,7 @@ bool qdev_hotplug_allowed(DeviceState *dev, Error **errp); */ HotplugHandler *qdev_get_hotplug_handler(DeviceState *dev); void qdev_unplug(DeviceState *dev, Error **errp); +int qdev_sync_config(DeviceState *dev, Error **errp); void qdev_simple_device_unplug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp); void qdev_machine_creation_done(void); diff --git a/qapi/qdev.json b/qapi/qdev.json index 32ffaee644..6ece164172 100644 --- a/qapi/qdev.json +++ b/qapi/qdev.json @@ -162,3 +162,20 @@ ## { 'event': 'DEVICE_UNPLUG_GUEST_ERROR', 'data': { '*
Re: [PATCH v2 3/6] qdev-monitor: add option to report GenericError from find_device_state
On 07.03.24 12:46, Markus Armbruster wrote: Vladimir Sementsov-Ogievskiy writes: Signed-off-by: Vladimir Sementsov-Ogievskiy --- system/qdev-monitor.c | 15 +++ 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/system/qdev-monitor.c b/system/qdev-monitor.c index 9febb743f1..cf7481e416 100644 --- a/system/qdev-monitor.c +++ b/system/qdev-monitor.c @@ -877,13 +877,20 @@ void qmp_device_add(QDict *qdict, QObject **ret_data, Error **errp) object_unref(OBJECT(dev)); } -static DeviceState *find_device_state(const char *id, Error **errp) +/* + * Note that creating new APIs using error classes other than GenericError is + * not recommended. Set use_generic_error=true for new interfaces. + */ +static DeviceState *find_device_state(const char *id, bool use_generic_error, + Error **errp) { Object *obj = object_resolve_path_at(qdev_get_peripheral(), id); DeviceState *dev; if (!obj) { -error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, +error_set(errp, + (use_generic_error ? + ERROR_CLASS_GENERIC_ERROR : ERROR_CLASS_DEVICE_NOT_FOUND), "Device '%s' not found", id); return NULL; } @@ -947,7 +954,7 @@ void qdev_unplug(DeviceState *dev, Error **errp) void qmp_device_del(const char *id, Error **errp) { -DeviceState *dev = find_device_state(id, errp); +DeviceState *dev = find_device_state(id, false, errp); if (dev != NULL) { if (dev->pending_deleted_event && (dev->pending_deleted_expires_ms == 0 || @@ -1067,7 +1074,7 @@ BlockBackend *blk_by_qdev_id(const char *id, Error **errp) GLOBAL_STATE_CODE(); -dev = find_device_state(id, errp); +dev = find_device_state(id, false, errp); if (dev == NULL) { return NULL; } I appreciate the attempt to curb the spread of DeviceNotFound errors. Two issues: * Copy-pasting find_device_state() with a false argument is an easy error to make. * Most uses of find_device_state() are via blk_by_qdev_id() and qmp_get_blk(). Any new uses of qemu_get_blk() will still produce DeviceNotFound. Hmm. Hmm. Right. Wait a bit, I can make the change stricter. Could you still explain (or give a link), why and when we decided to use only GenericError? I think, having different "error-codes" is a good thing, why we are trying to get rid of it? -- Best regards, Vladimir
Re: [RFC 0/4] mirror: implement incremental and bitmap modes
On 06.03.24 16:44, Fiona Ebner wrote: Am 29.02.24 um 13:47 schrieb Fiona Ebner: Am 29.02.24 um 12:48 schrieb Vladimir Sementsov-Ogievskiy: On 29.02.24 13:11, Fiona Ebner wrote: The iotest creates a new target image for each incremental sync which only records the diff relative to the previous mirror and those diff images are later rebased onto each other to get the full picture. Thus, it can be that a previous mirror job (not just background process or previous write) already copied a cluster, and in particular, copied it to a different target! Aha understand. For simplicity, let's consider case, when source "cluster size" = "job cluster size" = "bitmap granularity" = "target cluster size". Which types of clusters we should consider, when we want to handle guest write? 1. Clusters, that should be copied by background process These are dirty clusters from user-given bitmap, or if we do a full-disk mirror, all clusters, not yet copied by background process. For such clusters we simply ignore the unaligned write. We can even ignore the aligned write too: less disturbing the guest by delays. Since do_sync_target_write() currently doesn't ignore aligned writes, I wouldn't change it. Of course they can count towards the "done_bitmap" you propose below. 2. Clusters, already copied by background process during this mirror job and not dirtied by guest since this time. For such clusters we are safe to do unaligned write, as target cluster must be allocated. Right. 3. Clusters, not marked initially by dirty bitmap. What to do with them? We can't do unaligned write. I see two variants: - do additional read from source, to fill the whole cluster, which seems a bit too heavy Yes, I'd rather only do that as a last resort. - just mark the cluster as dirty for background job. So we behave like in "background" mode. But why not? The maximum count of such "hacks" is limited to number of "clear" clusters at start of mirror job, which means that we don't seriously affect the convergence. Mirror is guaranteed to converge anyway. And the whole sense of "write-blocking" mode is to have a guaranteed convergence. What do you think? It could lead to a lot of flips between job->actively_synced == true and == false. AFAIU, currently, we only switch back from true to false when an error happens. While I don't see a concrete issue with it, at least it might be unexpected to users, so it better be documented. I'll try going with this approach, thanks! These flips are actually a problem. When using live-migration with disk mirroring, it's good that an actively synced image stays actively synced. Otherwise, migration could finish at an inconvenient time and try to inactivate the block device while mirror still got something to do which would lead to an assertion failure [0]. Hmm right. So, when mirror is actively-synced, we have to read the whole cluster from source to make an aligned write on target. The IO test added by this series is what uses the possibility to sync to "diff images" which contain only the delta. In production, we are only syncing to a previously mirrored target image. Non-aligned writes are not an issue later like with a diff image. (Even if the initial mirroring happened via ZFS replication outside of QEMU). So copy-mode=write-blocking would work fine for our use case, but if I go with the "mark clusters for unaligned writes dirty"-approach, it would not work fine anymore. Should I rather just document the limitation for the combination "target is a diff image" and copy-mode=write-blocking? Of course, simply documenting the limitation is better than implementing a new feature, if you don't need the feature for production) I'd still add the check for the granularity and target cluster size. Check is good too. While also only needed for diff images, it would allow using background mode safely for those. Best Regards, Fiona [0]: https://lore.kernel.org/qemu-devel/1db7f571-cb7f-c293-04cc-cd856e060...@proxmox.com/ -- Best regards, Vladimir
[PATCH v2 0/6] vhost-user-blk: live resize additional APIs
v2: - now based on master - drop x- prefixes, still keep new APIs "unstable" Also: 01: add a-b by Raphael add note about removed comment "valid for resize only" 02-03: new patches, following review of old "02", which is now 04 04: use GenericError wording keep short name device-sync-config still 06: rename event to VIRTIO_CONFIG_READ === In vhost-user protocol we have VHOST_USER_BACKEND_CONFIG_CHANGE_MSG, which backend may send to notify Qemu, that we should re-read the config, and notify the guest. Still that's not always convenient: backend may not support this message. Also, having QMP command to force config sync is more reliable than waiting for notification from external program. It also may be helpful for debug/restore: if we have changed disk size, but guest doesn't see that, it's good to have a separate QMP command to trigger resync of the config. So, the series proposes two experimental APIs: 1. device-sync-config command, to trigger config synchronization 2. CONFIG_READ event, which notify management tool that guest read the updated config. Of course, that can't guarantee that the guest correctly handled the updated config, but it's still better than nothing: for sure guest will not show new disk size until it read the updated config. So, management tool may wait for this event to report success to the user. Vladimir Sementsov-Ogievskiy (6): vhost-user-blk: simplify and fix vhost_user_blk_handle_config_change qdev-monitor: fix error message in find_device_state() qdev-monitor: add option to report GenericError from find_device_state qapi: introduce device-sync-config qapi: device-sync-config: check runstate qapi: introduce CONFIG_READ event hw/block/vhost-user-blk.c | 32 +++--- hw/virtio/virtio-pci.c| 18 ++ include/hw/qdev-core.h| 3 ++ include/monitor/qdev.h| 2 ++ include/sysemu/runstate.h | 1 + monitor/monitor.c | 1 + qapi/qdev.json| 43 stubs/qdev.c | 6 system/qdev-monitor.c | 71 --- system/runstate.c | 5 +++ 10 files changed, 165 insertions(+), 17 deletions(-) -- 2.34.1
[PATCH v2 5/6] qapi: device-sync-config: check runstate
Command result is racy if allow it during migration. Let's allow the sync only in RUNNING state. Signed-off-by: Vladimir Sementsov-Ogievskiy --- include/sysemu/runstate.h | 1 + system/qdev-monitor.c | 27 ++- system/runstate.c | 5 + 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/include/sysemu/runstate.h b/include/sysemu/runstate.h index 0117d243c4..296af52322 100644 --- a/include/sysemu/runstate.h +++ b/include/sysemu/runstate.h @@ -5,6 +5,7 @@ #include "qemu/notify.h" bool runstate_check(RunState state); +const char *current_run_state_str(void); void runstate_set(RunState new_state); RunState runstate_get(void); bool runstate_is_running(void); diff --git a/system/qdev-monitor.c b/system/qdev-monitor.c index e3107a12d7..b83b5d23c9 100644 --- a/system/qdev-monitor.c +++ b/system/qdev-monitor.c @@ -23,6 +23,7 @@ #include "monitor/monitor.h" #include "monitor/qdev.h" #include "sysemu/arch_init.h" +#include "sysemu/runstate.h" #include "qapi/error.h" #include "qapi/qapi-commands-qdev.h" #include "qapi/qmp/dispatch.h" @@ -983,7 +984,31 @@ int qdev_sync_config(DeviceState *dev, Error **errp) void qmp_device_sync_config(const char *id, Error **errp) { -DeviceState *dev = find_device_state(id, true, errp); +MigrationState *s = migrate_get_current(); +DeviceState *dev; + +/* + * During migration there is a race between syncing`config and migrating it, + * so let's just not allow it. + * + * Moreover, let's not rely on setting up interrupts in paused state, which + * may be a part of migration process. + */ + +if (migration_is_running(s->state)) { +error_setg(errp, "Config synchronization is not allowed " + "during migration."); +return; +} + +if (!runstate_is_running()) { +error_setg(errp, "Config synchronization allowed only in '%s' state, " + "current state is '%s'", RunState_str(RUN_STATE_RUNNING), + current_run_state_str()); +return; +} + +dev = find_device_state(id, true, errp); if (!dev) { return; } diff --git a/system/runstate.c b/system/runstate.c index d6ab860eca..8fd89172ae 100644 --- a/system/runstate.c +++ b/system/runstate.c @@ -189,6 +189,11 @@ bool runstate_check(RunState state) return current_run_state == state; } +const char *current_run_state_str(void) +{ +return RunState_str(current_run_state); +} + static void runstate_init(void) { const RunStateTransition *p; -- 2.34.1
[PATCH v2 4/6] qapi: introduce device-sync-config
Add command to sync config from vhost-user backend to the device. It may be helpful when VHOST_USER_SLAVE_CONFIG_CHANGE_MSG failed or not triggered interrupt to the guest or just not available (not supported by vhost-user server). Signed-off-by: Vladimir Sementsov-Ogievskiy --- hw/block/vhost-user-blk.c | 27 --- hw/virtio/virtio-pci.c| 9 + include/hw/qdev-core.h| 3 +++ qapi/qdev.json| 17 + system/qdev-monitor.c | 23 +++ 5 files changed, 72 insertions(+), 7 deletions(-) diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c index 9e6bbc6950..2f301f380c 100644 --- a/hw/block/vhost-user-blk.c +++ b/hw/block/vhost-user-blk.c @@ -88,27 +88,39 @@ static void vhost_user_blk_set_config(VirtIODevice *vdev, const uint8_t *config) s->blkcfg.wce = blkcfg->wce; } +static int vhost_user_blk_sync_config(DeviceState *dev, Error **errp) +{ +int ret; +VirtIODevice *vdev = VIRTIO_DEVICE(dev); +VHostUserBlk *s = VHOST_USER_BLK(vdev); + +ret = vhost_dev_get_config(>dev, (uint8_t *)>blkcfg, + vdev->config_len, errp); +if (ret < 0) { +return ret; +} + +memcpy(vdev->config, >blkcfg, vdev->config_len); +virtio_notify_config(vdev); + +return 0; +} + static int vhost_user_blk_handle_config_change(struct vhost_dev *dev) { int ret; -VirtIODevice *vdev = dev->vdev; -VHostUserBlk *s = VHOST_USER_BLK(dev->vdev); Error *local_err = NULL; if (!dev->started) { return 0; } -ret = vhost_dev_get_config(dev, (uint8_t *)>blkcfg, - vdev->config_len, _err); +ret = vhost_user_blk_sync_config(DEVICE(dev->vdev), _err); if (ret < 0) { error_report_err(local_err); return ret; } -memcpy(dev->vdev->config, >blkcfg, vdev->config_len); -virtio_notify_config(dev->vdev); - return 0; } @@ -576,6 +588,7 @@ static void vhost_user_blk_class_init(ObjectClass *klass, void *data) device_class_set_props(dc, vhost_user_blk_properties); dc->vmsd = _vhost_user_blk; +dc->sync_config = vhost_user_blk_sync_config; set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); vdc->realize = vhost_user_blk_device_realize; vdc->unrealize = vhost_user_blk_device_unrealize; diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 1a7039fb0c..1197341a3c 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -2318,6 +2318,14 @@ static void virtio_pci_dc_realize(DeviceState *qdev, Error **errp) vpciklass->parent_dc_realize(qdev, errp); } +static int virtio_pci_sync_config(DeviceState *dev, Error **errp) +{ +VirtIOPCIProxy *proxy = VIRTIO_PCI(dev); +VirtIODevice *vdev = virtio_bus_get_device(>bus); + +return qdev_sync_config(DEVICE(vdev), errp); +} + static void virtio_pci_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); @@ -2334,6 +2342,7 @@ static void virtio_pci_class_init(ObjectClass *klass, void *data) device_class_set_parent_realize(dc, virtio_pci_dc_realize, >parent_dc_realize); rc->phases.hold = virtio_pci_bus_reset_hold; +dc->sync_config = virtio_pci_sync_config; } static const TypeInfo virtio_pci_info = { diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h index 9228e96c87..87135bdcdf 100644 --- a/include/hw/qdev-core.h +++ b/include/hw/qdev-core.h @@ -95,6 +95,7 @@ typedef void (*DeviceUnrealize)(DeviceState *dev); typedef void (*DeviceReset)(DeviceState *dev); typedef void (*BusRealize)(BusState *bus, Error **errp); typedef void (*BusUnrealize)(BusState *bus); +typedef int (*DeviceSyncConfig)(DeviceState *dev, Error **errp); /** * struct DeviceClass - The base class for all devices. @@ -162,6 +163,7 @@ struct DeviceClass { DeviceReset reset; DeviceRealize realize; DeviceUnrealize unrealize; +DeviceSyncConfig sync_config; /** * @vmsd: device state serialisation description for @@ -546,6 +548,7 @@ bool qdev_hotplug_allowed(DeviceState *dev, Error **errp); */ HotplugHandler *qdev_get_hotplug_handler(DeviceState *dev); void qdev_unplug(DeviceState *dev, Error **errp); +int qdev_sync_config(DeviceState *dev, Error **errp); void qdev_simple_device_unplug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp); void qdev_machine_creation_done(void); diff --git a/qapi/qdev.json b/qapi/qdev.json index 32ffaee644..6ece164172 100644 --- a/qapi/qdev.json +++ b/qapi/qdev.json @@ -162,3 +162,20 @@ ## { 'event': 'DEVICE_UNPLUG_GUEST_ERROR', 'data': { '*device': 'str', 'path': 'str' } } + +## +# @device-sync-config: +# +# Synchronize config from backend to the guest. +# +# @id: the device's ID or QOM path +
[PATCH v2 6/6] qapi: introduce CONFIG_READ event
Send a new event when guest reads virtio-pci config after virtio_notify_config() call. That's useful to check that guest fetched modified config, for example after resizing disk backend. Signed-off-by: Vladimir Sementsov-Ogievskiy --- hw/virtio/virtio-pci.c | 9 + include/monitor/qdev.h | 2 ++ monitor/monitor.c | 1 + qapi/qdev.json | 26 ++ stubs/qdev.c | 6 ++ system/qdev-monitor.c | 6 ++ 6 files changed, 50 insertions(+) diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 1197341a3c..fab49a2410 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -23,6 +23,7 @@ #include "hw/boards.h" #include "hw/virtio/virtio.h" #include "migration/qemu-file-types.h" +#include "monitor/qdev.h" #include "hw/pci/pci.h" #include "hw/pci/pci_bus.h" #include "hw/qdev-properties.h" @@ -530,6 +531,10 @@ static uint64_t virtio_pci_config_read(void *opaque, hwaddr addr, } addr -= config; +if (vdev->generation > 0) { +qdev_virtio_config_read_event(DEVICE(proxy)); +} + switch (size) { case 1: val = virtio_config_readb(vdev, addr); @@ -1735,6 +1740,10 @@ static uint64_t virtio_pci_device_read(void *opaque, hwaddr addr, return UINT64_MAX; } +if (vdev->generation > 0) { +qdev_virtio_config_read_event(DEVICE(proxy)); +} + switch (size) { case 1: val = virtio_config_modern_readb(vdev, addr); diff --git a/include/monitor/qdev.h b/include/monitor/qdev.h index 1d57bf6577..fc9a834dca 100644 --- a/include/monitor/qdev.h +++ b/include/monitor/qdev.h @@ -36,4 +36,6 @@ DeviceState *qdev_device_add_from_qdict(const QDict *opts, */ const char *qdev_set_id(DeviceState *dev, char *id, Error **errp); +void qdev_virtio_config_read_event(DeviceState *dev); + #endif diff --git a/monitor/monitor.c b/monitor/monitor.c index 01ede1babd..5b06146503 100644 --- a/monitor/monitor.c +++ b/monitor/monitor.c @@ -316,6 +316,7 @@ static MonitorQAPIEventConf monitor_qapi_event_conf[QAPI_EVENT__MAX] = { [QAPI_EVENT_VSERPORT_CHANGE] = { 1000 * SCALE_MS }, [QAPI_EVENT_MEMORY_DEVICE_SIZE_CHANGE] = { 1000 * SCALE_MS }, [QAPI_EVENT_HV_BALLOON_STATUS_REPORT] = { 1000 * SCALE_MS }, +[QAPI_EVENT_VIRTIO_CONFIG_READ] = { 300 * SCALE_MS }, }; /* diff --git a/qapi/qdev.json b/qapi/qdev.json index 6ece164172..ffc5e3be18 100644 --- a/qapi/qdev.json +++ b/qapi/qdev.json @@ -179,3 +179,29 @@ { 'command': 'device-sync-config', 'features': [ 'unstable' ], 'data': {'id': 'str'} } + +## +# @VIRTIO_CONFIG_READ: +# +# Emitted whenever guest reads virtio device config after config change. +# +# @device: device name +# +# @path: device path +# +# Features: +# +# @unstable: The event is experimental. +# +# Since: 9.0 +# +# Example: +# +# <- { "event": "VIRTIO_CONFIG_READ", +# "data": { "device": "virtio-net-pci-0", +#"path": "/machine/peripheral/virtio-net-pci-0" }, +# "timestamp": { "seconds": 1265044230, "microseconds": 450486 } } +## +{ 'event': 'VIRTIO_CONFIG_READ', + 'features': [ 'unstable' ], + 'data': { '*device': 'str', 'path': 'str' } } diff --git a/stubs/qdev.c b/stubs/qdev.c index 6869f6f90a..ab6c4afe0b 100644 --- a/stubs/qdev.c +++ b/stubs/qdev.c @@ -26,3 +26,9 @@ void qapi_event_send_device_unplug_guest_error(const char *device, { /* Nothing to do. */ } + +void qapi_event_send_virtio_config_read(const char *device, +const char *path) +{ +/* Nothing to do. */ +} diff --git a/system/qdev-monitor.c b/system/qdev-monitor.c index b83b5d23c9..29b8242a2d 100644 --- a/system/qdev-monitor.c +++ b/system/qdev-monitor.c @@ -26,6 +26,7 @@ #include "sysemu/runstate.h" #include "qapi/error.h" #include "qapi/qapi-commands-qdev.h" +#include "qapi/qapi-events-qdev.h" #include "qapi/qmp/dispatch.h" #include "qapi/qmp/qdict.h" #include "qapi/qmp/qerror.h" @@ -1206,3 +1207,8 @@ bool qmp_command_available(const QmpCommand *cmd, Error **errp) } return true; } + +void qdev_virtio_config_read_event(DeviceState *dev) +{ +qapi_event_send_virtio_config_read(dev->id, dev->canonical_path); +} -- 2.34.1
[PATCH v2 1/6] vhost-user-blk: simplify and fix vhost_user_blk_handle_config_change
Let's not care about what was changed and update the whole config, reasons: 1. config->geometry should be updated together with capacity, so we fix a bug. 2. Vhost-user protocol doesn't say anything about config change limitation. Silent ignore of changes doesn't seem to be correct. 3. vhost-user-vsock reads the whole config 4. on realize we don't do any checks on retrieved config, so no reason to care here Comment "valid for resize only" exists since introduction the whole hw/block/vhost-user-blk.c in commit 00343e4b54ba0685e9ebe928ec5713b0cf7f1d1c "vhost-user-blk: introduce a new vhost-user-blk host device", seems it was just an extra limitation. Also, let's notify guest unconditionally: 1. So does vhost-user-vsock 2. We are going to reuse the functionality in new cases when we do want to notify the guest unconditionally. So, no reason to create extra branches in the logic. Signed-off-by: Vladimir Sementsov-Ogievskiy Acked-by: Raphael Norwitz --- hw/block/vhost-user-blk.c | 11 +++ 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c index 6a856ad51a..9e6bbc6950 100644 --- a/hw/block/vhost-user-blk.c +++ b/hw/block/vhost-user-blk.c @@ -91,7 +91,6 @@ static void vhost_user_blk_set_config(VirtIODevice *vdev, const uint8_t *config) static int vhost_user_blk_handle_config_change(struct vhost_dev *dev) { int ret; -struct virtio_blk_config blkcfg; VirtIODevice *vdev = dev->vdev; VHostUserBlk *s = VHOST_USER_BLK(dev->vdev); Error *local_err = NULL; @@ -100,19 +99,15 @@ static int vhost_user_blk_handle_config_change(struct vhost_dev *dev) return 0; } -ret = vhost_dev_get_config(dev, (uint8_t *), +ret = vhost_dev_get_config(dev, (uint8_t *)>blkcfg, vdev->config_len, _err); if (ret < 0) { error_report_err(local_err); return ret; } -/* valid for resize only */ -if (blkcfg.capacity != s->blkcfg.capacity) { -s->blkcfg.capacity = blkcfg.capacity; -memcpy(dev->vdev->config, >blkcfg, vdev->config_len); -virtio_notify_config(dev->vdev); -} +memcpy(dev->vdev->config, >blkcfg, vdev->config_len); +virtio_notify_config(dev->vdev); return 0; } -- 2.34.1
[PATCH v2 3/6] qdev-monitor: add option to report GenericError from find_device_state
Signed-off-by: Vladimir Sementsov-Ogievskiy --- system/qdev-monitor.c | 15 +++ 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/system/qdev-monitor.c b/system/qdev-monitor.c index 9febb743f1..cf7481e416 100644 --- a/system/qdev-monitor.c +++ b/system/qdev-monitor.c @@ -877,13 +877,20 @@ void qmp_device_add(QDict *qdict, QObject **ret_data, Error **errp) object_unref(OBJECT(dev)); } -static DeviceState *find_device_state(const char *id, Error **errp) +/* + * Note that creating new APIs using error classes other than GenericError is + * not recommended. Set use_generic_error=true for new interfaces. + */ +static DeviceState *find_device_state(const char *id, bool use_generic_error, + Error **errp) { Object *obj = object_resolve_path_at(qdev_get_peripheral(), id); DeviceState *dev; if (!obj) { -error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, +error_set(errp, + (use_generic_error ? + ERROR_CLASS_GENERIC_ERROR : ERROR_CLASS_DEVICE_NOT_FOUND), "Device '%s' not found", id); return NULL; } @@ -947,7 +954,7 @@ void qdev_unplug(DeviceState *dev, Error **errp) void qmp_device_del(const char *id, Error **errp) { -DeviceState *dev = find_device_state(id, errp); +DeviceState *dev = find_device_state(id, false, errp); if (dev != NULL) { if (dev->pending_deleted_event && (dev->pending_deleted_expires_ms == 0 || @@ -1067,7 +1074,7 @@ BlockBackend *blk_by_qdev_id(const char *id, Error **errp) GLOBAL_STATE_CODE(); -dev = find_device_state(id, errp); +dev = find_device_state(id, false, errp); if (dev == NULL) { return NULL; } -- 2.34.1
[PATCH v2 2/6] qdev-monitor: fix error message in find_device_state()
This "hotpluggable" here is misleading. Actually we check is object a device or not. Let's drop the word. SUggested-by: Markus Armbruster Signed-off-by: Vladimir Sementsov-Ogievskiy --- system/qdev-monitor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/system/qdev-monitor.c b/system/qdev-monitor.c index a13db763e5..9febb743f1 100644 --- a/system/qdev-monitor.c +++ b/system/qdev-monitor.c @@ -890,7 +890,7 @@ static DeviceState *find_device_state(const char *id, Error **errp) dev = (DeviceState *)object_dynamic_cast(obj, TYPE_DEVICE); if (!dev) { -error_setg(errp, "%s is not a hotpluggable device", id); +error_setg(errp, "%s is not a device", id); return NULL; } -- 2.34.1
Re: [RFC 0/4] mirror: implement incremental and bitmap modes
On 01.03.24 18:14, Fiona Ebner wrote: Am 01.03.24 um 16:02 schrieb Vladimir Sementsov-Ogievskiy: On 01.03.24 17:52, Fiona Ebner wrote: Am 01.03.24 um 15:14 schrieb Vladimir Sementsov-Ogievskiy: As we already understood, (block-)job-api needs some spring-cleaning. Unfortunately I don't have much time on it, but still I decided to start from finally depreacting block-job-* API and moving to job-*.. Probably bitmap/bitmap-mode/sync APIs also need some optimization, keeping in mind new block-dirty-bitmap-merge api. So, what I could advice in this situation for newc interfaces: 1. be minimalistic 2. add `x-` prefix when unsure So, following these two rules, what about x-bitmap field, which may be combined only with 'full' mode, and do what you need? AFAIU, it should rather be marked as @unstable in QAPI [0]? Then it doesn't need to be renamed if it becomes stable later. Right, unstable feature is needed, using "x-" is optional. Recent discussion about it was in my "vhost-user-blk: live resize additional APIs" series: https://patchew.org/QEMU/20231006202045.1161543-1-vsement...@yandex-team.ru/20231006202045.1161543-5-vsement...@yandex-team.ru/ Following it, I think it's OK to not care anymore with "x-" prefixes, and rely on unstable feature. Thanks for the confirmation! I'll go without the prefix in the name then. About documentation: actually, I never liked that we use for backup job "MirrorSyncMode". Now it looks more like "BackupSyncMode", having two values supported only by backup. I'm also unsure how mode=full=some_bitmap differs from mode=bitmap=some_bitmap.. With the current patches, it was an error to specify @bitmap for other modes than 'incremental' and 'bitmap'. Current documentation says: # @bitmap: The name of a dirty bitmap to use. Must be present if sync # is "bitmap" or "incremental". Can be present if sync is "full" # or "top". Must not be present otherwise. # (Since 2.4 (drive-backup), 3.1 (blockdev-backup)) This is for backup. The documentation (and behavior) for @bitmap added by these patches for mirror is different ;) I meant backup in "I'm also unsure", just as one more point not consider backup-bitmap-API as a prototype for mirror-bitmap-API. Best Regards, Fiona -- Best regards, Vladimir
Re: [RFC 0/4] mirror: implement incremental and bitmap modes
On 01.03.24 17:52, Fiona Ebner wrote: Am 01.03.24 um 15:14 schrieb Vladimir Sementsov-Ogievskiy: As we already understood, (block-)job-api needs some spring-cleaning. Unfortunately I don't have much time on it, but still I decided to start from finally depreacting block-job-* API and moving to job-*.. Probably bitmap/bitmap-mode/sync APIs also need some optimization, keeping in mind new block-dirty-bitmap-merge api. So, what I could advice in this situation for newc interfaces: 1. be minimalistic 2. add `x-` prefix when unsure So, following these two rules, what about x-bitmap field, which may be combined only with 'full' mode, and do what you need? AFAIU, it should rather be marked as @unstable in QAPI [0]? Then it doesn't need to be renamed if it becomes stable later. Right, unstable feature is needed, using "x-" is optional. Recent discussion about it was in my "vhost-user-blk: live resize additional APIs" series: https://patchew.org/QEMU/20231006202045.1161543-1-vsement...@yandex-team.ru/20231006202045.1161543-5-vsement...@yandex-team.ru/ Following it, I think it's OK to not care anymore with "x-" prefixes, and rely on unstable feature. About documentation: actually, I never liked that we use for backup job "MirrorSyncMode". Now it looks more like "BackupSyncMode", having two values supported only by backup. I'm also unsure how mode=full=some_bitmap differs from mode=bitmap=some_bitmap.. With the current patches, it was an error to specify @bitmap for other modes than 'incremental' and 'bitmap'. Current documentation says: # @bitmap: The name of a dirty bitmap to use. Must be present if sync # is "bitmap" or "incremental". Can be present if sync is "full" # or "top". Must not be present otherwise. # (Since 2.4 (drive-backup), 3.1 (blockdev-backup)) So, I'd suggest simply rename MirrorSyncMode to BackupSyncMode, and add separate MirrorSyncMode with only "full", "top" and "none" values. Sounds good to me! [0]: https://gitlab.com/qemu-project/qemu/-/commit/a3c45b3e62962f99338716b1347cfb0d427cea44 Best Regards, Fiona -- Best regards, Vladimir
Re: [RFC 0/4] mirror: implement incremental and bitmap modes
On 29.02.24 17:50, Fiona Ebner wrote: Am 29.02.24 um 13:00 schrieb Vladimir Sementsov-Ogievskiy: But anyway, this all could be simply achieved with bitmap-copying/merging API, if we allow to pass user-given bitmap to the mirror as working bitmap. I see, I'll drop the 'bitmap-mode' in the next version if nobody complains :) Good. It's a golden rule: never make public interfaces which you don't actually need for production. I myself sometimes violate it and spend extra time on developing features, which we later have to just drop as "not needed downstream, no sense in upstreaming". Just wondering which new mode I should allow for the @MirrorSyncMode then? The documentation states: # @incremental: only copy data described by the dirty bitmap. # (since: 2.4) # # @bitmap: only copy data described by the dirty bitmap. (since: 4.2) # Behavior on completion is determined by the BitmapSyncMode. For backup, do_backup_common() just maps @incremental to @bitmap + @bitmap-mode == @on-success. Using @bitmap for mirror would lead to being at odds with the documentation, because it mentions the BitmapSyncMode, which mirror won't have. Using @incremental for mirror would be consistent with the documentation, but behave a bit differently from backup. Opinions? Good question. As we already understood, (block-)job-api needs some spring-cleaning. Unfortunately I don't have much time on it, but still I decided to start from finally depreacting block-job-* API and moving to job-*.. Probably bitmap/bitmap-mode/sync APIs also need some optimization, keeping in mind new block-dirty-bitmap-merge api. So, what I could advice in this situation for newc interfaces: 1. be minimalistic 2. add `x-` prefix when unsure So, following these two rules, what about x-bitmap field, which may be combined only with 'full' mode, and do what you need? About documentation: actually, I never liked that we use for backup job "MirrorSyncMode". Now it looks more like "BackupSyncMode", having two values supported only by backup. I'm also unsure how mode=full=some_bitmap differs from mode=bitmap=some_bitmap.. So, I'd suggest simply rename MirrorSyncMode to BackupSyncMode, and add separate MirrorSyncMode with only "full", "top" and "none" values. -- Best regards, Vladimir
Re: [RFC 0/4] mirror: implement incremental and bitmap modes
On 29.02.24 13:41, Fiona Ebner wrote: Am 28.02.24 um 17:24 schrieb Vladimir Sementsov-Ogievskiy: On 16.02.24 13:55, Fiona Ebner wrote: Previous discussion from when this was sent upstream [0] (it's been a while). I rebased the patches and re-ordered and squashed like suggested back then [1]. This implements two new mirror modes: - bitmap mirror mode with always/on-success/never bitmap sync mode - incremental mirror mode as sugar for bitmap + on-success Use cases: * Possibility to resume a failed mirror later. * Possibility to only mirror deltas to a previously mirrored volume. * Possibility to (efficiently) mirror an drive that was previously mirrored via some external mechanism (e.g. ZFS replication). We are using the last one in production without any issues since about 4 years now. In particular, like mentioned in [2]: - create bitmap(s) - (incrementally) replicate storage volume(s) out of band (using ZFS) - incrementally drive mirror as part of a live migration of VM - drop bitmap(s) Actually which mode you use, "never", "always" or "conditional"? Or in downstream you have different approach? We are using "conditional", but I think we don't really require any specific mode, because we drop the bitmaps after mirroring (even in failure case). Fabian, please correct me if I'm wrong. Why am I asking: These modes (for backup) were developed prior to block-dirty-bitmap-merge command, which allowed to copy bitmaps as you want. With that API, we actually don't need all these modes, instead it's enough to pass a bitmap, which would be _actually_ used by mirror. So, if you need "never" mode, you just copy your bitmap by block-dirty-bitmap-add + block-dirty-bitmap-merge, and pass a copy to mirror job. Or, you pass your bitmap to mirror-job, and have a "always" mode. And I don't see, why we need a "conditional" mode, which actually just drops away the progress we actually made. (OK, we failed, but why to drop the progress of successfully copied clusters?) I'm not sure actually. Maybe John remembers? Ah, I understand. Conditional just make sense if you don't support "partial success", and you want to delete target image in case of failure. And create a new one, to restart incremental job. But anyway, this all could be simply achieved with bitmap-copying/merging API, if we allow to pass user-given bitmap to the mirror as working bitmap. I see, I'll drop the 'bitmap-mode' in the next version if nobody complains :) Good. It's a golden rule: never make public interfaces which you don't actually need for production. I myself sometimes violate it and spend extra time on developing features, which we later have to just drop as "not needed downstream, no sense in upstreaming". Using user-given bitmap in the mirror job has also an additional advantage of live progress: up to visualization of disk copying by visualization of the dirty bitmap contents. Best Regards, Fiona -- Best regards, Vladimir
Re: [RFC 0/4] mirror: implement incremental and bitmap modes
On 29.02.24 13:11, Fiona Ebner wrote: Am 28.02.24 um 17:06 schrieb Vladimir Sementsov-Ogievskiy: On 28.02.24 19:00, Vladimir Sementsov-Ogievskiy wrote: On 16.02.24 13:55, Fiona Ebner wrote: Now, the IO test added in patch 4/4 actually contains yet another use case, namely doing incremental mirrors to stand-alone qcow2 "diff" images, that only contain the delta and can be rebased later. I had to adapt the IO test, because its output expected the mirror bitmap to still be dirty, but nowadays the mirror is apparently already done when the bitmaps are queried. So I thought, I'll just use 'write-blocking' mode to avoid any potential timing issues. But this exposed an issue with the diff image approach. If a write is not aligned to the granularity of the mirror target, then rebasing the diff image onto a backing image will not yield the desired result, because the full cluster is considered to be allocated and will "hide" some part of the base/backing image. The failure can be seen by either using 'write-blocking' mode in the IO test or setting the (bitmap) granularity to 32 KiB rather than the current 64 KiB. The question is how to deal with these edge cases? Some possibilities that would make sense to me: For 'background' mode: * prohibit if target's cluster size is larger than the bitmap granularity * document the limitation For 'write-blocking' mode: * disallow in combination with bitmap mode (would not be happy about it, because I'd like to use this without diff images) why not just require the same: bitmap granularity must be >= target granularity For the iotest's use-case, that only works for background mode. I'll explain below. * for writes that are not aligned to the target's cluster size, read the relevant/missing parts from the source image to be able to write whole target clusters (seems rather complex) There is another approach: consider and unaligned part of the request, fit in one cluster (we can always split any request to "aligned" middle part, and at most two small "unligned" parts, each fit into one cluster). We have two possibilities: 1. the cluster is dirty (marked dirty in the bitmap used by background process) We can simply ignore this part and rely on background process. This will not affect the convergence of the mirror job. Agreed. 2. the cluster is clear (i.e. background process, or some previous write already copied it) The iotest creates a new target image for each incremental sync which only records the diff relative to the previous mirror and those diff images are later rebased onto each other to get the full picture. Thus, it can be that a previous mirror job (not just background process or previous write) already copied a cluster, and in particular, copied it to a different target! Aha understand. For simplicity, let's consider case, when source "cluster size" = "job cluster size" = "bitmap granularity" = "target cluster size". Which types of clusters we should consider, when we want to handle guest write? 1. Clusters, that should be copied by background process These are dirty clusters from user-given bitmap, or if we do a full-disk mirror, all clusters, not yet copied by background process. For such clusters we simply ignore the unaligned write. We can even ignore the aligned write too: less disturbing the guest by delays. 2. Clusters, already copied by background process during this mirror job and not dirtied by guest since this time. For such clusters we are safe to do unaligned write, as target cluster must be allocated. 3. Clusters, not marked initially by dirty bitmap. What to do with them? We can't do unaligned write. I see two variants: - do additional read from source, to fill the whole cluster, which seems a bit too heavy - just mark the cluster as dirty for background job. So we behave like in "background" mode. But why not? The maximum count of such "hacks" is limited to number of "clear" clusters at start of mirror job, which means that we don't seriously affect the convergence. Mirror is guaranteed to converge anyway. And the whole sense of "write-blocking" mode is to have a guaranteed convergence. What do you think? Of course, we can't distinguish 3 types by on dirty bitmap, so we need the second one. For example "done_bitmap", where we can mark clusters that were successfully copied. That would be a kind of block-status of target image. But using bitmap is a lot better than querying block-status from target. In this case, we are safe to do unaligned write, as target cluster must be allocated. Because the diff image is new, the target's cluster is not necessarily allocated. When using write-blocking and a write of, e.g., 9 bytes to a clear source cluster comes in, only those 9 bytes are written to the target. Now the target's cluster is allocated
Re: [PATCH v2 00/10] mirror: allow switching from background to active mode
On 03.11.23 18:56, Markus Armbruster wrote: Kevin Wolf writes: Am 03.11.2023 um 10:36 hat Markus Armbruster geschrieben: Vladimir Sementsov-Ogievskiy writes: On 11.10.23 13:18, Fiona Ebner wrote: Am 10.10.23 um 19:55 schrieb Vladimir Sementsov-Ogievskiy: On 09.10.23 12:46, Fiona Ebner wrote: Initially, I tried to go for a more general 'job-change' command, but I couldn't figure out a way to avoid mutual inclusion between block-core.json and job.json. What is the problem with it? I still think that job-change would be better. If going for job-change in job.json, the dependencies would be job-change -> JobChangeOptions -> JobChangeOptionsMirror -> MirrorCopyMode query-jobs -> JobInfo -> JobInfoMirror and we can't include block-core.json in job.json, because an inclusion loop gives a build error. Let me try to understand this. Command job-change needs its argument type JobChangeOptions. JobChangeOptions is a union, and JobChangeOptionsMirror is one of its branches. JobChangeOptionsMirror needs MirrorCopyMode from block-core.json. block-core.json needs job.json for JobType and JobStatus. Could be made to work by moving MirrorCopyMode (and JobChangeOptionsMirror, JobInfoMirror) to job.json or some place that can be included by both job.json and block-core.json. Moving the type-specific definitions to the general job.json didn't feel right to me. Including another file with type-specific definitions in job.json feels slightly less wrong, but still not quite right and I didn't want to create a new file just for MirrorCopyMode (and JobChangeOptionsMirror, JobInfoMirror). And going further and moving all mirror-related things to a separate file would require moving along things like NewImageMode with it or create yet another file for such general things used by multiple block-jobs. If preferred, I can try and go with some version of the above. OK, I see the problem. Seems, that all requires some good refactoring. But that's a preexisting big work, and should not hold up your series. I'm OK to proceed with block-job-change. Saving ourselves some internal refactoring is a poor excuse for undesirable external interfaces. I'm not sure how undesirable it is. We have block-job-* commands for pretty much every other operation, so it's only consistent to have block-job-change, too. Is the job abstraction a failure? We have block-job- command since job- commandsince - block-job-set-speed 1.1 block-job-cancel1.1 job-cancel 3.0 block-job-pause 1.3 job-pause 3.0 block-job-resume1.3 job-resume 3.0 block-job-complete 1.3 job-complete3.0 block-job-dismiss 2.12job-dismiss 3.0 block-job-finalize 2.12job-finalize3.0 block-job-change8.2 query-block-jobs1.1 query-jobs I was under the impression that we added the (more general) job- commands to replace the (less general) block-job commands, and we're keeping the latter just for compatibility. Am I mistaken? Which one should be used? Why not deprecate the one that shouldn't be used? The addition of block-job-change without even trying to do job-change makes me wonder: have we given up on the job- interface? I'm okay with giving up on failures. All I want is clarity. Right now, I feel thoroughly confused about the status block-jobs and jobs, and how they're related. Hi! I didn't notice, that the series was finally merged. About the APIs, I think, of course we should deprecate block-job-* API, because we already have jobs which are not block-jobs, so we can't deprecate job-* API. So I suggest a plan: 1. Add job-change command simply in block-core.json, as a simple copy of block-job-change, to not care with resolving inclusion loops. (ha we could simply name our block-job-change to be job-change and place it in block-core.json, but now is too late) 2. Support changing speed in a new job-chage command. (or both in block-job-change and job-change, keeping them equal) 3. Deprecate block-job-* APIs 4. Wait 3 releases 5. Drop block-job-* APIs 6. Move all job-related stuff to job.json, drop `{ 'include': 'job.json' }` from block-core.json, and instead include block-core.json into job.json If it's OK, I can go through the steps. -- Best regards, Vladimir
Re: [PATCH v2] block-backend: per-device throttling of BLOCK_IO_ERROR reports
On 09.01.24 16:13, Vladimir Sementsov-Ogievskiy wrote: From: Leonid Kaplan BLOCK_IO_ERROR events comes from guest, so we must throttle them. We still want per-device throttling, so let's use device id as a key. Signed-off-by: Leonid Kaplan Signed-off-by: Vladimir Sementsov-Ogievskiy ping) -- Best regards, Vladimir
Re: [PATCH v3] blockcommit: Reopen base image as RO after abort
On 09.02.24 15:29, Alexander Ivanov wrote: Could you please review the patch? Sorry for long delay. Honestly, I don't like refcnt in block-driver. It violate incapsulation, refcnt is interal thing of common block layer. And actually, you can't make any assumptions from value of refcnt, as you don't know which additional parents were created and why, and when they are going unref their children. What was wrong with v2? On 1/30/24 10:14, Alexander Ivanov wrote: If a blockcommit is aborted the base image remains in RW mode, that leads to a fail of subsequent live migration. How to reproduce: $ virsh snapshot-create-as vm snp1 --disk-only *** write something to the disk inside the guest *** $ virsh blockcommit vm vda --active --shallow && virsh blockjob vm vda --abort $ lsof /vzt/vm.qcow2 COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAME qemu-syst 433203 root 45u REG 253,0 1724776448 133 /vzt/vm.qcow2 $ cat /proc/433203/fdinfo/45 pos: 0 flags: 02140002 < The last 2 means RW mode If the base image is in RW mode at the end of blockcommit and was in RO mode before blockcommit, check if src BDS has refcnt > 1. If so, the BDS will not be removed after blockcommit, and we should make the base image RO. Otherwise check recursively if there is a parent BDS of src BDS and reopen the base BDS in RO in this case. Signed-off-by: Alexander Ivanov --- block/mirror.c | 38 -- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/block/mirror.c b/block/mirror.c index 5145eb53e1..52a7fee75e 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -93,6 +93,7 @@ typedef struct MirrorBlockJob { int64_t active_write_bytes_in_flight; bool prepared; bool in_drain; + bool base_ro; } MirrorBlockJob; typedef struct MirrorBDSOpaque { @@ -652,6 +653,32 @@ static void coroutine_fn mirror_wait_for_all_io(MirrorBlockJob *s) } } +/* + * Check recursively if there is a parent BDS referenced more than + * min_refcnt times. This argument is needed because at the first + * call there is a bds referenced in blockcommit. + */ +static bool bdrv_chain_has_significant_parent(BlockDriverState *bs) +{ + BdrvChild *parent; + BlockDriverState *parent_bs; + + QLIST_FOREACH(parent, >parents, next) { + if (!(parent->klass->parent_is_bds)) { + continue; + } + parent_bs = parent->opaque; + if (parent_bs->drv && !parent_bs->drv->is_filter) { + return true; + } + if (bdrv_chain_has_significant_parent(parent_bs)) { + return true; + } + } + + return false; +} + /** * mirror_exit_common: handle both abort() and prepare() cases. * for .prepare, returns 0 on success and -errno on failure. @@ -793,6 +820,11 @@ static int mirror_exit_common(Job *job) bdrv_drained_end(target_bs); bdrv_unref(target_bs); + if (s->base_ro && !bdrv_is_read_only(target_bs) && + (src->refcnt > 1 || bdrv_chain_has_significant_parent(src))) { + bdrv_reopen_set_read_only(target_bs, true, NULL); + } + bs_opaque->job = NULL; bdrv_drained_end(src); @@ -1715,6 +1747,7 @@ static BlockJob *mirror_start_job( bool is_none_mode, BlockDriverState *base, bool auto_complete, const char *filter_node_name, bool is_mirror, MirrorCopyMode copy_mode, + bool base_ro, Error **errp) { MirrorBlockJob *s; @@ -1798,6 +1831,7 @@ static BlockJob *mirror_start_job( bdrv_unref(mirror_top_bs); s->mirror_top_bs = mirror_top_bs; + s->base_ro = base_ro; /* No resize for the target either; while the mirror is still running, a * consistent read isn't necessarily possible. We could possibly allow @@ -2027,7 +2061,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs, speed, granularity, buf_size, backing_mode, zero_target, on_source_error, on_target_error, unmap, NULL, NULL, _job_driver, is_none_mode, base, false, - filter_node_name, true, copy_mode, errp); + filter_node_name, true, copy_mode, false, errp); } BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs, @@ -2056,7 +2090,7 @@ BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs, on_error, on_error, true, cb, opaque, _active_job_driver, false, base, auto_complete, filter_node_name, false, MIRROR_COPY_MODE_BACKGROUND, - errp); + base_read_only, errp); if (!job) { goto error_restore_flags; } -- Best regards, Vladimir
Re: [RFC 0/4] mirror: implement incremental and bitmap modes
On 16.02.24 13:55, Fiona Ebner wrote: Previous discussion from when this was sent upstream [0] (it's been a while). I rebased the patches and re-ordered and squashed like suggested back then [1]. This implements two new mirror modes: - bitmap mirror mode with always/on-success/never bitmap sync mode - incremental mirror mode as sugar for bitmap + on-success Use cases: * Possibility to resume a failed mirror later. * Possibility to only mirror deltas to a previously mirrored volume. * Possibility to (efficiently) mirror an drive that was previously mirrored via some external mechanism (e.g. ZFS replication). We are using the last one in production without any issues since about 4 years now. In particular, like mentioned in [2]: - create bitmap(s) - (incrementally) replicate storage volume(s) out of band (using ZFS) - incrementally drive mirror as part of a live migration of VM - drop bitmap(s) Actually which mode you use, "never", "always" or "conditional"? Or in downstream you have different approach? Why am I asking: These modes (for backup) were developed prior to block-dirty-bitmap-merge command, which allowed to copy bitmaps as you want. With that API, we actually don't need all these modes, instead it's enough to pass a bitmap, which would be _actually_ used by mirror. So, if you need "never" mode, you just copy your bitmap by block-dirty-bitmap-add + block-dirty-bitmap-merge, and pass a copy to mirror job. Or, you pass your bitmap to mirror-job, and have a "always" mode. And I don't see, why we need a "conditional" mode, which actually just drops away the progress we actually made. (OK, we failed, but why to drop the progress of successfully copied clusters?) Using user-given bitmap in the mirror job has also an additional advantage of live progress: up to visualization of disk copying by visualization of the dirty bitmap contents. -- Best regards, Vladimir
Re: [RFC 0/4] mirror: implement incremental and bitmap modes
On 28.02.24 19:00, Vladimir Sementsov-Ogievskiy wrote: On 16.02.24 13:55, Fiona Ebner wrote: Previous discussion from when this was sent upstream [0] (it's been a while). I rebased the patches and re-ordered and squashed like suggested back then [1]. This implements two new mirror modes: - bitmap mirror mode with always/on-success/never bitmap sync mode - incremental mirror mode as sugar for bitmap + on-success Use cases: * Possibility to resume a failed mirror later. * Possibility to only mirror deltas to a previously mirrored volume. * Possibility to (efficiently) mirror an drive that was previously mirrored via some external mechanism (e.g. ZFS replication). We are using the last one in production without any issues since about 4 years now. In particular, like mentioned in [2]: - create bitmap(s) - (incrementally) replicate storage volume(s) out of band (using ZFS) - incrementally drive mirror as part of a live migration of VM - drop bitmap(s) Now, the IO test added in patch 4/4 actually contains yet another use case, namely doing incremental mirrors to stand-alone qcow2 "diff" images, that only contain the delta and can be rebased later. I had to adapt the IO test, because its output expected the mirror bitmap to still be dirty, but nowadays the mirror is apparently already done when the bitmaps are queried. So I thought, I'll just use 'write-blocking' mode to avoid any potential timing issues. But this exposed an issue with the diff image approach. If a write is not aligned to the granularity of the mirror target, then rebasing the diff image onto a backing image will not yield the desired result, because the full cluster is considered to be allocated and will "hide" some part of the base/backing image. The failure can be seen by either using 'write-blocking' mode in the IO test or setting the (bitmap) granularity to 32 KiB rather than the current 64 KiB. The question is how to deal with these edge cases? Some possibilities that would make sense to me: For 'background' mode: * prohibit if target's cluster size is larger than the bitmap granularity * document the limitation For 'write-blocking' mode: * disallow in combination with bitmap mode (would not be happy about it, because I'd like to use this without diff images) why not just require the same: bitmap granularity must be >= target granularity * for writes that are not aligned to the target's cluster size, read the relevant/missing parts from the source image to be able to write whole target clusters (seems rather complex) There is another approach: consider and unaligned part of the request, fit in one cluster (we can always split any request to "aligned" middle part, and at most two small "unligned" parts, each fit into one cluster). We have two possibilities: 1. the cluster is dirty (marked dirty in the bitmap used by background process) We can simply ignore this part and rely on background process. This will not affect the convergence of the mirror job. 2. the cluster is clear (i.e. background process, or some previous write already copied it) In this case, we are safe to do unaligned write, as target cluster must be allocated. (for bitmap-mode, I don't consider here clusters that are clear from the start, which we shouldn't copy in any case) Hmm, right, and that's exactly the logic we already have in do_sync_target_write(). So that's enough just to require that bitmap_granularity >= target_granularity -- Best regards, Vladimir
Re: [RFC 0/4] mirror: implement incremental and bitmap modes
On 16.02.24 13:55, Fiona Ebner wrote: Previous discussion from when this was sent upstream [0] (it's been a while). I rebased the patches and re-ordered and squashed like suggested back then [1]. This implements two new mirror modes: - bitmap mirror mode with always/on-success/never bitmap sync mode - incremental mirror mode as sugar for bitmap + on-success Use cases: * Possibility to resume a failed mirror later. * Possibility to only mirror deltas to a previously mirrored volume. * Possibility to (efficiently) mirror an drive that was previously mirrored via some external mechanism (e.g. ZFS replication). We are using the last one in production without any issues since about 4 years now. In particular, like mentioned in [2]: - create bitmap(s) - (incrementally) replicate storage volume(s) out of band (using ZFS) - incrementally drive mirror as part of a live migration of VM - drop bitmap(s) Now, the IO test added in patch 4/4 actually contains yet another use case, namely doing incremental mirrors to stand-alone qcow2 "diff" images, that only contain the delta and can be rebased later. I had to adapt the IO test, because its output expected the mirror bitmap to still be dirty, but nowadays the mirror is apparently already done when the bitmaps are queried. So I thought, I'll just use 'write-blocking' mode to avoid any potential timing issues. But this exposed an issue with the diff image approach. If a write is not aligned to the granularity of the mirror target, then rebasing the diff image onto a backing image will not yield the desired result, because the full cluster is considered to be allocated and will "hide" some part of the base/backing image. The failure can be seen by either using 'write-blocking' mode in the IO test or setting the (bitmap) granularity to 32 KiB rather than the current 64 KiB. The question is how to deal with these edge cases? Some possibilities that would make sense to me: For 'background' mode: * prohibit if target's cluster size is larger than the bitmap granularity * document the limitation For 'write-blocking' mode: * disallow in combination with bitmap mode (would not be happy about it, because I'd like to use this without diff images) why not just require the same: bitmap granularity must be >= target granularity * for writes that are not aligned to the target's cluster size, read the relevant/missing parts from the source image to be able to write whole target clusters (seems rather complex) There is another approach: consider and unaligned part of the request, fit in one cluster (we can always split any request to "aligned" middle part, and at most two small "unligned" parts, each fit into one cluster). We have two possibilities: 1. the cluster is dirty (marked dirty in the bitmap used by background process) We can simply ignore this part and rely on background process. This will not affect the convergence of the mirror job. 2. the cluster is clear (i.e. background process, or some previous write already copied it) In this case, we are safe to do unaligned write, as target cluster must be allocated. (for bitmap-mode, I don't consider here clusters that are clear from the start, which we shouldn't copy in any case) * document the limitation [0]: https://lore.kernel.org/qemu-devel/20200218100740.2228521-1-f.gruenbich...@proxmox.com/ [1]: https://lore.kernel.org/qemu-devel/d35a76de-78d5-af56-0b34-f7bd2bbd3...@redhat.com/ [2]: https://lore.kernel.org/qemu-devel/1599127031.9uxdp5h9o2.astr...@nora.none/ Fabian Grünbichler (2): mirror: move some checks to qmp iotests: add test for bitmap mirror John Snow (2): drive-mirror: add support for sync=bitmap mode=never drive-mirror: add support for conditional and always bitmap sync modes block/mirror.c| 94 +- blockdev.c| 70 +- include/block/block_int-global-state.h|4 +- qapi/block-core.json | 25 +- tests/qemu-iotests/tests/bitmap-sync-mirror | 550 .../qemu-iotests/tests/bitmap-sync-mirror.out | 2810 + tests/unit/test-block-iothread.c |4 +- 7 files changed, 3527 insertions(+), 30 deletions(-) create mode 100755 tests/qemu-iotests/tests/bitmap-sync-mirror create mode 100644 tests/qemu-iotests/tests/bitmap-sync-mirror.out -- Best regards, Vladimir
[PATCH v3 0/5] backup: discard-source parameter
Hi all! The main patch is 04, please look at it for description and diagram. v3: 02: new patch 04: take WRITE permission only when discard_source is required Vladimir Sementsov-Ogievskiy (5): block/copy-before-write: fix permission block/copy-before-write: support unligned snapshot-discard block/copy-before-write: create block_copy bitmap in filter node qapi: blockdev-backup: add discard-source parameter iotests: add backup-discard-source block/backup.c| 5 +- block/block-copy.c| 12 +- block/copy-before-write.c | 39 - block/copy-before-write.h | 1 + block/replication.c | 4 +- blockdev.c| 2 +- include/block/block-common.h | 2 + include/block/block-copy.h| 2 + include/block/block_int-global-state.h| 2 +- qapi/block-core.json | 4 + tests/qemu-iotests/257.out| 112 ++--- .../qemu-iotests/tests/backup-discard-source | 151 ++ .../tests/backup-discard-source.out | 5 + 13 files changed, 271 insertions(+), 70 deletions(-) create mode 100755 tests/qemu-iotests/tests/backup-discard-source create mode 100644 tests/qemu-iotests/tests/backup-discard-source.out -- 2.34.1
[PATCH v3 1/5] block/copy-before-write: fix permission
In case when source node does not have any parents, the condition still works as required: backup job do create the parent by block_job_create -> block_job_add_bdrv -> bdrv_root_attach_child Still, in this case checking @perm variable doesn't work, as backup job creates the root blk with empty permissions (as it rely on CBW filter to require correct permissions and don't want to create extra conflicts). So, we should not check @perm. The hack may be dropped entirely when transactional insertion of filter (when we don't try to recalculate permissions in intermediate state, when filter does conflict with original parent of the source node) merged (old big series "[PATCH v5 00/45] Transactional block-graph modifying API"[1] and it's current in-flight part is "[PATCH v8 0/7] blockdev-replace"[2]) [1] https://patchew.org/QEMU/20220330212902.590099-1-vsement...@openvz.org/ [2] https://patchew.org/QEMU/2023101718.932733-1-vsement...@yandex-team.ru/ Signed-off-by: Vladimir Sementsov-Ogievskiy --- block/copy-before-write.c | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/block/copy-before-write.c b/block/copy-before-write.c index 0842a1a6df..3919d495d7 100644 --- a/block/copy-before-write.c +++ b/block/copy-before-write.c @@ -364,9 +364,13 @@ cbw_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role, perm, shared, nperm, nshared); if (!QLIST_EMPTY(>parents)) { -if (perm & BLK_PERM_WRITE) { -*nperm = *nperm | BLK_PERM_CONSISTENT_READ; -} +/* + * Note, that source child may be shared with backup job. Backup job + * does create own blk parent on copy-before-write node, so this + * works even if source node does not have any parents before backup + * start + */ +*nperm = *nperm | BLK_PERM_CONSISTENT_READ; *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE); } } -- 2.34.1
[PATCH v3 3/5] block/copy-before-write: create block_copy bitmap in filter node
Currently block_copy creates copy_bitmap in source node. But that is in bad relation with .independent_close=true of copy-before-write filter: source node may be detached and removed before .bdrv_close() handler called, which should call block_copy_state_free(), which in turn should remove copy_bitmap. That's all not ideal: it would be better if internal bitmap of block-copy object is not attached to any node. But that is not possible now. The simplest solution is just create copy_bitmap in filter node, where anyway two other bitmaps are created. Signed-off-by: Vladimir Sementsov-Ogievskiy --- block/block-copy.c | 3 +- block/copy-before-write.c | 2 +- include/block/block-copy.h | 1 + tests/qemu-iotests/257.out | 112 ++--- 4 files changed, 60 insertions(+), 58 deletions(-) diff --git a/block/block-copy.c b/block/block-copy.c index 9ee3dd7ef5..8fca2c3698 100644 --- a/block/block-copy.c +++ b/block/block-copy.c @@ -351,6 +351,7 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target, } BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, + BlockDriverState *copy_bitmap_bs, const BdrvDirtyBitmap *bitmap, Error **errp) { @@ -367,7 +368,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, return NULL; } -copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL, +copy_bitmap = bdrv_create_dirty_bitmap(copy_bitmap_bs, cluster_size, NULL, errp); if (!copy_bitmap) { return NULL; diff --git a/block/copy-before-write.c b/block/copy-before-write.c index 6547eda707..44a9385aa7 100644 --- a/block/copy-before-write.c +++ b/block/copy-before-write.c @@ -467,7 +467,7 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags, ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & bs->file->bs->supported_zero_flags); -s->bcs = block_copy_state_new(bs->file, s->target, bitmap, errp); +s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap, errp); if (!s->bcs) { error_prepend(errp, "Cannot create block-copy-state: "); return -EINVAL; diff --git a/include/block/block-copy.h b/include/block/block-copy.h index 0700953ab8..8b41643bfa 100644 --- a/include/block/block-copy.h +++ b/include/block/block-copy.h @@ -25,6 +25,7 @@ typedef struct BlockCopyState BlockCopyState; typedef struct BlockCopyCallState BlockCopyCallState; BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, + BlockDriverState *copy_bitmap_bs, const BdrvDirtyBitmap *bitmap, Error **errp); diff --git a/tests/qemu-iotests/257.out b/tests/qemu-iotests/257.out index aa76131ca9..c33dd7f3a9 100644 --- a/tests/qemu-iotests/257.out +++ b/tests/qemu-iotests/257.out @@ -120,16 +120,16 @@ write -P0x67 0x3fe 0x2 "granularity": 65536, "persistent": false, "recording": false - } -], -"drive0": [ + }, { "busy": false, "count": 0, "granularity": 65536, "persistent": false, "recording": false - }, + } +], +"drive0": [ { "busy": false, "count": 458752, @@ -596,16 +596,16 @@ write -P0x67 0x3fe 0x2 "granularity": 65536, "persistent": false, "recording": false - } -], -"drive0": [ + }, { "busy": false, "count": 0, "granularity": 65536, "persistent": false, "recording": false - }, + } +], +"drive0": [ { "busy": false, "count": 458752, @@ -865,16 +865,16 @@ write -P0x67 0x3fe 0x2 "granularity": 65536, "persistent": false, "recording": false - } -], -"drive0": [ + }, { "busy": false, "count": 0, "granularity": 65536, "persistent": false, "recording": false - }, + } +], +"drive0": [ { "busy": false, "count": 458752, @@ -1341,16 +1341,16 @@ write -P0x67 0x3fe 0x2 "granularity": 65536, "persistent": false, "recording": fa
[PATCH v3 4/5] qapi: blockdev-backup: add discard-source parameter
Add a parameter that enables discard-after-copy. That is mostly useful in "push backup with fleecing" scheme, when source is snapshot-access format driver node, based on copy-before-write filter snapshot-access API: [guest] [snapshot-access] ~~ blockdev-backup ~~> [backup target] || | root | file vv [copy-before-write] | | | file| target v v [active disk] [temp.img] In this case discard-after-copy does two things: - discard data in temp.img to save disk space - avoid further copy-before-write operation in discarded area Note that we have to declare WRITE permission on source in copy-before-write filter, for discard to work. Still we can't take it unconditionally, as it will break normal backup from RO source. So, we have to add a parameter and pass it thorough bdrv_open flags. Signed-off-by: Vladimir Sementsov-Ogievskiy --- block/backup.c | 5 +++-- block/block-copy.c | 9 + block/copy-before-write.c | 15 +-- block/copy-before-write.h | 1 + block/replication.c| 4 ++-- blockdev.c | 2 +- include/block/block-common.h | 2 ++ include/block/block-copy.h | 1 + include/block/block_int-global-state.h | 2 +- qapi/block-core.json | 4 10 files changed, 37 insertions(+), 8 deletions(-) diff --git a/block/backup.c b/block/backup.c index ec29d6b810..3dd2e229d2 100644 --- a/block/backup.c +++ b/block/backup.c @@ -356,7 +356,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, BlockDriverState *target, int64_t speed, MirrorSyncMode sync_mode, BdrvDirtyBitmap *sync_bitmap, BitmapSyncMode bitmap_mode, - bool compress, + bool compress, bool discard_source, const char *filter_node_name, BackupPerf *perf, BlockdevOnError on_source_error, @@ -457,7 +457,8 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, goto error; } -cbw = bdrv_cbw_append(bs, target, filter_node_name, , errp); +cbw = bdrv_cbw_append(bs, target, filter_node_name, discard_source, + , errp); if (!cbw) { goto error; } diff --git a/block/block-copy.c b/block/block-copy.c index 8fca2c3698..7e3b378528 100644 --- a/block/block-copy.c +++ b/block/block-copy.c @@ -137,6 +137,7 @@ typedef struct BlockCopyState { CoMutex lock; int64_t in_flight_bytes; BlockCopyMethod method; +bool discard_source; BlockReqList reqs; QLIST_HEAD(, BlockCopyCallState) calls; /* @@ -353,6 +354,7 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target, BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, BlockDriverState *copy_bitmap_bs, const BdrvDirtyBitmap *bitmap, + bool discard_source, Error **errp) { ERRP_GUARD(); @@ -418,6 +420,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, cluster_size), }; +s->discard_source = discard_source; block_copy_set_copy_opts(s, false, false); ratelimit_init(>rate_limit); @@ -589,6 +592,12 @@ static coroutine_fn int block_copy_task_entry(AioTask *task) co_put_to_shres(s->mem, t->req.bytes); block_copy_task_end(t, ret); +if (s->discard_source && ret == 0) { +int64_t nbytes = +MIN(t->req.offset + t->req.bytes, s->len) - t->req.offset; +bdrv_co_pdiscard(s->source, t->req.offset, nbytes); +} + return ret; } diff --git a/block/copy-before-write.c b/block/copy-before-write.c index 44a9385aa7..dac57481c5 100644 --- a/block/copy-before-write.c +++ b/block/copy-before-write.c @@ -44,6 +44,7 @@ typedef struct BDRVCopyBeforeWriteState { BdrvChild *target; OnCbwError on_cbw_error; uint32_t cbw_timeout_ns; +bool discard_source; /* * @lock: protects access to @access_bitmap, @done_bitmap and @@ -357,6 +358,8 @@ cbw_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role, uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared) { +BDRVCopyBeforeWriteState *s = bs->opaque; + if (!(role & BDRV_CHILD_FILTERED)) { /* * Target child @@ -381,6 +384,10 @@ cbw_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role, * start */ *nperm = *nperm | BLK_PERM_CONSISTENT_READ; +if (s->dis
[PATCH v3 2/5] block/copy-before-write: support unligned snapshot-discard
First thing that crashes on unligned access here is bdrv_reset_dirty_bitmap(). Correct way is to align-down the snapshot-discard request. Signed-off-by: Vladimir Sementsov-Ogievskiy --- block/copy-before-write.c | 16 +--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/block/copy-before-write.c b/block/copy-before-write.c index 3919d495d7..6547eda707 100644 --- a/block/copy-before-write.c +++ b/block/copy-before-write.c @@ -325,14 +325,24 @@ static int coroutine_fn GRAPH_RDLOCK cbw_co_pdiscard_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes) { BDRVCopyBeforeWriteState *s = bs->opaque; +uint32_t cluster_size = block_copy_cluster_size(s->bcs); +int64_t aligned_offset = QEMU_ALIGN_UP(offset, cluster_size); +int64_t aligned_end = QEMU_ALIGN_DOWN(offset + bytes, cluster_size); +int64_t aligned_bytes; + +if (aligned_end <= aligned_offset) { +return 0; +} +aligned_bytes = aligned_end - aligned_offset; WITH_QEMU_LOCK_GUARD(>lock) { -bdrv_reset_dirty_bitmap(s->access_bitmap, offset, bytes); +bdrv_reset_dirty_bitmap(s->access_bitmap, aligned_offset, +aligned_bytes); } -block_copy_reset(s->bcs, offset, bytes); +block_copy_reset(s->bcs, aligned_offset, aligned_bytes); -return bdrv_co_pdiscard(s->target, offset, bytes); +return bdrv_co_pdiscard(s->target, aligned_offset, aligned_bytes); } static void GRAPH_RDLOCK cbw_refresh_filename(BlockDriverState *bs) -- 2.34.1
[PATCH v3 5/5] iotests: add backup-discard-source
Add test for a new backup option: discard-source. Signed-off-by: Vladimir Sementsov-Ogievskiy --- .../qemu-iotests/tests/backup-discard-source | 151 ++ .../tests/backup-discard-source.out | 5 + 2 files changed, 156 insertions(+) create mode 100755 tests/qemu-iotests/tests/backup-discard-source create mode 100644 tests/qemu-iotests/tests/backup-discard-source.out diff --git a/tests/qemu-iotests/tests/backup-discard-source b/tests/qemu-iotests/tests/backup-discard-source new file mode 100755 index 00..8a88b0f6c4 --- /dev/null +++ b/tests/qemu-iotests/tests/backup-discard-source @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 +# +# Test removing persistent bitmap from backing +# +# Copyright (c) 2022 Virtuozzo International GmbH. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +import os + +import iotests +from iotests import qemu_img_create, qemu_img_map, qemu_io + + +temp_img = os.path.join(iotests.test_dir, 'temp') +source_img = os.path.join(iotests.test_dir, 'source') +target_img = os.path.join(iotests.test_dir, 'target') +size = '1M' + + +def get_actual_size(vm, node_name): +nodes = vm.cmd('query-named-block-nodes', flat=True) +node = next(n for n in nodes if n['node-name'] == node_name) +return node['image']['actual-size'] + + +class TestBackup(iotests.QMPTestCase): +def setUp(self): +qemu_img_create('-f', iotests.imgfmt, source_img, size) +qemu_img_create('-f', iotests.imgfmt, temp_img, size) +qemu_img_create('-f', iotests.imgfmt, target_img, size) +qemu_io('-c', 'write 0 1M', source_img) + +self.vm = iotests.VM() +self.vm.launch() + +self.vm.cmd('blockdev-add', { +'node-name': 'cbw', +'driver': 'copy-before-write', +'file': { +'driver': iotests.imgfmt, +'file': { +'driver': 'file', +'filename': source_img, +} +}, +'target': { +'driver': iotests.imgfmt, +'discard': 'unmap', +'node-name': 'temp', +'file': { +'driver': 'file', +'filename': temp_img +} +} +}) + +self.vm.cmd('blockdev-add', { +'node-name': 'access', +'discard': 'unmap', +'driver': 'snapshot-access', +'file': 'cbw' +}) + +self.vm.cmd('blockdev-add', { +'driver': iotests.imgfmt, +'node-name': 'target', +'file': { +'driver': 'file', +'filename': target_img +} +}) + +self.assertLess(get_actual_size(self.vm, 'temp'), 512 * 1024) + +def tearDown(self): +# That should fail, because region is discarded +self.vm.hmp_qemu_io('access', 'read 0 1M') + +self.vm.shutdown() + +self.assertTrue('read failed: Permission denied' in self.vm.get_log()) + +# Final check that temp image is empty +mapping = qemu_img_map(temp_img) +self.assertEqual(len(mapping), 1) +self.assertEqual(mapping[0]['start'], 0) +self.assertEqual(mapping[0]['length'], 1024 * 1024) +self.assertEqual(mapping[0]['data'], False) + +os.remove(temp_img) +os.remove(source_img) +os.remove(target_img) + +def do_backup(self): +self.vm.cmd('blockdev-backup', device='access', +sync='full', target='target', +job_id='backup0', +discard_source=True) + +self.vm.event_wait(name='BLOCK_JOB_COMPLETED') + +def test_discard_written(self): +""" +1. Guest writes +2. copy-before-write operation, data is stored to temp +3. start backup(discard_source=True), check that data is + removed from temp +""" +# Trigger copy-before-write operation +result = self.vm.hmp_qemu_io('cbw', 'write 0 1M') +self.assert_qmp(result, 'return', '') + +# Check that data is written to temporary image +self.assertGreater(get_actual_size(self.vm, 'temp'), 1024 * 1024) + +self.do_backup() + +def test_discard_cbw(self): +""" +1.
Re: [PATCH v2 3/4] qapi: blockdev-backup: add discard-source parameter
On 19.01.24 17:46, Fiona Ebner wrote: Am 17.01.24 um 17:07 schrieb Vladimir Sementsov-Ogievskiy: Add a parameter that enables discard-after-copy. That is mostly useful in "push backup with fleecing" scheme, when source is snapshot-access format driver node, based on copy-before-write filter snapshot-access API: [guest] [snapshot-access] ~~ blockdev-backup ~~> [backup target] || | root | file vv [copy-before-write] | | | file| target v v [active disk] [temp.img] In this case discard-after-copy does two things: - discard data in temp.img to save disk space - avoid further copy-before-write operation in discarded area Note that we have to declare WRITE permission on source in copy-before-write filter, for discard to work. Signed-off-by: Vladimir Sementsov-Ogievskiy Unfortunately, setting BLK_PERM_WRITE unconditionally breaks blockdev-backup for a read-only node (even when not using discard-source): Ohh, right. So, that's the place when we have to somehow pass through discrard-souce option to CBW filter, so that it know that WRITE permission is needed. Will try in v3. Thanks again for testing! #!/bin/bash ./qemu-img create /tmp/disk.raw -f raw 1G ./qemu-img create /tmp/backup.raw -f raw 1G ./qemu-system-x86_64 --qmp stdio \ --blockdev raw,node-name=node0,file.driver=file,file.filename=/tmp/disk.raw,read-only=true \ --blockdev raw,node-name=node1,file.driver=file,file.filename=/tmp/backup.raw \ < The above works before applying this patch, but leads to an error afterwards: {"error": {"class": "GenericError", "desc": "Block node is read-only"}} Best Regards, Fiona -- Best regards, Vladimir
Re: [PATCH v2 3/4] qapi: blockdev-backup: add discard-source parameter
On 25.01.24 15:47, Fiona Ebner wrote: Am 24.01.24 um 16:03 schrieb Fiona Ebner: Am 17.01.24 um 17:07 schrieb Vladimir Sementsov-Ogievskiy: Add a parameter that enables discard-after-copy. That is mostly useful in "push backup with fleecing" scheme, when source is snapshot-access format driver node, based on copy-before-write filter snapshot-access API: [guest] [snapshot-access] ~~ blockdev-backup ~~> [backup target] || | root | file vv [copy-before-write] | | | file| target v v [active disk] [temp.img] In this case discard-after-copy does two things: - discard data in temp.img to save disk space - avoid further copy-before-write operation in discarded area Note that we have to declare WRITE permission on source in copy-before-write filter, for discard to work. Signed-off-by: Vladimir Sementsov-Ogievskiy Ran into another issue when the cluster_size of the fleecing image is larger than for the backup target, e.g. #!/bin/bash rm /tmp/fleecing.qcow2 ./qemu-img create /tmp/disk.qcow2 -f qcow2 1G ./qemu-img create /tmp/fleecing.qcow2 -o cluster_size=2M -f qcow2 1G ./qemu-img create /tmp/backup.qcow2 -f qcow2 1G ./qemu-system-x86_64 --qmp stdio \ --blockdev qcow2,node-name=node0,file.driver=file,file.filename=/tmp/disk.qcow2 \ --blockdev qcow2,node-name=node1,file.driver=file,file.filename=/tmp/fleecing.qcow2,discard=unmap \ --blockdev qcow2,node-name=node2,file.driver=file,file.filename=/tmp/backup.qcow2 \ < will fail with qemu-system-x86_64: ../util/hbitmap.c:570: hbitmap_reset: Assertion `QEMU_IS_ALIGNED(count, gran) || (start + count == hb->orig_size)' failed. Backtrace shows the assert happens while discarding, when resetting the BDRVCopyBeforeWriteState access_bitmap > #6 0x56142a2a in hbitmap_reset (hb=0x57e01b80, start=0, count=1048576) at ../util/hbitmap.c:570 #7 0x55f80764 in bdrv_reset_dirty_bitmap_locked (bitmap=0x5850a660, offset=0, bytes=1048576) at ../block/dirty-bitmap.c:563 #8 0x55f807ab in bdrv_reset_dirty_bitmap (bitmap=0x5850a660, offset=0, bytes=1048576) at ../block/dirty-bitmap.c:570 #9 0x55f7bb16 in cbw_co_pdiscard_snapshot (bs=0x581a7f60, offset=0, bytes=1048576) at ../block/copy-before-write.c:330 #10 0x55f8d00a in bdrv_co_pdiscard_snapshot (bs=0x581a7f60, offset=0, bytes=1048576) at ../block/io.c:3734 #11 0x55fd2380 in snapshot_access_co_pdiscard (bs=0x582b4f60, offset=0, bytes=1048576) at ../block/snapshot-access.c:55 #12 0x55f8b65d in bdrv_co_pdiscard (child=0x584fe790, offset=0, bytes=1048576) at ../block/io.c:3144 #13 0x55f78650 in block_copy_task_entry (task=0x57f588f0) at ../block/block-copy.c:597 My guess for the cause is that in block_copy_calculate_cluster_size() we only look at the target. But now that we need to discard the source, we'll also need to consider that for the calculation? Just querying the source and picking the maximum won't work either, because snapshot-access does not currently implement .bdrv_co_get_info and because copy-before-write (doesn't implement .bdrv_co_get_info and is a filter) will just return the info of its file child. But the discard will go to the target child. If I do 1. .bdrv_co_get_info in snapshot-access: return info from file child 2. .bdrv_co_get_info in copy-before-write: return maximum cluster_size from file child and target child 3. block_copy_calculate_cluster_size: return maximum from source and target then the issue does go away, but I don't know if that's not violating any assumptions and probably there's a better way to avoid the issue? Hmm. Taking maximum is not optimal for usual case without discard-source: user may want to work in smaller granularity than source, to save disk space. In case with discarding we have two possibilities: - either take larger granularity for the whole process like you propose (but this will need and option for CBW?) - or, fix discarding bitmap in CBW to work like normal discard: it should be aligned down. This will lead actually to discard-source option doing nothing.. == But why do you want fleecing image with larger granularity? Is that a real case or just experimenting? Still we should fix assertion anyway. I think: 1. fix discarding bitmap to make aligning-down (will do that for v3) 2. if we need another logic for block_copy_calculate_cluster_size() it should be an option. May be explicit "copy-cluster-size" or "granularity" option for CBW driver and for backup job. And we'll just check that given cluster-size is power of two >= target_size. -- Best regards, Vladimir
Re: [PATCH] block: allocate aligned write buffer for 'truncate -m full'
On 11.12.23 13:55, Andrey Drobyshev wrote: In case we're truncating an image opened with O_DIRECT, we might get -EINVAL on write with unaligned buffer. In particular, when running iotests/298 with '-nocache' we get: qemu-io: Failed to resize underlying file: Could not write zeros for preallocation: Invalid argument Let's just allocate the buffer using qemu_blockalign0() instead. Signed-off-by: Andrey Drobyshev Reviewed-by: Vladimir Sementsov-Ogievskiy I also suggest to use QEMU_AUTO_VFREE (keep my r-b if you do). --- block/file-posix.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index b862406c71..cee8de510b 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -2354,7 +2354,7 @@ static int handle_aiocb_truncate(void *opaque) goto out; } -buf = g_malloc0(65536); +buf = qemu_blockalign0(aiocb->bs, 65536); seek_result = lseek(fd, current_length, SEEK_SET); if (seek_result < 0) { @@ -2413,7 +2413,7 @@ out: } } -g_free(buf); +qemu_vfree(buf); return result; } -- Best regards, Vladimir
[PATCH v2 4/4] iotests: add backup-discard-source
Add test for a new backup option: discard-source. Signed-off-by: Vladimir Sementsov-Ogievskiy --- .../qemu-iotests/tests/backup-discard-source | 151 ++ .../tests/backup-discard-source.out | 5 + 2 files changed, 156 insertions(+) create mode 100755 tests/qemu-iotests/tests/backup-discard-source create mode 100644 tests/qemu-iotests/tests/backup-discard-source.out diff --git a/tests/qemu-iotests/tests/backup-discard-source b/tests/qemu-iotests/tests/backup-discard-source new file mode 100755 index 00..8a88b0f6c4 --- /dev/null +++ b/tests/qemu-iotests/tests/backup-discard-source @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 +# +# Test removing persistent bitmap from backing +# +# Copyright (c) 2022 Virtuozzo International GmbH. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +import os + +import iotests +from iotests import qemu_img_create, qemu_img_map, qemu_io + + +temp_img = os.path.join(iotests.test_dir, 'temp') +source_img = os.path.join(iotests.test_dir, 'source') +target_img = os.path.join(iotests.test_dir, 'target') +size = '1M' + + +def get_actual_size(vm, node_name): +nodes = vm.cmd('query-named-block-nodes', flat=True) +node = next(n for n in nodes if n['node-name'] == node_name) +return node['image']['actual-size'] + + +class TestBackup(iotests.QMPTestCase): +def setUp(self): +qemu_img_create('-f', iotests.imgfmt, source_img, size) +qemu_img_create('-f', iotests.imgfmt, temp_img, size) +qemu_img_create('-f', iotests.imgfmt, target_img, size) +qemu_io('-c', 'write 0 1M', source_img) + +self.vm = iotests.VM() +self.vm.launch() + +self.vm.cmd('blockdev-add', { +'node-name': 'cbw', +'driver': 'copy-before-write', +'file': { +'driver': iotests.imgfmt, +'file': { +'driver': 'file', +'filename': source_img, +} +}, +'target': { +'driver': iotests.imgfmt, +'discard': 'unmap', +'node-name': 'temp', +'file': { +'driver': 'file', +'filename': temp_img +} +} +}) + +self.vm.cmd('blockdev-add', { +'node-name': 'access', +'discard': 'unmap', +'driver': 'snapshot-access', +'file': 'cbw' +}) + +self.vm.cmd('blockdev-add', { +'driver': iotests.imgfmt, +'node-name': 'target', +'file': { +'driver': 'file', +'filename': target_img +} +}) + +self.assertLess(get_actual_size(self.vm, 'temp'), 512 * 1024) + +def tearDown(self): +# That should fail, because region is discarded +self.vm.hmp_qemu_io('access', 'read 0 1M') + +self.vm.shutdown() + +self.assertTrue('read failed: Permission denied' in self.vm.get_log()) + +# Final check that temp image is empty +mapping = qemu_img_map(temp_img) +self.assertEqual(len(mapping), 1) +self.assertEqual(mapping[0]['start'], 0) +self.assertEqual(mapping[0]['length'], 1024 * 1024) +self.assertEqual(mapping[0]['data'], False) + +os.remove(temp_img) +os.remove(source_img) +os.remove(target_img) + +def do_backup(self): +self.vm.cmd('blockdev-backup', device='access', +sync='full', target='target', +job_id='backup0', +discard_source=True) + +self.vm.event_wait(name='BLOCK_JOB_COMPLETED') + +def test_discard_written(self): +""" +1. Guest writes +2. copy-before-write operation, data is stored to temp +3. start backup(discard_source=True), check that data is + removed from temp +""" +# Trigger copy-before-write operation +result = self.vm.hmp_qemu_io('cbw', 'write 0 1M') +self.assert_qmp(result, 'return', '') + +# Check that data is written to temporary image +self.assertGreater(get_actual_size(self.vm, 'temp'), 1024 * 1024) + +self.do_backup() + +def test_discard_cbw(self): +""" +1.
[PATCH v2 0/4] backup: discard-source parameter
v2: - now, based on master - CBW permissions a bit reworked - clamp down length to discard (thanks to Fiona) - use modern vm.cmd() in test instead of vm.qmp() Vladimir Sementsov-Ogievskiy (4): block/copy-before-write: fix permission block/copy-before-write: create block_copy bitmap in filter node qapi: blockdev-backup: add discard-source parameter iotests: add backup-discard-source block/backup.c| 5 +- block/block-copy.c| 15 +- block/copy-before-write.c | 12 +- block/replication.c | 4 +- blockdev.c| 2 +- include/block/block-copy.h| 3 +- include/block/block_int-global-state.h| 2 +- qapi/block-core.json | 4 + tests/qemu-iotests/257.out| 112 ++--- .../qemu-iotests/tests/backup-discard-source | 151 ++ .../tests/backup-discard-source.out | 5 + 11 files changed, 245 insertions(+), 70 deletions(-) create mode 100755 tests/qemu-iotests/tests/backup-discard-source create mode 100644 tests/qemu-iotests/tests/backup-discard-source.out -- 2.34.1
[PATCH v2 2/4] block/copy-before-write: create block_copy bitmap in filter node
Currently block_copy creates copy_bitmap in source node. But that is in bad relation with .independent_close=true of copy-before-write filter: source node may be detached and removed before .bdrv_close() handler called, which should call block_copy_state_free(), which in turn should remove copy_bitmap. That's all not ideal: it would be better if internal bitmap of block-copy object is not attached to any node. But that is not possible now. The simplest solution is just create copy_bitmap in filter node, where anyway two other bitmaps are created. Signed-off-by: Vladimir Sementsov-Ogievskiy --- block/block-copy.c | 3 +- block/copy-before-write.c | 2 +- include/block/block-copy.h | 1 + tests/qemu-iotests/257.out | 112 ++--- 4 files changed, 60 insertions(+), 58 deletions(-) diff --git a/block/block-copy.c b/block/block-copy.c index 9ee3dd7ef5..8fca2c3698 100644 --- a/block/block-copy.c +++ b/block/block-copy.c @@ -351,6 +351,7 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target, } BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, + BlockDriverState *copy_bitmap_bs, const BdrvDirtyBitmap *bitmap, Error **errp) { @@ -367,7 +368,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, return NULL; } -copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL, +copy_bitmap = bdrv_create_dirty_bitmap(copy_bitmap_bs, cluster_size, NULL, errp); if (!copy_bitmap) { return NULL; diff --git a/block/copy-before-write.c b/block/copy-before-write.c index 3919d495d7..4cd9b7d91e 100644 --- a/block/copy-before-write.c +++ b/block/copy-before-write.c @@ -457,7 +457,7 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags, ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & bs->file->bs->supported_zero_flags); -s->bcs = block_copy_state_new(bs->file, s->target, bitmap, errp); +s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap, errp); if (!s->bcs) { error_prepend(errp, "Cannot create block-copy-state: "); return -EINVAL; diff --git a/include/block/block-copy.h b/include/block/block-copy.h index 0700953ab8..8b41643bfa 100644 --- a/include/block/block-copy.h +++ b/include/block/block-copy.h @@ -25,6 +25,7 @@ typedef struct BlockCopyState BlockCopyState; typedef struct BlockCopyCallState BlockCopyCallState; BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, + BlockDriverState *copy_bitmap_bs, const BdrvDirtyBitmap *bitmap, Error **errp); diff --git a/tests/qemu-iotests/257.out b/tests/qemu-iotests/257.out index aa76131ca9..c33dd7f3a9 100644 --- a/tests/qemu-iotests/257.out +++ b/tests/qemu-iotests/257.out @@ -120,16 +120,16 @@ write -P0x67 0x3fe 0x2 "granularity": 65536, "persistent": false, "recording": false - } -], -"drive0": [ + }, { "busy": false, "count": 0, "granularity": 65536, "persistent": false, "recording": false - }, + } +], +"drive0": [ { "busy": false, "count": 458752, @@ -596,16 +596,16 @@ write -P0x67 0x3fe 0x2 "granularity": 65536, "persistent": false, "recording": false - } -], -"drive0": [ + }, { "busy": false, "count": 0, "granularity": 65536, "persistent": false, "recording": false - }, + } +], +"drive0": [ { "busy": false, "count": 458752, @@ -865,16 +865,16 @@ write -P0x67 0x3fe 0x2 "granularity": 65536, "persistent": false, "recording": false - } -], -"drive0": [ + }, { "busy": false, "count": 0, "granularity": 65536, "persistent": false, "recording": false - }, + } +], +"drive0": [ { "busy": false, "count": 458752, @@ -1341,16 +1341,16 @@ write -P0x67 0x3fe 0x2 "granularity": 65536, "persistent": false, "recording": fa
[PATCH v2 3/4] qapi: blockdev-backup: add discard-source parameter
Add a parameter that enables discard-after-copy. That is mostly useful in "push backup with fleecing" scheme, when source is snapshot-access format driver node, based on copy-before-write filter snapshot-access API: [guest] [snapshot-access] ~~ blockdev-backup ~~> [backup target] || | root | file vv [copy-before-write] | | | file| target v v [active disk] [temp.img] In this case discard-after-copy does two things: - discard data in temp.img to save disk space - avoid further copy-before-write operation in discarded area Note that we have to declare WRITE permission on source in copy-before-write filter, for discard to work. Signed-off-by: Vladimir Sementsov-Ogievskiy --- block/backup.c | 5 +++-- block/block-copy.c | 12 ++-- block/copy-before-write.c | 2 +- block/replication.c| 4 ++-- blockdev.c | 2 +- include/block/block-copy.h | 2 +- include/block/block_int-global-state.h | 2 +- qapi/block-core.json | 4 8 files changed, 23 insertions(+), 10 deletions(-) diff --git a/block/backup.c b/block/backup.c index ec29d6b810..0c86b5be55 100644 --- a/block/backup.c +++ b/block/backup.c @@ -356,7 +356,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, BlockDriverState *target, int64_t speed, MirrorSyncMode sync_mode, BdrvDirtyBitmap *sync_bitmap, BitmapSyncMode bitmap_mode, - bool compress, + bool compress, bool discard_source, const char *filter_node_name, BackupPerf *perf, BlockdevOnError on_source_error, @@ -491,7 +491,8 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, job->len = len; job->perf = *perf; -block_copy_set_copy_opts(bcs, perf->use_copy_range, compress); +block_copy_set_copy_opts(bcs, perf->use_copy_range, compress, + discard_source); block_copy_set_progress_meter(bcs, >common.job.progress); block_copy_set_speed(bcs, speed); diff --git a/block/block-copy.c b/block/block-copy.c index 8fca2c3698..eebf643e86 100644 --- a/block/block-copy.c +++ b/block/block-copy.c @@ -137,6 +137,7 @@ typedef struct BlockCopyState { CoMutex lock; int64_t in_flight_bytes; BlockCopyMethod method; +bool discard_source; BlockReqList reqs; QLIST_HEAD(, BlockCopyCallState) calls; /* @@ -282,11 +283,12 @@ static uint32_t block_copy_max_transfer(BdrvChild *source, BdrvChild *target) } void block_copy_set_copy_opts(BlockCopyState *s, bool use_copy_range, - bool compress) + bool compress, bool discard_source) { /* Keep BDRV_REQ_SERIALISING set (or not set) in block_copy_state_new() */ s->write_flags = (s->write_flags & BDRV_REQ_SERIALISING) | (compress ? BDRV_REQ_WRITE_COMPRESSED : 0); +s->discard_source = discard_source; if (s->max_transfer < s->cluster_size) { /* @@ -418,7 +420,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, cluster_size), }; -block_copy_set_copy_opts(s, false, false); +block_copy_set_copy_opts(s, false, false, false); ratelimit_init(>rate_limit); qemu_co_mutex_init(>lock); @@ -589,6 +591,12 @@ static coroutine_fn int block_copy_task_entry(AioTask *task) co_put_to_shres(s->mem, t->req.bytes); block_copy_task_end(t, ret); +if (s->discard_source && ret == 0) { +int64_t nbytes = +MIN(t->req.offset + t->req.bytes, s->len) - t->req.offset; +bdrv_co_pdiscard(s->source, t->req.offset, nbytes); +} + return ret; } diff --git a/block/copy-before-write.c b/block/copy-before-write.c index 4cd9b7d91e..b208a80ade 100644 --- a/block/copy-before-write.c +++ b/block/copy-before-write.c @@ -370,7 +370,7 @@ cbw_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role, * works even if source node does not have any parents before backup * start */ -*nperm = *nperm | BLK_PERM_CONSISTENT_READ; +*nperm = *nperm | BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE; *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE); } } diff --git a/block/replication.c b/block/replication.c index ca6bd0a720..0415a5e8b7 100644 --- a/block/replication.c +++ b/block/replication.c @@ -582,8 +582,8 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, s->backup_job = backup_job_create( NULL, s->secondary_dis