[PATCH v2 09/18] modules: add ui module annotations

2021-06-09 Thread Gerd Hoffmann
Signed-off-by: Gerd Hoffmann 
---
 ui/egl-headless.c | 4 
 ui/gtk.c  | 4 
 ui/sdl2.c | 4 
 ui/spice-app.c| 3 +++
 ui/spice-core.c   | 5 +
 5 files changed, 20 insertions(+)

diff --git a/ui/egl-headless.c b/ui/egl-headless.c
index da377a74af69..75404e0e8700 100644
--- a/ui/egl-headless.c
+++ b/ui/egl-headless.c
@@ -213,3 +213,7 @@ static void register_egl(void)
 }
 
 type_init(register_egl);
+
+#ifdef CONFIG_OPENGL
+module_dep("ui-opengl");
+#endif
diff --git a/ui/gtk.c b/ui/gtk.c
index 98046f577b9d..376b4d528daa 100644
--- a/ui/gtk.c
+++ b/ui/gtk.c
@@ -2333,3 +2333,7 @@ static void register_gtk(void)
 }
 
 type_init(register_gtk);
+
+#ifdef CONFIG_OPENGL
+module_dep("ui-opengl");
+#endif
diff --git a/ui/sdl2.c b/ui/sdl2.c
index a203cb0239e1..36d9010cb6c1 100644
--- a/ui/sdl2.c
+++ b/ui/sdl2.c
@@ -918,3 +918,7 @@ static void register_sdl1(void)
 }
 
 type_init(register_sdl1);
+
+#ifdef CONFIG_OPENGL
+module_dep("ui-opengl");
+#endif
diff --git a/ui/spice-app.c b/ui/spice-app.c
index 4325ac2d9c54..641f4a9d53e3 100644
--- a/ui/spice-app.c
+++ b/ui/spice-app.c
@@ -221,3 +221,6 @@ static void register_spice_app(void)
 }
 
 type_init(register_spice_app);
+
+module_dep("ui-spice-core");
+module_dep("chardev-spice");
diff --git a/ui/spice-core.c b/ui/spice-core.c
index 272d19b0c152..86d43783acac 100644
--- a/ui/spice-core.c
+++ b/ui/spice-core.c
@@ -1037,3 +1037,8 @@ static void spice_register_config(void)
 qemu_add_opts(_spice_opts);
 }
 opts_init(spice_register_config);
+module_opts("spice");
+
+#ifdef CONFIG_OPENGL
+module_dep("ui-opengl");
+#endif
-- 
2.31.1




[PATCH v2 07/18] modules: add usb-redir module annotations

2021-06-09 Thread Gerd Hoffmann
Signed-off-by: Gerd Hoffmann 
---
 hw/usb/redirect.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/usb/redirect.c b/hw/usb/redirect.c
index 6a75b0dc4ab2..4ec9326e0582 100644
--- a/hw/usb/redirect.c
+++ b/hw/usb/redirect.c
@@ -2608,6 +2608,7 @@ static const TypeInfo usbredir_dev_info = {
 .class_init= usbredir_class_initfn,
 .instance_init = usbredir_instance_init,
 };
+module_obj(TYPE_USB_REDIR);
 
 static void usbredir_register_types(void)
 {
-- 
2.31.1




[PATCH v2 02/18] qapi: add ModuleInfo schema

2021-06-09 Thread Gerd Hoffmann
Add QAPI schema for the module info database.

Signed-off-by: Gerd Hoffmann 
---
 qapi/meson.build  |  1 +
 qapi/modules.json | 36 
 qapi/qapi-schema.json |  1 +
 3 files changed, 38 insertions(+)
 create mode 100644 qapi/modules.json

diff --git a/qapi/meson.build b/qapi/meson.build
index 376f4ceafe74..596aa5d71168 100644
--- a/qapi/meson.build
+++ b/qapi/meson.build
@@ -36,6 +36,7 @@ qapi_all_modules = [
   'migration',
   'misc',
   'misc-target',
+  'modules',
   'net',
   'pragma',
   'qom',
diff --git a/qapi/modules.json b/qapi/modules.json
new file mode 100644
index ..5420977d8765
--- /dev/null
+++ b/qapi/modules.json
@@ -0,0 +1,36 @@
+# -*- Mode: Python -*-
+# vim: filetype=python
+
+##
+# @ModuleInfo:
+#
+# qemu module metadata
+#
+# @name: module name
+#
+# @objs: list of qom objects implemented by the module.
+#
+# @deps: list of other modules this module depends on.
+#
+# @arch: module architecture.
+#
+# @opts: qemu opts implemented by module.
+#
+# Since: 6.1
+##
+{ 'struct': 'ModuleInfo',
+  'data': { 'name'  : 'str',
+'*objs' : ['str'],
+'*deps' : ['str'],
+'*arch' : 'str',
+'*opts' : 'str'}}
+
+##
+# @Modules:
+#
+# qemu module list
+#
+# Since: 6.1
+##
+{ 'struct': 'Modules',
+  'data': { 'list' : ['ModuleInfo']}}
diff --git a/qapi/qapi-schema.json b/qapi/qapi-schema.json
index 4912b9744e69..5baa511c2ff5 100644
--- a/qapi/qapi-schema.json
+++ b/qapi/qapi-schema.json
@@ -93,3 +93,4 @@
 { 'include': 'audio.json' }
 { 'include': 'acpi.json' }
 { 'include': 'pci.json' }
+{ 'include': 'modules.json' }
-- 
2.31.1




[PATCH v2 01/18] modules: add metadata macros, add qxl module annotations

2021-06-09 Thread Gerd Hoffmann
Stealing an idea from the linux kernel:  Place module metadata
in an .modinfo elf section.  This patch adds macros and qxl module
annotations as example.

Signed-off-by: Gerd Hoffmann 
---
 include/qemu/module.h | 22 ++
 hw/display/qxl.c  |  4 
 2 files changed, 26 insertions(+)

diff --git a/include/qemu/module.h b/include/qemu/module.h
index 944d403cbd15..d3cab3c25a2f 100644
--- a/include/qemu/module.h
+++ b/include/qemu/module.h
@@ -73,4 +73,26 @@ bool module_load_one(const char *prefix, const char 
*lib_name, bool mayfail);
 void module_load_qom_one(const char *type);
 void module_load_qom_all(void);
 
+/*
+ * macros to store module metadata in a .modinfo section.
+ * qemu-modinfo utility will collect the metadata.
+ *
+ * Use "objdump -t -s -j .modinfo ${module}.so" to inspect.
+ */
+
+#define ___PASTE(a, b) a##b
+#define __PASTE(a, b) ___PASTE(a, b)
+
+#define modinfo(kind, value) \
+static const char __PASTE(kind, __LINE__)[]  \
+__attribute__((__used__))\
+__attribute__((section(".modinfo"))) \
+__attribute__((aligned(1)))  \
+= stringify(kind) "=" value
+
+#define module_obj(name) modinfo(obj, name)
+#define module_dep(name) modinfo(dep, name)
+#define module_arch(name) modinfo(arch, name)
+#define module_opts(name) modinfo(opts, name)
+
 #endif
diff --git a/hw/display/qxl.c b/hw/display/qxl.c
index 6e1f8ff1b2a7..84f99088e0a0 100644
--- a/hw/display/qxl.c
+++ b/hw/display/qxl.c
@@ -2522,6 +2522,7 @@ static const TypeInfo qxl_primary_info = {
 .parent= TYPE_PCI_QXL,
 .class_init= qxl_primary_class_init,
 };
+module_obj("qxl-vga");
 
 static void qxl_secondary_class_init(ObjectClass *klass, void *data)
 {
@@ -2538,6 +2539,7 @@ static const TypeInfo qxl_secondary_info = {
 .parent= TYPE_PCI_QXL,
 .class_init= qxl_secondary_class_init,
 };
+module_obj("qxl");
 
 static void qxl_register_types(void)
 {
@@ -2547,3 +2549,5 @@ static void qxl_register_types(void)
 }
 
 type_init(qxl_register_types)
+
+module_dep("ui-spice-core");
-- 
2.31.1




Re: [PATCH 2/2] nbd: Add new qemu:joint-allocation metadata context

2021-06-09 Thread Nir Soffer
On Wed, Jun 9, 2021 at 9:01 PM Eric Blake  wrote:
>
> When trying to reconstruct a qcow2 chain using information provided
> over NBD, ovirt had been relying on an unsafe assumption that any
> portion of the qcow2 file advertised as sparse would defer to the
> backing image; this worked with what qemu 5.2 reports for a qcow2 BSD
> loaded with "backing":null.  However, in 6.0, commit 0da9856851 (nbd:
> server: Report holes for raw images) also had a side-effect of
> reporting unallocated zero clusters in qcow2 files as sparse.  This
> change is correct from the NBD spec perspective (advertising bits has
> always been optional based on how much information the server has
> available, and should only be used to optimize behavior when a bit is
> set, while not assuming semantics merely because a bit is clear), but
> means that a qcow2 file that uses an unallocated zero cluster to
> override a backing file now shows up as sparse over NBD, and causes
> ovirt to fail to reproduce that cluster (ie. ovirt was assuming it
> only had to write clusters where the bit was clear, and the 6.0
> behavior change shows the flaw in that assumption).
>
> The correct fix is for ovirt to additionally use the
> qemu:allocation-depth metadata context added in 5.2: after all, the
> actual determination for what is needed to recreate a qcow2 file is
> not whether a cluster is sparse, but whether the allocation-depth
> shows the cluster to be local.  But reproducing an image is more
> efficient when handling known-zero clusters, which means that ovirt
> has to track both base:allocation and qemu:allocation-depth metadata
> contexts simultaneously.  While NBD_CMD_BLOCK_STATUS is just fine
> sending back information for two contexts in parallel, it comes with
> some bookkeeping overhead at the client side: the two contexts need
> not report the same length of replies, and it involves more network
> traffic.
>
> So, as a convenience, we can provide yet another metadata context,
> "qemu:joint-allocation", which provides the bulk of the same
> information already available from using "base:allocation" and
> "qemu:allocation-depth" in parallel; the only difference is that an
> allocation depth larger than one is collapsed to a single bit, rather
> than remaining an integer representing actual depth.  By connecting to
> just this context, a client has less work to perform while still
> getting at all pieces of information needed to recreate a qcow2
> backing chain.

Providing extended allocation is awsome, and makes client life much
easier. But I'm not sure about the name, that comes from "joining"
"base:allocation" and "qemu:allocation-depth". This is correct when
thinking about qemu internals, but this is not really getting both, since
"qemu:allocation-depth" is reduced to local and backing.

>From a client point of view, I think this is best described as 
>"qemu:allocation"
which is an extension to NBD protocol, providing the same HOLE and ZERO
bits, and qemu specific info LOCAL, BACKING. Using different "namespace"
("qemu" vs "base") makes it clear that this is not the same.

We discussed in the past the option to expose also the dirty status of every
block in the response. Again this info is available using
"qemu:dirty-bitmap:xxx"
but just like allocation depth and base allocation, merging the results is hard
and if we could expose also the dirty bit, this can make clients life
even better.
In this case I'm not sure "qemu:allocation" is the best name, maybe something
more generic like "qemu:extents" or "qemu:block-status" is even better.

> With regards to exposing this new feature from qemu as NBD server, it
> is sufficient to reuse the existing 'qemu-nbd -A': since that already
> exposes allocation depth, it does not hurt to advertise two separate
> qemu:XXX metadata contexts at once for two different views of
> allocation depth.  And just because the server supports multiple
> contexts does not mean a client will want or need to connect to
> everything available.  On the other hand, the existing hack of using
> the qemu NBD client option of x-dirty-bitmap to select an alternative
> context from the client does NOT make it possible to read the extra
> information exposed by the new metadata context.  For now, you MUST
> use something like libnbd's 'nbdinfo --map=qemu:joint-allocation' in
> order to properly see all four bits in action:

Makes sense.

> # Create a qcow2 image with a raw backing file:
> $ qemu-img create base.raw $((4*64*1024))
> $ qemu-img create -f qcow2 -b base.raw -F raw top.qcow2
>
> # Write to first 3 clusters of base:
> $ qemu-io -f raw -c "w -P 65 0 64k" -c "w -P 66 64k 64k" \
>   -c "w -P 67 128k 64k" base.raw
>
> # Write to second and third clusters of top, hiding base:
> $ qemu-io -f qcow2 -c "w -P 69 64k 64k" -c "w -z 128k 64k" top.qcow2

Looks familiar but nicer :-)

> # Expose top.qcow2 without backing file over NBD
> $ ./qemu-nbd -r -t -f qcow2 -A 

Re: [RFC libnbd PATCH] info: Add support for new qemu:joint-allocation

2021-06-09 Thread Nir Soffer
On Thu, Jun 10, 2021 at 12:32 AM Eric Blake  wrote:
>
> Qemu is adding qemu:joint-allocation as a single context combining the
> two bits of base:allocation and a compression of qemu:allocation-depth
> into two bits [1].  Decoding the bits makes it easier for humans to
> see the result of that context.
>
> [1] https://lists.gnu.org/archive/html/qemu-devel/2021-06/msg02446.html
> ---
>
> Obviously, this libnbd patch should only go in if the qemu RFC is
> accepted favorably.  With this patch applied, the example listed in my
> qemu patch 2/2 commit message [2] becomes
>
> $ ~/libnbd/run nbdinfo --map=qemu:joint-allocation nbd://localhost
>  0   655363  hole,zero,unallocated
>  65536   655364  allocated,local
> 131072   655367  hole,zero,local
> 196608   655363  hole,zero,unallocated
>
> [2] https://lists.gnu.org/archive/html/qemu-devel/2021-06/msg02448.html
>
> For what it's worth, you can also play with the qemu+libnbd patches at:
> https://repo.or.cz/qemu/ericb.git/ master
> https://repo.or.cz/libnbd/ericb.git/ master
>
> (I sometimes rewind those branches, but they'll be stable for at least
> a few days after this email)
>
>  info/map.c | 21 +
>  1 file changed, 21 insertions(+)
>
> diff --git a/info/map.c b/info/map.c
> index ae6d4fe..21e8657 100644
> --- a/info/map.c
> +++ b/info/map.c
> @@ -226,6 +226,27 @@ extent_description (const char *metacontext, uint32_t 
> type)
>return ret;
>  }
>}
> +  else if (strcmp (metacontext, "qemu:joint-allocation") == 0) {
> +/* Combo of base:allocation and stripped-down qemu:allocation-depth */
> +const char *base, *depth;
> +switch (type & 3) {
> +case 0: base = "allocated"; break;
> +case 1: base = "hole"; break;
> +case 2: base = "zero"; break;
> +case 3: base = "hole,zero"; break;
> +}
> +switch (type & 0xc) {
> +case 0: depth = "unallocated"; break;

Is this possible? qemu reports BDRV_BLOCK_DATA but not BDRV_BLOCK_ALLOCATED?

Anyway this seems like a valid way to present qemu response.

> +case 4: depth = "local"; break;
> +case 8: depth = "backing"; break;
> +case 12: depth = ""; break;

This should not be possible based on the qemu patch, but printing this
seems like a good solution, and can help to debug such an issue.

Thinking about client code trying to copy extents based on the flags,
the client should abort the operation since qemu response is invalid.

> +}
> +if (asprintf (, "%s,%s", base, depth) == -1) {
> +  perror ("asprintf");
> +  exit (EXIT_FAILURE);
> +}
> +return ret;
> +  }
>
>return NULL;   /* Don't know - description field will be omitted. */
>  }
> --
> 2.31.1
>




[RFC libnbd PATCH] info: Add support for new qemu:joint-allocation

2021-06-09 Thread Eric Blake
Qemu is adding qemu:joint-allocation as a single context combining the
two bits of base:allocation and a compression of qemu:allocation-depth
into two bits [1].  Decoding the bits makes it easier for humans to
see the result of that context.

[1] https://lists.gnu.org/archive/html/qemu-devel/2021-06/msg02446.html
---

Obviously, this libnbd patch should only go in if the qemu RFC is
accepted favorably.  With this patch applied, the example listed in my
qemu patch 2/2 commit message [2] becomes

$ ~/libnbd/run nbdinfo --map=qemu:joint-allocation nbd://localhost
 0   655363  hole,zero,unallocated
 65536   655364  allocated,local
131072   655367  hole,zero,local
196608   655363  hole,zero,unallocated

[2] https://lists.gnu.org/archive/html/qemu-devel/2021-06/msg02448.html

For what it's worth, you can also play with the qemu+libnbd patches at:
https://repo.or.cz/qemu/ericb.git/ master
https://repo.or.cz/libnbd/ericb.git/ master

(I sometimes rewind those branches, but they'll be stable for at least
a few days after this email)

 info/map.c | 21 +
 1 file changed, 21 insertions(+)

diff --git a/info/map.c b/info/map.c
index ae6d4fe..21e8657 100644
--- a/info/map.c
+++ b/info/map.c
@@ -226,6 +226,27 @@ extent_description (const char *metacontext, uint32_t type)
   return ret;
 }
   }
+  else if (strcmp (metacontext, "qemu:joint-allocation") == 0) {
+/* Combo of base:allocation and stripped-down qemu:allocation-depth */
+const char *base, *depth;
+switch (type & 3) {
+case 0: base = "allocated"; break;
+case 1: base = "hole"; break;
+case 2: base = "zero"; break;
+case 3: base = "hole,zero"; break;
+}
+switch (type & 0xc) {
+case 0: depth = "unallocated"; break;
+case 4: depth = "local"; break;
+case 8: depth = "backing"; break;
+case 12: depth = ""; break;
+}
+if (asprintf (, "%s,%s", base, depth) == -1) {
+  perror ("asprintf");
+  exit (EXIT_FAILURE);
+}
+return ret;
+  }

   return NULL;   /* Don't know - description field will be omitted. */
 }
-- 
2.31.1




Re: [PATCH 4/4] Jobs based on custom runners: add CentOS Stream 8

2021-06-09 Thread Cleber Rosa Junior
On Tue, Jun 8, 2021 at 10:10 AM Cleber Rosa  wrote:
>
> This introduces three different parts of a job designed to run
> on a custom runner managed by Red Hat.  The goals include:
>
>  a) serve as a model for other organizations that want to onboard
> their own runners, with their specific platforms, build
> configuration and tests.
>
>  b) bring awareness to the differences between upstream QEMU and the
> version available under CentOS Stream, which is "A preview of
> upcoming Red Hat Enterprise Linux minor and major releases.".
>
>  c) becase of b), it should be easier to identify and reduce the gap
> between Red Hat's downstream and upstream QEMU.
>
> The components themselves to achieve this custom job are:
>
>  1) build environment configuration: documentation and a playbook for
> a base Enterprise Linux 8 system (also applicable to CentOS
> Stream), which other users can run on their system to get the
> environment suitable for building QEMU.
>
>  2) QEMU build configuration: how QEMU will be built to match, as
> closely as possible, the binaries built and packaged on CentOS
> stream 8.
>
>  3) job definition: GitLab CI jobs that will dispatch the build/test
> job to the machine specifically configured according to #1.
>
> Signed-off-by: Cleber Rosa 
> ---
>  .gitlab-ci.d/custom-runners.yml|  29 
>  scripts/ci/org.centos/stream/README|   2 +
>  scripts/ci/org.centos/stream/configure | 190 +
>  scripts/ci/setup/build-environment.yml |  38 +
>  4 files changed, 259 insertions(+)
>  create mode 100644 scripts/ci/org.centos/stream/README
>  create mode 100755 scripts/ci/org.centos/stream/configure
>
> diff --git a/.gitlab-ci.d/custom-runners.yml b/.gitlab-ci.d/custom-runners.yml
> index 061d3cdfed..ee5143995e 100644
> --- a/.gitlab-ci.d/custom-runners.yml
> +++ b/.gitlab-ci.d/custom-runners.yml
> @@ -220,3 +220,32 @@ ubuntu-20.04-aarch64-notcg:
>   - ../configure --disable-libssh --disable-tcg
>   - make --output-sync -j`nproc`
>   - make --output-sync -j`nproc` check V=1
> +
> +centos-stream-8-x86_64:
> + allow_failure: true
> + needs: []
> + stage: build
> + tags:
> + - centos_stream_8
> + - x86_64
> + rules:
> + - if: '$CI_COMMIT_BRANCH =~ /^staging/'
> + artifacts:
> +   name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
> +   when: on_failure
> +   expire_in: 7 days
> +   paths:
> + - build/tests/results/latest/results.xml
> + - build/tests/results/latest/test-results
> +   reports:
> + junit: build/tests/results/latest/results.xml
> + script:
> + - mkdir build
> + - cd build
> + - ../scripts/ci/org.centos/stream/configure
> + - make --output-sync -j`nproc`
> + - make --output-sync -j`nproc` check V=1
> + - make get-vm-images
> + # Only run tests that are either marked explicitly for KVM and x86_64
> + # or tests that are supposed to be valid for all targets
> + - ./tests/venv/bin/avocado run --job-results-dir=tests/results/ 
> --filter-by-tags-include-empty --filter-by-tags-include-empty-key -t 
> accel:kvm,arch:x86_64 -- tests/acceptance/
> diff --git a/scripts/ci/org.centos/stream/README 
> b/scripts/ci/org.centos/stream/README
> new file mode 100644
> index 00..f99bda99b8
> --- /dev/null
> +++ b/scripts/ci/org.centos/stream/README
> @@ -0,0 +1,2 @@
> +This directory contains scripts for generating a build of QEMU that
> +closely matches the CentOS Stream builds of the qemu-kvm package.
> diff --git a/scripts/ci/org.centos/stream/configure 
> b/scripts/ci/org.centos/stream/configure
> new file mode 100755
> index 00..1e7207faec
> --- /dev/null
> +++ b/scripts/ci/org.centos/stream/configure
> @@ -0,0 +1,190 @@
> +#!/bin/sh -e
> +../configure \
> +--prefix="/usr" \
> +--libdir="/usr/lib64" \
> +--datadir="/usr/share" \
> +--sysconfdir="/etc" \
> +--interp-prefix=/usr/qemu-%M \
> +--localstatedir="/var" \
> +--docdir="/usr/share/doc" \
> +--libexecdir="/usr/libexec" \
> +--extra-ldflags="-Wl,--build-id -Wl,-z,relro -Wl,-z,now" \
> +--extra-cflags="-O2 -g -pipe -Wall -Werror=format-security 
> -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fexceptions 
> -fstack-protector-strong -grecord-gcc-switches 
> -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 
> -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -m64 -mtune=generic 
> -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection" \
> +--with-suffix="qemu-kvm" \
> +--firmwarepath=/usr/share/qemu-firmware \
> +--meson="/usr/bin/meson" \
> +--target-list="x86_64-softmmu" \
> +--block-drv-rw-whitelist=qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster
>  \
> +--audio-drv-list= \
> +--block-drv-ro-whitelist=vmdk,vhdx,vpc,https,ssh \
> +--with-coroutine=ucontext \
> +--with-git=git \
> +--tls-priority=@QEMU,SYSTEM \
> +--disable-attr \
> +--disable-auth-pam \
> +--disable-avx2 \
> +--disable-avx512f \
> +--disable-bochs \
> +--disable-brlapi \
> +--disable-bsd-user \
> +--disable-bzip2 \
> 

Re: [PATCH v2 2/2] hw/nvme: documentation fix

2021-06-09 Thread Klaus Jensen

On Jun  1 20:32, Gollu Appalanaidu wrote:

In the documentation of the '-detached' param "be" and "not" has been
used side by side, fix that.

Signed-off-by: Gollu Appalanaidu 
---
hw/nvme/ctrl.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 813a72c655..a3df26d0ce 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -114,7 +114,7 @@
 *   This parameter is only valid together with the `subsys` parameter. If left
 *   at the default value (`false/off`), the namespace will be attached to all
 *   controllers in the NVMe subsystem at boot-up. If set to `true/on`, the
- *   namespace will be be available in the subsystem not not attached to any
+ *   namespace will be available in the subsystem not attached to any


namespace will be available in the subsystem *but* not attached to an


 *   controllers.
 *
 * Setting `zoned` to true selects Zoned Command Set at the namespace.
--
2.17.1





signature.asc
Description: PGP signature


Re: [PATCH v2 1/2] hw/nvme: fix endianess conversion and add controller list

2021-06-09 Thread Klaus Jensen

On Jun  1 20:32, Gollu Appalanaidu wrote:

Add the controller identifiers list CNS 0x13, available list of ctrls
in NVM Subsystem that may or may not be attached to namespaces.

In Identify Ctrl List of the CNS 0x12 and 0x13 no endian conversion
for the nsid field.

Signed-off-by: Gollu Appalanaidu 

-v2:
Fix the review comments from Klaus and squashed 2nd commit into
1st commit

---
hw/nvme/ctrl.c   | 26 --
hw/nvme/trace-events |  2 +-
include/block/nvme.h |  1 +
3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 2e7498a73e..813a72c655 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -4251,9 +4251,11 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, 
NvmeRequest *req, bool active)
return NVME_INVALID_CMD_SET | NVME_DNR;
}

-static uint16_t nvme_identify_ns_attached_list(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req,
+bool attached)
{
NvmeIdentify *c = (NvmeIdentify *)>cmd;
+uint32_t nsid = le32_to_cpu(c->nsid);
uint16_t min_id = le16_to_cpu(c->ctrlid);
uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
uint16_t *ids = [1];
@@ -4261,15 +4263,17 @@ static uint16_t nvme_identify_ns_attached_list(NvmeCtrl 
*n, NvmeRequest *req)
NvmeCtrl *ctrl;
int cntlid, nr_ids = 0;

-trace_pci_nvme_identify_ns_attached_list(min_id);
+trace_pci_nvme_identify_ctrl_list(c->cns, min_id);

-if (c->nsid == NVME_NSID_BROADCAST) {
-return NVME_INVALID_FIELD | NVME_DNR;
-}
+if (attached) {
+if (nsid == NVME_NSID_BROADCAST) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}

-ns = nvme_subsys_ns(n->subsys, c->nsid);
-if (!ns) {
-return NVME_INVALID_FIELD | NVME_DNR;
+ns = nvme_subsys_ns(n->subsys, nsid);
+if (!ns) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
}

for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {


Assume that `attached` is false and `n->subsys` is NULL.

KABM :)


signature.asc
Description: PGP signature


Re: [PATCH v2] hw/nvme/ctrl: fix csi field for cns 0x00 and 0x11

2021-06-09 Thread Klaus Jensen

On Apr 27 12:00, Gollu Appalanaidu wrote:

As per the TP 4056d Namespace types CNS 0x00 and CNS 0x11
CSI field shouldn't use but it is being used for these two
Identify command CNS values, fix that.

Remove 'nvme_csi_has_nvm_support()' helper as suggested by
Klaus we can safely assume NVM command set support for all
namespaces.

Suggested-by: Klaus Jensen 
Signed-off-by: Gollu Appalanaidu 
---
-v2: add sugggestions from Klaus
We can Remove 'nvme_csi_has_nvm_support()' helper, we can
assume NVM command set support for all namespaces.

hw/nvme/ctrl.c | 14 ++
1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 2e7498a73e..7fcd699235 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -4178,16 +4178,6 @@ static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, 
NvmeRequest *req)
return nvme_c2h(n, id, sizeof(id), req);
}

-static inline bool nvme_csi_has_nvm_support(NvmeNamespace *ns)
-{
-switch (ns->csi) {
-case NVME_CSI_NVM:
-case NVME_CSI_ZONED:
-return true;
-}
-return false;
-}
-
static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
{
trace_pci_nvme_identify_ctrl();
@@ -4244,7 +4234,7 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest 
*req, bool active)
}
}

-if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
+if (active || ns->csi == NVME_CSI_NVM) {
return nvme_c2h(n, (uint8_t *)>id_ns, sizeof(NvmeIdNs), req);
}

@@ -4315,7 +4305,7 @@ static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, 
NvmeRequest *req,
}
}

-if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
+if (c->csi == NVME_CSI_NVM) {
return nvme_rpt_empty_id_struct(n, req);
} else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
--
2.17.1



Applied to nvme-next. Thanks!


signature.asc
Description: PGP signature


Re: [PATCH v3 19/33] block/nbd: split nbd_handle_updated_info out of nbd_client_handshake()

2021-06-09 Thread Eric Blake
On Wed, Jun 09, 2021 at 08:23:06PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> > > +if (s->x_dirty_bitmap) {
> > > +if (!s->info.base_allocation) {
> > > +error_setg(errp, "requested x-dirty-bitmap %s not found",
> > > +   s->x_dirty_bitmap);
> > > +return -EINVAL;
> > > +}
> > > +if (strcmp(s->x_dirty_bitmap, "qemu:allocation-depth") == 0) {
> > > +s->alloc_depth = true;
> > > +}
> > > +}
> > > +
> > > +if (s->info.flags & NBD_FLAG_READ_ONLY) {
> > > +ret = bdrv_apply_auto_read_only(bs, "NBD export is read-only", 
> > > errp);
> > > +if (ret < 0) {
> > > +return ret;
> > > +}
> > > +}
> > > +
> > > +if (s->info.flags & NBD_FLAG_SEND_FUA) {
> > > +bs->supported_write_flags = BDRV_REQ_FUA;
> > > +bs->supported_zero_flags |= BDRV_REQ_FUA;
> > 
> > Code motion, so it is correct, but it looks odd to use = for one
> > assignment and |= for the other.  Using |= in both places would be
> > more consistent.
> 
> Actually I see bugs here:
> 
> 1. we should do =, not |=, as on reconnect info changes, so we should reset 
> supported flags.
> 
> 2. in-fligth requests that are in retying loops are not prepared to flags 
> changing. I afraid, that some malicious server may even do some bad thing
> 
> Still, let's fix it after these series. To avoid more conflicts.

Oh, you raise some good points.  And it's not just bs->*flags; qemu as
server uses constant metacontext ids (base:allocation is always
context 0), but even that might not be stable across reconnect.  For
example, with my proposed patch of adding qemu:joint-allocation
metacontext, if the reason we have to reconnect is because the server
is upgrading from qemu 6.0 to 6.1 temporarily bouncing the server, and
the client was paying attention to qemu:dirty-bitmap:FOO, that context
would now have a different id.

Yeah, making this code safer across potential changes in server
information (either to fail the reconnect because the reconnected
server dropped something we were previously depending on, or
gracefully handling the downgrade, or ...) is worth leaving for a
later series while we focus on the more immediate issue of making
reconnect itself stable.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




[PATCH 2/2] nbd: Add new qemu:joint-allocation metadata context

2021-06-09 Thread Eric Blake
When trying to reconstruct a qcow2 chain using information provided
over NBD, ovirt had been relying on an unsafe assumption that any
portion of the qcow2 file advertised as sparse would defer to the
backing image; this worked with what qemu 5.2 reports for a qcow2 BSD
loaded with "backing":null.  However, in 6.0, commit 0da9856851 (nbd:
server: Report holes for raw images) also had a side-effect of
reporting unallocated zero clusters in qcow2 files as sparse.  This
change is correct from the NBD spec perspective (advertising bits has
always been optional based on how much information the server has
available, and should only be used to optimize behavior when a bit is
set, while not assuming semantics merely because a bit is clear), but
means that a qcow2 file that uses an unallocated zero cluster to
override a backing file now shows up as sparse over NBD, and causes
ovirt to fail to reproduce that cluster (ie. ovirt was assuming it
only had to write clusters where the bit was clear, and the 6.0
behavior change shows the flaw in that assumption).

The correct fix is for ovirt to additionally use the
qemu:allocation-depth metadata context added in 5.2: after all, the
actual determination for what is needed to recreate a qcow2 file is
not whether a cluster is sparse, but whether the allocation-depth
shows the cluster to be local.  But reproducing an image is more
efficient when handling known-zero clusters, which means that ovirt
has to track both base:allocation and qemu:allocation-depth metadata
contexts simultaneously.  While NBD_CMD_BLOCK_STATUS is just fine
sending back information for two contexts in parallel, it comes with
some bookkeeping overhead at the client side: the two contexts need
not report the same length of replies, and it involves more network
traffic.

So, as a convenience, we can provide yet another metadata context,
"qemu:joint-allocation", which provides the bulk of the same
information already available from using "base:allocation" and
"qemu:allocation-depth" in parallel; the only difference is that an
allocation depth larger than one is collapsed to a single bit, rather
than remaining an integer representing actual depth.  By connecting to
just this context, a client has less work to perform while still
getting at all pieces of information needed to recreate a qcow2
backing chain.

With regards to exposing this new feature from qemu as NBD server, it
is sufficient to reuse the existing 'qemu-nbd -A': since that already
exposes allocation depth, it does not hurt to advertise two separate
qemu:XXX metadata contexts at once for two different views of
allocation depth.  And just because the server supports multiple
contexts does not mean a client will want or need to connect to
everything available.  On the other hand, the existing hack of using
the qemu NBD client option of x-dirty-bitmap to select an alternative
context from the client does NOT make it possible to read the extra
information exposed by the new metadata context.  For now, you MUST
use something like libnbd's 'nbdinfo --map=qemu:joint-allocation' in
order to properly see all four bits in action:

# Create a qcow2 image with a raw backing file:
$ qemu-img create base.raw $((4*64*1024))
$ qemu-img create -f qcow2 -b base.raw -F raw top.qcow2

# Write to first 3 clusters of base:
$ qemu-io -f raw -c "w -P 65 0 64k" -c "w -P 66 64k 64k" \
  -c "w -P 67 128k 64k" base.raw

# Write to second and third clusters of top, hiding base:
$ qemu-io -f qcow2 -c "w -P 69 64k 64k" -c "w -z 128k 64k" top.qcow2

# Expose top.qcow2 without backing file over NBD
$ ./qemu-nbd -r -t -f qcow2 -A 'json:{"driver":"qcow2", "backing":null, \
  "file":{"driver":"file", "filename":"top.qcow2"}}'
$ nbdinfo --map=qemu:joint-allocation nbd://localhost
 0   655363
 65536   655364
131072   655367
196608   655363

[This was output from nbdinfo 1.8.0; a later version will also add a
column to decode the bits into human-readable strings]

Additionally, later qemu patches may try to improve qemu-img to
automatically take advantage of additional NBD context information,
without having to use x-dirty-bitmap.

Reported-by: Nir Soffer 
Resolves: https://bugzilla.redhat.com/1968693
Signed-off-by: Eric Blake 
---
 docs/interop/nbd.txt  | 31 ++-
 docs/tools/qemu-nbd.rst   |  4 +-
 qapi/block-export.json|  4 +-
 include/block/nbd.h   | 10 ++-
 nbd/server.c  | 87 +--
 .../tests/nbd-qemu-allocation.out |  3 +-
 6 files changed, 125 insertions(+), 14 deletions(-)

diff --git a/docs/interop/nbd.txt b/docs/interop/nbd.txt
index 10ce098a29bf..cc8ce2d5389f 100644
--- a/docs/interop/nbd.txt
+++ b/docs/interop/nbd.txt
@@ -17,7 +17,7 @@ namespace "qemu".

 == "qemu" namespace ==

-The "qemu" namespace currently contains two 

[RFC PATCH 0/2] New NBD metacontext

2021-06-09 Thread Eric Blake
This is my counter-proposal to Nir's request [1] to revert a 6.0
behavior change.  It does not expose any new information over NBD, but
does make it easier to collect necessary information from a single
context rather than requiring the client to have to request two
contexts in parallel, then cross-correlate what may be different
extent lengths between those contexts.  Furthermore, this is easy to
backport to downstream based on qemu 6.0, at which point clients could
use the existence or absence of qemu:joint-allocation as a witness of
whether it can get away with trusting base:allocation when trying to
recreate a qcow2 backing chain.

[1] https://lists.gnu.org/archive/html/qemu-devel/2021-06/msg01796.html

Things I still want to do:
- a followup patch to libnbd to teach 'nbdinfo
  --map=qemu:joint-allocation' to decode the bits
- teach 'nbdinfo --map' to read all available contexts, instead of
  having to manually type each map besides base:allocation
- potential followup patches to qemu to automatically feed this
  information through qemu-img map:
  - add a new BDRV_BLOCK_BACKING bit for bdrv_block_status(), with
opposite semantics from BDRV_BLOCK_ALLOCATED, but where the only
thing known is that the data is not local (not how deep it is)
  - teach qemu to favor qemu:joint-allocation over base:allocation
when available, and use it to drive BDRV_BLOCK_BACKING
  - teach qemu-img map to recognize BDRV_BLOCK_BACKING

Eric Blake (2):
  iotests: Improve and rename test 309 to nbd-qemu-allocation
  nbd: Add new qemu:joint-allocation metadata context

 docs/interop/nbd.txt  | 31 ++-
 docs/tools/qemu-nbd.rst   |  4 +-
 qapi/block-export.json|  4 +-
 include/block/nbd.h   | 10 ++-
 nbd/server.c  | 87 +--
 .../{309 => tests/nbd-qemu-allocation}|  5 +-
 .../nbd-qemu-allocation.out}  | 13 ++-
 7 files changed, 139 insertions(+), 15 deletions(-)
 rename tests/qemu-iotests/{309 => tests/nbd-qemu-allocation} (95%)
 rename tests/qemu-iotests/{309.out => tests/nbd-qemu-allocation.out} (79%)

-- 
2.31.1




[PATCH 1/2] iotests: Improve and rename test 309 to nbd-qemu-allocation

2021-06-09 Thread Eric Blake
Enhance the test to inspect what qemu-nbd is advertising during
handshake, and rename it now that we support useful iotest names.

Signed-off-by: Eric Blake 
---
 .../qemu-iotests/{309 => tests/nbd-qemu-allocation}  |  5 -
 .../{309.out => tests/nbd-qemu-allocation.out}   | 12 +++-
 2 files changed, 15 insertions(+), 2 deletions(-)
 rename tests/qemu-iotests/{309 => tests/nbd-qemu-allocation} (95%)
 rename tests/qemu-iotests/{309.out => tests/nbd-qemu-allocation.out} (81%)

diff --git a/tests/qemu-iotests/309 
b/tests/qemu-iotests/tests/nbd-qemu-allocation
similarity index 95%
rename from tests/qemu-iotests/309
rename to tests/qemu-iotests/tests/nbd-qemu-allocation
index b90b279994c9..4ee73db8033b 100755
--- a/tests/qemu-iotests/309
+++ b/tests/qemu-iotests/tests/nbd-qemu-allocation
@@ -3,7 +3,7 @@
 #
 # Test qemu-nbd -A
 #
-# Copyright (C) 2018-2020 Red Hat, Inc.
+# Copyright (C) 2018-2021 Red Hat, Inc.
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -32,6 +32,7 @@ _cleanup()
 trap "_cleanup; exit \$status" 0 1 2 3 15

 # get standard environment, filters and checks
+cd ..
 . ./common.rc
 . ./common.filter
 . ./common.nbd
@@ -57,6 +58,8 @@ echo
 $QEMU_IMG map --output=json -f qcow2 "$TEST_IMG"
 IMG="driver=nbd,server.type=unix,server.path=$nbd_unix_socket"
 nbd_server_start_unix_socket -r -f qcow2 -A "$TEST_IMG"
+# Inspect what the server is exposing
+$QEMU_NBD --list -k $nbd_unix_socket
 # Normal -f raw NBD block status loses access to allocation information
 $QEMU_IMG map --output=json --image-opts \
 "$IMG" | _filter_qemu_img_map
diff --git a/tests/qemu-iotests/309.out 
b/tests/qemu-iotests/tests/nbd-qemu-allocation.out
similarity index 81%
rename from tests/qemu-iotests/309.out
rename to tests/qemu-iotests/tests/nbd-qemu-allocation.out
index db75bb6b0df9..c51022b2a38d 100644
--- a/tests/qemu-iotests/309.out
+++ b/tests/qemu-iotests/tests/nbd-qemu-allocation.out
@@ -1,4 +1,4 @@
-QA output created by 309
+QA output created by nbd-qemu-allocation

 === Initial image setup ===

@@ -14,6 +14,16 @@ wrote 2097152/2097152 bytes at offset 1048576
 [{ "start": 0, "length": 1048576, "depth": 1, "zero": false, "data": true, 
"offset": 327680},
 { "start": 1048576, "length": 2097152, "depth": 0, "zero": false, "data": 
true, "offset": 327680},
 { "start": 3145728, "length": 1048576, "depth": 1, "zero": true, "data": 
false}]
+exports available: 1
+ export: ''
+  size:  4194304
+  flags: 0x58f ( readonly flush fua df multi cache )
+  min block: 1
+  opt block: 4096
+  max block: 33554432
+  available meta contexts: 2
+   base:allocation
+   qemu:allocation-depth
 [{ "start": 0, "length": 3145728, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET},
 { "start": 3145728, "length": 1048576, "depth": 0, "zero": true, "data": 
false, "offset": OFFSET}]
 [{ "start": 0, "length": 1048576, "depth": 0, "zero": true, "data": true, 
"offset": OFFSET},
-- 
2.31.1




Re: [PATCH] block: Move read-only check during truncation earlier

2021-06-09 Thread Kevin Wolf
Am 09.06.2021 um 18:30 hat Eric Blake geschrieben:
> No need to start a tracked request that will always fail.  The choice
> to check read-only after bdrv_inc_in_flight() predates 1bc5f09f2e
> (block: Use tracked request for truncate), but waiting for serializing
> requests can make the effect more noticeable.
> 
> Signed-off-by: Eric Blake 

Thanks, applied to the block branch.

Kevin




Re: [PATCH v3 19/33] block/nbd: split nbd_handle_updated_info out of nbd_client_handshake()

2021-06-09 Thread Vladimir Sementsov-Ogievskiy

03.06.2021 19:29, Eric Blake wrote:

On Fri, Apr 16, 2021 at 11:08:57AM +0300, Vladimir Sementsov-Ogievskiy wrote:

To be reused in the following patch.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  block/nbd.c | 99 ++---
  1 file changed, 57 insertions(+), 42 deletions(-)

diff --git a/block/nbd.c b/block/nbd.c
index 5e63caaf4b..03ffe95231 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -318,6 +318,50 @@ static bool nbd_client_connecting_wait(BDRVNBDState *s)
  return qatomic_load_acquire(>state) == NBD_CLIENT_CONNECTING_WAIT;
  }
  
+/*

+ * Check s->info updated by negotiation process.


The parameter name is bs, not s; so this comment is a bit confusing...


+ * Update @bs correspondingly to new options.
+ */
+static int nbd_handle_updated_info(BlockDriverState *bs, Error **errp)
+{
+BDRVNBDState *s = (BDRVNBDState *)bs->opaque;


...until here.  Maybe rewrite the entire comment as:

Update @bs with information learned during a completed negotiation
process.  Return failure if the server's advertised options are
incompatible with the client's needs.


+int ret;
+
+if (s->x_dirty_bitmap) {
+if (!s->info.base_allocation) {
+error_setg(errp, "requested x-dirty-bitmap %s not found",
+   s->x_dirty_bitmap);
+return -EINVAL;
+}
+if (strcmp(s->x_dirty_bitmap, "qemu:allocation-depth") == 0) {
+s->alloc_depth = true;
+}
+}
+
+if (s->info.flags & NBD_FLAG_READ_ONLY) {
+ret = bdrv_apply_auto_read_only(bs, "NBD export is read-only", errp);
+if (ret < 0) {
+return ret;
+}
+}
+
+if (s->info.flags & NBD_FLAG_SEND_FUA) {
+bs->supported_write_flags = BDRV_REQ_FUA;
+bs->supported_zero_flags |= BDRV_REQ_FUA;


Code motion, so it is correct, but it looks odd to use = for one
assignment and |= for the other.  Using |= in both places would be
more consistent.


Actually I see bugs here:

1. we should do =, not |=, as on reconnect info changes, so we should reset 
supported flags.

2. in-fligth requests that are in retying loops are not prepared to flags 
changing. I afraid, that some malicious server may even do some bad thing

Still, let's fix it after these series. To avoid more conflicts.




+}
+
+if (s->info.flags & NBD_FLAG_SEND_WRITE_ZEROES) {
+bs->supported_zero_flags |= BDRV_REQ_MAY_UNMAP;
+if (s->info.flags & NBD_FLAG_SEND_FAST_ZERO) {
+bs->supported_zero_flags |= BDRV_REQ_NO_FALLBACK;
+}
+}
+
+trace_nbd_client_handshake_success(s->export);
+
+return 0;
+}
+
  static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s)
  {
  int ret;
@@ -1579,49 +1623,13 @@ static int nbd_client_handshake(BlockDriverState *bs, 
Error **errp)


As updating the comment doesn't affect code correctness,
Reviewed-by: Eric Blake 




--
Best regards,
Vladimir



Re: [PATCH] block: Move read-only check during truncation earlier

2021-06-09 Thread Vladimir Sementsov-Ogievskiy

09.06.2021 19:30, Eric Blake wrote:

No need to start a tracked request that will always fail.  The choice
to check read-only after bdrv_inc_in_flight() predates 1bc5f09f2e
(block: Use tracked request for truncate), but waiting for serializing
requests can make the effect more noticeable.

Signed-off-by: Eric Blake


Reviewed-by: Vladimir Sementsov-Ogievskiy 


--
Best regards,
Vladimir



Re: [PATCH v4 0/6] Allow changing bs->file on reopen

2021-06-09 Thread Vladimir Sementsov-Ogievskiy

09.06.2021 18:53, Kevin Wolf wrote:

Am 14.05.2021 um 17:53 hat Vladimir Sementsov-Ogievskiy geschrieben:

Hi Alberto!

What are your plans for v5? I'm now finishing a new series which makes
backup-top filter public, and I want to base it on your series
(otherwise I can't add a test).


Berto, where are we with this? I see that Vladimir picked up one or two
patches for his series, but I think we still need a v5 at least for the
rest?

If you can't find the time, should someone else pick up all patches?

Kevin


My "[PATCH v5 0/9] Allow changing bs->file on reopen" supersedes the "subject" part of 
the series. I think we now should start from taking it. Hmm, and I should check, does it conflict with recently 
merged block-permission-folloup and with beginning of "[PATCH v4 00/35] block: publish backup-top 
filter" which is already almost reviewed by Max and should land soon I hope (ohh, seems I should issue v5 
for python conflictes).

So, I propose the following plan:

1. I'll rebase and send "block: publish backup-top filter" series 
today-tomorrow. It's big, and mostly reviewed, let's not lose r-bs by rebases.

2. I'll rebase and send if needed (if it conflicts with master and/or [1]) "[PATCH v5 
0/9] Allow changing bs->file on reopen"

3. Then we'll decide what to do with the rest. Finally, I can take it if I have 
some time (the head is spinning from the number of tasks ;)

I also think that we can drop x- prefix even without supporting of multiple 
reopen, and implement it later as an option. QAPI interface is powerful enough 
for such enhancements.




17.03.2021 20:15, Alberto Garcia wrote:

Based-on: <20210317143529.615584-1-vsement...@virtuozzo.com>

Hello,

this is the same as v3, but rebased on top of Vladimir's "block:
update graph permissions update v3", which you can get here:

git: https://src.openvz.org/scm/~vsementsov/qemu.git
tag: up-block-topologic-perm-v3

Tip: you may find it easier to review patch 4 if you use 'git diff -w'
since a big part of the changes that you see in
qmp_x_blockdev_reopen() are just indentation changes.

Berto

v4:
- Rebase on top of version 3 of Vladimir's branch
v3: https://lists.gnu.org/archive/html/qemu-block/2021-03/msg00553.html
v2: https://lists.gnu.org/archive/html/qemu-block/2021-02/msg00623.html
v1: https://lists.gnu.org/archive/html/qemu-block/2021-01/msg00437.html

Output of git backport-diff against v3:

Key:
[] : patches are identical
[] : number of functional differences between upstream/downstream patch
[down] : patch is downstream-only
The flags [FC] indicate (F)unctional and (C)ontextual differences, respectively

001/6:[] [--] 'block: Add bdrv_reopen_queue_free()'
002/6:[0018] [FC] 'block: Allow changing bs->file on reopen'
003/6:[] [--] 'iotests: Test replacing files with x-blockdev-reopen'
004/6:[0071] [FC] 'block: Support multiple reopening with x-blockdev-reopen'
005/6:[] [--] 'iotests: Test reopening multiple devices at the same time'
006/6:[] [-C] 'block: Make blockdev-reopen stable API'

Alberto Garcia (6):
block: Add bdrv_reopen_queue_free()
block: Allow changing bs->file on reopen
iotests: Test replacing files with x-blockdev-reopen
block: Support multiple reopening with x-blockdev-reopen
iotests: Test reopening multiple devices at the same time
block: Make blockdev-reopen stable API

   qapi/block-core.json   |  24 ++---
   include/block/block.h  |   2 +
   block.c| 135 --
   blockdev.c |  78 +--
   tests/qemu-iotests/155 |   9 +-
   tests/qemu-iotests/165 |   4 +-
   tests/qemu-iotests/245 | 190 +
   tests/qemu-iotests/245.out |  11 ++-
   tests/qemu-iotests/248 |   4 +-
   tests/qemu-iotests/248.out |   2 +-
   tests/qemu-iotests/296 |  11 ++-
   tests/qemu-iotests/298 |   4 +-
   12 files changed, 351 insertions(+), 123 deletions(-)




--
Best regards,
Vladimir






--
Best regards,
Vladimir



[PATCH] block: Move read-only check during truncation earlier

2021-06-09 Thread Eric Blake
No need to start a tracked request that will always fail.  The choice
to check read-only after bdrv_inc_in_flight() predates 1bc5f09f2e
(block: Use tracked request for truncate), but waiting for serializing
requests can make the effect more noticeable.

Signed-off-by: Eric Blake 
---
 block/io.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/block/io.c b/block/io.c
index 323854d06337..1a05f320d35e 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3390,6 +3390,11 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, 
int64_t offset, bool exact,
 return old_size;
 }

+if (bdrv_is_read_only(bs)) {
+error_setg(errp, "Image is read-only");
+return -EACCES;
+}
+
 if (offset > old_size) {
 new_bytes = offset - old_size;
 } else {
@@ -3406,11 +3411,6 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, 
int64_t offset, bool exact,
 if (new_bytes) {
 bdrv_make_request_serialising(, 1);
 }
-if (bdrv_is_read_only(bs)) {
-error_setg(errp, "Image is read-only");
-ret = -EACCES;
-goto out;
-}
 ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, ,
 0);
 if (ret < 0) {
-- 
2.31.1




Re: [PATCH v4 2/7] scsi-generic: pass max_segments via max_iov field in BlockLimits

2021-06-09 Thread Maxim Levitsky
On Tue, 2021-06-08 at 15:16 +0200, Paolo Bonzini wrote:
> I/O to a disk via read/write is not limited by the number of segments allowed
> by the host adapter; the kernel can split requests if needed, and the limit
> imposed by the host adapter can be very low (256k or so) to avoid that SG_IO
> returns EINVAL if memory is heavily fragmented.
> 
> Since this value is only interesting for SG_IO-based I/O, do not include
> it in the max_transfer and only take it into account when patching the
> block limits VPD page in the scsi-generic device.
> 
> Signed-off-by: Paolo Bonzini 
> ---
>  block/file-posix.c | 3 +--
>  hw/scsi/scsi-generic.c | 6 --
>  2 files changed, 5 insertions(+), 4 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 536998a1d6..670c577bfe 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1239,8 +1239,7 @@ static void raw_refresh_limits(BlockDriverState *bs, 
> Error **errp)
>  
>  ret = sg_get_max_segments(s->fd);
>  if (ret > 0) {
> -bs->bl.max_transfer = MIN(bs->bl.max_transfer,
> -  ret * qemu_real_host_page_size);
> +bs->bl.max_iov = ret;

Actually I think that both max transfer size and max segement count,
are only relevant for SCSI passthrough since kernel I think emualates
both for regular I/O, so I think that we shoudn't expose them to qemu at all.

In my version of the patches I removed both bl.max_transfer and bl.max_iov
setup from the file-posix driver and replaced it with bs->bl.max_ioctl_transfer
(you call it max_hw_transfer)

In my version the bl.max_ioctl_transfer is a merged limit of the max transfer 
size
and the max iovec number.

https://www.mail-archive.com/qemu-devel@nongnu.org/msg768264.html


Best regards,
Maxim Levitsky


>  }
>  }
>  
> diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
> index 98c30c5d5c..82e1e2ee79 100644
> --- a/hw/scsi/scsi-generic.c
> +++ b/hw/scsi/scsi-generic.c
> @@ -179,10 +179,12 @@ static void scsi_handle_inquiry_reply(SCSIGenericReq 
> *r, SCSIDevice *s)
>  (r->req.cmd.buf[1] & 0x01)) {
>  page = r->req.cmd.buf[2];
>  if (page == 0xb0) {
> -uint32_t max_transfer =
> -blk_get_max_transfer(s->conf.blk) / s->blocksize;
> +uint32_t max_transfer = blk_get_max_transfer(s->conf.blk);
> +uint32_t max_iov = blk_get_max_iov(s->conf.blk);
>  
>  assert(max_transfer);
> +max_transfer = MIN_NON_ZERO(max_transfer, max_iov * 
> qemu_real_host_page_size)
> +/ s->blocksize;
>  stl_be_p(>buf[8], max_transfer);
>  /* Also take care of the opt xfer len. */
>  stl_be_p(>buf[12],







Re: [PATCH v4 4/7] file-posix: try BLKSECTGET on block devices too, do not round to power of 2

2021-06-09 Thread Maxim Levitsky
On Tue, 2021-06-08 at 15:16 +0200, Paolo Bonzini wrote:
> bs->sg is only true for character devices, but block devices can also
> be used with scsi-block and scsi-generic.  Unfortunately BLKSECTGET
> returns bytes in an int for /dev/sgN devices, and sectors in a short
> for block devices, so account for that in the code.
> 
> The maximum transfer also need not be a power of 2 (for example I have
> seen disks with 1280 KiB maximum transfer) so there's no need to pass
> the result through pow2floor.
> 
> Signed-off-by: Paolo Bonzini 
> ---
>  block/file-posix.c | 44 
>  1 file changed, 24 insertions(+), 20 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index c9746d3eb6..1439293f63 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1149,22 +1149,27 @@ static void raw_reopen_abort(BDRVReopenState *state)
>  s->reopen_state = NULL;
>  }
>  
> -static int sg_get_max_transfer_length(int fd)
> +static int hdev_get_max_hw_transfer(int fd, struct stat *st)
>  {
>  #ifdef BLKSECTGET
> -int max_bytes = 0;
> -
> -if (ioctl(fd, BLKSECTGET, _bytes) == 0) {
> -return max_bytes;
> +if (S_ISBLK(st->st_mode)) {
> +unsigned short max_sectors = 0;
> +if (ioctl(fd, BLKSECTGET, _sectors) == 0) {
> +return max_sectors * 512;
> +}
>  } else {
> -return -errno;
> +int max_bytes = 0;
> +if (ioctl(fd, BLKSECTGET, _bytes) == 0) {

Again I would use the bs->sg for that.

> +return max_bytes;
> +}
>  }
> +return -errno;
>  #else
>  return -ENOSYS;
>  #endif
>  }
>  
> -static int sg_get_max_segments(int fd)
> +static int hdev_get_max_segments(int fd, struct stat *st)
>  {
>  #ifdef CONFIG_LINUX
>  char buf[32];
> @@ -1173,26 +1178,20 @@ static int sg_get_max_segments(int fd)
>  int ret;
>  int sysfd = -1;
>  long max_segments;
> -struct stat st;
>  
> -if (fstat(fd, )) {
> -ret = -errno;
> -goto out;
> -}
> -
> -if (S_ISCHR(st.st_mode)) {
> +if (S_ISCHR(st->st_mode)) {
>  if (ioctl(fd, SG_GET_SG_TABLESIZE, ) == 0) {
>  return ret;
>  }
>  return -ENOTSUP;
>  }
>  
> -if (!S_ISBLK(st.st_mode)) {
> +if (!S_ISBLK(st->st_mode)) {
>  return -ENOTSUP;
>  }
>  
>  sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
> -major(st.st_rdev), minor(st.st_rdev));
> +major(st->st_rdev), minor(st->st_rdev));
>  sysfd = open(sysfspath, O_RDONLY);
>  if (sysfd == -1) {
>  ret = -errno;
> @@ -1229,15 +1228,20 @@ out:
>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>  {
>  BDRVRawState *s = bs->opaque;
> +struct stat st;
> +
> +if (fstat(s->fd, )) {
> +return;
> +}
>  
> -if (bs->sg) {
> -int ret = sg_get_max_transfer_length(s->fd);
> +if (bs->sg || S_ISBLK(st.st_mode)) {
> +int ret = hdev_get_max_hw_transfer(s->fd, );
>  
>  if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
> -bs->bl.max_hw_transfer = pow2floor(ret);
> +bs->bl.max_hw_transfer = ret;
>  }
>  
> -ret = sg_get_max_segments(s->fd);
> +ret = hdev_get_max_segments(s->fd, );
>  if (ret > 0) {
>  bs->bl.max_iov = ret;
>  }


Roughly speaking this looks correct, but I might have missed something as well.

This is roughly the same as patches from Tom Yan which I carried in my series

https://www.mail-archive.com/qemu-devel@nongnu.org/msg768258.html
https://www.mail-archive.com/qemu-devel@nongnu.org/msg768262.html


I like a bit more how he created separate functions for /dev/sg and for all 
other block devices.
Please take a look.

Also not related to this patch, you are missing my fix I did to the VPD limit 
emulation, please consider taking
it into the series:

https://www.mail-archive.com/qemu-devel@nongnu.org/msg768260.html


Best regards,
Maxim Levitsky






Re: [PATCH v4 3/7] block: add max_hw_transfer to BlockLimits

2021-06-09 Thread Maxim Levitsky
On Tue, 2021-06-08 at 15:16 +0200, Paolo Bonzini wrote:
> For block host devices, I/O can happen through either the kernel file
> descriptor I/O system calls (preadv/pwritev, io_submit, io_uring)
> or the SCSI passthrough ioctl SG_IO.
> 
> In the latter case, the size of each transfer can be limited by the
> HBA, while for file descriptor I/O the kernel is able to split and
> merge I/O in smaller pieces as needed.  Applying the HBA limits to
> file descriptor I/O results in more system calls and suboptimal
> performance, so this patch splits the max_transfer limit in two:
> max_transfer remains valid and is used in general, while max_hw_transfer
> is limited to the maximum hardware size.  max_hw_transfer can then be
> included by the scsi-generic driver in the block limits page, to ensure
> that the stricter hardware limit is used.
> 
> Signed-off-by: Paolo Bonzini 

This is mostly the same as my patch 

https://www.mail-archive.com/qemu-devel@nongnu.org/msg768264.html

I called this max_ioctl_transfer, since this limit is only relevant
to the .ioctl, but max_hw_transfer is fine as well.

So this patch looks OK, but I might have missed something
as I haven't touched this area for a long time.

Reviewed-by: Maxim Levitsky 

Best regards,
Maxim Levitsky


> ---
>  block/block-backend.c  | 12 
>  block/file-posix.c |  2 +-
>  block/io.c |  1 +
>  hw/scsi/scsi-generic.c |  2 +-
>  include/block/block_int.h  |  7 +++
>  include/sysemu/block-backend.h |  1 +
>  6 files changed, 23 insertions(+), 2 deletions(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index 15f1ea4288..2ea1412a54 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1953,6 +1953,18 @@ uint32_t blk_get_request_alignment(BlockBackend *blk)
>  return bs ? bs->bl.request_alignment : BDRV_SECTOR_SIZE;
>  }
>  
> +/* Returns the maximum hardware transfer length, in bytes; guaranteed 
> nonzero */
> +uint64_t blk_get_max_hw_transfer(BlockBackend *blk)
> +{
> +BlockDriverState *bs = blk_bs(blk);
> +uint64_t max = INT_MAX;
> +
> +if (bs) {
> +max = MIN_NON_ZERO(bs->bl.max_hw_transfer, bs->bl.max_transfer);
> +}
> +return max;
> +}
> +
>  /* Returns the maximum transfer length, in bytes; guaranteed nonzero */
>  uint32_t blk_get_max_transfer(BlockBackend *blk)
>  {
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 670c577bfe..c9746d3eb6 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1234,7 +1234,7 @@ static void raw_refresh_limits(BlockDriverState *bs, 
> Error **errp)
>  int ret = sg_get_max_transfer_length(s->fd);
>  
>  if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
> -bs->bl.max_transfer = pow2floor(ret);
> +bs->bl.max_hw_transfer = pow2floor(ret);
>  }
>  
>  ret = sg_get_max_segments(s->fd);
> diff --git a/block/io.c b/block/io.c
> index 323854d063..089b99bb0c 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -127,6 +127,7 @@ static void bdrv_merge_limits(BlockLimits *dst, const 
> BlockLimits *src)
>  {
>  dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
>  dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
> +dst->max_hw_transfer = MIN_NON_ZERO(dst->max_hw_transfer, 
> src->max_hw_transfer);
>  dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
>   src->opt_mem_alignment);
>  dst->min_mem_alignment = MAX(dst->min_mem_alignment,
> diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
> index 82e1e2ee79..3762dce749 100644
> --- a/hw/scsi/scsi-generic.c
> +++ b/hw/scsi/scsi-generic.c
> @@ -179,7 +179,7 @@ static void scsi_handle_inquiry_reply(SCSIGenericReq *r, 
> SCSIDevice *s)
>  (r->req.cmd.buf[1] & 0x01)) {
>  page = r->req.cmd.buf[2];
>  if (page == 0xb0) {
> -uint32_t max_transfer = blk_get_max_transfer(s->conf.blk);
> +uint64_t max_transfer = blk_get_max_hw_transfer(s->conf.blk);
>  uint32_t max_iov = blk_get_max_iov(s->conf.blk);
>  
>  assert(max_transfer);
> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index 057d88b1fc..f1a54db0f8 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -695,6 +695,13 @@ typedef struct BlockLimits {
>   * clamped down. */
>  uint32_t max_transfer;
>  
> +/* Maximal hardware transfer length in bytes.  Applies whenever
> + * transfers to the device bypass the kernel I/O scheduler, for
> + * example with SG_IO.  If larger than max_transfer or if zero,
> + * blk_get_max_hw_transfer will fall back to max_transfer.
> + */
> +uint64_t max_hw_transfer;
> +
>  /* memory alignment, in bytes so that no bounce buffer is needed */
>  size_t min_mem_alignment;
>  
> diff --git a/include/sysemu/block-backend.h 

Re: [PATCH v4 1/7] file-posix: fix max_iov for /dev/sg devices

2021-06-09 Thread Maxim Levitsky
On Tue, 2021-06-08 at 22:14 +0300, Vladimir Sementsov-Ogievskiy wrote:
> 08.06.2021 16:16, Paolo Bonzini wrote:
> > Even though it was only called for devices that have bs->sg set (which
> > must be character devices), sg_get_max_segments looked at /sys/dev/block
> > which only works for block devices.
> > 
> > On Linux the sg driver has its own way to provide the maximum number of
> > iovecs in a scatter/gather list, so add support for it.  The block device
> > path is kept because it will be reinstated in the next patches.
> > 
> > Signed-off-by: Paolo Bonzini 
> > ---
> >   block/file-posix.c | 11 +++
> >   1 file changed, 11 insertions(+)
> > 
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index f37dfc10b3..536998a1d6 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -1180,6 +1180,17 @@ static int sg_get_max_segments(int fd)
> >   goto out;
> >   }
> >   
> > +if (S_ISCHR(st.st_mode)) {
> 
> Why not check "if (bs->sg) {" instead? It seems to be more consistent with 
> issuing SG_ ioctl. Or what I miss?

I also think so. Actually the 'hdev_is_sg' has a check for character device as 
well, 
in addition to a few more checks that make sure that we are really 
dealing with the quirky /dev/sg character device.

> 
> > +if (ioctl(fd, SG_GET_SG_TABLESIZE, ) == 0) {
> > +return ret;
> > +}
> > +return -ENOTSUP;
> > +}
> > +
> > +if (!S_ISBLK(st.st_mode)) {
> > +return -ENOTSUP;
> > +}
> > +
> >   sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
> >   major(st.st_rdev), minor(st.st_rdev));
> >   sysfd = open(sysfspath, O_RDONLY);
> > 
> 
> 

Other than that, this is the same as the patch from Tom Yan:
https://www.mail-archive.com/qemu-devel@nongnu.org/msg768262.html

In this version he does check if the SG_GET_SG_TABLESIZE is defined, so
you might want to do this as well.


Best regards,
Maxim Levitsky






Re: [PATCH v3 14/33] nbd: move connection code from block/nbd to nbd/client-connection

2021-06-09 Thread Vladimir Sementsov-Ogievskiy

28.04.2021 11:14, Vladimir Sementsov-Ogievskiy wrote:

+struct NBDClientConnection {
+    /* Initialization constants */
+    SocketAddress *saddr; /* address to connect to */
+
+    /*
+ * Result of last attempt. Valid in FAIL and SUCCESS states.
+ * If you want to steal error, don't forget to set pointer to NULL.
+ */
+    QIOChannelSocket *sioc;
+    Error *err;


These two are also manipulated under the mutex.  Consider also updating
the comment: both these pointers are to be "stolen" by the caller, with
the former being valid when the connection succeeds and the latter
otherwise.



Hmm. I should move mutex and "All further" comment above these two fields.

Ok, I'll think on updating the comment (probably as an additional patch, to 
keep this as a simple movement). I don't like to document that they are stolen 
by caller(). For me it sounds like caller is user of the interface. And caller 
of nbd_co_establish_connection() doesn't stole anything: the structure is 
private now..


Finally, I decided to improve the comment as part of "[PATCH v3 08/33] block/nbd: drop 
thr->state" commit, as "FAIL and SUCCESS states" string becomes outdated when we 
drop these states.

--
Best regards,
Vladimir



Re: [PATCH v4 0/6] Allow changing bs->file on reopen

2021-06-09 Thread Kevin Wolf
Am 14.05.2021 um 17:53 hat Vladimir Sementsov-Ogievskiy geschrieben:
> Hi Alberto!
> 
> What are your plans for v5? I'm now finishing a new series which makes
> backup-top filter public, and I want to base it on your series
> (otherwise I can't add a test).

Berto, where are we with this? I see that Vladimir picked up one or two
patches for his series, but I think we still need a v5 at least for the
rest?

If you can't find the time, should someone else pick up all patches?

Kevin

> 17.03.2021 20:15, Alberto Garcia wrote:
> > Based-on: <20210317143529.615584-1-vsement...@virtuozzo.com>
> > 
> > Hello,
> > 
> > this is the same as v3, but rebased on top of Vladimir's "block:
> > update graph permissions update v3", which you can get here:
> > 
> > git: https://src.openvz.org/scm/~vsementsov/qemu.git
> > tag: up-block-topologic-perm-v3
> > 
> > Tip: you may find it easier to review patch 4 if you use 'git diff -w'
> > since a big part of the changes that you see in
> > qmp_x_blockdev_reopen() are just indentation changes.
> > 
> > Berto
> > 
> > v4:
> > - Rebase on top of version 3 of Vladimir's branch
> > v3: https://lists.gnu.org/archive/html/qemu-block/2021-03/msg00553.html
> > v2: https://lists.gnu.org/archive/html/qemu-block/2021-02/msg00623.html
> > v1: https://lists.gnu.org/archive/html/qemu-block/2021-01/msg00437.html
> > 
> > Output of git backport-diff against v3:
> > 
> > Key:
> > [] : patches are identical
> > [] : number of functional differences between upstream/downstream patch
> > [down] : patch is downstream-only
> > The flags [FC] indicate (F)unctional and (C)ontextual differences, 
> > respectively
> > 
> > 001/6:[] [--] 'block: Add bdrv_reopen_queue_free()'
> > 002/6:[0018] [FC] 'block: Allow changing bs->file on reopen'
> > 003/6:[] [--] 'iotests: Test replacing files with x-blockdev-reopen'
> > 004/6:[0071] [FC] 'block: Support multiple reopening with x-blockdev-reopen'
> > 005/6:[] [--] 'iotests: Test reopening multiple devices at the same 
> > time'
> > 006/6:[] [-C] 'block: Make blockdev-reopen stable API'
> > 
> > Alberto Garcia (6):
> >block: Add bdrv_reopen_queue_free()
> >block: Allow changing bs->file on reopen
> >iotests: Test replacing files with x-blockdev-reopen
> >block: Support multiple reopening with x-blockdev-reopen
> >iotests: Test reopening multiple devices at the same time
> >block: Make blockdev-reopen stable API
> > 
> >   qapi/block-core.json   |  24 ++---
> >   include/block/block.h  |   2 +
> >   block.c| 135 --
> >   blockdev.c |  78 +--
> >   tests/qemu-iotests/155 |   9 +-
> >   tests/qemu-iotests/165 |   4 +-
> >   tests/qemu-iotests/245 | 190 +
> >   tests/qemu-iotests/245.out |  11 ++-
> >   tests/qemu-iotests/248 |   4 +-
> >   tests/qemu-iotests/248.out |   2 +-
> >   tests/qemu-iotests/296 |  11 ++-
> >   tests/qemu-iotests/298 |   4 +-
> >   12 files changed, 351 insertions(+), 123 deletions(-)
> > 
> 
> 
> -- 
> Best regards,
> Vladimir
> 




[PATCH 7/7] vhost-user-blk: Implement reconnection during realize

2021-06-09 Thread Kevin Wolf
Commit dabefdd6 removed code that was supposed to try reconnecting
during .realize(), but actually just crashed and had several design
problems.

This adds the feature back without the crash in simple cases while also
fixing some design problems: Reconnection is now only tried if there was
a problem with the connection and not an error related to the content
(which would fail again the same way in the next attempt). Reconnection
is limited to three attempts (four with the initial attempt) so that we
won't end up in an infinite loop if a problem is permanent. If the
backend restarts three times in the very short time window of device
initialisation, we have bigger problems and erroring out is the right
course of action.

In the case that a connection error occurs and we reconnect, the error
message is printed using error_report_err(), but otherwise ignored.

Signed-off-by: Kevin Wolf 
---
 hw/block/vhost-user-blk.c | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index e49d2e4c83..f75a42bc62 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -455,8 +455,10 @@ static int vhost_user_blk_realize_connect(VHostUserBlk *s, 
Error **errp)
 
 static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
 {
+ERRP_GUARD();
 VirtIODevice *vdev = VIRTIO_DEVICE(dev);
 VHostUserBlk *s = VHOST_USER_BLK(vdev);
+int retries;
 int i, ret;
 
 if (!s->chardev.chr) {
@@ -498,7 +500,17 @@ static void vhost_user_blk_device_realize(DeviceState 
*dev, Error **errp)
 s->inflight = g_new0(struct vhost_inflight, 1);
 s->vhost_vqs = g_new0(struct vhost_virtqueue, s->num_queues);
 
-ret = vhost_user_blk_realize_connect(s, errp);
+retries = 3;
+assert(!*errp);
+do {
+if (*errp) {
+error_prepend(errp, "Reconnecting after error: ");
+error_report_err(*errp);
+*errp = NULL;
+}
+ret = vhost_user_blk_realize_connect(s, errp);
+} while (ret == -EPROTO && retries--);
+
 if (ret < 0) {
 goto virtio_err;
 }
-- 
2.30.2




[PATCH 6/7] vhost-user-blk: Factor out vhost_user_blk_realize_connect()

2021-06-09 Thread Kevin Wolf
This function is the part that we will want to retry if the connection
is lost during initialisation, so factor it out to keep the following
patch simpler.

The error path for vhost_dev_get_config() forgot disconnecting the
chardev, add this while touching the code.

Signed-off-by: Kevin Wolf 
---
 hw/block/vhost-user-blk.c | 48 ++-
 1 file changed, 32 insertions(+), 16 deletions(-)

diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index 3770f715da..e49d2e4c83 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -423,6 +423,36 @@ static void vhost_user_blk_event(void *opaque, 
QEMUChrEvent event)
 }
 }
 
+static int vhost_user_blk_realize_connect(VHostUserBlk *s, Error **errp)
+{
+DeviceState *dev = >parent_obj.parent_obj;
+int ret;
+
+s->connected = false;
+
+ret = qemu_chr_fe_wait_connected(>chardev, errp);
+if (ret < 0) {
+return ret;
+}
+
+ret = vhost_user_blk_connect(dev, errp);
+if (ret < 0) {
+qemu_chr_fe_disconnect(>chardev);
+return ret;
+}
+assert(s->connected);
+
+ret = vhost_dev_get_config(>dev, (uint8_t *)>blkcfg,
+   sizeof(struct virtio_blk_config), errp);
+if (ret < 0) {
+qemu_chr_fe_disconnect(>chardev);
+vhost_dev_cleanup(>dev);
+return ret;
+}
+
+return 0;
+}
+
 static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
 {
 VirtIODevice *vdev = VIRTIO_DEVICE(dev);
@@ -467,22 +497,10 @@ static void vhost_user_blk_device_realize(DeviceState 
*dev, Error **errp)
 
 s->inflight = g_new0(struct vhost_inflight, 1);
 s->vhost_vqs = g_new0(struct vhost_virtqueue, s->num_queues);
-s->connected = false;
-
-if (qemu_chr_fe_wait_connected(>chardev, errp) < 0) {
-goto virtio_err;
-}
 
-if (vhost_user_blk_connect(dev, errp) < 0) {
-qemu_chr_fe_disconnect(>chardev);
-goto virtio_err;
-}
-assert(s->connected);
-
-ret = vhost_dev_get_config(>dev, (uint8_t *)>blkcfg,
-   sizeof(struct virtio_blk_config), errp);
+ret = vhost_user_blk_realize_connect(s, errp);
 if (ret < 0) {
-goto vhost_err;
+goto virtio_err;
 }
 
 /* we're fully initialized, now we can operate, so add the handler */
@@ -491,8 +509,6 @@ static void vhost_user_blk_device_realize(DeviceState *dev, 
Error **errp)
  NULL, true);
 return;
 
-vhost_err:
-vhost_dev_cleanup(>dev);
 virtio_err:
 g_free(s->vhost_vqs);
 s->vhost_vqs = NULL;
-- 
2.30.2




[PATCH 4/7] vhost-user-blk: Add Error parameter to vhost_user_blk_start()

2021-06-09 Thread Kevin Wolf
Instead of letting the caller make up a meaningless error message, add
an Error parameter to allow reporting the real error.

Signed-off-by: Kevin Wolf 
---
 hw/block/vhost-user-blk.c | 31 +++
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index 0cb56baefb..e9382e152a 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -113,7 +113,7 @@ const VhostDevConfigOps blk_ops = {
 .vhost_dev_config_notifier = vhost_user_blk_handle_config_change,
 };
 
-static int vhost_user_blk_start(VirtIODevice *vdev)
+static int vhost_user_blk_start(VirtIODevice *vdev, Error **errp)
 {
 VHostUserBlk *s = VHOST_USER_BLK(vdev);
 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
@@ -121,19 +121,19 @@ static int vhost_user_blk_start(VirtIODevice *vdev)
 int i, ret;
 
 if (!k->set_guest_notifiers) {
-error_report("binding does not support guest notifiers");
+error_setg(errp, "binding does not support guest notifiers");
 return -ENOSYS;
 }
 
 ret = vhost_dev_enable_notifiers(>dev, vdev);
 if (ret < 0) {
-error_report("Error enabling host notifiers: %d", -ret);
+error_setg_errno(errp, -ret, "Error enabling host notifiers");
 return ret;
 }
 
 ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, true);
 if (ret < 0) {
-error_report("Error binding guest notifier: %d", -ret);
+error_setg_errno(errp, -ret, "Error binding guest notifier");
 goto err_host_notifiers;
 }
 
@@ -141,27 +141,27 @@ static int vhost_user_blk_start(VirtIODevice *vdev)
 
 ret = vhost_dev_prepare_inflight(>dev, vdev);
 if (ret < 0) {
-error_report("Error set inflight format: %d", -ret);
+error_setg_errno(errp, -ret, "Error setting inflight format");
 goto err_guest_notifiers;
 }
 
 if (!s->inflight->addr) {
 ret = vhost_dev_get_inflight(>dev, s->queue_size, s->inflight);
 if (ret < 0) {
-error_report("Error get inflight: %d", -ret);
+error_setg_errno(errp, -ret, "Error getting inflight");
 goto err_guest_notifiers;
 }
 }
 
 ret = vhost_dev_set_inflight(>dev, s->inflight);
 if (ret < 0) {
-error_report("Error set inflight: %d", -ret);
+error_setg_errno(errp, -ret, "Error setting inflight");
 goto err_guest_notifiers;
 }
 
 ret = vhost_dev_start(>dev, vdev);
 if (ret < 0) {
-error_report("Error starting vhost: %d", -ret);
+error_setg_errno(errp, -ret, "Error starting vhost");
 goto err_guest_notifiers;
 }
 s->started_vu = true;
@@ -214,6 +214,7 @@ static void vhost_user_blk_set_status(VirtIODevice *vdev, 
uint8_t status)
 {
 VHostUserBlk *s = VHOST_USER_BLK(vdev);
 bool should_start = virtio_device_started(vdev, status);
+Error *local_err = NULL;
 int ret;
 
 if (!vdev->vm_running) {
@@ -229,10 +230,9 @@ static void vhost_user_blk_set_status(VirtIODevice *vdev, 
uint8_t status)
 }
 
 if (should_start) {
-ret = vhost_user_blk_start(vdev);
+ret = vhost_user_blk_start(vdev, _err);
 if (ret < 0) {
-error_report("vhost-user-blk: vhost start failed: %s",
- strerror(-ret));
+error_reportf_err(local_err, "vhost-user-blk: vhost start failed: 
");
 qemu_chr_fe_disconnect(>chardev);
 }
 } else {
@@ -270,6 +270,7 @@ static uint64_t vhost_user_blk_get_features(VirtIODevice 
*vdev,
 static void vhost_user_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
 {
 VHostUserBlk *s = VHOST_USER_BLK(vdev);
+Error *local_err = NULL;
 int i, ret;
 
 if (!vdev->start_on_kick) {
@@ -287,10 +288,9 @@ static void vhost_user_blk_handle_output(VirtIODevice 
*vdev, VirtQueue *vq)
 /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start
  * vhost here instead of waiting for .set_status().
  */
-ret = vhost_user_blk_start(vdev);
+ret = vhost_user_blk_start(vdev, _err);
 if (ret < 0) {
-error_report("vhost-user-blk: vhost start failed: %s",
- strerror(-ret));
+error_reportf_err(local_err, "vhost-user-blk: vhost start failed: ");
 qemu_chr_fe_disconnect(>chardev);
 return;
 }
@@ -340,9 +340,8 @@ static int vhost_user_blk_connect(DeviceState *dev, Error 
**errp)
 
 /* restore vhost state */
 if (virtio_device_started(vdev, vdev->status)) {
-ret = vhost_user_blk_start(vdev);
+ret = vhost_user_blk_start(vdev, errp);
 if (ret < 0) {
-error_setg_errno(errp, -ret, "vhost start failed");
 return ret;
 }
 }
-- 
2.30.2




[PATCH 2/7] vhost: Distinguish errors in vhost_backend_init()

2021-06-09 Thread Kevin Wolf
Instead of just returning 0/-1 and letting the caller make up a
meaningless error message, add an Error parameter to allow reporting the
real error and switch to 0/-errno so that different kind of errors can
be distinguished in the caller.

Specifically, in vhost-user, EPROTO is used for all errors that relate
to the connection itself, whereas other error codes are used for errors
relating to the content of the connection. This will allow us later to
automatically reconnect when the connection goes away, without ending up
in an endless loop if it's a permanent error in the configuration.

Signed-off-by: Kevin Wolf 
---
 include/hw/virtio/vhost-backend.h |  3 ++-
 hw/virtio/vhost-backend.c |  2 +-
 hw/virtio/vhost-user.c| 41 ---
 hw/virtio/vhost-vdpa.c|  2 +-
 hw/virtio/vhost.c | 13 +-
 5 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/include/hw/virtio/vhost-backend.h 
b/include/hw/virtio/vhost-backend.h
index 8a6f8e2a7a..728ebb0ed9 100644
--- a/include/hw/virtio/vhost-backend.h
+++ b/include/hw/virtio/vhost-backend.h
@@ -37,7 +37,8 @@ struct vhost_scsi_target;
 struct vhost_iotlb_msg;
 struct vhost_virtqueue;
 
-typedef int (*vhost_backend_init)(struct vhost_dev *dev, void *opaque);
+typedef int (*vhost_backend_init)(struct vhost_dev *dev, void *opaque,
+  Error **errp);
 typedef int (*vhost_backend_cleanup)(struct vhost_dev *dev);
 typedef int (*vhost_backend_memslots_limit)(struct vhost_dev *dev);
 
diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c
index 31b33bde37..f4f71cf58a 100644
--- a/hw/virtio/vhost-backend.c
+++ b/hw/virtio/vhost-backend.c
@@ -30,7 +30,7 @@ static int vhost_kernel_call(struct vhost_dev *dev, unsigned 
long int request,
 return ioctl(fd, request, arg);
 }
 
-static int vhost_kernel_init(struct vhost_dev *dev, void *opaque)
+static int vhost_kernel_init(struct vhost_dev *dev, void *opaque, Error **errp)
 {
 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_KERNEL);
 
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index ee57abe045..024cb201bb 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -1856,7 +1856,8 @@ static int 
vhost_user_postcopy_notifier(NotifierWithReturn *notifier,
 return 0;
 }
 
-static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque)
+static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque,
+   Error **errp)
 {
 uint64_t features, protocol_features, ram_slots;
 struct vhost_user *u;
@@ -1871,7 +1872,7 @@ static int vhost_user_backend_init(struct vhost_dev *dev, 
void *opaque)
 
 err = vhost_user_get_features(dev, );
 if (err < 0) {
-return err;
+return -EPROTO;
 }
 
 if (virtio_has_feature(features, VHOST_USER_F_PROTOCOL_FEATURES)) {
@@ -1880,7 +1881,7 @@ static int vhost_user_backend_init(struct vhost_dev *dev, 
void *opaque)
 err = vhost_user_get_u64(dev, VHOST_USER_GET_PROTOCOL_FEATURES,
  _features);
 if (err < 0) {
-return err;
+return -EPROTO;
 }
 
 dev->protocol_features =
@@ -1891,14 +1892,14 @@ static int vhost_user_backend_init(struct vhost_dev 
*dev, void *opaque)
 dev->protocol_features &= ~(1ULL << VHOST_USER_PROTOCOL_F_CONFIG);
 } else if (!(protocol_features &
 (1ULL << VHOST_USER_PROTOCOL_F_CONFIG))) {
-error_report("Device expects VHOST_USER_PROTOCOL_F_CONFIG "
-"but backend does not support it.");
-return -1;
+error_setg(errp, "Device expects VHOST_USER_PROTOCOL_F_CONFIG "
+   "but backend does not support it.");
+return -EINVAL;
 }
 
 err = vhost_user_set_protocol_features(dev, dev->protocol_features);
 if (err < 0) {
-return err;
+return -EPROTO;
 }
 
 /* query the max queues we support if backend supports Multiple Queue 
*/
@@ -1906,12 +1907,12 @@ static int vhost_user_backend_init(struct vhost_dev 
*dev, void *opaque)
 err = vhost_user_get_u64(dev, VHOST_USER_GET_QUEUE_NUM,
  >max_queues);
 if (err < 0) {
-return err;
+return -EPROTO;
 }
 }
 if (dev->num_queues && dev->max_queues < dev->num_queues) {
-error_report("The maximum number of queues supported by the "
- "backend is %" PRIu64, dev->max_queues);
+error_setg(errp, "The maximum number of queues supported by the "
+   "backend is %" PRIu64, dev->max_queues);
 return -EINVAL;
 }
 
@@ -1920,9 +1921,9 @@ static int vhost_user_backend_init(struct vhost_dev *dev, 
void *opaque)
 

[PATCH 5/7] vhost: Distinguish errors in vhost_dev_get_config()

2021-06-09 Thread Kevin Wolf
Instead of just returning 0/-1 and letting the caller make up a
meaningless error message, add an Error parameter to allow reporting the
real error and switch to 0/-errno so that different kind of errors can
be distinguished in the caller.

Signed-off-by: Kevin Wolf 
---
 include/hw/virtio/vhost-backend.h |  2 +-
 include/hw/virtio/vhost.h |  4 ++--
 hw/block/vhost-user-blk.c |  9 +
 hw/display/vhost-user-gpu.c   |  6 --
 hw/input/vhost-user-input.c   |  6 --
 hw/net/vhost_net.c|  2 +-
 hw/virtio/vhost-user-vsock.c  |  9 +
 hw/virtio/vhost-user.c| 24 
 hw/virtio/vhost-vdpa.c|  2 +-
 hw/virtio/vhost.c | 14 +++---
 10 files changed, 46 insertions(+), 32 deletions(-)

diff --git a/include/hw/virtio/vhost-backend.h 
b/include/hw/virtio/vhost-backend.h
index 728ebb0ed9..8475c5a29d 100644
--- a/include/hw/virtio/vhost-backend.h
+++ b/include/hw/virtio/vhost-backend.h
@@ -98,7 +98,7 @@ typedef int (*vhost_set_config_op)(struct vhost_dev *dev, 
const uint8_t *data,
uint32_t offset, uint32_t size,
uint32_t flags);
 typedef int (*vhost_get_config_op)(struct vhost_dev *dev, uint8_t *config,
-   uint32_t config_len);
+   uint32_t config_len, Error **errp);
 
 typedef int (*vhost_crypto_create_session_op)(struct vhost_dev *dev,
   void *session_info,
diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index 2d7aaad67b..045d0fd9f2 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -130,8 +130,8 @@ int vhost_net_set_backend(struct vhost_dev *hdev,
   struct vhost_vring_file *file);
 
 int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write);
-int vhost_dev_get_config(struct vhost_dev *dev, uint8_t *config,
- uint32_t config_len);
+int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config,
+ uint32_t config_len, Error **errp);
 int vhost_dev_set_config(struct vhost_dev *dev, const uint8_t *data,
  uint32_t offset, uint32_t size, uint32_t flags);
 /* notifier callback in case vhost device config space changed
diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index e9382e152a..3770f715da 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -91,11 +91,13 @@ static int vhost_user_blk_handle_config_change(struct 
vhost_dev *dev)
 int ret;
 struct virtio_blk_config blkcfg;
 VHostUserBlk *s = VHOST_USER_BLK(dev->vdev);
+Error *local_err = NULL;
 
 ret = vhost_dev_get_config(dev, (uint8_t *),
-   sizeof(struct virtio_blk_config));
+   sizeof(struct virtio_blk_config),
+   _err);
 if (ret < 0) {
-error_report("get config space failed");
+error_report_err(local_err);
 return -1;
 }
 
@@ -478,9 +480,8 @@ static void vhost_user_blk_device_realize(DeviceState *dev, 
Error **errp)
 assert(s->connected);
 
 ret = vhost_dev_get_config(>dev, (uint8_t *)>blkcfg,
-   sizeof(struct virtio_blk_config));
+   sizeof(struct virtio_blk_config), errp);
 if (ret < 0) {
-error_setg(errp, "vhost-user-blk: get block config failed");
 goto vhost_err;
 }
 
diff --git a/hw/display/vhost-user-gpu.c b/hw/display/vhost-user-gpu.c
index 6cdaa1c73b..389199e6ca 100644
--- a/hw/display/vhost-user-gpu.c
+++ b/hw/display/vhost-user-gpu.c
@@ -415,14 +415,16 @@ vhost_user_gpu_get_config(VirtIODevice *vdev, uint8_t 
*config_data)
 VirtIOGPUBase *b = VIRTIO_GPU_BASE(vdev);
 struct virtio_gpu_config *vgconfig =
 (struct virtio_gpu_config *)config_data;
+Error *local_err = NULL;
 int ret;
 
 memset(config_data, 0, sizeof(struct virtio_gpu_config));
 
 ret = vhost_dev_get_config(>vhost->dev,
-   config_data, sizeof(struct virtio_gpu_config));
+   config_data, sizeof(struct virtio_gpu_config),
+   _err);
 if (ret) {
-error_report("vhost-user-gpu: get device config space failed");
+error_report_err(local_err);
 return;
 }
 
diff --git a/hw/input/vhost-user-input.c b/hw/input/vhost-user-input.c
index 63984a8ba7..273e96a7b1 100644
--- a/hw/input/vhost-user-input.c
+++ b/hw/input/vhost-user-input.c
@@ -49,13 +49,15 @@ static void vhost_input_get_config(VirtIODevice *vdev, 
uint8_t *config_data)
 {
 VirtIOInput *vinput = VIRTIO_INPUT(vdev);
 VHostUserInput *vhi = VHOST_USER_INPUT(vdev);
+Error *local_err = NULL;
 int ret;
 
 memset(config_data, 0, vinput->cfg_size);
 
-ret = 

[PATCH 1/7] vhost: Add Error parameter to vhost_dev_init()

2021-06-09 Thread Kevin Wolf
This allows callers to return better error messages instead of making
one up while the real error ends up on stderr. Most callers can
immediately make use of this because they already have an Error
parameter themselves. The others just keep printing the error with
error_report_err().

Signed-off-by: Kevin Wolf 
---
 include/hw/virtio/vhost.h|  2 +-
 backends/cryptodev-vhost.c   |  5 -
 backends/vhost-user.c|  4 ++--
 hw/block/vhost-user-blk.c|  4 ++--
 hw/net/vhost_net.c   |  6 +-
 hw/scsi/vhost-scsi.c |  4 +---
 hw/scsi/vhost-user-scsi.c|  4 +---
 hw/virtio/vhost-user-fs.c|  3 +--
 hw/virtio/vhost-user-vsock.c |  3 +--
 hw/virtio/vhost-vsock.c  |  3 +--
 hw/virtio/vhost.c| 16 ++--
 11 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index 21a9a52088..2d7aaad67b 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -104,7 +104,7 @@ struct vhost_net {
 
 int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
VhostBackendType backend_type,
-   uint32_t busyloop_timeout);
+   uint32_t busyloop_timeout, Error **errp);
 void vhost_dev_cleanup(struct vhost_dev *hdev);
 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev);
 void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev);
diff --git a/backends/cryptodev-vhost.c b/backends/cryptodev-vhost.c
index 8231e7f1bc..bc13e466b4 100644
--- a/backends/cryptodev-vhost.c
+++ b/backends/cryptodev-vhost.c
@@ -52,6 +52,7 @@ cryptodev_vhost_init(
 {
 int r;
 CryptoDevBackendVhost *crypto;
+Error *local_err = NULL;
 
 crypto = g_new(CryptoDevBackendVhost, 1);
 crypto->dev.max_queues = 1;
@@ -66,8 +67,10 @@ cryptodev_vhost_init(
 /* vhost-user needs vq_index to initiate a specific queue pair */
 crypto->dev.vq_index = crypto->cc->queue_index * crypto->dev.nvqs;
 
-r = vhost_dev_init(>dev, options->opaque, options->backend_type, 
0);
+r = vhost_dev_init(>dev, options->opaque, options->backend_type, 0,
+   _err);
 if (r < 0) {
+error_report_err(local_err);
 goto fail;
 }
 
diff --git a/backends/vhost-user.c b/backends/vhost-user.c
index b366610e16..10b39992d2 100644
--- a/backends/vhost-user.c
+++ b/backends/vhost-user.c
@@ -48,9 +48,9 @@ vhost_user_backend_dev_init(VhostUserBackend *b, VirtIODevice 
*vdev,
 b->dev.nvqs = nvqs;
 b->dev.vqs = g_new0(struct vhost_virtqueue, nvqs);
 
-ret = vhost_dev_init(>dev, >vhost_user, VHOST_BACKEND_TYPE_USER, 0);
+ret = vhost_dev_init(>dev, >vhost_user, VHOST_BACKEND_TYPE_USER, 0,
+ errp);
 if (ret < 0) {
-error_setg_errno(errp, -ret, "vhost initialization failed");
 return -1;
 }
 
diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index c6210fad0c..0cb56baefb 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -332,9 +332,9 @@ static int vhost_user_blk_connect(DeviceState *dev, Error 
**errp)
 
 vhost_dev_set_config_notifier(>dev, _ops);
 
-ret = vhost_dev_init(>dev, >vhost_user, VHOST_BACKEND_TYPE_USER, 0);
+ret = vhost_dev_init(>dev, >vhost_user, VHOST_BACKEND_TYPE_USER, 0,
+ errp);
 if (ret < 0) {
-error_setg_errno(errp, -ret, "vhost initialization failed");
 return ret;
 }
 
diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index 44c1ed92dc..447b119f85 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -22,6 +22,7 @@
 #include "standard-headers/linux/vhost_types.h"
 #include "hw/virtio/virtio-net.h"
 #include "net/vhost_net.h"
+#include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "qemu/main-loop.h"
 
@@ -157,6 +158,7 @@ struct vhost_net *vhost_net_init(VhostNetOptions *options)
 bool backend_kernel = options->backend_type == VHOST_BACKEND_TYPE_KERNEL;
 struct vhost_net *net = g_new0(struct vhost_net, 1);
 uint64_t features = 0;
+Error *local_err = NULL;
 
 if (!options->net_backend) {
 fprintf(stderr, "vhost-net requires net backend to be setup\n");
@@ -187,8 +189,10 @@ struct vhost_net *vhost_net_init(VhostNetOptions *options)
 }
 
 r = vhost_dev_init(>dev, options->opaque,
-   options->backend_type, options->busyloop_timeout);
+   options->backend_type, options->busyloop_timeout,
+   _err);
 if (r < 0) {
+error_report_err(local_err);
 goto fail;
 }
 if (backend_kernel) {
diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c
index 4d70fa036b..8c611bfd2d 100644
--- a/hw/scsi/vhost-scsi.c
+++ b/hw/scsi/vhost-scsi.c
@@ -219,10 +219,8 @@ static void vhost_scsi_realize(DeviceState *dev, Error 
**errp)
 vsc->dev.backend_features = 0;
 
 ret = vhost_dev_init(>dev, (void *)(uintptr_t)vhostfd,
- 

[PATCH 3/7] vhost: Return 0/-errno in vhost_dev_init()

2021-06-09 Thread Kevin Wolf
Instead of just returning 0/-1 and letting the caller make up a
meaningless error message, switch to 0/-errno so that different kinds of
errors can be distinguished in the caller.

This involves changing a few more callbacks in VhostOps to return
0/-errno: .vhost_set_owner(), .vhost_get_features() and
.vhost_virtqueue_set_busyloop_timeout(). The implementations of these
functions are trivial as they generally just send a message to the
backend.

Signed-off-by: Kevin Wolf 
---
 hw/virtio/vhost-backend.c |  4 +++-
 hw/virtio/vhost-user.c| 10 +++---
 hw/virtio/vhost-vdpa.c|  4 +++-
 hw/virtio/vhost.c |  8 
 4 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c
index f4f71cf58a..594d770b75 100644
--- a/hw/virtio/vhost-backend.c
+++ b/hw/virtio/vhost-backend.c
@@ -24,10 +24,12 @@ static int vhost_kernel_call(struct vhost_dev *dev, 
unsigned long int request,
  void *arg)
 {
 int fd = (uintptr_t) dev->opaque;
+int ret;
 
 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_KERNEL);
 
-return ioctl(fd, request, arg);
+ret = ioctl(fd, request, arg);
+return ret < 0 ? -errno : ret;
 }
 
 static int vhost_kernel_init(struct vhost_dev *dev, void *opaque, Error **errp)
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 024cb201bb..889559d86a 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -1353,7 +1353,11 @@ static int vhost_user_get_u64(struct vhost_dev *dev, int 
request, uint64_t *u64)
 
 static int vhost_user_get_features(struct vhost_dev *dev, uint64_t *features)
 {
-return vhost_user_get_u64(dev, VHOST_USER_GET_FEATURES, features);
+if (vhost_user_get_u64(dev, VHOST_USER_GET_FEATURES, features) < 0) {
+return -EPROTO;
+}
+
+return 0;
 }
 
 static int vhost_user_set_owner(struct vhost_dev *dev)
@@ -1364,7 +1368,7 @@ static int vhost_user_set_owner(struct vhost_dev *dev)
 };
 
 if (vhost_user_write(dev, , NULL, 0) < 0) {
-return -1;
+return -EPROTO;
 }
 
 return 0;
@@ -1872,7 +1876,7 @@ static int vhost_user_backend_init(struct vhost_dev *dev, 
void *opaque,
 
 err = vhost_user_get_features(dev, );
 if (err < 0) {
-return -EPROTO;
+return err;
 }
 
 if (virtio_has_feature(features, VHOST_USER_F_PROTOCOL_FEATURES)) {
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index c2aadb57cb..71897c1a01 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -253,10 +253,12 @@ static int vhost_vdpa_call(struct vhost_dev *dev, 
unsigned long int request,
 {
 struct vhost_vdpa *v = dev->opaque;
 int fd = v->device_fd;
+int ret;
 
 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
 
-return ioctl(fd, request, arg);
+ret = ioctl(fd, request, arg);
+return ret < 0 ? -errno : ret;
 }
 
 static void vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status)
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index fd13135706..c7f9d8bb06 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -1309,13 +1309,13 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
 
 r = hdev->vhost_ops->vhost_set_owner(hdev);
 if (r < 0) {
-error_setg(errp, "vhost_set_owner failed");
+error_setg_errno(errp, -r, "vhost_set_owner failed");
 goto fail;
 }
 
 r = hdev->vhost_ops->vhost_get_features(hdev, );
 if (r < 0) {
-error_setg(errp, "vhost_get_features failed");
+error_setg_errno(errp, -r, "vhost_get_features failed");
 goto fail;
 }
 
@@ -1332,7 +1332,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
 r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i,
  busyloop_timeout);
 if (r < 0) {
-error_setg(errp, "Failed to set busyloop timeout");
+error_setg_errno(errp, -r, "Failed to set busyloop timeout");
 goto fail_busyloop;
 }
 }
@@ -1391,7 +1391,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
 if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) {
 error_setg(errp, "vhost backend memory slots limit is less"
" than current number of present memory slots");
-r = -1;
+r = -EINVAL;
 goto fail_busyloop;
 }
 
-- 
2.30.2




[PATCH 0/7] vhost-user-blk: Implement reconnection during realize

2021-06-09 Thread Kevin Wolf
My previous series removed the broken implementation of automatic
reconnection during .realize(). This series adds some error reporting
improvements that allow distinguishing cases where reconnecting could
help from permanent errors, and then uses it to re-implement the
automatic reconnection during .realize(), as was requested during review
of the previous series.

Kevin Wolf (7):
  vhost: Add Error parameter to vhost_dev_init()
  vhost: Distinguish errors in vhost_backend_init()
  vhost: Return 0/-errno in vhost_dev_init()
  vhost-user-blk: Add Error parameter to vhost_user_blk_start()
  vhost: Distinguish errors in vhost_dev_get_config()
  vhost-user-blk: Factor out vhost_user_blk_realize_connect()
  vhost-user-blk: Implement reconnection during realize

 include/hw/virtio/vhost-backend.h |   5 +-
 include/hw/virtio/vhost.h |   6 +-
 backends/cryptodev-vhost.c|   5 +-
 backends/vhost-user.c |   4 +-
 hw/block/vhost-user-blk.c | 100 +++---
 hw/display/vhost-user-gpu.c   |   6 +-
 hw/input/vhost-user-input.c   |   6 +-
 hw/net/vhost_net.c|   8 ++-
 hw/scsi/vhost-scsi.c  |   4 +-
 hw/scsi/vhost-user-scsi.c |   4 +-
 hw/virtio/vhost-backend.c |   6 +-
 hw/virtio/vhost-user-fs.c |   3 +-
 hw/virtio/vhost-user-vsock.c  |  12 ++--
 hw/virtio/vhost-user.c|  71 +++--
 hw/virtio/vhost-vdpa.c|   8 ++-
 hw/virtio/vhost-vsock.c   |   3 +-
 hw/virtio/vhost.c |  41 +++-
 17 files changed, 174 insertions(+), 118 deletions(-)

-- 
2.30.2




Re: [PATCH v2] async: the main AioContext is only "current" if under the BQL

2021-06-09 Thread Vladimir Sementsov-Ogievskiy

09.06.2021 16:02, Kevin Wolf wrote:

Am 09.06.2021 um 14:22 hat Paolo Bonzini geschrieben:

If we want to wake up a coroutine from a worker thread, aio_co_wake()
currently does not work.  In that scenario, aio_co_wake() calls
aio_co_enter(), but there is no current AioContext and therefore
qemu_get_current_aio_context() returns the main thread.  aio_co_wake()
then attempts to call aio_context_acquire() instead of going through
aio_co_schedule().

The default case of qemu_get_current_aio_context() was added to cover
synchronous I/O started from the vCPU thread, but the main and vCPU
threads are quite different.  The main thread is an I/O thread itself,
only running a more complicated event loop; the vCPU thread instead
is essentially a worker thread that occasionally calls
qemu_mutex_lock_iothread().  It is only in those critical sections
that it acts as if it were the home thread of the main AioContext.

Therefore, this patch detaches qemu_get_current_aio_context() from
iothreads, which is a useless complication.  The AioContext pointer
is stored directly in the thread-local variable, including for the
main loop.  Worker threads (including vCPU threads) optionally behave
as temporary home threads if they have taken the big QEMU lock,
but if that is not the case they will always schedule coroutines
on remote threads via aio_co_schedule().

With this change, qemu_mutex_iothread_locked() must be changed from
true to false.  The previous value of true was needed because the
main thread did not have an AioContext in the thread-local variable,
but now it does have one.

Reported-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Paolo Bonzini 


The commit message doesn't specify, but in the buggy case, are we
talking about calling aio_co_wake() for a coroutine in the main context
specifically, right? Could we have a unit test for this scenario?

But the change looks reasonable to me.



Yes the root case is: we have a coroutine in the main context sleeping in the yield 
point. We want to wake it by aio_co_wake() from another thread (which is not an 
iothread). My series "block/nbd: rework client connection" hangs on iotest 058 
without that fix, because I exactly use this case.


--
Best regards,
Vladimir



Re: [PATCH v2] async: the main AioContext is only "current" if under the BQL

2021-06-09 Thread Vladimir Sementsov-Ogievskiy

09.06.2021 15:22, Paolo Bonzini wrote:

If we want to wake up a coroutine from a worker thread, aio_co_wake()
currently does not work.  In that scenario, aio_co_wake() calls
aio_co_enter(), but there is no current AioContext and therefore
qemu_get_current_aio_context() returns the main thread.  aio_co_wake()
then attempts to call aio_context_acquire() instead of going through
aio_co_schedule().

The default case of qemu_get_current_aio_context() was added to cover
synchronous I/O started from the vCPU thread, but the main and vCPU
threads are quite different.  The main thread is an I/O thread itself,
only running a more complicated event loop; the vCPU thread instead
is essentially a worker thread that occasionally calls
qemu_mutex_lock_iothread().  It is only in those critical sections
that it acts as if it were the home thread of the main AioContext.

Therefore, this patch detaches qemu_get_current_aio_context() from
iothreads, which is a useless complication.  The AioContext pointer
is stored directly in the thread-local variable, including for the
main loop.  Worker threads (including vCPU threads) optionally behave
as temporary home threads if they have taken the big QEMU lock,
but if that is not the case they will always schedule coroutines
on remote threads via aio_co_schedule().

With this change, qemu_mutex_iothread_locked() must be changed from


Did you miss "qemu_mutex_iothread_locked() stub function"?


true to false.  The previous value of true was needed because the
main thread did not have an AioContext in the thread-local variable,
but now it does have one.

Reported-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Paolo Bonzini 


Weak as I'm not good in all these iothreads, neither I know does all old 
callers of qemu_get_current_aio_context() are ok with new behavior. Technically 
looks OK:

Reviewed-by: Vladimir Sementsov-Ogievskiy 

Also, it works well with my nbd upcoming series, thanks:

Tested-by: Vladimir Sementsov-Ogievskiy 

Ask just in case: aio context of iothread is set only once and never changes, 
yes?



---
  include/block/aio.h   |  5 -
  iothread.c|  9 +
  stubs/iothread-lock.c |  2 +-
  stubs/iothread.c  |  8 
  stubs/meson.build |  1 -
  tests/unit/iothread.c |  9 +
  util/async.c  | 20 
  util/main-loop.c  |  1 +
  8 files changed, 28 insertions(+), 27 deletions(-)
  delete mode 100644 stubs/iothread.c

diff --git a/include/block/aio.h b/include/block/aio.h
index 5f342267d5..10fcae1515 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -691,10 +691,13 @@ void aio_co_enter(AioContext *ctx, struct Coroutine *co);
   * Return the AioContext whose event loop runs in the current thread.
   *
   * If called from an IOThread this will be the IOThread's AioContext.  If
- * called from another thread it will be the main loop AioContext.
+ * called from the main thread or with the "big QEMU lock" taken it
+ * will be the main loop AioContext.


worth add: In other cases return NULL ?


--
Best regards,
Vladimir



Re: [PATCH v2] async: the main AioContext is only "current" if under the BQL

2021-06-09 Thread Kevin Wolf
Am 09.06.2021 um 14:22 hat Paolo Bonzini geschrieben:
> If we want to wake up a coroutine from a worker thread, aio_co_wake()
> currently does not work.  In that scenario, aio_co_wake() calls
> aio_co_enter(), but there is no current AioContext and therefore
> qemu_get_current_aio_context() returns the main thread.  aio_co_wake()
> then attempts to call aio_context_acquire() instead of going through
> aio_co_schedule().
> 
> The default case of qemu_get_current_aio_context() was added to cover
> synchronous I/O started from the vCPU thread, but the main and vCPU
> threads are quite different.  The main thread is an I/O thread itself,
> only running a more complicated event loop; the vCPU thread instead
> is essentially a worker thread that occasionally calls
> qemu_mutex_lock_iothread().  It is only in those critical sections
> that it acts as if it were the home thread of the main AioContext.
> 
> Therefore, this patch detaches qemu_get_current_aio_context() from
> iothreads, which is a useless complication.  The AioContext pointer
> is stored directly in the thread-local variable, including for the
> main loop.  Worker threads (including vCPU threads) optionally behave
> as temporary home threads if they have taken the big QEMU lock,
> but if that is not the case they will always schedule coroutines
> on remote threads via aio_co_schedule().
> 
> With this change, qemu_mutex_iothread_locked() must be changed from
> true to false.  The previous value of true was needed because the
> main thread did not have an AioContext in the thread-local variable,
> but now it does have one.
> 
> Reported-by: Vladimir Sementsov-Ogievskiy 
> Signed-off-by: Paolo Bonzini 

The commit message doesn't specify, but in the buggy case, are we
talking about calling aio_co_wake() for a coroutine in the main context
specifically, right? Could we have a unit test for this scenario?

But the change looks reasonable to me.

Kevin




Re: [PATCH 1/1] hw/nvme: namespace parameter for EUI64

2021-06-09 Thread Heinrich Schuchardt

+cc qemu-block@nongnu.org

On 6/9/21 1:46 PM, Heinrich Schuchardt wrote:

The EUI64 field is the only identifier for NVMe namespaces in UEFI device
paths. Add a new namespace property "eui64", that provides the user the
option to specify the EUI64.

Signed-off-by: Heinrich Schuchardt 
---
  docs/system/nvme.rst |  4 +++
  hw/nvme/ctrl.c   | 58 ++--
  hw/nvme/ns.c |  2 ++
  hw/nvme/nvme.h   |  1 +
  4 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/docs/system/nvme.rst b/docs/system/nvme.rst
index f7f63d6bf6..a6042f942a 100644
--- a/docs/system/nvme.rst
+++ b/docs/system/nvme.rst
@@ -81,6 +81,10 @@ There are a number of parameters available:
Set the UUID of the namespace. This will be reported as a "Namespace UUID"
descriptor in the Namespace Identification Descriptor List.

+``eui64``
+  Set the EUI64 of the namespace. This will be reported as a "IEEE Extended
+  Unique Identifier" descriptor in the Namespace Identification Descriptor 
List.
+
  ``bus``
If there are more ``nvme`` devices defined, this parameter may be used to
attach the namespace to a specific ``nvme`` device (identified by an ``id``
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 0bcaf7192f..21f2d6843b 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -4426,19 +4426,19 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl 
*n, NvmeRequest *req)
  NvmeIdentify *c = (NvmeIdentify *)>cmd;
  uint32_t nsid = le32_to_cpu(c->nsid);
  uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
-
-struct data {
-struct {
-NvmeIdNsDescr hdr;
-uint8_t v[NVME_NIDL_UUID];
-} uuid;
-struct {
-NvmeIdNsDescr hdr;
-uint8_t v;
-} csi;
-};
-
-struct data *ns_descrs = (struct data *)list;
+uint8_t *pos = list;
+struct {
+NvmeIdNsDescr hdr;
+uint8_t v[NVME_NIDL_UUID];
+} QEMU_PACKED uuid;
+struct {
+NvmeIdNsDescr hdr;
+uint64_t v;
+} QEMU_PACKED eui64;
+struct {
+NvmeIdNsDescr hdr;
+uint8_t v;
+} QEMU_PACKED csi;

  trace_pci_nvme_identify_ns_descr_list(nsid);

@@ -4452,17 +4452,29 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl 
*n, NvmeRequest *req)
  }

  /*
- * Because the NGUID and EUI64 fields are 0 in the Identify Namespace data
- * structure, a Namespace UUID (nidt = 3h) must be reported in the
- * Namespace Identification Descriptor. Add the namespace UUID here.
+ * If the EUI64 field is 0 and the NGUID field is 0, the namespace must
+ * provide a valid Namespace UUID in the Namespace Identification 
Descriptor
+ * data structure. QEMU does not yet support setting NGUID.
   */
-ns_descrs->uuid.hdr.nidt = NVME_NIDT_UUID;
-ns_descrs->uuid.hdr.nidl = NVME_NIDL_UUID;
-memcpy(_descrs->uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
-
-ns_descrs->csi.hdr.nidt = NVME_NIDT_CSI;
-ns_descrs->csi.hdr.nidl = NVME_NIDL_CSI;
-ns_descrs->csi.v = ns->csi;
+uuid.hdr.nidt = NVME_NIDT_UUID;
+uuid.hdr.nidl = NVME_NIDL_UUID;
+memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
+memcpy(pos, , sizeof(uuid));
+pos += sizeof(uuid);
+
+if (ns->params.eui64) {
+eui64.hdr.nidt = NVME_NIDT_EUI64;
+eui64.hdr.nidl = NVME_NIDL_EUI64;
+eui64.v = cpu_to_be64(ns->params.eui64);
+memcpy(pos, , sizeof(eui64));
+pos += sizeof(eui64);
+}
+
+csi.hdr.nidt = NVME_NIDT_CSI;
+csi.hdr.nidl = NVME_NIDL_CSI;
+csi.v = ns->csi;
+memcpy(pos, , sizeof(csi));
+pos += sizeof(csi);

  return nvme_c2h(n, list, sizeof(list), req);
  }
diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c
index 992e5a13f5..ddf395d60e 100644
--- a/hw/nvme/ns.c
+++ b/hw/nvme/ns.c
@@ -77,6 +77,7 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
  id_ns->mssrl = cpu_to_le16(ns->params.mssrl);
  id_ns->mcl = cpu_to_le32(ns->params.mcl);
  id_ns->msrc = ns->params.msrc;
+id_ns->eui64 = cpu_to_be64(ns->params.eui64);

  ds = 31 - clz32(ns->blkconf.logical_block_size);
  ms = ns->params.ms;
@@ -518,6 +519,7 @@ static Property nvme_ns_props[] = {
  DEFINE_PROP_BOOL("shared", NvmeNamespace, params.shared, false),
  DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
  DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid),
+DEFINE_PROP_UINT64("eui64", NvmeNamespace, params.eui64, 0),
  DEFINE_PROP_UINT16("ms", NvmeNamespace, params.ms, 0),
  DEFINE_PROP_UINT8("mset", NvmeNamespace, params.mset, 0),
  DEFINE_PROP_UINT8("pi", NvmeNamespace, params.pi, 0),
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 81a35cda14..abe7fab21c 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -83,6 +83,7 @@ typedef struct NvmeNamespaceParams {
  bool shared;
  uint32_t nsid;
  QemuUUID uuid;
+uint64_t eui64;

  uint16_t ms;
  uint8_t  mset;
--
2.30.2


Re: [PATCH v3 4/5] block-copy: add a CoMutex

2021-06-09 Thread Vladimir Sementsov-Ogievskiy

08.06.2021 10:33, Emanuele Giuseppe Esposito wrote:

Add a CoMutex to protect concurrent access of block-copy
data structures.

This mutex also protects .copy_bitmap, because its thread-safe
API does not prevent it from assigning two tasks to the same
bitmap region.

.finished, .cancelled and reads to .ret and .error_is_read will be
protected in the following patch, because are used also outside
coroutines.

Also set block_copy_task_create as coroutine_fn because:
1) it is static and only invoked by coroutine functions
2) this patch introduces and uses a CoMutex lock there

Signed-off-by: Emanuele Giuseppe Esposito 


I missed, did you (where?) add a comment like "all APIs are thread-safe", or 
what is thread-safe?


---
  block/block-copy.c | 82 ++
  1 file changed, 54 insertions(+), 28 deletions(-)

diff --git a/block/block-copy.c b/block/block-copy.c
index e2adb5b2ea..56f62913e4 100644
--- a/block/block-copy.c
+++ b/block/block-copy.c
@@ -61,6 +61,7 @@ typedef struct BlockCopyCallState {
  
  /* OUT parameters */

  bool cancelled;
+/* Fields protected by lock in BlockCopyState */
  bool error_is_read;
  int ret;
  } BlockCopyCallState;
@@ -78,7 +79,7 @@ typedef struct BlockCopyTask {
  int64_t bytes; /* only re-set in task_shrink, before running the task */
  BlockCopyMethod method; /* initialized in block_copy_dirty_clusters() */
  
-/* State */

+/* State. Protected by lock in BlockCopyState */
  CoQueue wait_queue; /* coroutines blocked on this task */
  
  /* To reference all call states from BlockCopyState */

@@ -99,7 +100,8 @@ typedef struct BlockCopyState {
  BdrvChild *source;
  BdrvChild *target;
  
-/* State */

+/* State. Protected by lock */
+CoMutex lock;
  int64_t in_flight_bytes;
  BlockCopyMethod method;
  QLIST_HEAD(, BlockCopyTask) tasks; /* All tasks from all block-copy calls 
*/
@@ -139,8 +141,10 @@ typedef struct BlockCopyState {
  bool skip_unallocated;
  } BlockCopyState;
  


May be nitpicking, but if we want block_copy_set_progress_meter to be threadsafe 
it should set s->progress under mutex. Or we should document that it's not 
threadsafe and called once.



-static BlockCopyTask *find_conflicting_task(BlockCopyState *s,
-int64_t offset, int64_t bytes)
+/* Called with lock held */
+static BlockCopyTask *find_conflicting_task_locked(BlockCopyState *s,
+   int64_t offset,
+   int64_t bytes)
  {
  BlockCopyTask *t;
  
@@ -160,18 +164,22 @@ static BlockCopyTask *find_conflicting_task(BlockCopyState *s,

  static bool coroutine_fn block_copy_wait_one(BlockCopyState *s, int64_t 
offset,
   int64_t bytes)
  {
-BlockCopyTask *task = find_conflicting_task(s, offset, bytes);
+BlockCopyTask *task;
+
+QEMU_LOCK_GUARD(>lock);
+task = find_conflicting_task_locked(s, offset, bytes);
  
  if (!task) {

  return false;
  }
  
-qemu_co_queue_wait(>wait_queue, NULL);

+qemu_co_queue_wait(>wait_queue, >lock);
  
  return true;

  }
  
-static int64_t block_copy_chunk_size(BlockCopyState *s)

+/* Called with lock held */
+static int64_t block_copy_chunk_size_locked(BlockCopyState *s)
  {
  switch (s->method) {
  case COPY_READ_WRITE_CLUSTER:
@@ -193,14 +201,16 @@ static int64_t block_copy_chunk_size(BlockCopyState *s)
   * Search for the first dirty area in offset/bytes range and create task at
   * the beginning of it.
   */
-static BlockCopyTask *block_copy_task_create(BlockCopyState *s,
- BlockCopyCallState *call_state,
- int64_t offset, int64_t bytes)
+static coroutine_fn BlockCopyTask *block_copy_task_create(BlockCopyState *s,
+BlockCopyCallState *call_state,
+int64_t offset, int64_t bytes)
  {
  BlockCopyTask *task;
-int64_t max_chunk = block_copy_chunk_size(s);
+int64_t max_chunk;
  
-max_chunk = MIN_NON_ZERO(max_chunk, call_state->max_chunk);

+QEMU_LOCK_GUARD(>lock);
+max_chunk = MIN_NON_ZERO(block_copy_chunk_size_locked(s),
+ call_state->max_chunk);
  if (!bdrv_dirty_bitmap_next_dirty_area(s->copy_bitmap,
 offset, offset + bytes,
 max_chunk, , ))
@@ -212,7 +222,7 @@ static BlockCopyTask *block_copy_task_create(BlockCopyState 
*s,
  bytes = QEMU_ALIGN_UP(bytes, s->cluster_size);
  
  /* region is dirty, so no existent tasks possible in it */

-assert(!find_conflicting_task(s, offset, bytes));
+assert(!find_conflicting_task_locked(s, offset, bytes));
  
  

[PATCH v2] async: the main AioContext is only "current" if under the BQL

2021-06-09 Thread Paolo Bonzini
If we want to wake up a coroutine from a worker thread, aio_co_wake()
currently does not work.  In that scenario, aio_co_wake() calls
aio_co_enter(), but there is no current AioContext and therefore
qemu_get_current_aio_context() returns the main thread.  aio_co_wake()
then attempts to call aio_context_acquire() instead of going through
aio_co_schedule().

The default case of qemu_get_current_aio_context() was added to cover
synchronous I/O started from the vCPU thread, but the main and vCPU
threads are quite different.  The main thread is an I/O thread itself,
only running a more complicated event loop; the vCPU thread instead
is essentially a worker thread that occasionally calls
qemu_mutex_lock_iothread().  It is only in those critical sections
that it acts as if it were the home thread of the main AioContext.

Therefore, this patch detaches qemu_get_current_aio_context() from
iothreads, which is a useless complication.  The AioContext pointer
is stored directly in the thread-local variable, including for the
main loop.  Worker threads (including vCPU threads) optionally behave
as temporary home threads if they have taken the big QEMU lock,
but if that is not the case they will always schedule coroutines
on remote threads via aio_co_schedule().

With this change, qemu_mutex_iothread_locked() must be changed from
true to false.  The previous value of true was needed because the
main thread did not have an AioContext in the thread-local variable,
but now it does have one.

Reported-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Paolo Bonzini 
---
 include/block/aio.h   |  5 -
 iothread.c|  9 +
 stubs/iothread-lock.c |  2 +-
 stubs/iothread.c  |  8 
 stubs/meson.build |  1 -
 tests/unit/iothread.c |  9 +
 util/async.c  | 20 
 util/main-loop.c  |  1 +
 8 files changed, 28 insertions(+), 27 deletions(-)
 delete mode 100644 stubs/iothread.c

diff --git a/include/block/aio.h b/include/block/aio.h
index 5f342267d5..10fcae1515 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -691,10 +691,13 @@ void aio_co_enter(AioContext *ctx, struct Coroutine *co);
  * Return the AioContext whose event loop runs in the current thread.
  *
  * If called from an IOThread this will be the IOThread's AioContext.  If
- * called from another thread it will be the main loop AioContext.
+ * called from the main thread or with the "big QEMU lock" taken it
+ * will be the main loop AioContext.
  */
 AioContext *qemu_get_current_aio_context(void);
 
+void qemu_set_current_aio_context(AioContext *ctx);
+
 /**
  * aio_context_setup:
  * @ctx: the aio context
diff --git a/iothread.c b/iothread.c
index 7f086387be..2c5ccd7367 100644
--- a/iothread.c
+++ b/iothread.c
@@ -39,13 +39,6 @@ DECLARE_CLASS_CHECKERS(IOThreadClass, IOTHREAD,
 #define IOTHREAD_POLL_MAX_NS_DEFAULT 0ULL
 #endif
 
-static __thread IOThread *my_iothread;
-
-AioContext *qemu_get_current_aio_context(void)
-{
-return my_iothread ? my_iothread->ctx : qemu_get_aio_context();
-}
-
 static void *iothread_run(void *opaque)
 {
 IOThread *iothread = opaque;
@@ -56,7 +49,7 @@ static void *iothread_run(void *opaque)
  * in this new thread uses glib.
  */
 g_main_context_push_thread_default(iothread->worker_context);
-my_iothread = iothread;
+qemu_set_current_aio_context(iothread->ctx);
 iothread->thread_id = qemu_get_thread_id();
 qemu_sem_post(>init_done_sem);
 
diff --git a/stubs/iothread-lock.c b/stubs/iothread-lock.c
index 2a6efad64a..5b45b7fc8b 100644
--- a/stubs/iothread-lock.c
+++ b/stubs/iothread-lock.c
@@ -3,7 +3,7 @@
 
 bool qemu_mutex_iothread_locked(void)
 {
-return true;
+return false;
 }
 
 void qemu_mutex_lock_iothread_impl(const char *file, int line)
diff --git a/stubs/iothread.c b/stubs/iothread.c
deleted file mode 100644
index 8cc9e28c55..00
--- a/stubs/iothread.c
+++ /dev/null
@@ -1,8 +0,0 @@
-#include "qemu/osdep.h"
-#include "block/aio.h"
-#include "qemu/main-loop.h"
-
-AioContext *qemu_get_current_aio_context(void)
-{
-return qemu_get_aio_context();
-}
diff --git a/stubs/meson.build b/stubs/meson.build
index 65c22c0568..4993797f05 100644
--- a/stubs/meson.build
+++ b/stubs/meson.build
@@ -16,7 +16,6 @@ stub_ss.add(files('fw_cfg.c'))
 stub_ss.add(files('gdbstub.c'))
 stub_ss.add(files('get-vm-name.c'))
 stub_ss.add(when: 'CONFIG_LINUX_IO_URING', if_true: files('io_uring.c'))
-stub_ss.add(files('iothread.c'))
 stub_ss.add(files('iothread-lock.c'))
 stub_ss.add(files('isa-bus.c'))
 stub_ss.add(files('is-daemonized.c'))
diff --git a/tests/unit/iothread.c b/tests/unit/iothread.c
index afde12b4ef..f9b0791084 100644
--- a/tests/unit/iothread.c
+++ b/tests/unit/iothread.c
@@ -30,13 +30,6 @@ struct IOThread {
 bool stopping;
 };
 
-static __thread IOThread *my_iothread;
-
-AioContext *qemu_get_current_aio_context(void)
-{
-return my_iothread ? my_iothread->ctx : qemu_get_aio_context();
-}
-
 static void 

Re: [PATCH] async: the main AioContext is only "current" if under the BQL

2021-06-09 Thread Paolo Bonzini

On 09/06/21 13:40, Vladimir Sementsov-Ogievskiy wrote:



And in gdb all looks like aio_co_wake() in my own separate thread 
leads to coroutine execution exactly in my own thread.. So, it don't 
dead-lock on trying to acquire the context, instead it somehow enter 
to a coroutine.  And then deadlock because called coroutine tries to 
lock the mutex, that already locked before (in the code that thinks 
that aio_co_wake() will only schedule the coroutine).


I'll dig into it a bit more.




Aha, that's because qemu_mutex_iothread_locked() from 
stubs/iothread-lock.c is used, which always returns true.


Ok, you can change it to always return false with this patch.  Which is 
nicer, as it means we have less special casing going on in the tools and 
it matches the fact that there are no vCPU threads.


Paolo




Re: [PATCH v3 06/33] util/async: aio_co_schedule(): support reschedule in same ctx

2021-06-09 Thread Paolo Bonzini

On 09/06/21 12:24, Vladimir Sementsov-Ogievskiy wrote:

Thanks, I'll base v4 of nbd patches on it.

I now run make check. test-aio-multithread crashes on assertion:


With the patch I've sent it doesn't, so hopefully you can go ahead.

Paolo


(gdb) bt
#0  0x7f4af8d839d5 in raise () from /lib64/libc.so.6
#1  0x7f4af8d6c8a4 in abort () from /lib64/libc.so.6
#2  0x7f4af8d6c789 in __assert_fail_base.cold () from /lib64/libc.so.6
#3  0x7f4af8d7c026 in __assert_fail () from /lib64/libc.so.6
#4  0x55daebfdab95 in aio_poll (ctx=0x7f4aeb60, blocking=true) 
at ../util/aio-posix.c:567
#5  0x55daebea096c in iothread_run (opaque=0x55daed81bc90) at 
../tests/unit/iothread.c:91
#6  0x55daebfc6c4a in qemu_thread_start (args=0x55daed7d9940) at 
../util/qemu-thread-posix.c:521

#7  0x7f4af8f1a3f9 in start_thread () from /lib64/libpthread.so.0
#8  0x7f4af8e47b53 in clone () from /lib64/libc.so.6
(gdb) fr 4
#4  0x55daebfdab95 in aio_poll (ctx=0x7f4aeb60, blocking=true) 
at ../util/aio-posix.c:567
567 assert(in_aio_context_home_thread(ctx == 
iohandler_get_aio_context() ?

(gdb) list
562  *
563  * aio_poll() may only be called in the AioContext's thread. 
iohandler_ctx
564  * is special in that it runs in the main thread, but that 
thread's context

565  * is qemu_aio_context.
566  */
567 assert(in_aio_context_home_thread(ctx == 
iohandler_get_aio_context() ?
568   qemu_get_aio_context() : 
ctx));

569
570 qemu_lockcnt_inc(>list_lock);





Re: [PATCH] async: the main AioContext is only "current" if under the BQL

2021-06-09 Thread Vladimir Sementsov-Ogievskiy

09.06.2021 14:32, Vladimir Sementsov-Ogievskiy wrote:

09.06.2021 13:53, Paolo Bonzini wrote:

If we want to wake up a coroutine from a worker thread, aio_co_wake()
currently does not work.  In that scenario, aio_co_wake() calls
aio_co_enter(), but there is no current AioContext and therefore
qemu_get_current_aio_context() returns the main thread.  aio_co_wake()
then attempts to call aio_context_acquire() instead of going through
aio_co_schedule().

The default case of qemu_get_current_aio_context() was added to cover
synchronous I/O started from the vCPU thread, but the main and vCPU
threads are quite different.  The main thread is an I/O thread itself,
only running a more complicated event loop; the vCPU thread instead
is essentially a worker thread that occasionally calls
qemu_mutex_lock_iothread().  It is only in those critical sections
that it acts as if it were the home thread of the main AioContext.

Therefore, this patch detaches qemu_get_current_aio_context() from
iothreads, which is a useless complication.  The AioContext pointer
is stored directly in the thread-local variable, including for the
main loop.  Worker threads (including vCPU threads) optionally behave
as temporary home threads if they have taken the big QEMU lock,
but if that is not the case they will always schedule coroutines
on remote threads via aio_co_schedule().

Reported-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Paolo Bonzini 
---
  include/block/aio.h   |  5 -
  iothread.c    |  9 +
  stubs/iothread.c  |  8 
  stubs/meson.build |  1 -
  tests/unit/iothread.c |  9 +
  util/async.c  | 20 
  util/main-loop.c  |  1 +
  7 files changed, 27 insertions(+), 26 deletions(-)
  delete mode 100644 stubs/iothread.c

diff --git a/include/block/aio.h b/include/block/aio.h
index 5f342267d5..10fcae1515 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -691,10 +691,13 @@ void aio_co_enter(AioContext *ctx, struct Coroutine *co);
   * Return the AioContext whose event loop runs in the current thread.
   *
   * If called from an IOThread this will be the IOThread's AioContext.  If
- * called from another thread it will be the main loop AioContext.
+ * called from the main thread or with the "big QEMU lock" taken it
+ * will be the main loop AioContext.
   */
  AioContext *qemu_get_current_aio_context(void);
+void qemu_set_current_aio_context(AioContext *ctx);
+
  /**
   * aio_context_setup:
   * @ctx: the aio context
diff --git a/iothread.c b/iothread.c
index 7f086387be..2c5ccd7367 100644
--- a/iothread.c
+++ b/iothread.c
@@ -39,13 +39,6 @@ DECLARE_CLASS_CHECKERS(IOThreadClass, IOTHREAD,
  #define IOTHREAD_POLL_MAX_NS_DEFAULT 0ULL
  #endif
-static __thread IOThread *my_iothread;
-
-AioContext *qemu_get_current_aio_context(void)
-{
-    return my_iothread ? my_iothread->ctx : qemu_get_aio_context();
-}
-
  static void *iothread_run(void *opaque)
  {
  IOThread *iothread = opaque;
@@ -56,7 +49,7 @@ static void *iothread_run(void *opaque)
   * in this new thread uses glib.
   */
  g_main_context_push_thread_default(iothread->worker_context);
-    my_iothread = iothread;
+    qemu_set_current_aio_context(iothread->ctx);
  iothread->thread_id = qemu_get_thread_id();
  qemu_sem_post(>init_done_sem);
diff --git a/stubs/iothread.c b/stubs/iothread.c
deleted file mode 100644
index 8cc9e28c55..00
--- a/stubs/iothread.c
+++ /dev/null
@@ -1,8 +0,0 @@
-#include "qemu/osdep.h"
-#include "block/aio.h"
-#include "qemu/main-loop.h"
-
-AioContext *qemu_get_current_aio_context(void)
-{
-    return qemu_get_aio_context();
-}
diff --git a/stubs/meson.build b/stubs/meson.build
index 65c22c0568..4993797f05 100644
--- a/stubs/meson.build
+++ b/stubs/meson.build
@@ -16,7 +16,6 @@ stub_ss.add(files('fw_cfg.c'))
  stub_ss.add(files('gdbstub.c'))
  stub_ss.add(files('get-vm-name.c'))
  stub_ss.add(when: 'CONFIG_LINUX_IO_URING', if_true: files('io_uring.c'))
-stub_ss.add(files('iothread.c'))
  stub_ss.add(files('iothread-lock.c'))
  stub_ss.add(files('isa-bus.c'))
  stub_ss.add(files('is-daemonized.c'))
diff --git a/tests/unit/iothread.c b/tests/unit/iothread.c
index afde12b4ef..f9b0791084 100644
--- a/tests/unit/iothread.c
+++ b/tests/unit/iothread.c
@@ -30,13 +30,6 @@ struct IOThread {
  bool stopping;
  };
-static __thread IOThread *my_iothread;
-
-AioContext *qemu_get_current_aio_context(void)
-{
-    return my_iothread ? my_iothread->ctx : qemu_get_aio_context();
-}
-
  static void iothread_init_gcontext(IOThread *iothread)
  {
  GSource *source;
@@ -54,9 +47,9 @@ static void *iothread_run(void *opaque)
  rcu_register_thread();
-    my_iothread = iothread;
  qemu_mutex_lock(>init_done_lock);
  iothread->ctx = aio_context_new(_abort);
+    qemu_set_current_aio_context(iothread->ctx);
  /*
   * We must connect the ctx to a GMainContext, because in older versions
diff --git a/util/async.c b/util/async.c
index 

Re: [PATCH] async: the main AioContext is only "current" if under the BQL

2021-06-09 Thread Vladimir Sementsov-Ogievskiy

09.06.2021 13:53, Paolo Bonzini wrote:

If we want to wake up a coroutine from a worker thread, aio_co_wake()
currently does not work.  In that scenario, aio_co_wake() calls
aio_co_enter(), but there is no current AioContext and therefore
qemu_get_current_aio_context() returns the main thread.  aio_co_wake()
then attempts to call aio_context_acquire() instead of going through
aio_co_schedule().

The default case of qemu_get_current_aio_context() was added to cover
synchronous I/O started from the vCPU thread, but the main and vCPU
threads are quite different.  The main thread is an I/O thread itself,
only running a more complicated event loop; the vCPU thread instead
is essentially a worker thread that occasionally calls
qemu_mutex_lock_iothread().  It is only in those critical sections
that it acts as if it were the home thread of the main AioContext.

Therefore, this patch detaches qemu_get_current_aio_context() from
iothreads, which is a useless complication.  The AioContext pointer
is stored directly in the thread-local variable, including for the
main loop.  Worker threads (including vCPU threads) optionally behave
as temporary home threads if they have taken the big QEMU lock,
but if that is not the case they will always schedule coroutines
on remote threads via aio_co_schedule().

Reported-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Paolo Bonzini 
---
  include/block/aio.h   |  5 -
  iothread.c|  9 +
  stubs/iothread.c  |  8 
  stubs/meson.build |  1 -
  tests/unit/iothread.c |  9 +
  util/async.c  | 20 
  util/main-loop.c  |  1 +
  7 files changed, 27 insertions(+), 26 deletions(-)
  delete mode 100644 stubs/iothread.c

diff --git a/include/block/aio.h b/include/block/aio.h
index 5f342267d5..10fcae1515 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -691,10 +691,13 @@ void aio_co_enter(AioContext *ctx, struct Coroutine *co);
   * Return the AioContext whose event loop runs in the current thread.
   *
   * If called from an IOThread this will be the IOThread's AioContext.  If
- * called from another thread it will be the main loop AioContext.
+ * called from the main thread or with the "big QEMU lock" taken it
+ * will be the main loop AioContext.
   */
  AioContext *qemu_get_current_aio_context(void);
  
+void qemu_set_current_aio_context(AioContext *ctx);

+
  /**
   * aio_context_setup:
   * @ctx: the aio context
diff --git a/iothread.c b/iothread.c
index 7f086387be..2c5ccd7367 100644
--- a/iothread.c
+++ b/iothread.c
@@ -39,13 +39,6 @@ DECLARE_CLASS_CHECKERS(IOThreadClass, IOTHREAD,
  #define IOTHREAD_POLL_MAX_NS_DEFAULT 0ULL
  #endif
  
-static __thread IOThread *my_iothread;

-
-AioContext *qemu_get_current_aio_context(void)
-{
-return my_iothread ? my_iothread->ctx : qemu_get_aio_context();
-}
-
  static void *iothread_run(void *opaque)
  {
  IOThread *iothread = opaque;
@@ -56,7 +49,7 @@ static void *iothread_run(void *opaque)
   * in this new thread uses glib.
   */
  g_main_context_push_thread_default(iothread->worker_context);
-my_iothread = iothread;
+qemu_set_current_aio_context(iothread->ctx);
  iothread->thread_id = qemu_get_thread_id();
  qemu_sem_post(>init_done_sem);
  
diff --git a/stubs/iothread.c b/stubs/iothread.c

deleted file mode 100644
index 8cc9e28c55..00
--- a/stubs/iothread.c
+++ /dev/null
@@ -1,8 +0,0 @@
-#include "qemu/osdep.h"
-#include "block/aio.h"
-#include "qemu/main-loop.h"
-
-AioContext *qemu_get_current_aio_context(void)
-{
-return qemu_get_aio_context();
-}
diff --git a/stubs/meson.build b/stubs/meson.build
index 65c22c0568..4993797f05 100644
--- a/stubs/meson.build
+++ b/stubs/meson.build
@@ -16,7 +16,6 @@ stub_ss.add(files('fw_cfg.c'))
  stub_ss.add(files('gdbstub.c'))
  stub_ss.add(files('get-vm-name.c'))
  stub_ss.add(when: 'CONFIG_LINUX_IO_URING', if_true: files('io_uring.c'))
-stub_ss.add(files('iothread.c'))
  stub_ss.add(files('iothread-lock.c'))
  stub_ss.add(files('isa-bus.c'))
  stub_ss.add(files('is-daemonized.c'))
diff --git a/tests/unit/iothread.c b/tests/unit/iothread.c
index afde12b4ef..f9b0791084 100644
--- a/tests/unit/iothread.c
+++ b/tests/unit/iothread.c
@@ -30,13 +30,6 @@ struct IOThread {
  bool stopping;
  };
  
-static __thread IOThread *my_iothread;

-
-AioContext *qemu_get_current_aio_context(void)
-{
-return my_iothread ? my_iothread->ctx : qemu_get_aio_context();
-}
-
  static void iothread_init_gcontext(IOThread *iothread)
  {
  GSource *source;
@@ -54,9 +47,9 @@ static void *iothread_run(void *opaque)
  
  rcu_register_thread();
  
-my_iothread = iothread;

  qemu_mutex_lock(>init_done_lock);
  iothread->ctx = aio_context_new(_abort);
+qemu_set_current_aio_context(iothread->ctx);
  
  /*

   * We must connect the ctx to a GMainContext, because in older versions
diff --git a/util/async.c b/util/async.c
index 674dbefb7c..5d9b7cc1eb 100644
--- 

Re: [PATCH v3 1/5] block-copy: streamline choice of copy_range vs. read/write

2021-06-09 Thread Vladimir Sementsov-Ogievskiy

09.06.2021 12:33, Paolo Bonzini wrote:

On 09/06/21 10:51, Vladimir Sementsov-Ogievskiy wrote:


+    default:
[...]
+    bounce_buffer = qemu_blockalign(s->source->bs, nbytes);
+    ret = bdrv_co_pread(s->source, offset, nbytes, bounce_buffer, 0);
+    if (ret < 0) {
+    trace_block_copy_read_fail(s, offset, ret);
+    *error_is_read = true;
+    goto out;
+    }
+    ret = bdrv_co_pwrite(s->target, offset, nbytes, bounce_buffer,
+    s->write_flags);
+    if (ret < 0) {
+    trace_block_copy_write_fail(s, offset, ret);
+    *error_is_read = false;
+    goto out;
+    }
+out:
+    qemu_vfree(bounce_buffer);


label inside switch operator? Rather unusual. Please, let's avoid it and just 
keep out: after switch operator.


Agreed with all comments except this one, the bounce_buffer doesn't exist in 
the other cases.


If keep it as is, worth making indentation for out: the same as for default: ?

--
Best regards,
Vladimir



[PATCH] async: the main AioContext is only "current" if under the BQL

2021-06-09 Thread Paolo Bonzini
If we want to wake up a coroutine from a worker thread, aio_co_wake()
currently does not work.  In that scenario, aio_co_wake() calls
aio_co_enter(), but there is no current AioContext and therefore
qemu_get_current_aio_context() returns the main thread.  aio_co_wake()
then attempts to call aio_context_acquire() instead of going through
aio_co_schedule().

The default case of qemu_get_current_aio_context() was added to cover
synchronous I/O started from the vCPU thread, but the main and vCPU
threads are quite different.  The main thread is an I/O thread itself,
only running a more complicated event loop; the vCPU thread instead
is essentially a worker thread that occasionally calls
qemu_mutex_lock_iothread().  It is only in those critical sections
that it acts as if it were the home thread of the main AioContext.

Therefore, this patch detaches qemu_get_current_aio_context() from
iothreads, which is a useless complication.  The AioContext pointer
is stored directly in the thread-local variable, including for the
main loop.  Worker threads (including vCPU threads) optionally behave
as temporary home threads if they have taken the big QEMU lock,
but if that is not the case they will always schedule coroutines
on remote threads via aio_co_schedule().

Reported-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Paolo Bonzini 
---
 include/block/aio.h   |  5 -
 iothread.c|  9 +
 stubs/iothread.c  |  8 
 stubs/meson.build |  1 -
 tests/unit/iothread.c |  9 +
 util/async.c  | 20 
 util/main-loop.c  |  1 +
 7 files changed, 27 insertions(+), 26 deletions(-)
 delete mode 100644 stubs/iothread.c

diff --git a/include/block/aio.h b/include/block/aio.h
index 5f342267d5..10fcae1515 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -691,10 +691,13 @@ void aio_co_enter(AioContext *ctx, struct Coroutine *co);
  * Return the AioContext whose event loop runs in the current thread.
  *
  * If called from an IOThread this will be the IOThread's AioContext.  If
- * called from another thread it will be the main loop AioContext.
+ * called from the main thread or with the "big QEMU lock" taken it
+ * will be the main loop AioContext.
  */
 AioContext *qemu_get_current_aio_context(void);
 
+void qemu_set_current_aio_context(AioContext *ctx);
+
 /**
  * aio_context_setup:
  * @ctx: the aio context
diff --git a/iothread.c b/iothread.c
index 7f086387be..2c5ccd7367 100644
--- a/iothread.c
+++ b/iothread.c
@@ -39,13 +39,6 @@ DECLARE_CLASS_CHECKERS(IOThreadClass, IOTHREAD,
 #define IOTHREAD_POLL_MAX_NS_DEFAULT 0ULL
 #endif
 
-static __thread IOThread *my_iothread;
-
-AioContext *qemu_get_current_aio_context(void)
-{
-return my_iothread ? my_iothread->ctx : qemu_get_aio_context();
-}
-
 static void *iothread_run(void *opaque)
 {
 IOThread *iothread = opaque;
@@ -56,7 +49,7 @@ static void *iothread_run(void *opaque)
  * in this new thread uses glib.
  */
 g_main_context_push_thread_default(iothread->worker_context);
-my_iothread = iothread;
+qemu_set_current_aio_context(iothread->ctx);
 iothread->thread_id = qemu_get_thread_id();
 qemu_sem_post(>init_done_sem);
 
diff --git a/stubs/iothread.c b/stubs/iothread.c
deleted file mode 100644
index 8cc9e28c55..00
--- a/stubs/iothread.c
+++ /dev/null
@@ -1,8 +0,0 @@
-#include "qemu/osdep.h"
-#include "block/aio.h"
-#include "qemu/main-loop.h"
-
-AioContext *qemu_get_current_aio_context(void)
-{
-return qemu_get_aio_context();
-}
diff --git a/stubs/meson.build b/stubs/meson.build
index 65c22c0568..4993797f05 100644
--- a/stubs/meson.build
+++ b/stubs/meson.build
@@ -16,7 +16,6 @@ stub_ss.add(files('fw_cfg.c'))
 stub_ss.add(files('gdbstub.c'))
 stub_ss.add(files('get-vm-name.c'))
 stub_ss.add(when: 'CONFIG_LINUX_IO_URING', if_true: files('io_uring.c'))
-stub_ss.add(files('iothread.c'))
 stub_ss.add(files('iothread-lock.c'))
 stub_ss.add(files('isa-bus.c'))
 stub_ss.add(files('is-daemonized.c'))
diff --git a/tests/unit/iothread.c b/tests/unit/iothread.c
index afde12b4ef..f9b0791084 100644
--- a/tests/unit/iothread.c
+++ b/tests/unit/iothread.c
@@ -30,13 +30,6 @@ struct IOThread {
 bool stopping;
 };
 
-static __thread IOThread *my_iothread;
-
-AioContext *qemu_get_current_aio_context(void)
-{
-return my_iothread ? my_iothread->ctx : qemu_get_aio_context();
-}
-
 static void iothread_init_gcontext(IOThread *iothread)
 {
 GSource *source;
@@ -54,9 +47,9 @@ static void *iothread_run(void *opaque)
 
 rcu_register_thread();
 
-my_iothread = iothread;
 qemu_mutex_lock(>init_done_lock);
 iothread->ctx = aio_context_new(_abort);
+qemu_set_current_aio_context(iothread->ctx);
 
 /*
  * We must connect the ctx to a GMainContext, because in older versions
diff --git a/util/async.c b/util/async.c
index 674dbefb7c..5d9b7cc1eb 100644
--- a/util/async.c
+++ b/util/async.c
@@ -649,3 +649,23 @@ void aio_context_release(AioContext *ctx)
 {
 

Re: [PATCH v2 3/3] qapi: deprecate drive-backup

2021-06-09 Thread Markus Armbruster
Vladimir Sementsov-Ogievskiy  writes:

> 08.06.2021 14:12, Markus Armbruster wrote:
>> Vladimir Sementsov-Ogievskiy  writes:
>> 
>> [...]
>> 
>>> TODO: We also need to deprecate drive-backup transaction action..
>>> But union members in QAPI doesn't support 'deprecated' feature. I tried
>>> to dig a bit, but failed :/ Markus, could you please help with it? At
>>> least by advice?
>> 
>> There are two closely related things in play here: the union branch and
>> the corresponding enum value.
>> 
>> So far, the QAPI schema language doesn't support tacking feature flags
>> to either.
>> 
>> If an enum value is deprecated, any union branches corresponding to it
>> must also be deprecated (because their use requires using the deprecated
>> enum value).
>> 
>> The converse is not true, but I can't see a use for deprecating a union
>> branch without also deprecating the enum member.
>> 
>> I think we can implement feature flags just for enum members, then
>> document that 'deprecated' enum value implies corresponding union
>> branches are also deprecated.
>> 
>> I have unfinished patches implementing feature flags for enum members.
>> 
>> Since TransactionAction is a simple union, the corresponding enum is
>> implicit.  We can make it explicit by converting to a flat union.
>> Simple unions need to die anyway.
>
>
> Does BlockStatsSpecific from qapi/block-core.json a correct example of flat 
> union you mean? I can make patch to convert TransactionAction to be similar 
> if that helps (discriminator field should be called "type", yes?).

>From docs/devel/qapi-code-gen.txt:

A simple union can always be re-written as a flat union where the base
class has a single member named 'type', and where each branch of the
union has a struct with a single member named 'data'.  That is,

 { 'union': 'Simple', 'data': { 'one': 'str', 'two': 'int' } }

is identical on the wire to:

 { 'enum': 'Enum', 'data': ['one', 'two'] }
 { 'struct': 'Branch1', 'data': { 'data': 'str' } }
 { 'struct': 'Branch2', 'data': { 'data': 'int' } }
 { 'union': 'Flat', 'base': { 'type': 'Enum' }, 'discriminator': 'type',
   'data': { 'one': 'Branch1', 'two': 'Branch2' } }

The generated C isn't identical, but adjusting the code using it should
be straightforward.

>> Does this make sense?
>> 
>
> Yes if it helps)
>
> Did you also look at John's 
> https://gitlab.com/jsnow/qemu/-/commits/hack-deprecate-union-branches/ ?

Not yet.

> I hope you and John will send patches that you have, I'll help with reviewing 
> (keep me in CC), and finally we'll get the feature.

Sounds like a plan.  I need to get my post-vacation e-mail pileup under
control first.




Re: [PATCH v3 06/33] util/async: aio_co_schedule(): support reschedule in same ctx

2021-06-09 Thread Vladimir Sementsov-Ogievskiy

09.06.2021 12:35, Paolo Bonzini wrote:

On 08/06/21 20:45, Vladimir Sementsov-Ogievskiy wrote:

14.05.2021 00:04, Paolo Bonzini wrote:

On 12/05/21 09:15, Vladimir Sementsov-Ogievskiy wrote:




I don't understand.  Why doesn't aio_co_enter go through the ctx != 
qemu_get_current_aio_context() branch and just do aio_co_schedule? That was at 
least the idea behind aio_co_wake and aio_co_enter.



Because ctx is exactly qemu_get_current_aio_context(), as we are not in 
iothread but in nbd connection thread. So, qemu_get_current_aio_context() 
returns qemu_aio_context.


So the problem is that threads other than the main thread and
the I/O thread should not return qemu_aio_context.  The vCPU thread
may need to return it when running with BQL taken, though.

Something like this (untested):

diff --git a/include/block/aio.h b/include/block/aio.h
index 5f342267d5..10fcae1515 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -691,10 +691,13 @@ void aio_co_enter(AioContext *ctx, struct Coroutine *co);
   * Return the AioContext whose event loop runs in the current thread.
   *
   * If called from an IOThread this will be the IOThread's AioContext.  If
- * called from another thread it will be the main loop AioContext.
+ * called from the main thread or with the "big QEMU lock" taken it
+ * will be the main loop AioContext.
   */
  AioContext *qemu_get_current_aio_context(void);

+void qemu_set_current_aio_context(AioContext *ctx);
+
  /**
   * aio_context_setup:
   * @ctx: the aio context
diff --git a/iothread.c b/iothread.c
index 7f086387be..22b967e77c 100644
--- a/iothread.c
+++ b/iothread.c
@@ -39,11 +39,23 @@ DECLARE_CLASS_CHECKERS(IOThreadClass, IOTHREAD,
  #define IOTHREAD_POLL_MAX_NS_DEFAULT 0ULL
  #endif

-static __thread IOThread *my_iothread;
+static __thread AioContext *my_aiocontext;
+
+void qemu_set_current_aio_context(AioContext *ctx)
+{
+    assert(!my_aiocontext);
+    my_aiocontext = ctx;
+}

  AioContext *qemu_get_current_aio_context(void)
  {
-    return my_iothread ? my_iothread->ctx : qemu_get_aio_context();
+    if (my_aiocontext) {
+    return my_aiocontext;
+    }
+    if (qemu_mutex_iothread_locked()) {
+    return qemu_get_aio_context();
+    }
+    return NULL;
  }

  static void *iothread_run(void *opaque)
@@ -56,7 +68,7 @@ static void *iothread_run(void *opaque)
   * in this new thread uses glib.
   */
  g_main_context_push_thread_default(iothread->worker_context);
-    my_iothread = iothread;
+    qemu_set_current_aio_context(iothread->ctx);
  iothread->thread_id = qemu_get_thread_id();
  qemu_sem_post(>init_done_sem);

diff --git a/stubs/iothread.c b/stubs/iothread.c
index 8cc9e28c55..25ff398894 100644
--- a/stubs/iothread.c
+++ b/stubs/iothread.c
@@ -6,3 +6,7 @@ AioContext *qemu_get_current_aio_context(void)
  {
  return qemu_get_aio_context();
  }
+
+void qemu_set_current_aio_context(AioContext *ctx)
+{
+}
diff --git a/tests/unit/iothread.c b/tests/unit/iothread.c
index afde12b4ef..cab38b3da8 100644
--- a/tests/unit/iothread.c
+++ b/tests/unit/iothread.c
@@ -30,13 +30,26 @@ struct IOThread {
  bool stopping;
  };

-static __thread IOThread *my_iothread;
+static __thread AioContext *my_aiocontext;
+
+void qemu_set_current_aio_context(AioContext *ctx)
+{
+    assert(!my_aiocontext);
+    my_aiocontext = ctx;
+}

  AioContext *qemu_get_current_aio_context(void)
  {
-    return my_iothread ? my_iothread->ctx : qemu_get_aio_context();
+    if (my_aiocontext) {
+    return my_aiocontext;
+    }
+    if (qemu_mutex_iothread_locked()) {
+    return qemu_get_aio_context();
+    }
+    return NULL;
  }

+
  static void iothread_init_gcontext(IOThread *iothread)
  {
  GSource *source;
@@ -54,7 +67,7 @@ static void *iothread_run(void *opaque)

  rcu_register_thread();

-    my_iothread = iothread;
+    qemu_set_current_aio_context(iothread->ctx);
  qemu_mutex_lock(>init_done_lock);
  iothread->ctx = aio_context_new(_abort);

diff --git a/util/main-loop.c b/util/main-loop.c
index d9c55df6f5..4ae5b23e99 100644
--- a/util/main-loop.c
+++ b/util/main-loop.c
@@ -170,6 +170,7 @@ int qemu_init_main_loop(Error **errp)
  if (!qemu_aio_context) {
  return -EMFILE;
  }
+    qemu_set_current_aio_context(qemu_aio_context);
  qemu_notify_bh = qemu_bh_new(notify_event_cb, NULL);
  gpollfds = g_array_new(FALSE, FALSE, sizeof(GPollFD));
  src = aio_get_g_source(qemu_aio_context);

If it works for you, I can post it as a formal patch.



This doesn't work for iotests.. qemu-io goes through version in stub. It works 
if I add:


Great, thanks.  I'll try the patch, also with the test that broke with the 
earlier version, and post if it works.



Thanks, I'll base v4 of nbd patches on it.

I now run make check. test-aio-multithread crashes on assertion:

(gdb) bt
#0  0x7f4af8d839d5 in raise () from /lib64/libc.so.6
#1  0x7f4af8d6c8a4 in abort () from /lib64/libc.so.6
#2  0x7f4af8d6c789 in 

Re: [PATCH v3 1/5] block-copy: streamline choice of copy_range vs. read/write

2021-06-09 Thread Vladimir Sementsov-Ogievskiy

09.06.2021 12:33, Paolo Bonzini wrote:

On 09/06/21 10:51, Vladimir Sementsov-Ogievskiy wrote:


+    default:
[...]
+    bounce_buffer = qemu_blockalign(s->source->bs, nbytes);
+    ret = bdrv_co_pread(s->source, offset, nbytes, bounce_buffer, 0);
+    if (ret < 0) {
+    trace_block_copy_read_fail(s, offset, ret);
+    *error_is_read = true;
+    goto out;
+    }
+    ret = bdrv_co_pwrite(s->target, offset, nbytes, bounce_buffer,
+    s->write_flags);
+    if (ret < 0) {
+    trace_block_copy_write_fail(s, offset, ret);
+    *error_is_read = false;
+    goto out;
+    }
+out:
+    qemu_vfree(bounce_buffer);


label inside switch operator? Rather unusual. Please, let's avoid it and just 
keep out: after switch operator.


Agreed with all comments except this one, the bounce_buffer doesn't exist in 
the other cases.


Hmm, as well as "return ret" actually is used only for this "default:" case, 
other paths returns earlier :) Also, bounce_buffer is defined in outer scope anyway. So I don't 
think that overall picture becomes better from isolation point of view with this change. Maybe good 
refactoring is moving default branch to a separate helper function together with bounce_buffer 
local variable.

Still, I don't care too much, keep it as is if you want, that's works for me.

The thing that comes to my mind not the first time: how to make something 
similar with g_autofree for qemu_blockalign()?

I can imagine how to implement a macro like ERRP_GUARD, which will work like

void *bounce_buffer = qemu_blockalign(...);
QEMU_AUTO_VFREE(bounce_buffer);

...




+    ret = block_copy_do_copy(s, t->offset, t->bytes, , _is_read);
+    if (s->method == t->method) {
+    s->method = method;


you leave another t->s occurrences in the function untouched. It's somehow inconsistent. Could we just use t->s in this patch, and refactor with a follow-up patch (or as preparing patch)? 


Maybe as a first patch, yes.

Paolo




--
Best regards,
Vladimir



Re: [PATCH v3 06/33] util/async: aio_co_schedule(): support reschedule in same ctx

2021-06-09 Thread Paolo Bonzini

On 08/06/21 20:45, Vladimir Sementsov-Ogievskiy wrote:

14.05.2021 00:04, Paolo Bonzini wrote:

On 12/05/21 09:15, Vladimir Sementsov-Ogievskiy wrote:




I don't understand.  Why doesn't aio_co_enter go through the ctx != 
qemu_get_current_aio_context() branch and just do aio_co_schedule? 
That was at least the idea behind aio_co_wake and aio_co_enter.




Because ctx is exactly qemu_get_current_aio_context(), as we are not 
in iothread but in nbd connection thread. So, 
qemu_get_current_aio_context() returns qemu_aio_context.


So the problem is that threads other than the main thread and
the I/O thread should not return qemu_aio_context.  The vCPU thread
may need to return it when running with BQL taken, though.

Something like this (untested):

diff --git a/include/block/aio.h b/include/block/aio.h
index 5f342267d5..10fcae1515 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -691,10 +691,13 @@ void aio_co_enter(AioContext *ctx, struct 
Coroutine *co);

   * Return the AioContext whose event loop runs in the current thread.
   *
   * If called from an IOThread this will be the IOThread's 
AioContext.  If

- * called from another thread it will be the main loop AioContext.
+ * called from the main thread or with the "big QEMU lock" taken it
+ * will be the main loop AioContext.
   */
  AioContext *qemu_get_current_aio_context(void);

+void qemu_set_current_aio_context(AioContext *ctx);
+
  /**
   * aio_context_setup:
   * @ctx: the aio context
diff --git a/iothread.c b/iothread.c
index 7f086387be..22b967e77c 100644
--- a/iothread.c
+++ b/iothread.c
@@ -39,11 +39,23 @@ DECLARE_CLASS_CHECKERS(IOThreadClass, IOTHREAD,
  #define IOTHREAD_POLL_MAX_NS_DEFAULT 0ULL
  #endif

-static __thread IOThread *my_iothread;
+static __thread AioContext *my_aiocontext;
+
+void qemu_set_current_aio_context(AioContext *ctx)
+{
+    assert(!my_aiocontext);
+    my_aiocontext = ctx;
+}

  AioContext *qemu_get_current_aio_context(void)
  {
-    return my_iothread ? my_iothread->ctx : qemu_get_aio_context();
+    if (my_aiocontext) {
+    return my_aiocontext;
+    }
+    if (qemu_mutex_iothread_locked()) {
+    return qemu_get_aio_context();
+    }
+    return NULL;
  }

  static void *iothread_run(void *opaque)
@@ -56,7 +68,7 @@ static void *iothread_run(void *opaque)
   * in this new thread uses glib.
   */
  g_main_context_push_thread_default(iothread->worker_context);
-    my_iothread = iothread;
+    qemu_set_current_aio_context(iothread->ctx);
  iothread->thread_id = qemu_get_thread_id();
  qemu_sem_post(>init_done_sem);

diff --git a/stubs/iothread.c b/stubs/iothread.c
index 8cc9e28c55..25ff398894 100644
--- a/stubs/iothread.c
+++ b/stubs/iothread.c
@@ -6,3 +6,7 @@ AioContext *qemu_get_current_aio_context(void)
  {
  return qemu_get_aio_context();
  }
+
+void qemu_set_current_aio_context(AioContext *ctx)
+{
+}
diff --git a/tests/unit/iothread.c b/tests/unit/iothread.c
index afde12b4ef..cab38b3da8 100644
--- a/tests/unit/iothread.c
+++ b/tests/unit/iothread.c
@@ -30,13 +30,26 @@ struct IOThread {
  bool stopping;
  };

-static __thread IOThread *my_iothread;
+static __thread AioContext *my_aiocontext;
+
+void qemu_set_current_aio_context(AioContext *ctx)
+{
+    assert(!my_aiocontext);
+    my_aiocontext = ctx;
+}

  AioContext *qemu_get_current_aio_context(void)
  {
-    return my_iothread ? my_iothread->ctx : qemu_get_aio_context();
+    if (my_aiocontext) {
+    return my_aiocontext;
+    }
+    if (qemu_mutex_iothread_locked()) {
+    return qemu_get_aio_context();
+    }
+    return NULL;
  }

+
  static void iothread_init_gcontext(IOThread *iothread)
  {
  GSource *source;
@@ -54,7 +67,7 @@ static void *iothread_run(void *opaque)

  rcu_register_thread();

-    my_iothread = iothread;
+    qemu_set_current_aio_context(iothread->ctx);
  qemu_mutex_lock(>init_done_lock);
  iothread->ctx = aio_context_new(_abort);

diff --git a/util/main-loop.c b/util/main-loop.c
index d9c55df6f5..4ae5b23e99 100644
--- a/util/main-loop.c
+++ b/util/main-loop.c
@@ -170,6 +170,7 @@ int qemu_init_main_loop(Error **errp)
  if (!qemu_aio_context) {
  return -EMFILE;
  }
+    qemu_set_current_aio_context(qemu_aio_context);
  qemu_notify_bh = qemu_bh_new(notify_event_cb, NULL);
  gpollfds = g_array_new(FALSE, FALSE, sizeof(GPollFD));
  src = aio_get_g_source(qemu_aio_context);

If it works for you, I can post it as a formal patch.



This doesn't work for iotests.. qemu-io goes through version in stub. It 
works if I add:


Great, thanks.  I'll try the patch, also with the test that broke with 
the earlier version, and post if it works.


Paolo


diff --git a/stubs/iothread.c b/stubs/iothread.c
index 8cc9e28c55..967a01c4f0 100644
--- a/stubs/iothread.c
+++ b/stubs/iothread.c
@@ -2,7 +2,18 @@
  #include "block/aio.h"
  #include "qemu/main-loop.h"

+static __thread AioContext *my_aiocontext;
+
+void qemu_set_current_aio_context(AioContext 

Re: [PATCH v3 1/5] block-copy: streamline choice of copy_range vs. read/write

2021-06-09 Thread Paolo Bonzini

On 09/06/21 10:51, Vladimir Sementsov-Ogievskiy wrote:


+    default:
[...]
+bounce_buffer = qemu_blockalign(s->source->bs, nbytes);
+    ret = bdrv_co_pread(s->source, offset, nbytes, bounce_buffer, 
0);

+    if (ret < 0) {
+    trace_block_copy_read_fail(s, offset, ret);
+    *error_is_read = true;
+    goto out;
+    }
+    ret = bdrv_co_pwrite(s->target, offset, nbytes, bounce_buffer,
+    s->write_flags);
+    if (ret < 0) {
+    trace_block_copy_write_fail(s, offset, ret);
+    *error_is_read = false;
+    goto out;
+    }
+out:
+    qemu_vfree(bounce_buffer);


label inside switch operator? Rather unusual. Please, let's avoid it and 
just keep out: after switch operator.


Agreed with all comments except this one, the bounce_buffer doesn't 
exist in the other cases.



+ret = block_copy_do_copy(s, t->offset, t->bytes, , _is_read);
+if (s->method == t->method) {
+s->method = method;


you leave another t->s occurrences in the function untouched. It's somehow inconsistent. Could we just use t->s in this patch, and refactor with a follow-up patch (or as preparing patch)? 


Maybe as a first patch, yes.

Paolo




Re: [PATCH v3 2/5] block-copy: improve comments of BlockCopyTask and BlockCopyState types and functions

2021-06-09 Thread Vladimir Sementsov-Ogievskiy

08.06.2021 10:33, Emanuele Giuseppe Esposito wrote:

As done in BlockCopyCallState, categorize BlockCopyTask
and BlockCopyState in IN, State and OUT fields.
This is just to understand which field has to be protected with a lock.

.sleep_state is handled in the series "coroutine: new sleep/wake API"
and thus here left as TODO.

Signed-off-by: Emanuele Giuseppe Esposito 
---
  block/block-copy.c | 47 ++
  1 file changed, 31 insertions(+), 16 deletions(-)

diff --git a/block/block-copy.c b/block/block-copy.c
index d58051288b..b3533a3003 100644
--- a/block/block-copy.c
+++ b/block/block-copy.c
@@ -56,25 +56,33 @@ typedef struct BlockCopyCallState {
  QLIST_ENTRY(BlockCopyCallState) list;
  
  /* State */


Why previous @list field is not in the state? For sure it's not an IN parameter 
and should be protected somehow.


-int ret;
  bool finished;
-QemuCoSleep sleep;
-bool cancelled;
+QemuCoSleep sleep; /* TODO: protect API with a lock */
  
  /* OUT parameters */

+bool cancelled;
  bool error_is_read;
+int ret;
  } BlockCopyCallState;
  
  typedef struct BlockCopyTask {

  AioTask task;
  
+/*

+ * IN parameters. Initialized in block_copy_task_create()
+ * and never changed.
+ */
  BlockCopyState *s;
  BlockCopyCallState *call_state;
  int64_t offset;
-int64_t bytes;
-BlockCopyMethod method;
-QLIST_ENTRY(BlockCopyTask) list;
+int64_t bytes; /* only re-set in task_shrink, before running the task */
+BlockCopyMethod method; /* initialized in block_copy_dirty_clusters() */


hmm. to be precise method is initialized in block_copy_task_create.

And after block_copy_task_create finished, task is in the list and can be read 
by parallel block_copy_dirty_clusters(). So, @bytes is part of State, we must 
protect it..

method is only read by block_copy_task_entry(), so can be modified without any 
protection before running the task.


+
+/* State */
  CoQueue wait_queue; /* coroutines blocked on this task */
+
+/* To reference all call states from BlockCopyState */


That's a misleading comment.. not all sates but all tasks. I don't think we 
need this new comment, just keep @list in State section.


+QLIST_ENTRY(BlockCopyTask) list;
  } BlockCopyTask;
  
  static int64_t task_end(BlockCopyTask *task)

@@ -90,15 +98,25 @@ typedef struct BlockCopyState {
   */
  BdrvChild *source;
  BdrvChild *target;
-BdrvDirtyBitmap *copy_bitmap;
+
+/* State */
  int64_t in_flight_bytes;
-int64_t cluster_size;
  BlockCopyMethod method;
-int64_t max_transfer;
-uint64_t len;
  QLIST_HEAD(, BlockCopyTask) tasks; /* All tasks from all block-copy calls 
*/
  QLIST_HEAD(, BlockCopyCallState) calls;
+/* State fields that use a thread-safe API */
+BdrvDirtyBitmap *copy_bitmap;
+ProgressMeter *progress;
+SharedResource *mem;
+RateLimit rate_limit;
  
+/*

+ * IN parameters. Initialized in block_copy_state_new()
+ * and never changed.
+ */
+int64_t cluster_size;
+int64_t max_transfer;
+uint64_t len;
  BdrvRequestFlags write_flags;
  
  /*

@@ -114,14 +132,11 @@ typedef struct BlockCopyState {
   * In this case, block_copy() will query the source’s allocation status,
   * skip unallocated regions, clear them in the copy_bitmap, and invoke
   * block_copy_reset_unallocated() every time it does.
+ *
+ * This field is set in backup_run() before coroutines are run,
+ * therefore is an IN.
   */
  bool skip_unallocated;
-
-ProgressMeter *progress;
-
-SharedResource *mem;
-
-RateLimit rate_limit;
  } BlockCopyState;
  
  static BlockCopyTask *find_conflicting_task(BlockCopyState *s,





--
Best regards,
Vladimir



Re: Prevent compiler warning on block.c

2021-06-09 Thread Kevin Wolf
Am 09.06.2021 um 09:12 hat Thomas Huth geschrieben:
> On 05/05/2021 10.05, Vladimir Sementsov-Ogievskiy wrote:
> > 05.05.2021 10:59, Miroslav Rezanina wrote:
> > > Commit 3108a15cf (block: introduce bdrv_drop_filter()) introduced
> > > uninitialized
> > > variable to_cow_parent in bdrv_replace_node_common function that is
> > > used only when
> > > detach_subchain is true. It is used in two places. First if block
> > > properly initialize
> > > the variable and second block use it.
> > > 
> > > However, compiler treats this two blocks as two independent cases so
> > > it thinks first
> > > block can fail test and second one pass (although both use same
> > > condition). This cause
> > > warning that variable can be uninitialized in second block.
> > > 
> > > To prevent this warning, initialize the variable with NULL.
> > > 
> > > Signed-off-by: Miroslav Rezanina 
> > > ---
> > > diff --git a/block.c b/block.c
> > > index 874c22c43e..3ca27bd2d9 100644
> > > --- a/block.c
> > > +++ b/block.c
> > > @@ -4851,7 +4851,7 @@ static int
> > > bdrv_replace_node_common(BlockDriverState *from,
> > >   Transaction *tran = tran_new();
> > >   g_autoptr(GHashTable) found = NULL;
> > >   g_autoptr(GSList) refresh_list = NULL;
> > > -    BlockDriverState *to_cow_parent;
> > > +    BlockDriverState *to_cow_parent = NULL;
> > >   int ret;
> > > 
> > >   if (detach_subchain) {
> > > 
> > 
> > Reviewed-by: Vladimir Sementsov-Ogievskiy 
> 
> This just popped up again here:
> 
>  https://lists.gnu.org/archive/html/qemu-devel/2021-06/msg02067.html
> 
> Kevin, Max, could you please pick it up to get this problem fixed?

Thanks for pinging, seems the intended refactoring hasn't happened.

I've applied this one to my block branch now.

Kevin




Re: [PATCH v3 1/5] block-copy: streamline choice of copy_range vs. read/write

2021-06-09 Thread Vladimir Sementsov-Ogievskiy

08.06.2021 10:33, Emanuele Giuseppe Esposito wrote:

From: Paolo Bonzini 

Put the logic to determine the copy size in a separate function, so
that there is a simple state machine for the possible methods of
copying data from one BlockDriverState to the other.

Use .method instead of .copy_range as in-out argument, and
include also .zeroes as an additional copy method.

While at it, store the common computation of block_copy_max_transfer
into a new field of BlockCopyState, and make sure that we always
obey max_transfer; that's more efficient even for the
COPY_RANGE_READ_WRITE case.

Signed-off-by: Emanuele Giuseppe Esposito 
Signed-off-by: Paolo Bonzini 


In general looks OK to me, I mostly have style remarks, see below.


---
  block/block-copy.c | 171 ++---
  1 file changed, 85 insertions(+), 86 deletions(-)

diff --git a/block/block-copy.c b/block/block-copy.c
index 943e30b7e6..d58051288b 100644
--- a/block/block-copy.c
+++ b/block/block-copy.c
@@ -28,6 +28,14 @@
  #define BLOCK_COPY_MAX_WORKERS 64
  #define BLOCK_COPY_SLICE_TIME 1ULL /* ns */
  
+typedef enum {

+COPY_READ_WRITE_CLUSTER,
+COPY_READ_WRITE,
+COPY_WRITE_ZEROES,
+COPY_RANGE_SMALL,
+COPY_RANGE_FULL
+} BlockCopyMethod;
+
  static coroutine_fn int block_copy_task_entry(AioTask *task);
  
  typedef struct BlockCopyCallState {

@@ -64,8 +72,7 @@ typedef struct BlockCopyTask {
  BlockCopyCallState *call_state;
  int64_t offset;
  int64_t bytes;
-bool zeroes;
-bool copy_range;
+BlockCopyMethod method;
  QLIST_ENTRY(BlockCopyTask) list;
  CoQueue wait_queue; /* coroutines blocked on this task */
  } BlockCopyTask;
@@ -86,8 +93,8 @@ typedef struct BlockCopyState {
  BdrvDirtyBitmap *copy_bitmap;
  int64_t in_flight_bytes;
  int64_t cluster_size;
-bool use_copy_range;
-int64_t copy_size;
+BlockCopyMethod method;
+int64_t max_transfer;
  uint64_t len;
  QLIST_HEAD(, BlockCopyTask) tasks; /* All tasks from all block-copy calls 
*/
  QLIST_HEAD(, BlockCopyCallState) calls;
@@ -149,6 +156,24 @@ static bool coroutine_fn 
block_copy_wait_one(BlockCopyState *s, int64_t offset,
  return true;
  }
  
+static int64_t block_copy_chunk_size(BlockCopyState *s)

+{
+switch (s->method) {
+case COPY_READ_WRITE_CLUSTER:
+return s->cluster_size;
+case COPY_READ_WRITE:
+case COPY_RANGE_SMALL:
+return MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER),
+   s->max_transfer);
+case COPY_RANGE_FULL:
+return MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_COPY_RANGE),
+   s->max_transfer);
+default:
+/* Cannot have COPY_WRITE_ZEROES here.  */
+abort();
+}
+}
+
  /*
   * Search for the first dirty area in offset/bytes range and create task at
   * the beginning of it.
@@ -158,8 +183,9 @@ static BlockCopyTask *block_copy_task_create(BlockCopyState 
*s,
   int64_t offset, int64_t bytes)
  {
  BlockCopyTask *task;
-int64_t max_chunk = MIN_NON_ZERO(s->copy_size, call_state->max_chunk);
+int64_t max_chunk = block_copy_chunk_size(s);
  
+max_chunk = MIN_NON_ZERO(max_chunk, call_state->max_chunk);

  if (!bdrv_dirty_bitmap_next_dirty_area(s->copy_bitmap,
 offset, offset + bytes,
 max_chunk, , ))
@@ -183,7 +209,7 @@ static BlockCopyTask *block_copy_task_create(BlockCopyState 
*s,
  .call_state = call_state,
  .offset = offset,
  .bytes = bytes,
-.copy_range = s->use_copy_range,
+.method = s->method,
  };
  qemu_co_queue_init(>wait_queue);
  QLIST_INSERT_HEAD(>tasks, task, list);
@@ -267,28 +293,27 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, 
BdrvChild *target,
  .len = bdrv_dirty_bitmap_size(copy_bitmap),
  .write_flags = write_flags,
  .mem = shres_create(BLOCK_COPY_MAX_MEM),
+.max_transfer = QEMU_ALIGN_DOWN(block_copy_max_transfer(source, target)
+, cluster_size),


It seems unusual to not keep comma on the previous line (and it's actually fit 
into 80 characters).

I've grepped several places with '^\s*,' pattern, for example in 
target/mips/tcg/msa_helper.c. But at least in this file there is no such 
practice, let's be consistent.


  };
  
-if (block_copy_max_transfer(source, target) < cluster_size) {

+if (s->max_transfer < cluster_size) {
  /*
   * copy_range does not respect max_transfer. We don't want to bother
   * with requests smaller than block-copy cluster size, so fallback to
   * buffered copying (read and write respect max_transfer on their
   * behalf).
   */
-s->use_copy_range = false;
-s->copy_size = cluster_size;
+s->method = COPY_READ_WRITE_CLUSTER;
  } 

Re: Prevent compiler warning on block.c

2021-06-09 Thread Thomas Huth

On 05/05/2021 10.05, Vladimir Sementsov-Ogievskiy wrote:

05.05.2021 10:59, Miroslav Rezanina wrote:
Commit 3108a15cf (block: introduce bdrv_drop_filter()) introduced 
uninitialized
variable to_cow_parent in bdrv_replace_node_common function that is used 
only when
detach_subchain is true. It is used in two places. First if block properly 
initialize

the variable and second block use it.

However, compiler treats this two blocks as two independent cases so it 
thinks first
block can fail test and second one pass (although both use same 
condition). This cause

warning that variable can be uninitialized in second block.

To prevent this warning, initialize the variable with NULL.

Signed-off-by: Miroslav Rezanina 
---
diff --git a/block.c b/block.c
index 874c22c43e..3ca27bd2d9 100644
--- a/block.c
+++ b/block.c
@@ -4851,7 +4851,7 @@ static int bdrv_replace_node_common(BlockDriverState 
*from,

  Transaction *tran = tran_new();
  g_autoptr(GHashTable) found = NULL;
  g_autoptr(GSList) refresh_list = NULL;
-    BlockDriverState *to_cow_parent;
+    BlockDriverState *to_cow_parent = NULL;
  int ret;

  if (detach_subchain) {



Reviewed-by: Vladimir Sementsov-Ogievskiy 


This just popped up again here:

 https://lists.gnu.org/archive/html/qemu-devel/2021-06/msg02067.html

Kevin, Max, could you please pick it up to get this problem fixed?

 Thomas




Re: [PATCH 1/4] block.c: fix compilation error on possible unitialized variable

2021-06-09 Thread Thomas Huth

On 08/06/2021 16.09, Cleber Rosa wrote:

GCC from CentOS Stream 8 is erroring out on a possibily unitialized
varible.

Full version info for the compiler used:

  gcc (GCC) 8.5.0 20210514 (Red Hat 8.5.0-1)

Signed-off-by: Cleber Rosa 
---
  block.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block.c b/block.c
index 3f456892d0..08f29e6b65 100644
--- a/block.c
+++ b/block.c
@@ -4866,7 +4866,7 @@ static int bdrv_replace_node_common(BlockDriverState 
*from,
  Transaction *tran = tran_new();
  g_autoptr(GHashTable) found = NULL;
  g_autoptr(GSList) refresh_list = NULL;
-BlockDriverState *to_cow_parent;
+BlockDriverState *to_cow_parent = NULL;
  int ret;


Already reported here:

 https://lists.gnu.org/archive/html/qemu-devel/2021-05/msg01229.html

 Thomas