[PATCH v2 2/3] hw/nvme: namespace parameter for EUI-64

2021-06-11 Thread Heinrich Schuchardt
The EUI-64 field is the only identifier for NVMe namespaces in UEFI device
paths. Add a new namespace property "eui64", that provides the user the
option to specify the EUI-64.

Signed-off-by: Heinrich Schuchardt 
Acked-by: Klaus Jensen 
---
v2:
fix typo %s/EUI64/EUI-64/
---
 docs/system/nvme.rst |  4 +++
 hw/nvme/ctrl.c   | 58 ++--
 hw/nvme/ns.c |  2 ++
 hw/nvme/nvme.h   |  1 +
 4 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/docs/system/nvme.rst b/docs/system/nvme.rst
index f7f63d6bf6..b5f8288d7c 100644
--- a/docs/system/nvme.rst
+++ b/docs/system/nvme.rst
@@ -81,6 +81,10 @@ There are a number of parameters available:
   Set the UUID of the namespace. This will be reported as a "Namespace UUID"
   descriptor in the Namespace Identification Descriptor List.

+``eui64``
+  Set the EUI-64 of the namespace. This will be reported as a "IEEE Extended
+  Unique Identifier" descriptor in the Namespace Identification Descriptor 
List.
+
 ``bus``
   If there are more ``nvme`` devices defined, this parameter may be used to
   attach the namespace to a specific ``nvme`` device (identified by an ``id``
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 8dd9cb2ccb..f37c4fd635 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -4436,19 +4436,19 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl 
*n, NvmeRequest *req)
 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
 uint32_t nsid = le32_to_cpu(c->nsid);
 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
-
-struct data {
-struct {
-NvmeIdNsDescr hdr;
-uint8_t v[NVME_NIDL_UUID];
-} uuid;
-struct {
-NvmeIdNsDescr hdr;
-uint8_t v;
-} csi;
-};
-
-struct data *ns_descrs = (struct data *)list;
+uint8_t *pos = list;
+struct {
+NvmeIdNsDescr hdr;
+uint8_t v[NVME_NIDL_UUID];
+} QEMU_PACKED uuid;
+struct {
+NvmeIdNsDescr hdr;
+uint64_t v;
+} QEMU_PACKED eui64;
+struct {
+NvmeIdNsDescr hdr;
+uint8_t v;
+} QEMU_PACKED csi;

 trace_pci_nvme_identify_ns_descr_list(nsid);

@@ -4462,17 +4462,29 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl 
*n, NvmeRequest *req)
 }

 /*
- * Because the NGUID and EUI64 fields are 0 in the Identify Namespace data
- * structure, a Namespace UUID (nidt = 3h) must be reported in the
- * Namespace Identification Descriptor. Add the namespace UUID here.
+ * If the EUI-64 field is 0 and the NGUID field is 0, the namespace must
+ * provide a valid Namespace UUID in the Namespace Identification 
Descriptor
+ * data structure. QEMU does not yet support setting NGUID.
  */
-ns_descrs->uuid.hdr.nidt = NVME_NIDT_UUID;
-ns_descrs->uuid.hdr.nidl = NVME_NIDL_UUID;
-memcpy(&ns_descrs->uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
-
-ns_descrs->csi.hdr.nidt = NVME_NIDT_CSI;
-ns_descrs->csi.hdr.nidl = NVME_NIDL_CSI;
-ns_descrs->csi.v = ns->csi;
+uuid.hdr.nidt = NVME_NIDT_UUID;
+uuid.hdr.nidl = NVME_NIDL_UUID;
+memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
+memcpy(pos, &uuid, sizeof(uuid));
+pos += sizeof(uuid);
+
+if (ns->params.eui64) {
+eui64.hdr.nidt = NVME_NIDT_EUI64;
+eui64.hdr.nidl = NVME_NIDL_EUI64;
+eui64.v = cpu_to_be64(ns->params.eui64);
+memcpy(pos, &eui64, sizeof(eui64));
+pos += sizeof(eui64);
+}
+
+csi.hdr.nidt = NVME_NIDT_CSI;
+csi.hdr.nidl = NVME_NIDL_CSI;
+csi.v = ns->csi;
+memcpy(pos, &csi, sizeof(csi));
+pos += sizeof(csi);

 return nvme_c2h(n, list, sizeof(list), req);
 }
diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c
index 3fec9c6273..45e457de6a 100644
--- a/hw/nvme/ns.c
+++ b/hw/nvme/ns.c
@@ -77,6 +77,7 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
 id_ns->mssrl = cpu_to_le16(ns->params.mssrl);
 id_ns->mcl = cpu_to_le32(ns->params.mcl);
 id_ns->msrc = ns->params.msrc;
+id_ns->eui64 = cpu_to_be64(ns->params.eui64);

 ds = 31 - clz32(ns->blkconf.logical_block_size);
 ms = ns->params.ms;
@@ -511,6 +512,7 @@ static Property nvme_ns_props[] = {
 DEFINE_PROP_BOOL("shared", NvmeNamespace, params.shared, false),
 DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
 DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid),
+DEFINE_PROP_UINT64("eui64", NvmeNamespace, params.eui64, 0),
 DEFINE_PROP_UINT16("ms", NvmeNamespace, params.ms, 0),
 DEFINE_PROP_UINT8("mset", NvmeNamespace, params.mset, 0),
 DEFINE_PROP_UINT8("pi", NvmeNamespace, params.pi, 0),
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 93a7e0e538..ac90e13d7b 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -83,6 +83,7 @@ typedef struct NvmeNamespaceParams {
 bool shared;
 uint32_t nsid;
 QemuUUID uuid;
+uint64_t eui64;

 uint16_t ms;
 uint8_t  mset;
--
2.30.2




[PATCH v2 0/3] hw/nvme: namespace parameter for EUI-64

2021-06-11 Thread Heinrich Schuchardt
The EUI-64 field is the only identifier for NVMe namespaces in UEFI device
paths. Add a new namespace property "eui64", that provides the user the
option to specify the EUI-64.

v2:
include patch for hw_compat_6_0
add a patch to implement default values for the EUI-64

Heinrich Schuchardt (3):
  hw: virt: consider hw_compat_6_0
  hw/nvme: namespace parameter for EUI-64
  hw/nvme: default for namespace EUI-64

 docs/system/nvme.rst |  6 +
 hw/arm/virt.c|  2 ++
 hw/core/machine.c|  1 +
 hw/nvme/ctrl.c   | 58 ++--
 hw/nvme/ns.c | 11 +
 hw/nvme/nvme.h   |  3 +++
 6 files changed, 58 insertions(+), 23 deletions(-)

--
2.30.2




[PATCH v2 1/3] hw: virt: consider hw_compat_6_0

2021-06-11 Thread Heinrich Schuchardt
virt-6.0 must consider hw_compat_6_0.

Fixes: da7e13c00b59 ("hw: add compat machines for 6.1")
Signed-off-by: Heinrich Schuchardt 
Reviewed-by: Cornelia Huck 
---
v2:
add missing Fixes: tag
---
 hw/arm/virt.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 840758666d..8bc3b408fe 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -2764,6 +2764,8 @@ DEFINE_VIRT_MACHINE_AS_LATEST(6, 1)

 static void virt_machine_6_0_options(MachineClass *mc)
 {
+virt_machine_6_1_options(mc);
+compat_props_add(mc->compat_props, hw_compat_6_0, hw_compat_6_0_len);
 }
 DEFINE_VIRT_MACHINE(6, 0)

--
2.30.2




[PATCH v2 3/3] hw/nvme: default for namespace EUI-64

2021-06-11 Thread Heinrich Schuchardt
On machines with version > 6.0 replace a missing EUI-64 by a generated
value.

Signed-off-by: Heinrich Schuchardt 
---
v2:
new patch
---
 docs/system/nvme.rst | 2 ++
 hw/core/machine.c| 1 +
 hw/nvme/ns.c | 9 +
 hw/nvme/nvme.h   | 2 ++
 4 files changed, 14 insertions(+)

diff --git a/docs/system/nvme.rst b/docs/system/nvme.rst
index b5f8288d7c..33a15c7dbc 100644
--- a/docs/system/nvme.rst
+++ b/docs/system/nvme.rst
@@ -84,6 +84,8 @@ There are a number of parameters available:
 ``eui64``
   Set the EUI-64 of the namespace. This will be reported as a "IEEE Extended
   Unique Identifier" descriptor in the Namespace Identification Descriptor 
List.
+  Since machine type 6.1 a non-zero default value is used if the parameter
+  is not provided. For earlier machine types the field defaults to 0.

 ``bus``
   If there are more ``nvme`` devices defined, this parameter may be used to
diff --git a/hw/core/machine.c b/hw/core/machine.c
index 55b9bc7817..d0e934 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -39,6 +39,7 @@
 GlobalProperty hw_compat_6_0[] = {
 { "gpex-pcihost", "allow-unmapped-accesses", "false" },
 { "i8042", "extended-state", "false"},
+{ "nvme-ns", "eui64-default", "off"},
 };
 const size_t hw_compat_6_0_len = G_N_ELEMENTS(hw_compat_6_0);

diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c
index 45e457de6a..4275c3db63 100644
--- a/hw/nvme/ns.c
+++ b/hw/nvme/ns.c
@@ -56,6 +56,7 @@ void nvme_ns_init_format(NvmeNamespace *ns)

 static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
 {
+static uint64_t ns_count;
 NvmeIdNs *id_ns = &ns->id_ns;
 uint8_t ds;
 uint16_t ms;
@@ -73,6 +74,12 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
 id_ns->nmic |= NVME_NMIC_NS_SHARED;
 }

+/* Substitute a missing EUI-64 by an autogenerated one */
+++ns_count;
+if (!ns->params.eui64 && ns->params.eui64_default) {
+ns->params.eui64 = ns_count + NVME_EUI64_DEFAULT;
+}
+
 /* simple copy */
 id_ns->mssrl = cpu_to_le16(ns->params.mssrl);
 id_ns->mcl = cpu_to_le32(ns->params.mcl);
@@ -533,6 +540,8 @@ static Property nvme_ns_props[] = {
params.max_open_zones, 0),
 DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace,
params.zd_extension_size, 0),
+DEFINE_PROP_BOOL("eui64-default", NvmeNamespace, params.eui64_default,
+ true),
 DEFINE_PROP_END_OF_LIST(),
 };

diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index ac90e13d7b..3fb869731d 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -26,6 +26,7 @@

 #define NVME_MAX_CONTROLLERS 32
 #define NVME_MAX_NAMESPACES  256
+#define NVME_EUI64_DEFAULT 0x27fed9272381cbd0UL

 typedef struct NvmeCtrl NvmeCtrl;
 typedef struct NvmeNamespace NvmeNamespace;
@@ -84,6 +85,7 @@ typedef struct NvmeNamespaceParams {
 uint32_t nsid;
 QemuUUID uuid;
 uint64_t eui64;
+bool eui64_default;

 uint16_t ms;
 uint8_t  mset;
--
2.30.2




Re: [PATCH 2/2] nbd: Add new qemu:joint-allocation metadata context

2021-06-11 Thread Nir Soffer
On Wed, Jun 9, 2021 at 9:01 PM Eric Blake  wrote:
>
> When trying to reconstruct a qcow2 chain using information provided
> over NBD, ovirt had been relying on an unsafe assumption that any
> portion of the qcow2 file advertised as sparse would defer to the
> backing image; this worked with what qemu 5.2 reports for a qcow2 BSD
> loaded with "backing":null.  However, in 6.0, commit 0da9856851 (nbd:
> server: Report holes for raw images) also had a side-effect of
> reporting unallocated zero clusters in qcow2 files as sparse.  This
> change is correct from the NBD spec perspective (advertising bits has
> always been optional based on how much information the server has
> available, and should only be used to optimize behavior when a bit is
> set, while not assuming semantics merely because a bit is clear), but
> means that a qcow2 file that uses an unallocated zero cluster to
> override a backing file now shows up as sparse over NBD, and causes
> ovirt to fail to reproduce that cluster (ie. ovirt was assuming it
> only had to write clusters where the bit was clear, and the 6.0
> behavior change shows the flaw in that assumption).
>
> The correct fix is for ovirt to additionally use the
> qemu:allocation-depth metadata context added in 5.2: after all, the
> actual determination for what is needed to recreate a qcow2 file is
> not whether a cluster is sparse, but whether the allocation-depth
> shows the cluster to be local.  But reproducing an image is more
> efficient when handling known-zero clusters, which means that ovirt
> has to track both base:allocation and qemu:allocation-depth metadata
> contexts simultaneously.  While NBD_CMD_BLOCK_STATUS is just fine
> sending back information for two contexts in parallel, it comes with
> some bookkeeping overhead at the client side: the two contexts need
> not report the same length of replies, and it involves more network
> traffic.

Since this change is not simple, and the chance that we also get the dirty
bitmap included in the result seems to be very low, I decided to check the
direction of merging multiple extents.

I started with merging "base:allocation" and "qemu:dirty-bitmap:xxx" since
we already have both. It was not hard to do, although it is not completely
tested yet.

Here is the merging code:
https://gerrit.ovirt.org/c/ovirt-imageio/+/115216/1/daemon/ovirt_imageio/_internal/nbdutil.py

To make merging easy and safe, we map the NBD_STATE_DIRTY bit to a private bit
so it cannot clash with the NBD_STATE_HOLE bit:
https://gerrit.ovirt.org/c/ovirt-imageio/+/115215/1/daemon/ovirt_imageio/_internal/nbd.py

Here is a functional test using qemu-nbd showing that it works:
https://gerrit.ovirt.org/c/ovirt-imageio/+/115216/1/daemon/test/client_test.py

I'll try to use "qemu:allocation-depth" in a similar way next week, probably
mapping depth > 0 to EXTENT_EXISTS, to use when reporting holes in
single qcow2 images.

If this is successful, we can start using this in the next ovirt release, and we
don't need "qemu:joint-allocation".

Nir




Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole

2021-06-11 Thread Eric Blake
On Sat, Jun 12, 2021 at 12:23:06AM +0300, Nir Soffer wrote:
> > Otherwise, you do have a point: "depth":1 in isolation is ambiguous
> > between "not allocated anywhere in this 1-element chain" and
> > "allocated at the first backing file in this chain of length 2 or
> > more".  At which point you can indeed use "qemu-img info" to determine
> > the backing chain depth.  How painful is that extra step?  Does it
> > justify the addition of a new optional "backing":true to any portion
> > of the file that was beyond the end of the chain (and omit that line
> > for all other regions, rather than printing "backing":false)?
> 
> Dealing with depth: N + 1 is not that painful, but also not great.
> 
> I think it is worth a little more effort, and it will save time in the long 
> term
> for users and for developers. Better APIs need simpler and shorter
> documentation and require less support.
> 
> I'm not sure about backing: false, maybe absent: true to match libnbd?

In the patch [1], I did "backing":true if the cluster was not found in
the chain, and omitted the bool altogether when the cluster is
present.  If we like the name "absent":true better than
"backing":true, that's an easy change.  The libnbd change for nbdinfo
to report 'absent' instead of 'unallocated' has not yet been released,
so we have some leeway on naming choices.

[1] https://lists.gnu.org/archive/html/qemu-devel/2021-06/msg03067.html

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole

2021-06-11 Thread Nir Soffer
On Fri, Jun 11, 2021 at 9:34 PM Eric Blake  wrote:
>
> On Fri, Jun 11, 2021 at 08:35:01PM +0300, Nir Soffer wrote:
> > On Fri, Jun 11, 2021 at 4:28 PM Eric Blake  wrote:
> > >
> > > On Fri, Jun 11, 2021 at 10:09:09AM +0200, Kevin Wolf wrote:
> > > > > Yes, that might work as well.  But we didn't previously document
> > > > > depth to be optional.  Removing something from output risks breaking
> > > > > more downstream tools that expect it to be non-optional, compared to
> > > > > providing a new value.
> > > >
> > > > A negative value isn't any less unexpected than a missing key. I don't
> > > > think any existing tool would be able to handle it. Encoding different
> > > > meanings in a single value isn't very QAPI-like either. Usually strings
> > > > that are parsed are the problem, but negative integers really isn't that
> > > > much different. I don't really like this solution.
> > > >
> > > > Leaving out the depth feels like a better suggestion to me.
> > > >
> > > > But anyway, this seems to only happen at the end of the backing chain.
> > > > So if the backing chain consistents of n images, why not report 'depth':
> > > > n + 1? So, in the above example, you would get 1. I think this has the
> > > > best chances of tools actually working correctly with the new output,
> > > > even though it's still not unlikely to break something.
> > >
> > > Ooh, I like that.  It is closer to reality - the file data really
> > > comes from the next depth, even if we have no filename at that depth.
> > > v2 of my patch coming up.
> >
> > How do you know the number of the layer? this info is not presented in
> > qemu-img map output.
...
> Otherwise, you do have a point: "depth":1 in isolation is ambiguous
> between "not allocated anywhere in this 1-element chain" and
> "allocated at the first backing file in this chain of length 2 or
> more".  At which point you can indeed use "qemu-img info" to determine
> the backing chain depth.  How painful is that extra step?  Does it
> justify the addition of a new optional "backing":true to any portion
> of the file that was beyond the end of the chain (and omit that line
> for all other regions, rather than printing "backing":false)?

Dealing with depth: N + 1 is not that painful, but also not great.

I think it is worth a little more effort, and it will save time in the long term
for users and for developers. Better APIs need simpler and shorter
documentation and require less support.

I'm not sure about backing: false, maybe absent: true to match libnbd?

Nir



Re: [PATCH 7/7] vhost-user-blk: Implement reconnection during realize

2021-06-11 Thread Raphael Norwitz
On Wed, Jun 09, 2021 at 05:46:58PM +0200, Kevin Wolf wrote:
> Commit dabefdd6 removed code that was supposed to try reconnecting
> during .realize(), but actually just crashed and had several design
> problems.
> 
> This adds the feature back without the crash in simple cases while also
> fixing some design problems: Reconnection is now only tried if there was
> a problem with the connection and not an error related to the content
> (which would fail again the same way in the next attempt). Reconnection
> is limited to three attempts (four with the initial attempt) so that we
> won't end up in an infinite loop if a problem is permanent. If the
> backend restarts three times in the very short time window of device
> initialisation, we have bigger problems and erroring out is the right
> course of action.
> 
> In the case that a connection error occurs and we reconnect, the error
> message is printed using error_report_err(), but otherwise ignored.
> 
> Signed-off-by: Kevin Wolf 

Reviewed-by: Raphael Norwitz 

> ---
>  hw/block/vhost-user-blk.c | 14 +-
>  1 file changed, 13 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
> index e49d2e4c83..f75a42bc62 100644
> --- a/hw/block/vhost-user-blk.c
> +++ b/hw/block/vhost-user-blk.c
> @@ -455,8 +455,10 @@ static int vhost_user_blk_realize_connect(VHostUserBlk 
> *s, Error **errp)
>  
>  static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
>  {
> +ERRP_GUARD();
>  VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>  VHostUserBlk *s = VHOST_USER_BLK(vdev);
> +int retries;
>  int i, ret;
>  
>  if (!s->chardev.chr) {
> @@ -498,7 +500,17 @@ static void vhost_user_blk_device_realize(DeviceState 
> *dev, Error **errp)
>  s->inflight = g_new0(struct vhost_inflight, 1);
>  s->vhost_vqs = g_new0(struct vhost_virtqueue, s->num_queues);
>  
> -ret = vhost_user_blk_realize_connect(s, errp);
> +retries = 3;
> +assert(!*errp);
> +do {
> +if (*errp) {
> +error_prepend(errp, "Reconnecting after error: ");
> +error_report_err(*errp);
> +*errp = NULL;
> +}
> +ret = vhost_user_blk_realize_connect(s, errp);
> +} while (ret == -EPROTO && retries--);
> +
>  if (ret < 0) {
>  goto virtio_err;
>  }
> -- 
> 2.30.2
> 


Re: [PATCH 6/7] vhost-user-blk: Factor out vhost_user_blk_realize_connect()

2021-06-11 Thread Raphael Norwitz
On Wed, Jun 09, 2021 at 05:46:57PM +0200, Kevin Wolf wrote:
> This function is the part that we will want to retry if the connection
> is lost during initialisation, so factor it out to keep the following
> patch simpler.
> 
> The error path for vhost_dev_get_config() forgot disconnecting the
> chardev, add this while touching the code.
> 
> Signed-off-by: Kevin Wolf 

Reviewed-by: Raphael Norwitz 

> ---
>  hw/block/vhost-user-blk.c | 48 ++-
>  1 file changed, 32 insertions(+), 16 deletions(-)
> 
> diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
> index 3770f715da..e49d2e4c83 100644
> --- a/hw/block/vhost-user-blk.c
> +++ b/hw/block/vhost-user-blk.c
> @@ -423,6 +423,36 @@ static void vhost_user_blk_event(void *opaque, 
> QEMUChrEvent event)
>  }
>  }
>  
> +static int vhost_user_blk_realize_connect(VHostUserBlk *s, Error **errp)
> +{
> +DeviceState *dev = &s->parent_obj.parent_obj;
> +int ret;
> +
> +s->connected = false;
> +
> +ret = qemu_chr_fe_wait_connected(&s->chardev, errp);
> +if (ret < 0) {
> +return ret;
> +}
> +
> +ret = vhost_user_blk_connect(dev, errp);
> +if (ret < 0) {
> +qemu_chr_fe_disconnect(&s->chardev);
> +return ret;
> +}
> +assert(s->connected);
> +
> +ret = vhost_dev_get_config(&s->dev, (uint8_t *)&s->blkcfg,
> +   sizeof(struct virtio_blk_config), errp);
> +if (ret < 0) {
> +qemu_chr_fe_disconnect(&s->chardev);
> +vhost_dev_cleanup(&s->dev);
> +return ret;
> +}
> +
> +return 0;
> +}
> +
>  static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
>  {
>  VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> @@ -467,22 +497,10 @@ static void vhost_user_blk_device_realize(DeviceState 
> *dev, Error **errp)
>  
>  s->inflight = g_new0(struct vhost_inflight, 1);
>  s->vhost_vqs = g_new0(struct vhost_virtqueue, s->num_queues);
> -s->connected = false;
> -
> -if (qemu_chr_fe_wait_connected(&s->chardev, errp) < 0) {
> -goto virtio_err;
> -}
>  
> -if (vhost_user_blk_connect(dev, errp) < 0) {
> -qemu_chr_fe_disconnect(&s->chardev);
> -goto virtio_err;
> -}
> -assert(s->connected);
> -
> -ret = vhost_dev_get_config(&s->dev, (uint8_t *)&s->blkcfg,
> -   sizeof(struct virtio_blk_config), errp);
> +ret = vhost_user_blk_realize_connect(s, errp);
>  if (ret < 0) {
> -goto vhost_err;
> +goto virtio_err;
>  }
>  
>  /* we're fully initialized, now we can operate, so add the handler */
> @@ -491,8 +509,6 @@ static void vhost_user_blk_device_realize(DeviceState 
> *dev, Error **errp)
>   NULL, true);
>  return;
>  
> -vhost_err:
> -vhost_dev_cleanup(&s->dev);
>  virtio_err:
>  g_free(s->vhost_vqs);
>  s->vhost_vqs = NULL;
> -- 
> 2.30.2
> 


Re: [PATCH 5/7] vhost: Distinguish errors in vhost_dev_get_config()

2021-06-11 Thread Raphael Norwitz
On Wed, Jun 09, 2021 at 05:46:56PM +0200, Kevin Wolf wrote:
> Instead of just returning 0/-1 and letting the caller make up a
> meaningless error message, add an Error parameter to allow reporting the
> real error and switch to 0/-errno so that different kind of errors can
> be distinguished in the caller.
> 
> Signed-off-by: Kevin Wolf 

Just one commmit message suggestion.

Reviewed-by: Raphael Norwitz 

> ---
>  include/hw/virtio/vhost-backend.h |  2 +-
>  include/hw/virtio/vhost.h |  4 ++--
>  hw/block/vhost-user-blk.c |  9 +
>  hw/display/vhost-user-gpu.c   |  6 --
>  hw/input/vhost-user-input.c   |  6 --
>  hw/net/vhost_net.c|  2 +-
>  hw/virtio/vhost-user-vsock.c  |  9 +
>  hw/virtio/vhost-user.c| 24 
>  hw/virtio/vhost-vdpa.c|  2 +-
>  hw/virtio/vhost.c | 14 +++---
>  10 files changed, 46 insertions(+), 32 deletions(-)
> 
> diff --git a/include/hw/virtio/vhost-backend.h 
> b/include/hw/virtio/vhost-backend.h
> index 728ebb0ed9..8475c5a29d 100644
> --- a/include/hw/virtio/vhost-backend.h
> +++ b/include/hw/virtio/vhost-backend.h
> @@ -98,7 +98,7 @@ typedef int (*vhost_set_config_op)(struct vhost_dev *dev, 
> const uint8_t *data,
> uint32_t offset, uint32_t size,
> uint32_t flags);
>  typedef int (*vhost_get_config_op)(struct vhost_dev *dev, uint8_t *config,
> -   uint32_t config_len);
> +   uint32_t config_len, Error **errp);
>  
>  typedef int (*vhost_crypto_create_session_op)(struct vhost_dev *dev,
>void *session_info,
> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> index 2d7aaad67b..045d0fd9f2 100644
> --- a/include/hw/virtio/vhost.h
> +++ b/include/hw/virtio/vhost.h
> @@ -130,8 +130,8 @@ int vhost_net_set_backend(struct vhost_dev *hdev,
>struct vhost_vring_file *file);
>  
>  int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write);
> -int vhost_dev_get_config(struct vhost_dev *dev, uint8_t *config,
> - uint32_t config_len);
> +int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config,
> + uint32_t config_len, Error **errp);
>  int vhost_dev_set_config(struct vhost_dev *dev, const uint8_t *data,
>   uint32_t offset, uint32_t size, uint32_t flags);
>  /* notifier callback in case vhost device config space changed
> diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
> index e9382e152a..3770f715da 100644
> --- a/hw/block/vhost-user-blk.c
> +++ b/hw/block/vhost-user-blk.c
> @@ -91,11 +91,13 @@ static int vhost_user_blk_handle_config_change(struct 
> vhost_dev *dev)
>  int ret;
>  struct virtio_blk_config blkcfg;
>  VHostUserBlk *s = VHOST_USER_BLK(dev->vdev);
> +Error *local_err = NULL;
>  
>  ret = vhost_dev_get_config(dev, (uint8_t *)&blkcfg,
> -   sizeof(struct virtio_blk_config));
> +   sizeof(struct virtio_blk_config),
> +   &local_err);
>  if (ret < 0) {
> -error_report("get config space failed");
> +error_report_err(local_err);
>  return -1;
>  }
>  
> @@ -478,9 +480,8 @@ static void vhost_user_blk_device_realize(DeviceState 
> *dev, Error **errp)
>  assert(s->connected);
>  
>  ret = vhost_dev_get_config(&s->dev, (uint8_t *)&s->blkcfg,
> -   sizeof(struct virtio_blk_config));
> +   sizeof(struct virtio_blk_config), errp);
>  if (ret < 0) {
> -error_setg(errp, "vhost-user-blk: get block config failed");
>  goto vhost_err;
>  }
>  
> diff --git a/hw/display/vhost-user-gpu.c b/hw/display/vhost-user-gpu.c
> index 6cdaa1c73b..389199e6ca 100644
> --- a/hw/display/vhost-user-gpu.c
> +++ b/hw/display/vhost-user-gpu.c
> @@ -415,14 +415,16 @@ vhost_user_gpu_get_config(VirtIODevice *vdev, uint8_t 
> *config_data)
>  VirtIOGPUBase *b = VIRTIO_GPU_BASE(vdev);
>  struct virtio_gpu_config *vgconfig =
>  (struct virtio_gpu_config *)config_data;
> +Error *local_err = NULL;
>  int ret;
>  
>  memset(config_data, 0, sizeof(struct virtio_gpu_config));
>  
>  ret = vhost_dev_get_config(&g->vhost->dev,
> -   config_data, sizeof(struct 
> virtio_gpu_config));
> +   config_data, sizeof(struct virtio_gpu_config),
> +   &local_err);
>  if (ret) {
> -error_report("vhost-user-gpu: get device config space failed");
> +error_report_err(local_err);
>  return;
>  }
>  
> diff --git a/hw/input/vhost-user-input.c b/hw/input/vhost-user-input.c
> index 63984a8ba7..273e96a7b1 100644

Re: [PATCH 4/7] vhost-user-blk: Add Error parameter to vhost_user_blk_start()

2021-06-11 Thread Raphael Norwitz
On Wed, Jun 09, 2021 at 05:46:55PM +0200, Kevin Wolf wrote:
> Instead of letting the caller make up a meaningless error message, add
> an Error parameter to allow reporting the real error.
> 
> Signed-off-by: Kevin Wolf 

Reviewed-by: Raphael Norwitz 

> ---
>  hw/block/vhost-user-blk.c | 31 +++
>  1 file changed, 15 insertions(+), 16 deletions(-)
> 
> diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
> index 0cb56baefb..e9382e152a 100644
> --- a/hw/block/vhost-user-blk.c
> +++ b/hw/block/vhost-user-blk.c
> @@ -113,7 +113,7 @@ const VhostDevConfigOps blk_ops = {
>  .vhost_dev_config_notifier = vhost_user_blk_handle_config_change,
>  };
>  
> -static int vhost_user_blk_start(VirtIODevice *vdev)
> +static int vhost_user_blk_start(VirtIODevice *vdev, Error **errp)
>  {
>  VHostUserBlk *s = VHOST_USER_BLK(vdev);
>  BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
> @@ -121,19 +121,19 @@ static int vhost_user_blk_start(VirtIODevice *vdev)
>  int i, ret;
>  
>  if (!k->set_guest_notifiers) {
> -error_report("binding does not support guest notifiers");
> +error_setg(errp, "binding does not support guest notifiers");
>  return -ENOSYS;
>  }
>  
>  ret = vhost_dev_enable_notifiers(&s->dev, vdev);
>  if (ret < 0) {
> -error_report("Error enabling host notifiers: %d", -ret);
> +error_setg_errno(errp, -ret, "Error enabling host notifiers");
>  return ret;
>  }
>  
>  ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, true);
>  if (ret < 0) {
> -error_report("Error binding guest notifier: %d", -ret);
> +error_setg_errno(errp, -ret, "Error binding guest notifier");
>  goto err_host_notifiers;
>  }
>  
> @@ -141,27 +141,27 @@ static int vhost_user_blk_start(VirtIODevice *vdev)
>  
>  ret = vhost_dev_prepare_inflight(&s->dev, vdev);
>  if (ret < 0) {
> -error_report("Error set inflight format: %d", -ret);
> +error_setg_errno(errp, -ret, "Error setting inflight format");
>  goto err_guest_notifiers;
>  }
>  
>  if (!s->inflight->addr) {
>  ret = vhost_dev_get_inflight(&s->dev, s->queue_size, s->inflight);
>  if (ret < 0) {
> -error_report("Error get inflight: %d", -ret);
> +error_setg_errno(errp, -ret, "Error getting inflight");
>  goto err_guest_notifiers;
>  }
>  }
>  
>  ret = vhost_dev_set_inflight(&s->dev, s->inflight);
>  if (ret < 0) {
> -error_report("Error set inflight: %d", -ret);
> +error_setg_errno(errp, -ret, "Error setting inflight");
>  goto err_guest_notifiers;
>  }
>  
>  ret = vhost_dev_start(&s->dev, vdev);
>  if (ret < 0) {
> -error_report("Error starting vhost: %d", -ret);
> +error_setg_errno(errp, -ret, "Error starting vhost");
>  goto err_guest_notifiers;
>  }
>  s->started_vu = true;
> @@ -214,6 +214,7 @@ static void vhost_user_blk_set_status(VirtIODevice *vdev, 
> uint8_t status)
>  {
>  VHostUserBlk *s = VHOST_USER_BLK(vdev);
>  bool should_start = virtio_device_started(vdev, status);
> +Error *local_err = NULL;
>  int ret;
>  
>  if (!vdev->vm_running) {
> @@ -229,10 +230,9 @@ static void vhost_user_blk_set_status(VirtIODevice 
> *vdev, uint8_t status)
>  }
>  
>  if (should_start) {
> -ret = vhost_user_blk_start(vdev);
> +ret = vhost_user_blk_start(vdev, &local_err);
>  if (ret < 0) {
> -error_report("vhost-user-blk: vhost start failed: %s",
> - strerror(-ret));
> +error_reportf_err(local_err, "vhost-user-blk: vhost start 
> failed: ");
>  qemu_chr_fe_disconnect(&s->chardev);
>  }
>  } else {
> @@ -270,6 +270,7 @@ static uint64_t vhost_user_blk_get_features(VirtIODevice 
> *vdev,
>  static void vhost_user_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
>  {
>  VHostUserBlk *s = VHOST_USER_BLK(vdev);
> +Error *local_err = NULL;
>  int i, ret;
>  
>  if (!vdev->start_on_kick) {
> @@ -287,10 +288,9 @@ static void vhost_user_blk_handle_output(VirtIODevice 
> *vdev, VirtQueue *vq)
>  /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start
>   * vhost here instead of waiting for .set_status().
>   */
> -ret = vhost_user_blk_start(vdev);
> +ret = vhost_user_blk_start(vdev, &local_err);
>  if (ret < 0) {
> -error_report("vhost-user-blk: vhost start failed: %s",
> - strerror(-ret));
> +error_reportf_err(local_err, "vhost-user-blk: vhost start failed: ");
>  qemu_chr_fe_disconnect(&s->chardev);
>  return;
>  }
> @@ -340,9 +340,8 @@ static int vhost_user_blk_connect(DeviceState *dev, Error 
> **errp)
>  
>  /* restore vhost state */
>  if (virtio_device_started(vdev, vdev->status)) {
> -ret = vhost_user_b

Re: [PATCH 3/7] vhost: Return 0/-errno in vhost_dev_init()

2021-06-11 Thread Raphael Norwitz
On Wed, Jun 09, 2021 at 05:46:54PM +0200, Kevin Wolf wrote:
> Instead of just returning 0/-1 and letting the caller make up a
> meaningless error message, switch to 0/-errno so that different kinds of
> errors can be distinguished in the caller.
> 
> This involves changing a few more callbacks in VhostOps to return
> 0/-errno: .vhost_set_owner(), .vhost_get_features() and
> .vhost_virtqueue_set_busyloop_timeout(). The implementations of these
> functions are trivial as they generally just send a message to the
> backend.
> 
> Signed-off-by: Kevin Wolf 

Reviewed-by: Raphael Norwitz 

> ---
>  hw/virtio/vhost-backend.c |  4 +++-
>  hw/virtio/vhost-user.c| 10 +++---
>  hw/virtio/vhost-vdpa.c|  4 +++-
>  hw/virtio/vhost.c |  8 
>  4 files changed, 17 insertions(+), 9 deletions(-)
> 
> diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c
> index f4f71cf58a..594d770b75 100644
> --- a/hw/virtio/vhost-backend.c
> +++ b/hw/virtio/vhost-backend.c
> @@ -24,10 +24,12 @@ static int vhost_kernel_call(struct vhost_dev *dev, 
> unsigned long int request,
>   void *arg)
>  {
>  int fd = (uintptr_t) dev->opaque;
> +int ret;
>  
>  assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_KERNEL);
>  
> -return ioctl(fd, request, arg);
> +ret = ioctl(fd, request, arg);
> +return ret < 0 ? -errno : ret;
>  }
>  
>  static int vhost_kernel_init(struct vhost_dev *dev, void *opaque, Error 
> **errp)
> diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
> index 024cb201bb..889559d86a 100644
> --- a/hw/virtio/vhost-user.c
> +++ b/hw/virtio/vhost-user.c
> @@ -1353,7 +1353,11 @@ static int vhost_user_get_u64(struct vhost_dev *dev, 
> int request, uint64_t *u64)
>  
>  static int vhost_user_get_features(struct vhost_dev *dev, uint64_t *features)
>  {
> -return vhost_user_get_u64(dev, VHOST_USER_GET_FEATURES, features);
> +if (vhost_user_get_u64(dev, VHOST_USER_GET_FEATURES, features) < 0) {
> +return -EPROTO;
> +}
> +
> +return 0;
>  }
>  
>  static int vhost_user_set_owner(struct vhost_dev *dev)
> @@ -1364,7 +1368,7 @@ static int vhost_user_set_owner(struct vhost_dev *dev)
>  };
>  
>  if (vhost_user_write(dev, &msg, NULL, 0) < 0) {
> -return -1;
> +return -EPROTO;
>  }
>  
>  return 0;
> @@ -1872,7 +1876,7 @@ static int vhost_user_backend_init(struct vhost_dev 
> *dev, void *opaque,
>  
>  err = vhost_user_get_features(dev, &features);
>  if (err < 0) {
> -return -EPROTO;
> +return err;
>  }
>  
>  if (virtio_has_feature(features, VHOST_USER_F_PROTOCOL_FEATURES)) {
> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> index c2aadb57cb..71897c1a01 100644
> --- a/hw/virtio/vhost-vdpa.c
> +++ b/hw/virtio/vhost-vdpa.c
> @@ -253,10 +253,12 @@ static int vhost_vdpa_call(struct vhost_dev *dev, 
> unsigned long int request,
>  {
>  struct vhost_vdpa *v = dev->opaque;
>  int fd = v->device_fd;
> +int ret;
>  
>  assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
>  
> -return ioctl(fd, request, arg);
> +ret = ioctl(fd, request, arg);
> +return ret < 0 ? -errno : ret;
>  }
>  
>  static void vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status)
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index fd13135706..c7f9d8bb06 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -1309,13 +1309,13 @@ int vhost_dev_init(struct vhost_dev *hdev, void 
> *opaque,
>  
>  r = hdev->vhost_ops->vhost_set_owner(hdev);
>  if (r < 0) {
> -error_setg(errp, "vhost_set_owner failed");
> +error_setg_errno(errp, -r, "vhost_set_owner failed");
>  goto fail;
>  }
>  
>  r = hdev->vhost_ops->vhost_get_features(hdev, &features);
>  if (r < 0) {
> -error_setg(errp, "vhost_get_features failed");
> +error_setg_errno(errp, -r, "vhost_get_features failed");
>  goto fail;
>  }
>  
> @@ -1332,7 +1332,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
>  r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + 
> i,
>   busyloop_timeout);
>  if (r < 0) {
> -error_setg(errp, "Failed to set busyloop timeout");
> +error_setg_errno(errp, -r, "Failed to set busyloop timeout");
>  goto fail_busyloop;
>  }
>  }
> @@ -1391,7 +1391,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
>  if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) 
> {
>  error_setg(errp, "vhost backend memory slots limit is less"
> " than current number of present memory slots");
> -r = -1;
> +r = -EINVAL;
>  goto fail_busyloop;
>  }
>  
> -- 
> 2.30.2
> 


Re: [PATCH 2/7] vhost: Distinguish errors in vhost_backend_init()

2021-06-11 Thread Raphael Norwitz
On Wed, Jun 09, 2021 at 05:46:53PM +0200, Kevin Wolf wrote:
> Instead of just returning 0/-1 and letting the caller make up a
> meaningless error message, add an Error parameter to allow reporting the
> real error and switch to 0/-errno so that different kind of errors can
> be distinguished in the caller.
> 
> Specifically, in vhost-user, EPROTO is used for all errors that relate
> to the connection itself, whereas other error codes are used for errors
> relating to the content of the connection. This will allow us later to
> automatically reconnect when the connection goes away, without ending up
> in an endless loop if it's a permanent error in the configuration.
> 
> Signed-off-by: Kevin Wolf 

Reviewed-by: Raphael Norwitz 

> ---
>  include/hw/virtio/vhost-backend.h |  3 ++-
>  hw/virtio/vhost-backend.c |  2 +-
>  hw/virtio/vhost-user.c| 41 ---
>  hw/virtio/vhost-vdpa.c|  2 +-
>  hw/virtio/vhost.c | 13 +-
>  5 files changed, 32 insertions(+), 29 deletions(-)
> 
> diff --git a/include/hw/virtio/vhost-backend.h 
> b/include/hw/virtio/vhost-backend.h
> index 8a6f8e2a7a..728ebb0ed9 100644
> --- a/include/hw/virtio/vhost-backend.h
> +++ b/include/hw/virtio/vhost-backend.h
> @@ -37,7 +37,8 @@ struct vhost_scsi_target;
>  struct vhost_iotlb_msg;
>  struct vhost_virtqueue;
>  
> -typedef int (*vhost_backend_init)(struct vhost_dev *dev, void *opaque);
> +typedef int (*vhost_backend_init)(struct vhost_dev *dev, void *opaque,
> +  Error **errp);
>  typedef int (*vhost_backend_cleanup)(struct vhost_dev *dev);
>  typedef int (*vhost_backend_memslots_limit)(struct vhost_dev *dev);
>  
> diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c
> index 31b33bde37..f4f71cf58a 100644
> --- a/hw/virtio/vhost-backend.c
> +++ b/hw/virtio/vhost-backend.c
> @@ -30,7 +30,7 @@ static int vhost_kernel_call(struct vhost_dev *dev, 
> unsigned long int request,
>  return ioctl(fd, request, arg);
>  }
>  
> -static int vhost_kernel_init(struct vhost_dev *dev, void *opaque)
> +static int vhost_kernel_init(struct vhost_dev *dev, void *opaque, Error 
> **errp)
>  {
>  assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_KERNEL);
>  
> diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
> index ee57abe045..024cb201bb 100644
> --- a/hw/virtio/vhost-user.c
> +++ b/hw/virtio/vhost-user.c
> @@ -1856,7 +1856,8 @@ static int 
> vhost_user_postcopy_notifier(NotifierWithReturn *notifier,
>  return 0;
>  }
>  
> -static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque)
> +static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque,
> +   Error **errp)
>  {
>  uint64_t features, protocol_features, ram_slots;
>  struct vhost_user *u;
> @@ -1871,7 +1872,7 @@ static int vhost_user_backend_init(struct vhost_dev 
> *dev, void *opaque)
>  
>  err = vhost_user_get_features(dev, &features);
>  if (err < 0) {
> -return err;
> +return -EPROTO;
>  }
>  
>  if (virtio_has_feature(features, VHOST_USER_F_PROTOCOL_FEATURES)) {
> @@ -1880,7 +1881,7 @@ static int vhost_user_backend_init(struct vhost_dev 
> *dev, void *opaque)
>  err = vhost_user_get_u64(dev, VHOST_USER_GET_PROTOCOL_FEATURES,
>   &protocol_features);
>  if (err < 0) {
> -return err;
> +return -EPROTO;
>  }
>  
>  dev->protocol_features =
> @@ -1891,14 +1892,14 @@ static int vhost_user_backend_init(struct vhost_dev 
> *dev, void *opaque)
>  dev->protocol_features &= ~(1ULL << 
> VHOST_USER_PROTOCOL_F_CONFIG);
>  } else if (!(protocol_features &
>  (1ULL << VHOST_USER_PROTOCOL_F_CONFIG))) {
> -error_report("Device expects VHOST_USER_PROTOCOL_F_CONFIG "
> -"but backend does not support it.");
> -return -1;
> +error_setg(errp, "Device expects VHOST_USER_PROTOCOL_F_CONFIG "
> +   "but backend does not support it.");
> +return -EINVAL;
>  }
>  
>  err = vhost_user_set_protocol_features(dev, dev->protocol_features);
>  if (err < 0) {
> -return err;
> +return -EPROTO;
>  }
>  
>  /* query the max queues we support if backend supports Multiple 
> Queue */
> @@ -1906,12 +1907,12 @@ static int vhost_user_backend_init(struct vhost_dev 
> *dev, void *opaque)
>  err = vhost_user_get_u64(dev, VHOST_USER_GET_QUEUE_NUM,
>   &dev->max_queues);
>  if (err < 0) {
> -return err;
> +return -EPROTO;
>  }
>  }
>  if (dev->num_queues && dev->max_queues < dev->num_queues) {
> -error_report("The maximum number of queues supported by the "
> - "backe

Re: [PATCH 1/7] vhost: Add Error parameter to vhost_dev_init()

2021-06-11 Thread Raphael Norwitz
On Wed, Jun 09, 2021 at 05:46:52PM +0200, Kevin Wolf wrote:
> This allows callers to return better error messages instead of making
> one up while the real error ends up on stderr. Most callers can
> immediately make use of this because they already have an Error
> parameter themselves. The others just keep printing the error with
> error_report_err().
> 
> Signed-off-by: Kevin Wolf 

Reviewed-by: Raphael Norwitz 

> ---
>  include/hw/virtio/vhost.h|  2 +-
>  backends/cryptodev-vhost.c   |  5 -
>  backends/vhost-user.c|  4 ++--
>  hw/block/vhost-user-blk.c|  4 ++--
>  hw/net/vhost_net.c   |  6 +-
>  hw/scsi/vhost-scsi.c |  4 +---
>  hw/scsi/vhost-user-scsi.c|  4 +---
>  hw/virtio/vhost-user-fs.c|  3 +--
>  hw/virtio/vhost-user-vsock.c |  3 +--
>  hw/virtio/vhost-vsock.c  |  3 +--
>  hw/virtio/vhost.c| 16 ++--
>  11 files changed, 29 insertions(+), 25 deletions(-)
> 
> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> index 21a9a52088..2d7aaad67b 100644
> --- a/include/hw/virtio/vhost.h
> +++ b/include/hw/virtio/vhost.h
> @@ -104,7 +104,7 @@ struct vhost_net {
>  
>  int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
> VhostBackendType backend_type,
> -   uint32_t busyloop_timeout);
> +   uint32_t busyloop_timeout, Error **errp);
>  void vhost_dev_cleanup(struct vhost_dev *hdev);
>  int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev);
>  void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev);
> diff --git a/backends/cryptodev-vhost.c b/backends/cryptodev-vhost.c
> index 8231e7f1bc..bc13e466b4 100644
> --- a/backends/cryptodev-vhost.c
> +++ b/backends/cryptodev-vhost.c
> @@ -52,6 +52,7 @@ cryptodev_vhost_init(
>  {
>  int r;
>  CryptoDevBackendVhost *crypto;
> +Error *local_err = NULL;
>  
>  crypto = g_new(CryptoDevBackendVhost, 1);
>  crypto->dev.max_queues = 1;
> @@ -66,8 +67,10 @@ cryptodev_vhost_init(
>  /* vhost-user needs vq_index to initiate a specific queue pair */
>  crypto->dev.vq_index = crypto->cc->queue_index * crypto->dev.nvqs;
>  
> -r = vhost_dev_init(&crypto->dev, options->opaque, options->backend_type, 
> 0);
> +r = vhost_dev_init(&crypto->dev, options->opaque, options->backend_type, 
> 0,
> +   &local_err);
>  if (r < 0) {
> +error_report_err(local_err);
>  goto fail;
>  }
>  
> diff --git a/backends/vhost-user.c b/backends/vhost-user.c
> index b366610e16..10b39992d2 100644
> --- a/backends/vhost-user.c
> +++ b/backends/vhost-user.c
> @@ -48,9 +48,9 @@ vhost_user_backend_dev_init(VhostUserBackend *b, 
> VirtIODevice *vdev,
>  b->dev.nvqs = nvqs;
>  b->dev.vqs = g_new0(struct vhost_virtqueue, nvqs);
>  
> -ret = vhost_dev_init(&b->dev, &b->vhost_user, VHOST_BACKEND_TYPE_USER, 
> 0);
> +ret = vhost_dev_init(&b->dev, &b->vhost_user, VHOST_BACKEND_TYPE_USER, 0,
> + errp);
>  if (ret < 0) {
> -error_setg_errno(errp, -ret, "vhost initialization failed");
>  return -1;
>  }
>  
> diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
> index c6210fad0c..0cb56baefb 100644
> --- a/hw/block/vhost-user-blk.c
> +++ b/hw/block/vhost-user-blk.c
> @@ -332,9 +332,9 @@ static int vhost_user_blk_connect(DeviceState *dev, Error 
> **errp)
>  
>  vhost_dev_set_config_notifier(&s->dev, &blk_ops);
>  
> -ret = vhost_dev_init(&s->dev, &s->vhost_user, VHOST_BACKEND_TYPE_USER, 
> 0);
> +ret = vhost_dev_init(&s->dev, &s->vhost_user, VHOST_BACKEND_TYPE_USER, 0,
> + errp);
>  if (ret < 0) {
> -error_setg_errno(errp, -ret, "vhost initialization failed");
>  return ret;
>  }
>  
> diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
> index 44c1ed92dc..447b119f85 100644
> --- a/hw/net/vhost_net.c
> +++ b/hw/net/vhost_net.c
> @@ -22,6 +22,7 @@
>  #include "standard-headers/linux/vhost_types.h"
>  #include "hw/virtio/virtio-net.h"
>  #include "net/vhost_net.h"
> +#include "qapi/error.h"
>  #include "qemu/error-report.h"
>  #include "qemu/main-loop.h"
>  
> @@ -157,6 +158,7 @@ struct vhost_net *vhost_net_init(VhostNetOptions *options)
>  bool backend_kernel = options->backend_type == VHOST_BACKEND_TYPE_KERNEL;
>  struct vhost_net *net = g_new0(struct vhost_net, 1);
>  uint64_t features = 0;
> +Error *local_err = NULL;
>  
>  if (!options->net_backend) {
>  fprintf(stderr, "vhost-net requires net backend to be setup\n");
> @@ -187,8 +189,10 @@ struct vhost_net *vhost_net_init(VhostNetOptions 
> *options)
>  }
>  
>  r = vhost_dev_init(&net->dev, options->opaque,
> -   options->backend_type, options->busyloop_timeout);
> +   options->backend_type, options->busyloop_timeout,
> +   &local_err);
>  if (r < 0) {
> +error_report_err(local_

[PATCH v2 2/1] qemu-img: Add "backing":true to unallocated map segments

2021-06-11 Thread Eric Blake
To save the user from having to check 'qemu-img info --backing-chain'
or other followup command to determine which "depth":n goes beyond the
chain, add a boolean field "backing" that is set only for unallocated
portions of the disk.

Signed-off-by: Eric Blake 
---

Touches the same iotest output as 1/1.  If we decide that switching to
"depth":n+1 is too risky, and that the mere addition of "backing":true
while keeping "depth":n is good enough, then we'd have just one patch,
instead of this double churn.  Preferences?

 docs/tools/qemu-img.rst|  3 ++
 qapi/block-core.json   |  7 ++-
 qemu-img.c | 15 +-
 tests/qemu-iotests/122.out | 34 +++---
 tests/qemu-iotests/154.out | 96 +++---
 tests/qemu-iotests/179.out | 66 +-
 tests/qemu-iotests/223.out | 24 +-
 tests/qemu-iotests/244.out |  6 +--
 tests/qemu-iotests/252.out |  4 +-
 tests/qemu-iotests/274.out | 16 +++
 tests/qemu-iotests/291.out |  8 ++--
 tests/qemu-iotests/309.out |  4 +-
 12 files changed, 150 insertions(+), 133 deletions(-)

diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst
index c155b1bf3cc8..fbc623b645c3 100644
--- a/docs/tools/qemu-img.rst
+++ b/docs/tools/qemu-img.rst
@@ -601,6 +601,9 @@ Command description:
 a ``depth``; for example, a depth of 2 refers to the backing file
 of the backing file of *FILENAME*.  Depth will be one larger than
 the chain length if no file in the chain provides the data.
+  - an optional ``backing`` field is present with value true if no
+file in the backing chain provides the data (making it easier to
+identify when ``depth`` exceeds the chain length).

   In JSON format, the ``offset`` field is optional; it is absent in
   cases where ``human`` format would omit the entry or exit with an error.
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 2ea294129e08..cebe12ba16a0 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -264,6 +264,9 @@
 # @offset: if present, the image file stores the data for this range
 #  in raw format at the given (host) offset
 #
+# @backing: if present, the range is not allocated within the backing
+#   chain (since 6.1)
+#
 # @filename: filename that is referred to by @offset
 #
 # Since: 2.6
@@ -271,8 +274,8 @@
 ##
 { 'struct': 'MapEntry',
   'data': {'start': 'int', 'length': 'int', 'data': 'bool',
-   'zero': 'bool', 'depth': 'int', '*offset': 'int',
-   '*filename': 'str' } }
+   'zero': 'bool', 'depth': 'int', '*backing': 'bool',
+   '*offset': 'int', '*filename': 'str' } }

 ##
 # @BlockdevCacheInfo:
diff --git a/qemu-img.c b/qemu-img.c
index 33a5cd012b8b..4d357f534803 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -2977,8 +2977,13 @@ static int dump_map_entry(OutputFormat output_format, 
MapEntry *e,
 break;
 case OFORMAT_JSON:
 printf("{ \"start\": %"PRId64", \"length\": %"PRId64","
-   " \"depth\": %"PRId64", \"zero\": %s, \"data\": %s",
-   e->start, e->length, e->depth,
+   " \"depth\": %"PRId64, e->start, e->length, e->depth);
+if (e->has_backing) {
+/* Backing should only be set at the end of the chain */
+assert(e->backing && e->depth > 0);
+printf(", \"backing\": true");
+}
+printf(", \"zero\": %s, \"data\": %s",
e->zero ? "true" : "false",
e->data ? "true" : "false");
 if (e->has_offset) {
@@ -2999,6 +3004,7 @@ static int get_block_status(BlockDriverState *bs, int64_t 
offset,
 {
 int ret;
 int depth;
+bool backing = false;
 BlockDriverState *file;
 bool has_offset;
 int64_t map;
@@ -3037,6 +3043,7 @@ static int get_block_status(BlockDriverState *bs, int64_t 
offset,
 }
 if (!(ret & BDRV_BLOCK_ALLOCATED)) {
 depth++;
+backing = true;
 }

 *e = (MapEntry) {
@@ -3047,6 +3054,8 @@ static int get_block_status(BlockDriverState *bs, int64_t 
offset,
 .offset = map,
 .has_offset = has_offset,
 .depth = depth,
+.has_backing = backing,
+.backing = backing,
 .has_filename = filename,
 .filename = filename,
 };
@@ -3072,6 +3081,8 @@ static inline bool entry_mergeable(const MapEntry *curr, 
const MapEntry *next)
 if (curr->has_offset && curr->offset + curr->length != next->offset) {
 return false;
 }
+/* backing should only ever be set for identical depth */
+assert(curr->backing == next->backing);
 return true;
 }

diff --git a/tests/qemu-iotests/122.out b/tests/qemu-iotests/122.out
index 779dab4847f0..c5aa2c9866f1 100644
--- a/tests/qemu-iotests/122.out
+++ b/tests/qemu-iotests/122.out
@@ -68,11 +68,11 @@ read 65536/65536 bytes at offset 4194304
 read 65536/65536 bytes at offset 8388608
 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 [{ "start": 0, "length": 65536, "depth"

Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole

2021-06-11 Thread Eric Blake
On Fri, Jun 11, 2021 at 08:35:01PM +0300, Nir Soffer wrote:
> On Fri, Jun 11, 2021 at 4:28 PM Eric Blake  wrote:
> >
> > On Fri, Jun 11, 2021 at 10:09:09AM +0200, Kevin Wolf wrote:
> > > > Yes, that might work as well.  But we didn't previously document
> > > > depth to be optional.  Removing something from output risks breaking
> > > > more downstream tools that expect it to be non-optional, compared to
> > > > providing a new value.
> > >
> > > A negative value isn't any less unexpected than a missing key. I don't
> > > think any existing tool would be able to handle it. Encoding different
> > > meanings in a single value isn't very QAPI-like either. Usually strings
> > > that are parsed are the problem, but negative integers really isn't that
> > > much different. I don't really like this solution.
> > >
> > > Leaving out the depth feels like a better suggestion to me.
> > >
> > > But anyway, this seems to only happen at the end of the backing chain.
> > > So if the backing chain consistents of n images, why not report 'depth':
> > > n + 1? So, in the above example, you would get 1. I think this has the
> > > best chances of tools actually working correctly with the new output,
> > > even though it's still not unlikely to break something.
> >
> > Ooh, I like that.  It is closer to reality - the file data really
> > comes from the next depth, even if we have no filename at that depth.
> > v2 of my patch coming up.
> 
> How do you know the number of the layer? this info is not presented in
> qemu-img map output.

qemu-img map has two output formats.

In --output=human, areas of the disk reading as zero are elided (and
this happens to include ALL areas that were not allocated anywhere in
the chain); all other areas list the filename of the element in the
chain where the data was found.  This mode also fails if compression
or encryption prevents easy access to actual data.  In other words,
it's fragile, so no one uses it for anything programmatic, even though
it's the default.

In --output=json, no file names are output.  Instead, "depth":N tells
you how deep in the backing chain you must traverse to find the data.
"depth":0 is obvious: the file you mapped (other than the bug that
this patch is fixing where we mistakenly used "depth":0 also for
unallocated regions).  If you use "backing":null to force a 1-layer
depth, then "depth":1 is unambiguous meaning the (non-present) backing
file.

Otherwise, you do have a point: "depth":1 in isolation is ambiguous
between "not allocated anywhere in this 1-element chain" and
"allocated at the first backing file in this chain of length 2 or
more".  At which point you can indeed use "qemu-img info" to determine
the backing chain depth.  How painful is that extra step?  Does it
justify the addition of a new optional "backing":true to any portion
of the file that was beyond the end of the chain (and omit that line
for all other regions, rather than printing "backing":false)?

> 
> Users will have to run "qemu-img info --backing-chain" to understand the
> output of qemu-img map.

At any rate, it should be easy enough to output an additional field,
followup patch coming soon...

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH v2] qemu-img: Make unallocated part of backing chain obvious in map

2021-06-11 Thread Nir Soffer
On Fri, Jun 11, 2021 at 5:59 PM Eric Blake  wrote:
>
> On Fri, Jun 11, 2021 at 05:35:12PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> > > An obvious solution is to make 'qemu-img map --output=json'
> > > distinguish between clusters that have a local allocation from those
> > > that are found nowhere in the chain.  We already have a one-off
> > > mismatch between qemu-img map and NBD qemu:allocation-depth (the
> > > former chose 0, and the latter 1 for the local layer), so exposing the
> > > latter's choice of 0 for unallocated in the entire chain would mean
> > > using using "depth":-1 in the former, but a negative depth may confuse
> > > existing tools.  But there is an easy out: for any chain of length N,
> > > we can simply represent an unallocated cluster as "depth":N+1.  This
> > > does have a slight risk of confusing any tool that might try to
> > > dereference NULL when finding the backing image for the last file in
> > > the backing chain, but that risk sseems worth the more precise output.
> > > The iotests have several examples where this distinction demonstrates
> > > the additional accuracy.
> > >
> > > Signed-off-by: Eric Blake 
> > > ---
> > >
> > > Replaces v1: 20210610213906.1313440-1-ebl...@redhat.com
> > > (qemu-img: Use "depth":-1 to make backing probes obvious)
> > >
> > > Use N+1 instead of -1 for unallocated [Kevin]
> > >
> >
> > Bit in contrast with -1, or with separate boolean flag, you lose the 
> > possibility to distinguish case when we have 3 layers and the cluster is 
> > absent in all of them, and the case when we have 4 layers and the cluster 
> > is absent in top 3 but in 4 it is qcow2 UNALLOCATED_ZERO cluster.
>
> Using just 'qemu-img map --output-json', you only see depth numbers.
> You also have to use 'qemu-img info --backing-chain' to see what file
> those depth numbers correspond to, at which point it becomes obvious
> whether "depth":4 meant unallocated (because the chain was length 3)
> or allocated at depth 4 (because the chain was length 4 or longer).
> But that's no worse than pre-patch, where you had to use qemu-img info
> --backing-chain to learn which file a particular "depth" maps to.
>
> >
> > So, if someone use this API to reconstruct the chain, then for original 3 
> > empty layers he will create 3 empty layers and 4rd additional ZERO layer. 
> > And such reconstructed chain would not be equal to original chain (as if we 
> > take these two chains and add additional backing file as a new bottom 
> > layer, effect would be different).. I'm not sure is it a problem in the 
> > task you are solving :\
>
> It should be fairly easy to optimize the case of a backing chain where
> EVERY listed cluster at the final depth was "data":false,"zero":true
> to omit that file after all.
>
> And in oVirt's case, Nir pointed out that we have one more tool at our
> disposal in recreating a backing chain: if you use
> json:{"driver":"qcow2", "backing":null, ...} as your image file, you
> don't have to worry about arbitrary files in the backing chain, only
> about recreating the top-most layer of a chain.  And in that case, it
> becomes very obvious that "depth":0 is something you must recreate,
> and "depth":1 would be a non-existent backing file because you just
> passed "backing":null.

Note that oVirt does not use qemu-img map, we use qemu-nbd to get
image extents, since it is used only in context we already connect to
qemu-nbd server or run qemu-nbd.

Management tools already know the image format (they should avoid
doing format probing anyway), and using a json uri allows single command
to get the needed info when you inspect a single layer.

But this change introduces a risk that some program using qemu-img map
will interrupt the result in the wrong way, assuming that there is N+1 layer.

I think adding a new flag for absent extents is better. It cannot break
any user and it is easier to understand and use.

Nir




Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole

2021-06-11 Thread Nir Soffer
On Fri, Jun 11, 2021 at 4:28 PM Eric Blake  wrote:
>
> On Fri, Jun 11, 2021 at 10:09:09AM +0200, Kevin Wolf wrote:
> > > Yes, that might work as well.  But we didn't previously document
> > > depth to be optional.  Removing something from output risks breaking
> > > more downstream tools that expect it to be non-optional, compared to
> > > providing a new value.
> >
> > A negative value isn't any less unexpected than a missing key. I don't
> > think any existing tool would be able to handle it. Encoding different
> > meanings in a single value isn't very QAPI-like either. Usually strings
> > that are parsed are the problem, but negative integers really isn't that
> > much different. I don't really like this solution.
> >
> > Leaving out the depth feels like a better suggestion to me.
> >
> > But anyway, this seems to only happen at the end of the backing chain.
> > So if the backing chain consistents of n images, why not report 'depth':
> > n + 1? So, in the above example, you would get 1. I think this has the
> > best chances of tools actually working correctly with the new output,
> > even though it's still not unlikely to break something.
>
> Ooh, I like that.  It is closer to reality - the file data really
> comes from the next depth, even if we have no filename at that depth.
> v2 of my patch coming up.

How do you know the number of the layer? this info is not presented in
qemu-img map output.

Users will have to run "qemu-img info --backing-chain" to understand the
output of qemu-img map.




Re: [PATCH v4 00/32] block/nbd: rework client connection

2021-06-11 Thread Vladimir Sementsov-Ogievskiy

11.06.2021 18:55, Eric Blake wrote:

On Thu, Jun 10, 2021 at 01:07:30PM +0300, Vladimir Sementsov-Ogievskiy wrote:

v4:

Now based on new Paolo's patch:
Based-on: <20210609122234.544153-1-pbonz...@redhat.com>

Also, I've dropped patch 33 for now, it's too much for this series.
I'll resend it later on top of this.

The series is also available at tag up-nbd-client-connection-v4 in
git https://src.openvz.org/scm/~vsementsov/qemu.git


I think everything has R-b now, so I'll queue this through my NBD tree
including folding in the grammar tweaks where I spotted them.



Thanks a lot!

--
Best regards,
Vladimir



Re: [PATCH 3/7] net/rocker: use GDateTime for formatting timestamp in debug messages

2021-06-11 Thread Daniel P . Berrangé
ping: anyone willing to give a review of this one

On Wed, May 05, 2021 at 11:36:58AM +0100, Daniel P. Berrangé wrote:
> The GDateTime APIs provided by GLib avoid portability pitfalls, such
> as some platforms where 'struct timeval.tv_sec' field is still 'long'
> instead of 'time_t'. When combined with automatic cleanup, GDateTime
> often results in simpler code too.
> 
> Signed-off-by: Daniel P. Berrangé 
> ---
>  hw/net/rocker/rocker.h | 11 +++
>  1 file changed, 3 insertions(+), 8 deletions(-)
> 
> diff --git a/hw/net/rocker/rocker.h b/hw/net/rocker/rocker.h
> index 941c932265..412fa44d01 100644
> --- a/hw/net/rocker/rocker.h
> +++ b/hw/net/rocker/rocker.h
> @@ -25,14 +25,9 @@
>  #if defined(DEBUG_ROCKER)
>  #  define DPRINTF(fmt, ...) \
>  do {   \
> -struct timeval tv; \
> -char timestr[64];  \
> -time_t now;\
> -gettimeofday(&tv, NULL);   \
> -now = tv.tv_sec;   \
> -strftime(timestr, sizeof(timestr), "%T", localtime(&now)); \
> -fprintf(stderr, "%s.%06ld ", timestr, tv.tv_usec); \
> -fprintf(stderr, "ROCKER: " fmt, ## __VA_ARGS__);   \
> +g_autoptr(GDateTime) now = g_date_time_new_now_local();\
> +g_autofree char *nowstr = g_date_time_format(now, "%T.%f");\
> +fprintf(stderr, "%s ROCKER: " fmt, nowstr, ## __VA_ARGS__);\
>  } while (0)
>  #else
>  static inline GCC_FMT_ATTR(1, 2) int DPRINTF(const char *fmt, ...)
> -- 
> 2.31.1
> 

Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|




Re: [PATCH 5/7] io: use GDateTime for formatting timestamp for websock headers

2021-06-11 Thread Daniel P . Berrangé
ping: anyone willing to review this

On Wed, May 05, 2021 at 11:37:00AM +0100, Daniel P. Berrangé wrote:
> The GDateTime APIs provided by GLib avoid portability pitfalls, such
> as some platforms where 'struct timeval.tv_sec' field is still 'long'
> instead of 'time_t'. When combined with automatic cleanup, GDateTime
> often results in simpler code too.
> 
> Signed-off-by: Daniel P. Berrangé 
> ---
>  io/channel-websock.c | 10 ++
>  1 file changed, 2 insertions(+), 8 deletions(-)
> 
> diff --git a/io/channel-websock.c b/io/channel-websock.c
> index 03c1f7cb62..70889bb54d 100644
> --- a/io/channel-websock.c
> +++ b/io/channel-websock.c
> @@ -177,15 +177,9 @@ qio_channel_websock_handshake_send_res(QIOChannelWebsock 
> *ioc,
>  
>  static gchar *qio_channel_websock_date_str(void)
>  {
> -struct tm tm;
> -time_t now = time(NULL);
> -char datebuf[128];
> +g_autoptr(GDateTime) now = g_date_time_new_now_utc();
>  
> -gmtime_r(&now, &tm);
> -
> -strftime(datebuf, sizeof(datebuf), "%a, %d %b %Y %H:%M:%S GMT", &tm);
> -
> -return g_strdup(datebuf);
> +return g_date_time_format(now, "%a, %d %b %Y %H:%M:%S GMT");
>  }
>  
>  static void qio_channel_websock_handshake_send_res_err(QIOChannelWebsock 
> *ioc,
> -- 
> 2.31.1
> 

Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|




Re: [PATCH v2] async: the main AioContext is only "current" if under the BQL

2021-06-11 Thread Eric Blake
On Fri, Jun 11, 2021 at 01:14:27PM +0200, Paolo Bonzini wrote:
> > > With this change, qemu_mutex_iothread_locked() must be changed from
> > 
> > Did you miss "qemu_mutex_iothread_locked() stub function"?
> 
> Yes, that comment refers to the stub.  Let me resubmit with a testcase and
> I'll fix that too.

Vladimir's v4 series reworking nbd connection depends on this one.
I'll wait for your v3 to land, but once it does, I'm happy to queue it
through my NBD tree along with Vladimir's work, if that makes it
easier.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH v4 00/32] block/nbd: rework client connection

2021-06-11 Thread Eric Blake
On Thu, Jun 10, 2021 at 01:07:30PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> v4:
> 
> Now based on new Paolo's patch:
> Based-on: <20210609122234.544153-1-pbonz...@redhat.com>
> 
> Also, I've dropped patch 33 for now, it's too much for this series.
> I'll resend it later on top of this.
> 
> The series is also available at tag up-nbd-client-connection-v4 in
> git https://src.openvz.org/scm/~vsementsov/qemu.git

I think everything has R-b now, so I'll queue this through my NBD tree
including folding in the grammar tweaks where I spotted them.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH v4 28/32] block/nbd: split nbd_co_do_establish_connection out of nbd_reconnect_attempt

2021-06-11 Thread Eric Blake
On Thu, Jun 10, 2021 at 01:07:58PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> Split out the part that we want to reuse for nbd_open().
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  block/nbd.c | 80 -
>  1 file changed, 42 insertions(+), 38 deletions(-)
>

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH v4 21/32] nbd/client-connection: shutdown connection on release

2021-06-11 Thread Eric Blake
On Thu, Jun 10, 2021 at 01:07:51PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> Now, when a thread can do negotiation and retry, it may run relatively
> long. We need a mechanism to stop it, when the user is not interested
> in a result any more. So, on nbd_client_connection_release() let's
> shutdown the socket, and do not retry connection if thread is detached.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  nbd/client-connection.c | 20 +---
>  1 file changed, 17 insertions(+), 3 deletions(-)
>

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH v4 20/32] nbd/client-connection: implement connection retry

2021-06-11 Thread Eric Blake
On Thu, Jun 10, 2021 at 01:07:50PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> Add an option for a thread to retry connection until succeeds. We'll

for a thread to retry connecting until it succeeds.

> use nbd/client-connection both for reconnect and for initial connection
> in nbd_open(), so we need a possibility to use same NBDClientConnection
> instance to connect once in nbd_open() and then use retry semantics for
> reconnect.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  include/block/nbd.h |  2 ++
>  nbd/client-connection.c | 56 +++--
>  2 files changed, 45 insertions(+), 13 deletions(-)
> 

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH v4 19/32] nbd/client-connection: add possibility of negotiation

2021-06-11 Thread Eric Blake
On Thu, Jun 10, 2021 at 01:07:49PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> Add arguments and logic to support nbd negotiation in the same thread
> after successful connection.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  include/block/nbd.h |   9 +++-
>  block/nbd.c |   4 +-
>  nbd/client-connection.c | 105 ++--
>  3 files changed, 109 insertions(+), 9 deletions(-)
>

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH v2] qemu-img: Make unallocated part of backing chain obvious in map

2021-06-11 Thread Eric Blake
On Fri, Jun 11, 2021 at 05:35:12PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> > An obvious solution is to make 'qemu-img map --output=json'
> > distinguish between clusters that have a local allocation from those
> > that are found nowhere in the chain.  We already have a one-off
> > mismatch between qemu-img map and NBD qemu:allocation-depth (the
> > former chose 0, and the latter 1 for the local layer), so exposing the
> > latter's choice of 0 for unallocated in the entire chain would mean
> > using using "depth":-1 in the former, but a negative depth may confuse
> > existing tools.  But there is an easy out: for any chain of length N,
> > we can simply represent an unallocated cluster as "depth":N+1.  This
> > does have a slight risk of confusing any tool that might try to
> > dereference NULL when finding the backing image for the last file in
> > the backing chain, but that risk sseems worth the more precise output.
> > The iotests have several examples where this distinction demonstrates
> > the additional accuracy.
> > 
> > Signed-off-by: Eric Blake 
> > ---
> > 
> > Replaces v1: 20210610213906.1313440-1-ebl...@redhat.com
> > (qemu-img: Use "depth":-1 to make backing probes obvious)
> > 
> > Use N+1 instead of -1 for unallocated [Kevin]
> > 
> 
> Bit in contrast with -1, or with separate boolean flag, you lose the 
> possibility to distinguish case when we have 3 layers and the cluster is 
> absent in all of them, and the case when we have 4 layers and the cluster is 
> absent in top 3 but in 4 it is qcow2 UNALLOCATED_ZERO cluster.

Using just 'qemu-img map --output-json', you only see depth numbers.
You also have to use 'qemu-img info --backing-chain' to see what file
those depth numbers correspond to, at which point it becomes obvious
whether "depth":4 meant unallocated (because the chain was length 3)
or allocated at depth 4 (because the chain was length 4 or longer).
But that's no worse than pre-patch, where you had to use qemu-img info
--backing-chain to learn which file a particular "depth" maps to.

> 
> So, if someone use this API to reconstruct the chain, then for original 3 
> empty layers he will create 3 empty layers and 4rd additional ZERO layer. And 
> such reconstructed chain would not be equal to original chain (as if we take 
> these two chains and add additional backing file as a new bottom layer, 
> effect would be different).. I'm not sure is it a problem in the task you are 
> solving :\

It should be fairly easy to optimize the case of a backing chain where
EVERY listed cluster at the final depth was "data":false,"zero":true
to omit that file after all.

And in oVirt's case, Nir pointed out that we have one more tool at our
disposal in recreating a backing chain: if you use
json:{"driver":"qcow2", "backing":null, ...} as your image file, you
don't have to worry about arbitrary files in the backing chain, only
about recreating the top-most layer of a chain.  And in that case, it
becomes very obvious that "depth":0 is something you must recreate,
and "depth":1 would be a non-existent backing file because you just
passed "backing":null.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH v2] qemu-img: Make unallocated part of backing chain obvious in map

2021-06-11 Thread Vladimir Sementsov-Ogievskiy

11.06.2021 17:01, Eric Blake wrote:

The recently-added NBD context qemu:allocation-depth is able to
distinguish between locally-present data (even with that data is
sparse) [shown as depth 1 over NBD], and data that could not be found
anywhere in the backing chain [shown as depth 0].  But qemu-img map
--output=json predates that addition, and prior to this patch has the
unfortunate behavior that all portions of the backing chain that
resolve without finding a hit in any backing layer report the same
depth as the final backing layer.  This makes it harder to reconstruct
a qcow2 backing chain using just 'qemu-img map' output, especially
when using "backing":null to artificially limit a backing chain,
because it is impossible to distinguish between a
QCOW2_CLUSTER_UNALLOCATED (which defers to a [missing] backing file)
and a QCOW2_CLUSTER_ZERO_PLAIN cluster (which would override any
backing file), since both types of clusters otherwise show as
"data":false,"zero":true" (but note that we can distinguish a
QCOW2_CLUSTER_ZERO_ALLOCATED, which would also have an "offset":
listing).

The task of reconstructing a qcow2 chain was made harder in commit
0da9856851 (nbd: server: Report holes for raw images), because prior
to that point, it was possible to abuse NBD's block status command to
see which portions of a qcow2 file resulted in BDRV_BLOCK_ALLOCATED
(showing up as NBD_STATE_ZERO in isolation) vs. missing from the chain
(showing up as NBD_STATE_ZERO|NBD_STATE_HOLE); but now qemu reports
more accurate sparseness information over NBD.

An obvious solution is to make 'qemu-img map --output=json'
distinguish between clusters that have a local allocation from those
that are found nowhere in the chain.  We already have a one-off
mismatch between qemu-img map and NBD qemu:allocation-depth (the
former chose 0, and the latter 1 for the local layer), so exposing the
latter's choice of 0 for unallocated in the entire chain would mean
using using "depth":-1 in the former, but a negative depth may confuse
existing tools.  But there is an easy out: for any chain of length N,
we can simply represent an unallocated cluster as "depth":N+1.  This
does have a slight risk of confusing any tool that might try to
dereference NULL when finding the backing image for the last file in
the backing chain, but that risk sseems worth the more precise output.
The iotests have several examples where this distinction demonstrates
the additional accuracy.

Signed-off-by: Eric Blake 
---

Replaces v1: 20210610213906.1313440-1-ebl...@redhat.com
(qemu-img: Use "depth":-1 to make backing probes obvious)

Use N+1 instead of -1 for unallocated [Kevin]



Bit in contrast with -1, or with separate boolean flag, you lose the 
possibility to distinguish case when we have 3 layers and the cluster is absent 
in all of them, and the case when we have 4 layers and the cluster is absent in 
top 3 but in 4 it is qcow2 UNALLOCATED_ZERO cluster.

So, if someone use this API to reconstruct the chain, then for original 3 empty 
layers he will create 3 empty layers and 4rd additional ZERO layer. And such 
reconstructed chain would not be equal to original chain (as if we take these 
two chains and add additional backing file as a new bottom layer, effect would 
be different).. I'm not sure is it a problem in the task you are solving :\


--
Best regards,
Vladimir



Re: [PATCH v4 18/32] nbd/client-connection: use QEMU_LOCK_GUARD

2021-06-11 Thread Eric Blake
On Thu, Jun 10, 2021 at 01:07:48PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> We don't update connect_thread_func() to use QEMU_LOCK_GUARD, as it
> will get more complex critical sections logic in further commit, where
> QEMU_LOCK_GUARD doesn't help.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  nbd/client-connection.c | 99 +++--
>  1 file changed, 45 insertions(+), 54 deletions(-)
>

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH v4 16/32] block/nbd: introduce nbd_client_connection_release()

2021-06-11 Thread Eric Blake
On Thu, Jun 10, 2021 at 01:07:46PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> This is a last step of creating bs-independent nbd connection
> interface. With next commit we can finally move it to separate file.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  block/nbd.c | 45 +++--
>  1 file changed, 27 insertions(+), 18 deletions(-)
>

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH v4 11/32] block/nbd: drop thr->state

2021-06-11 Thread Eric Blake
On Thu, Jun 10, 2021 at 01:07:41PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> We don't need all these states. The code refactored to use two boolean
> variables looks simpler.
> 
> While moving the comment in nbd_co_establish_connection() rework it to
> give better information. Also, we are going to move the connection code
> to separate file and mentioning drained section would be confusing.
> 
> Improve also the comment in NBDConnectThread, while dropping removed
> state names from it.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  block/nbd.c | 139 +---
>  1 file changed, 44 insertions(+), 95 deletions(-)
> 

>  typedef struct NBDConnectThread {
>  /* Initialization constants */
>  SocketAddress *saddr; /* address to connect to */
>  
> +QemuMutex mutex;
> +
>  /*
> - * Result of last attempt. Valid in FAIL and SUCCESS states.
> - * If you want to steal error, don't forget to set pointer to NULL.
> + * @sioc and @err present a result of connection attempt.
> + * While running is true, they are used only by thread, mutex locking is 
> not
> + * needed. When thread is finished nbd_co_establish_connection steal 
> these
> + * pointers under mutex.

@sioc and @err represent a connection attempt.  While running is true,
they are only used by the connection thread, and mutex locking is not
needed.  Once the thread finishes, nbd_co_establish_connection then
steals these pointers under mutex.

>   */
>  QIOChannelSocket *sioc;
>  Error *err;
>  
> -QemuMutex mutex;
> -/* All further fields are protected by mutex */
> -NBDConnectThreadState state; /* current state of the thread */
> +/* All further fields are accessed only under mutex */
> +bool running; /* thread is running now */
> +bool detached; /* thread is detached and should cleanup the state */
>

Okay, I'm understanding this better than I did in v3.

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH v4 05/32] qemu-sockets: introduce socket_address_parse_named_fd()

2021-06-11 Thread Vladimir Sementsov-Ogievskiy

11.06.2021 16:22, Eric Blake wrote:

On Thu, Jun 10, 2021 at 01:07:35PM +0300, Vladimir Sementsov-Ogievskiy wrote:

Add function that transforms named fd inside SocketAddress structure
into number representation. This way it may be then used in a context
where current monitor is not available.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  include/qemu/sockets.h | 14 ++
  util/qemu-sockets.c| 19 +++
  2 files changed, 33 insertions(+)

diff --git a/include/qemu/sockets.h b/include/qemu/sockets.h
index 7d1f813576..1f4f18a44a 100644
--- a/include/qemu/sockets.h
+++ b/include/qemu/sockets.h
@@ -111,4 +111,18 @@ SocketAddress *socket_remote_address(int fd, Error **errp);
   */
  SocketAddress *socket_address_flatten(SocketAddressLegacy *addr);
  
+/**

+ * socket_address_parse_named_fd:
+ *
+ * Modify @addr, replacing named fd by corresponding number.
+ *
+ * Parsing named fd (by sockget_get_fd) is not possible in context where
+ * current monitor is not available. So, SocketAddress user may first call
+ * socket_parse_named_fd() to parse named fd in advance, and then pass @addr to
+ * the context where monitor is not available.


2 different wrong function names, and reads awkwardly.  How about this
shorter variant:

Modify @addr, replacing a named fd by its corresponding number.
Needed for callers that plan to pass @addr to a context where the
current monitor is not available.


A lot better, thanks!




+ *
+ * Return 0 on success.
+ */
+int socket_address_parse_named_fd(SocketAddress *addr, Error **errp);
+
  #endif /* QEMU_SOCKETS_H */
diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c


But the code looks good.


Somehow, C is simpler for me than English )



Reviewed-by: Eric Blake 




--
Best regards,
Vladimir



Re: [PATCH v4 10/32] block/nbd: simplify waking of nbd_co_establish_connection()

2021-06-11 Thread Eric Blake
On Thu, Jun 10, 2021 at 01:07:40PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> Instead of managing connect_bh, bh_ctx, and wait_connect fields, we
> can use a single link to the waiting coroutine with proper mutex
> protection.
> 
...
> Also, this commit reduces the dependence of
> nbd_co_establish_connection() on the internals of bs (we now use a
> generic pointer to the coroutine, instead of direct use of
> s->connection_co).  This is a step towards splitting the connection
> API out of nbd.c.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  block/nbd.c | 55 +++--
>  1 file changed, 15 insertions(+), 40 deletions(-)
>

Reviewied-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




[PATCH v2] qemu-img: Make unallocated part of backing chain obvious in map

2021-06-11 Thread Eric Blake
The recently-added NBD context qemu:allocation-depth is able to
distinguish between locally-present data (even with that data is
sparse) [shown as depth 1 over NBD], and data that could not be found
anywhere in the backing chain [shown as depth 0].  But qemu-img map
--output=json predates that addition, and prior to this patch has the
unfortunate behavior that all portions of the backing chain that
resolve without finding a hit in any backing layer report the same
depth as the final backing layer.  This makes it harder to reconstruct
a qcow2 backing chain using just 'qemu-img map' output, especially
when using "backing":null to artificially limit a backing chain,
because it is impossible to distinguish between a
QCOW2_CLUSTER_UNALLOCATED (which defers to a [missing] backing file)
and a QCOW2_CLUSTER_ZERO_PLAIN cluster (which would override any
backing file), since both types of clusters otherwise show as
"data":false,"zero":true" (but note that we can distinguish a
QCOW2_CLUSTER_ZERO_ALLOCATED, which would also have an "offset":
listing).

The task of reconstructing a qcow2 chain was made harder in commit
0da9856851 (nbd: server: Report holes for raw images), because prior
to that point, it was possible to abuse NBD's block status command to
see which portions of a qcow2 file resulted in BDRV_BLOCK_ALLOCATED
(showing up as NBD_STATE_ZERO in isolation) vs. missing from the chain
(showing up as NBD_STATE_ZERO|NBD_STATE_HOLE); but now qemu reports
more accurate sparseness information over NBD.

An obvious solution is to make 'qemu-img map --output=json'
distinguish between clusters that have a local allocation from those
that are found nowhere in the chain.  We already have a one-off
mismatch between qemu-img map and NBD qemu:allocation-depth (the
former chose 0, and the latter 1 for the local layer), so exposing the
latter's choice of 0 for unallocated in the entire chain would mean
using using "depth":-1 in the former, but a negative depth may confuse
existing tools.  But there is an easy out: for any chain of length N,
we can simply represent an unallocated cluster as "depth":N+1.  This
does have a slight risk of confusing any tool that might try to
dereference NULL when finding the backing image for the last file in
the backing chain, but that risk sseems worth the more precise output.
The iotests have several examples where this distinction demonstrates
the additional accuracy.

Signed-off-by: Eric Blake 
---

Replaces v1: 20210610213906.1313440-1-ebl...@redhat.com
(qemu-img: Use "depth":-1 to make backing probes obvious)

Use N+1 instead of -1 for unallocated [Kevin]

 docs/tools/qemu-img.rst|   3 +-
 qemu-img.c |   3 ++
 tests/qemu-iotests/122.out |  34 ++---
 tests/qemu-iotests/154.out | 100 +++--
 tests/qemu-iotests/179.out |  67 +
 tests/qemu-iotests/223.out |  24 -
 tests/qemu-iotests/244.out |   7 +--
 tests/qemu-iotests/252.out |   4 +-
 tests/qemu-iotests/274.out |  16 +++---
 tests/qemu-iotests/291.out |   8 +--
 tests/qemu-iotests/309.out |   4 +-
 11 files changed, 162 insertions(+), 108 deletions(-)

diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst
index cfe11478791f..c155b1bf3cc8 100644
--- a/docs/tools/qemu-img.rst
+++ b/docs/tools/qemu-img.rst
@@ -599,7 +599,8 @@ Command description:
   - whether the data is known to read as zero (boolean field ``zero``);
   - in order to make the output shorter, the target file is expressed as
 a ``depth``; for example, a depth of 2 refers to the backing file
-of the backing file of *FILENAME*.
+of the backing file of *FILENAME*.  Depth will be one larger than
+the chain length if no file in the chain provides the data.

   In JSON format, the ``offset`` field is optional; it is absent in
   cases where ``human`` format would omit the entry or exit with an error.
diff --git a/qemu-img.c b/qemu-img.c
index a5993682aad4..33a5cd012b8b 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3035,6 +3035,9 @@ static int get_block_status(BlockDriverState *bs, int64_t 
offset,
 bdrv_refresh_filename(file);
 filename = file->filename;
 }
+if (!(ret & BDRV_BLOCK_ALLOCATED)) {
+depth++;
+}

 *e = (MapEntry) {
 .start = offset,
diff --git a/tests/qemu-iotests/122.out b/tests/qemu-iotests/122.out
index 3a3e121d579d..779dab4847f0 100644
--- a/tests/qemu-iotests/122.out
+++ b/tests/qemu-iotests/122.out
@@ -68,11 +68,11 @@ read 65536/65536 bytes at offset 4194304
 read 65536/65536 bytes at offset 8388608
 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 [{ "start": 0, "length": 65536, "depth": 0, "zero": false, "data": true},
-{ "start": 65536, "length": 4128768, "depth": 0, "zero": true, "data": false},
+{ "start": 65536, "length": 4128768, "depth": 1, "zero": true, "data": false},
 { "start": 4194304, "length": 65536, "depth": 0, "zero": false, "data": true},
-{ "start": 4259840, "length": 4128768, "dep

Re: [PATCH v4 06/32] block/nbd: call socket_address_parse_named_fd() in advance

2021-06-11 Thread Eric Blake
On Thu, Jun 10, 2021 at 01:07:36PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> Detecting monitor by current coroutine works bad when we are not in
> coroutine context. And that's exactly so in nbd reconnect code, where
> qio_channel_socket_connect_sync() is called from thread.
> 
> Monitor is needed only to parse named file descriptor. So, let's just
> parse it during nbd_open(), so that all further users of s->saddr don't
> need to access monitor.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  block/nbd.c | 6 ++
>  1 file changed, 6 insertions(+)

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole

2021-06-11 Thread Eric Blake
On Fri, Jun 11, 2021 at 01:21:45PM +0200, Kevin Wolf wrote:
> > Did you consider just add a new field?
> > 
> > So, "depth" keeps its meaning "which level provides data".
> > 
> > And we add additional optional field like
> > 
> > absolutely-completely-absent: bool
> > 
> > Which is true if data is nowhere in the backing chain.
> 
> Or how about exposing BDRV_BLOCK_ALLOCATED as 'allocated': 'bool'? Which
> I think is what the conclusion was already for NBD, so doing the same in
> 'qemu-img map' would be consistent.
> 
> This is, of course, almost the same as 'absolutely-completely-absent',
> just without the negating the flag.

If we want to bikeshed on a new name, I think "allocated" is going to
cause more confusion than it solves.  And "hole" is wrong.  Better
would be "backing":true for portions of the file that would derive
from a backing file, if a backing file had been present.

But that still feels like more work than just exposing n+1 in depth.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole

2021-06-11 Thread Eric Blake
On Fri, Jun 11, 2021 at 10:09:09AM +0200, Kevin Wolf wrote:
> > Yes, that might work as well.  But we didn't previously document
> > depth to be optional.  Removing something from output risks breaking
> > more downstream tools that expect it to be non-optional, compared to
> > providing a new value.
> 
> A negative value isn't any less unexpected than a missing key. I don't
> think any existing tool would be able to handle it. Encoding different
> meanings in a single value isn't very QAPI-like either. Usually strings
> that are parsed are the problem, but negative integers really isn't that
> much different. I don't really like this solution.
> 
> Leaving out the depth feels like a better suggestion to me.
> 
> But anyway, this seems to only happen at the end of the backing chain.
> So if the backing chain consistents of n images, why not report 'depth':
> n + 1? So, in the above example, you would get 1. I think this has the
> best chances of tools actually working correctly with the new output,
> even though it's still not unlikely to break something.

Ooh, I like that.  It is closer to reality - the file data really
comes from the next depth, even if we have no filename at that depth.
v2 of my patch coming up.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH v4 05/32] qemu-sockets: introduce socket_address_parse_named_fd()

2021-06-11 Thread Eric Blake
On Thu, Jun 10, 2021 at 01:07:35PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> Add function that transforms named fd inside SocketAddress structure
> into number representation. This way it may be then used in a context
> where current monitor is not available.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  include/qemu/sockets.h | 14 ++
>  util/qemu-sockets.c| 19 +++
>  2 files changed, 33 insertions(+)
> 
> diff --git a/include/qemu/sockets.h b/include/qemu/sockets.h
> index 7d1f813576..1f4f18a44a 100644
> --- a/include/qemu/sockets.h
> +++ b/include/qemu/sockets.h
> @@ -111,4 +111,18 @@ SocketAddress *socket_remote_address(int fd, Error 
> **errp);
>   */
>  SocketAddress *socket_address_flatten(SocketAddressLegacy *addr);
>  
> +/**
> + * socket_address_parse_named_fd:
> + *
> + * Modify @addr, replacing named fd by corresponding number.
> + *
> + * Parsing named fd (by sockget_get_fd) is not possible in context where
> + * current monitor is not available. So, SocketAddress user may first call
> + * socket_parse_named_fd() to parse named fd in advance, and then pass @addr 
> to
> + * the context where monitor is not available.

2 different wrong function names, and reads awkwardly.  How about this
shorter variant:

Modify @addr, replacing a named fd by its corresponding number.
Needed for callers that plan to pass @addr to a context where the
current monitor is not available.

> + *
> + * Return 0 on success.
> + */
> +int socket_address_parse_named_fd(SocketAddress *addr, Error **errp);
> +
>  #endif /* QEMU_SOCKETS_H */
> diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c

But the code looks good.

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole

2021-06-11 Thread Vladimir Sementsov-Ogievskiy

11.06.2021 14:21, Kevin Wolf wrote:

Am 11.06.2021 um 10:14 hat Vladimir Sementsov-Ogievskiy geschrieben:

11.06.2021 11:09, Kevin Wolf wrote:

Am 10.06.2021 um 22:46 hat Eric Blake geschrieben:

On Thu, Jun 10, 2021 at 11:09:05PM +0300, Nir Soffer wrote:

But:

$ qemu-img map --output=json -f qcow2 json:'{"driver":"qcow2","backing":null, \
"file":{"driver":"file","filename":"top.qcow2"}}'
[{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false},
{ "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, 
"offset": 327680},
{ "start": 131072, "length": 131072, "depth": 0, "zero": true, "data": false}]

also reports the entire file at "depth":0, which is misleading, since
we have just been arguing from the qemu:allocation-depth perspective
(and also from bdrv_block_status) that the qcow2 image is NOT 100%
allocated (in the sense where allocation == data comes locally).
Perhaps it might be better if we tweaked the above qemu-img map to
produce:

[{ "start": 0, "length": 65536, "depth": -1, "zero": true, "data": false},
{ "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, 
"offset": 327680},
{ "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": false},
{ "start": 196608, "length": 65536, "depth": -1, "zero": true, "data": false}]


It will be more consistent with "offset" to drop "depth" from output
if we don't have it:

  [{ "start": 0, "length": 65536, "zero": true, "data": false},
   { "start": 65536, "length": 65536, "depth": 0, "zero": false,
"data": true, "offset": 327680},
   { "start": 131072, "length": 65536, "depth": 0, "zero": true,
"data": false},
   { "start": 196608, "length": 65536, "zero": true, "data": false}]


Yes, that might work as well.  But we didn't previously document
depth to be optional.  Removing something from output risks breaking
more downstream tools that expect it to be non-optional, compared to
providing a new value.


A negative value isn't any less unexpected than a missing key. I don't
think any existing tool would be able to handle it. Encoding different
meanings in a single value isn't very QAPI-like either. Usually strings
that are parsed are the problem, but negative integers really isn't that
much different. I don't really like this solution.

Leaving out the depth feels like a better suggestion to me.

But anyway, this seems to only happen at the end of the backing chain.
So if the backing chain consistents of n images, why not report 'depth':
n + 1? So, in the above example, you would get 1. I think this has the
best chances of tools actually working correctly with the new output,
even though it's still not unlikely to break something.



Did you consider just add a new field?

So, "depth" keeps its meaning "which level provides data".

And we add additional optional field like

absolutely-completely-absent: bool

Which is true if data is nowhere in the backing chain.


Or how about exposing BDRV_BLOCK_ALLOCATED as 'allocated': 'bool'? Which
I think is what the conclusion was already for NBD, so doing the same in
'qemu-img map' would be consistent.


"allocated" is historically ambiguous: we never know exactly does it mean "occupy space on 
disk" or "data (or zeroes) taken from this qcow2 image, not from backing".

Eric recently sent related patch to libnbd:

 [libnbd PATCH] info: Avoid ambiguous 'allocated' terminology in mapping



This is, of course, almost the same as 'absolutely-completely-absent',
just without the negating the flag.

Kevin




--
Best regards,
Vladimir



[PATCH] block/nvme: Fix VFIO_MAP_DMA failed: No space left on device

2021-06-11 Thread Philippe Mathieu-Daudé
When the NVMe block driver was introduced (see commit bdd6a90a9e5,
January 2018), Linux VFIO_IOMMU_MAP_DMA ioctl was only returning
-ENOMEM in case of error. The driver was correctly handling the
error path to recycle its volatile IOVA mappings.

To fix CVE-2019-3882, Linux commit 492855939bdb ("vfio/type1: Limit
DMA mappings per container", April 2019) added the -ENOSPC error to
signal the user exhausted the DMA mappings available for a container.

The block driver started to mis-behave:

  qemu-system-x86_64: VFIO_MAP_DMA failed: No space left on device
  (qemu)
  (qemu) info status
  VM status: paused (io-error)
  (qemu) c
  VFIO_MAP_DMA failed: No space left on device
  qemu-system-x86_64: block/block-backend.c:1968: blk_get_aio_context: 
Assertion `ctx == blk->ctx' failed.

Fix by handling the -ENOSPC error when DMA mappings are exhausted;
other errors (such -ENOMEM) are still handled later in the same
function.

An easy way to reproduce this bug is to restrict the DMA mapping
limit (65535 by default) when loading the VFIO IOMMU module:

  # modprobe vfio_iommu_type1 dma_entry_limit=666

Cc: qemu-sta...@nongnu.org
Reported-by: Michal Prívozník 
Fixes: bdd6a90a9e5 ("block: Add VFIO based NVMe driver")
Buglink: https://bugs.launchpad.net/qemu/+bug/186
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/65
Signed-off-by: Philippe Mathieu-Daudé 
---
Michal, is it still possible for you to test this (old bug)?

A functional test using viommu & nested VM is planned (suggested by
Stefan and Maxim).
---
 block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/nvme.c b/block/nvme.c
index 2b5421e7aa6..12f9dd5cce3 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -1030,7 +1030,7 @@ try_map:
 r = qemu_vfio_dma_map(s->vfio,
   qiov->iov[i].iov_base,
   len, true, &iova);
-if (r == -ENOMEM && retry) {
+if (r == -ENOSPC && retry) {
 retry = false;
 trace_nvme_dma_flush_queue_wait(s);
 if (s->dma_map_count) {
-- 
2.31.1




Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole

2021-06-11 Thread Kevin Wolf
Am 11.06.2021 um 10:14 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 11.06.2021 11:09, Kevin Wolf wrote:
> > Am 10.06.2021 um 22:46 hat Eric Blake geschrieben:
> > > On Thu, Jun 10, 2021 at 11:09:05PM +0300, Nir Soffer wrote:
> > > > > But:
> > > > > 
> > > > > $ qemu-img map --output=json -f qcow2 
> > > > > json:'{"driver":"qcow2","backing":null, \
> > > > >"file":{"driver":"file","filename":"top.qcow2"}}'
> > > > > [{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": 
> > > > > false},
> > > > > { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": 
> > > > > true, "offset": 327680},
> > > > > { "start": 131072, "length": 131072, "depth": 0, "zero": true, 
> > > > > "data": false}]
> > > > > 
> > > > > also reports the entire file at "depth":0, which is misleading, since
> > > > > we have just been arguing from the qemu:allocation-depth perspective
> > > > > (and also from bdrv_block_status) that the qcow2 image is NOT 100%
> > > > > allocated (in the sense where allocation == data comes locally).
> > > > > Perhaps it might be better if we tweaked the above qemu-img map to
> > > > > produce:
> > > > > 
> > > > > [{ "start": 0, "length": 65536, "depth": -1, "zero": true, "data": 
> > > > > false},
> > > > > { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": 
> > > > > true, "offset": 327680},
> > > > > { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": 
> > > > > false},
> > > > > { "start": 196608, "length": 65536, "depth": -1, "zero": true, 
> > > > > "data": false}]
> > > > 
> > > > It will be more consistent with "offset" to drop "depth" from output
> > > > if we don't have it:
> > > > 
> > > >  [{ "start": 0, "length": 65536, "zero": true, "data": false},
> > > >   { "start": 65536, "length": 65536, "depth": 0, "zero": false,
> > > > "data": true, "offset": 327680},
> > > >   { "start": 131072, "length": 65536, "depth": 0, "zero": true,
> > > > "data": false},
> > > >   { "start": 196608, "length": 65536, "zero": true, "data": false}]
> > > 
> > > Yes, that might work as well.  But we didn't previously document
> > > depth to be optional.  Removing something from output risks breaking
> > > more downstream tools that expect it to be non-optional, compared to
> > > providing a new value.
> > 
> > A negative value isn't any less unexpected than a missing key. I don't
> > think any existing tool would be able to handle it. Encoding different
> > meanings in a single value isn't very QAPI-like either. Usually strings
> > that are parsed are the problem, but negative integers really isn't that
> > much different. I don't really like this solution.
> > 
> > Leaving out the depth feels like a better suggestion to me.
> > 
> > But anyway, this seems to only happen at the end of the backing chain.
> > So if the backing chain consistents of n images, why not report 'depth':
> > n + 1? So, in the above example, you would get 1. I think this has the
> > best chances of tools actually working correctly with the new output,
> > even though it's still not unlikely to break something.
> > 
> 
> Did you consider just add a new field?
> 
> So, "depth" keeps its meaning "which level provides data".
> 
> And we add additional optional field like
> 
> absolutely-completely-absent: bool
> 
> Which is true if data is nowhere in the backing chain.

Or how about exposing BDRV_BLOCK_ALLOCATED as 'allocated': 'bool'? Which
I think is what the conclusion was already for NBD, so doing the same in
'qemu-img map' would be consistent.

This is, of course, almost the same as 'absolutely-completely-absent',
just without the negating the flag.

Kevin




Re: [PATCH v2] async: the main AioContext is only "current" if under the BQL

2021-06-11 Thread Paolo Bonzini

On 09/06/21 17:01, Vladimir Sementsov-Ogievskiy wrote:

09.06.2021 15:22, Paolo Bonzini wrote:

If we want to wake up a coroutine from a worker thread, aio_co_wake()
currently does not work.  In that scenario, aio_co_wake() calls
aio_co_enter(), but there is no current AioContext and therefore
qemu_get_current_aio_context() returns the main thread.  aio_co_wake()
then attempts to call aio_context_acquire() instead of going through
aio_co_schedule().

The default case of qemu_get_current_aio_context() was added to cover
synchronous I/O started from the vCPU thread, but the main and vCPU
threads are quite different.  The main thread is an I/O thread itself,
only running a more complicated event loop; the vCPU thread instead
is essentially a worker thread that occasionally calls
qemu_mutex_lock_iothread().  It is only in those critical sections
that it acts as if it were the home thread of the main AioContext.

Therefore, this patch detaches qemu_get_current_aio_context() from
iothreads, which is a useless complication.  The AioContext pointer
is stored directly in the thread-local variable, including for the
main loop.  Worker threads (including vCPU threads) optionally behave
as temporary home threads if they have taken the big QEMU lock,
but if that is not the case they will always schedule coroutines
on remote threads via aio_co_schedule().

With this change, qemu_mutex_iothread_locked() must be changed from


Did you miss "qemu_mutex_iothread_locked() stub function"?


Yes, that comment refers to the stub.  Let me resubmit with a testcase 
and I'll fix that too.


Paolo




Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole

2021-06-11 Thread Vladimir Sementsov-Ogievskiy

11.06.2021 12:05, Nir Soffer wrote:




‫ב-11 ביוני 2021, בשעה 11:14, ‏‏Vladimir Sementsov-Ogievskiy 
‏ כתב/ה:‬

11.06.2021 11:09, Kevin Wolf wrote:

Am 10.06.2021 um 22:46 hat Eric Blake geschrieben:

On Thu, Jun 10, 2021 at 11:09:05PM +0300, Nir Soffer wrote:

But:

$ qemu-img map --output=json -f qcow2 json:'{"driver":"qcow2","backing":null, \
  "file":{"driver":"file","filename":"top.qcow2"}}'
[{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false},
{ "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, 
"offset": 327680},
{ "start": 131072, "length": 131072, "depth": 0, "zero": true, "data": false}]

also reports the entire file at "depth":0, which is misleading, since
we have just been arguing from the qemu:allocation-depth perspective
(and also from bdrv_block_status) that the qcow2 image is NOT 100%
allocated (in the sense where allocation == data comes locally).
Perhaps it might be better if we tweaked the above qemu-img map to
produce:

[{ "start": 0, "length": 65536, "depth": -1, "zero": true, "data": false},
{ "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, 
"offset": 327680},
{ "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": false},
{ "start": 196608, "length": 65536, "depth": -1, "zero": true, "data": false}]


It will be more consistent with "offset" to drop "depth" from output
if we don't have it:

[{ "start": 0, "length": 65536, "zero": true, "data": false},
 { "start": 65536, "length": 65536, "depth": 0, "zero": false,
"data": true, "offset": 327680},
 { "start": 131072, "length": 65536, "depth": 0, "zero": true,
"data": false},
 { "start": 196608, "length": 65536, "zero": true, "data": false}]


Yes, that might work as well.  But we didn't previously document
depth to be optional.  Removing something from output risks breaking
more downstream tools that expect it to be non-optional, compared to
providing a new value.

A negative value isn't any less unexpected than a missing key. I don't
think any existing tool would be able to handle it. Encoding different
meanings in a single value isn't very QAPI-like either. Usually strings
that are parsed are the problem, but negative integers really isn't that
much different. I don't really like this solution.
Leaving out the depth feels like a better suggestion to me.
But anyway, this seems to only happen at the end of the backing chain.
So if the backing chain consistents of n images, why not report 'depth':
n + 1? So, in the above example, you would get 1. I think this has the
best chances of tools actually working correctly with the new output,
even though it's still not unlikely to break something.


Did you consider just add a new field?

So, "depth" keeps its meaning "which level provides data".

And we add additional optional field like

absolutely-completely-absent: bool


hole: bool?



That messes-up with file-posix holes which are UNALLOCATED_ZERO..

I think, we should somehow start to honestly report backing chains and use 
"backing" concept in interfaces..

maybe
nobacking: bool

  May be true only together with data=false and zero=true, and means that all 
layers refer to backing for this region, but last layer just don't have backing 
image currently and therefore returns zeroes.


--
Best regards,
Vladimir



Re: [PATCH v2] async: the main AioContext is only "current" if under the BQL

2021-06-11 Thread Paolo Bonzini

On 09/06/21 15:02, Kevin Wolf wrote:

Am 09.06.2021 um 14:22 hat Paolo Bonzini geschrieben:

If we want to wake up a coroutine from a worker thread, aio_co_wake()
currently does not work.  In that scenario, aio_co_wake() calls
aio_co_enter(), but there is no current AioContext and therefore
qemu_get_current_aio_context() returns the main thread.  aio_co_wake()
then attempts to call aio_context_acquire() instead of going through
aio_co_schedule().

The default case of qemu_get_current_aio_context() was added to cover
synchronous I/O started from the vCPU thread, but the main and vCPU
threads are quite different.  The main thread is an I/O thread itself,
only running a more complicated event loop; the vCPU thread instead
is essentially a worker thread that occasionally calls
qemu_mutex_lock_iothread().  It is only in those critical sections
that it acts as if it were the home thread of the main AioContext.

Therefore, this patch detaches qemu_get_current_aio_context() from
iothreads, which is a useless complication.  The AioContext pointer
is stored directly in the thread-local variable, including for the
main loop.  Worker threads (including vCPU threads) optionally behave
as temporary home threads if they have taken the big QEMU lock,
but if that is not the case they will always schedule coroutines
on remote threads via aio_co_schedule().

With this change, qemu_mutex_iothread_locked() must be changed from
true to false.  The previous value of true was needed because the
main thread did not have an AioContext in the thread-local variable,
but now it does have one.

Reported-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Paolo Bonzini 


The commit message doesn't specify, but in the buggy case, are we
talking about calling aio_co_wake() for a coroutine in the main context
specifically, right? Could we have a unit test for this scenario?


Yes, that's the scenario.  I will try to write a unit test indeed.

Paolo




Re: iotest 233 failing

2021-06-11 Thread Daniel P . Berrangé
On Thu, Jun 10, 2021 at 10:33:40PM +0100, Daniel P. Berrangé wrote:
> On Thu, Jun 10, 2021 at 10:31:14PM +0100, Daniel P. Berrangé wrote:
> > On Thu, Jun 10, 2021 at 03:34:46PM -0500, Eric Blake wrote:
> > > I'm now getting failures on iotest 233:
> > > 
> > > 233   fail   [15:26:01] [15:26:03]   2.1s   (last: 1.3s)  output 
> > > mismatch (see 233.out.bad)
> > > --- /home/eblake/qemu/tests/qemu-iotests/233.out
> > > +++ 233.out.bad
> > > @@ -65,6 +65,6 @@
> > >  == final server log ==
> > >  qemu-nbd: option negotiation failed: Verify failed: No certificate was 
> > > found.
> > >  qemu-nbd: option negotiation failed: Verify failed: No certificate was 
> > > found.
> > > -qemu-nbd: option negotiation failed: TLS x509 authz check for 
> > > CN=localhost,O=Cthulhu Dark Lord Enterprises client1,L=R'lyeh,C=South 
> > > Pacific is denied
> > > -qemu-nbd: option negotiation failed: TLS x509 authz check for 
> > > CN=localhost,O=Cthulhu Dark Lord Enterprises client3,L=R'lyeh,C=South 
> > > Pacific is denied
> > > +qemu-nbd: option negotiation failed: TLS x509 authz check for C=South 
> > > Pacific,L=R'lyeh,O=Cthulhu Dark Lord Enterprises client1,CN=localhost is 
> > > denied
> > > +qemu-nbd: option negotiation failed: TLS x509 authz check for C=South 
> > > Pacific,L=R'lyeh,O=Cthulhu Dark Lord Enterprises client3,CN=localhost is 
> > > denied
> > >  *** done
> > > Failures: 233
> > > Failed 1 of 1 iotests
> > > 
> > > Looks like I recently updated to gnutls-3.7.2-1.fc34 on June 1, could
> > > that be the culprit for the error message being reordered?
> > 
> > It is possible I guess. They have indeed made such a change in the past
> > and reverted it when I pointed out that this is effectively an ABI for
> > apps, because access control lists are based on matching the distinguish
> > name string, as an opaque string. The cause certainly needs investigating
> > as a matter of urgency because this is ABI for QEMU's authz access control
> > lists.
> 
> There is an ominous sounding NEWS item in 3.7.2
> 
> ** certtool: When producing certificates and certificate requests, subject DN
>components that are provided individually will now be ordered by
>assumed scale (e.g. Country before State, Organization before
>OrganizationalUnit).  This change also affects the order in which
>certtool prompts interactively.  Please rely on the template
>mechanism for automated use of certtool! (#1243)
> 
> This ordering change in certtool seems to correspond with the new order
> you see above in the distinguished name, so I wonder if the certtool
> change had accidental side effects.

Right so iotest 233 of course creates a new certificate, and so it picks
up the new certtool behaviour, which means the certificate it generates
has the distinguished name in the new order. This is good because it
means the gnutls API for querying the distinguished name is still using
the same ordering as before. This is bad because the iotest is obviously
susceptible to changes it the way the certificate is created.

I think we might just need to apply a filter to strip the distinguished
name from the output.

Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|




Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole

2021-06-11 Thread Nir Soffer


> ‫ב-11 ביוני 2021, בשעה 11:14, ‏‏Vladimir Sementsov-Ogievskiy 
> ‏ כתב/ה:‬
> 
> 11.06.2021 11:09, Kevin Wolf wrote:
>> Am 10.06.2021 um 22:46 hat Eric Blake geschrieben:
 On Thu, Jun 10, 2021 at 11:09:05PM +0300, Nir Soffer wrote:
>> But:
>> 
>> $ qemu-img map --output=json -f qcow2 
>> json:'{"driver":"qcow2","backing":null, \
>>   "file":{"driver":"file","filename":"top.qcow2"}}'
>> [{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false},
>> { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": 
>> true, "offset": 327680},
>> { "start": 131072, "length": 131072, "depth": 0, "zero": true, "data": 
>> false}]
>> 
>> also reports the entire file at "depth":0, which is misleading, since
>> we have just been arguing from the qemu:allocation-depth perspective
>> (and also from bdrv_block_status) that the qcow2 image is NOT 100%
>> allocated (in the sense where allocation == data comes locally).
>> Perhaps it might be better if we tweaked the above qemu-img map to
>> produce:
>> 
>> [{ "start": 0, "length": 65536, "depth": -1, "zero": true, "data": 
>> false},
>> { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": 
>> true, "offset": 327680},
>> { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": 
>> false},
>> { "start": 196608, "length": 65536, "depth": -1, "zero": true, "data": 
>> false}]
> 
> It will be more consistent with "offset" to drop "depth" from output
> if we don't have it:
> 
> [{ "start": 0, "length": 65536, "zero": true, "data": false},
>  { "start": 65536, "length": 65536, "depth": 0, "zero": false,
> "data": true, "offset": 327680},
>  { "start": 131072, "length": 65536, "depth": 0, "zero": true,
> "data": false},
>  { "start": 196608, "length": 65536, "zero": true, "data": false}]
>>> 
>>> Yes, that might work as well.  But we didn't previously document
>>> depth to be optional.  Removing something from output risks breaking
>>> more downstream tools that expect it to be non-optional, compared to
>>> providing a new value.
>> A negative value isn't any less unexpected than a missing key. I don't
>> think any existing tool would be able to handle it. Encoding different
>> meanings in a single value isn't very QAPI-like either. Usually strings
>> that are parsed are the problem, but negative integers really isn't that
>> much different. I don't really like this solution.
>> Leaving out the depth feels like a better suggestion to me.
>> But anyway, this seems to only happen at the end of the backing chain.
>> So if the backing chain consistents of n images, why not report 'depth':
>> n + 1? So, in the above example, you would get 1. I think this has the
>> best chances of tools actually working correctly with the new output,
>> even though it's still not unlikely to break something.
> 
> Did you consider just add a new field?
> 
> So, "depth" keeps its meaning "which level provides data".
> 
> And we add additional optional field like
> 
> absolutely-completely-absent: bool

hole: bool?

> 
> Which is true if data is nowhere in the backing chain.
> 
> 
> -- 
> Best regards,
> Vladimir


Re: [PATCH] qemu-img: Use "depth":-1 to make backing probes obvious

2021-06-11 Thread Vladimir Sementsov-Ogievskiy

11.06.2021 11:08, Vladimir Sementsov-Ogievskiy wrote:

11.06.2021 00:39, Eric Blake wrote:

The recently-added NBD context qemu:allocation-depth makes an obvious
case for why it is important to distinguish between locally-present
data (even with that data is sparse) [shown as depth 1 over NBD], and
data that could not be found anywhere in the backing chain [shown as
depth 0].  But qemu-img map --output=json predates that addition, and
has the unfortunate behavior that all portions of the backing chain
that resolve without finding a hit in any backing layer report the
same depth as the final backing layer.  This makes it harder to
reconstruct a qcow2 backing chain using just 'qemu-img map' output
when using "backing":null to artificially limit a backing chain,
because it is impossible to distinguish between a
QCOW2_CLUSTER_UNALLOCATED (which defers to a [missing] backing file)
and a QCOW2_CLUSTER_ZERO_PLAIN cluster (which would override any
backing file), since both types of clusters otherwise show as
"data":false,"zero":true" (but note that we can distinguish a
QCOW2_CLUSTER_ZERO_ALLOCATED, which would also have an "offset":
listing).

The task of reconstructing a qcow2 chain was made harder in commit
0da9856851 (nbd: server: Report holes for raw images), because prior
to that point, it was possible to abuse NBD's block status command to
see which portions of a qcow2 file resulted in BDRV_BLOCK_ALLOCATED
(showing up as NBD_STATE_ZERO in isolation) vs. missing from the chain
(showing up as NBD_STATE_ZERO|NBD_STATE_HOLE); but now qemu reports
more accurate sparseness information over NBD.

An obvious solution is to make 'qemu-img map --output=json' visually
distinguish between clusters that have a local allocation from those
that are found nowhere in the chain, by adding "depth":-1 as the new
witness of data that could not be tied to a specific backing image.
Several iotests are impacted, but glancing through the changes shows
that it is an improvement in that it shows more accurate details.

Note that the documentation is specifically worded to allow qemu-img
to report "depth":-1 both in the scenario where the last file in the
backing chain still defers the cluster (corresponding to
BDRV_BLOCK_ALLOCATED not being set anywhere in the chain), and in the
scenario where qemu is unable to determine which backing chain element
(if any) provides the data.  The latter case does not exist now, but
I'm considering an upcoming patch to add a BDRV_BLOCK_BACKING that
would let a specific driver (such as NBD) inform the block layer that
it is known that a cluster comes from a backing layer, but where there
is insufficient data to determine which layer.

As a quick demonstration:

 # Create a qcow2 image with a raw backing file:
 $ qemu-img create base.raw $((4*64*1024))
 $ qemu-img create -f qcow2 -b base.raw -F raw top.qcow2

 # Write to first 3 clusters of base:
 $ qemu-io -f raw -c "w -P 65 0 64k" -c "w -P 66 64k 64k" \
   -c "w -P 67 128k 64k" base.raw

 # Write to second and third clusters of top, hiding base:
 $ qemu-io -f qcow2 -c "w -P 69 64k 64k" -c "w -z 128k 64k" top.qcow2

 # Examine the full backing chain
 $ qemu-img map --output=json -f qcow2 top.qcow2
 [{ "start": 0, "length": 65536, "depth": 1, "zero": false, "data": true, 
"offset": 0},
 { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, 
"offset": 327680},
 { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": 
false},
 { "start": 196608, "length": 65536, "depth": 1, "zero": true, "data": false, 
"offset": 196608}]

 # Repeat, but with the backing chain clamped. Pre-patch:
 $ qemu-img map --output=json -f qcow2 json:'{"driver":"qcow2", \
   "backing":null, "file":{"driver":"file", "filename":"top.qcow2"}}'
 [{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false},
 { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, 
"offset": 327680},
 { "start": 131072, "length": 131072, "depth": 0, "zero": true, "data": 
false}]

 # Repeat, but post-patch:
 $ qemu-img map --output=json -f qcow2 json:'{"driver":"qcow2", \
   "backing":null, "file":{"driver":"file", "filename":"top.qcow2"}}'
 [{ "start": 0, "length": 65536, "depth": -1, "zero": true, "data": false},
 { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, 
"offset": 327680},
 { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": 
false},
 { "start": 196608, "length": 65536, "depth": -1, "zero": true, "data": 
false}]

Note that pre-patch, it was impossible to determine which portions of
the qcow2 file override the backing file because the "depth":0 regions
were combined, so even though qemu internally can tell the difference
between sclusters 2 and 3, the command line user could not.  But
post-patch, the "depth":-1 markings match the "depth":1 markings when
the backing chain is in

Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole

2021-06-11 Thread Vladimir Sementsov-Ogievskiy

11.06.2021 11:09, Kevin Wolf wrote:

Am 10.06.2021 um 22:46 hat Eric Blake geschrieben:

On Thu, Jun 10, 2021 at 11:09:05PM +0300, Nir Soffer wrote:

But:

$ qemu-img map --output=json -f qcow2 json:'{"driver":"qcow2","backing":null, \
   "file":{"driver":"file","filename":"top.qcow2"}}'
[{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false},
{ "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, 
"offset": 327680},
{ "start": 131072, "length": 131072, "depth": 0, "zero": true, "data": false}]

also reports the entire file at "depth":0, which is misleading, since
we have just been arguing from the qemu:allocation-depth perspective
(and also from bdrv_block_status) that the qcow2 image is NOT 100%
allocated (in the sense where allocation == data comes locally).
Perhaps it might be better if we tweaked the above qemu-img map to
produce:

[{ "start": 0, "length": 65536, "depth": -1, "zero": true, "data": false},
{ "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, 
"offset": 327680},
{ "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": false},
{ "start": 196608, "length": 65536, "depth": -1, "zero": true, "data": false}]


It will be more consistent with "offset" to drop "depth" from output
if we don't have it:

 [{ "start": 0, "length": 65536, "zero": true, "data": false},
  { "start": 65536, "length": 65536, "depth": 0, "zero": false,
"data": true, "offset": 327680},
  { "start": 131072, "length": 65536, "depth": 0, "zero": true,
"data": false},
  { "start": 196608, "length": 65536, "zero": true, "data": false}]


Yes, that might work as well.  But we didn't previously document
depth to be optional.  Removing something from output risks breaking
more downstream tools that expect it to be non-optional, compared to
providing a new value.


A negative value isn't any less unexpected than a missing key. I don't
think any existing tool would be able to handle it. Encoding different
meanings in a single value isn't very QAPI-like either. Usually strings
that are parsed are the problem, but negative integers really isn't that
much different. I don't really like this solution.

Leaving out the depth feels like a better suggestion to me.

But anyway, this seems to only happen at the end of the backing chain.
So if the backing chain consistents of n images, why not report 'depth':
n + 1? So, in the above example, you would get 1. I think this has the
best chances of tools actually working correctly with the new output,
even though it's still not unlikely to break something.



Did you consider just add a new field?

So, "depth" keeps its meaning "which level provides data".

And we add additional optional field like

absolutely-completely-absent: bool

Which is true if data is nowhere in the backing chain.


--
Best regards,
Vladimir



Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole

2021-06-11 Thread Kevin Wolf
Am 10.06.2021 um 22:46 hat Eric Blake geschrieben:
> On Thu, Jun 10, 2021 at 11:09:05PM +0300, Nir Soffer wrote:
> > > But:
> > >
> > > $ qemu-img map --output=json -f qcow2 
> > > json:'{"driver":"qcow2","backing":null, \
> > >   "file":{"driver":"file","filename":"top.qcow2"}}'
> > > [{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false},
> > > { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": 
> > > true, "offset": 327680},
> > > { "start": 131072, "length": 131072, "depth": 0, "zero": true, "data": 
> > > false}]
> > >
> > > also reports the entire file at "depth":0, which is misleading, since
> > > we have just been arguing from the qemu:allocation-depth perspective
> > > (and also from bdrv_block_status) that the qcow2 image is NOT 100%
> > > allocated (in the sense where allocation == data comes locally).
> > > Perhaps it might be better if we tweaked the above qemu-img map to
> > > produce:
> > >
> > > [{ "start": 0, "length": 65536, "depth": -1, "zero": true, "data": false},
> > > { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": 
> > > true, "offset": 327680},
> > > { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": 
> > > false},
> > > { "start": 196608, "length": 65536, "depth": -1, "zero": true, "data": 
> > > false}]
> > 
> > It will be more consistent with "offset" to drop "depth" from output
> > if we don't have it:
> > 
> > [{ "start": 0, "length": 65536, "zero": true, "data": false},
> >  { "start": 65536, "length": 65536, "depth": 0, "zero": false,
> > "data": true, "offset": 327680},
> >  { "start": 131072, "length": 65536, "depth": 0, "zero": true,
> > "data": false},
> >  { "start": 196608, "length": 65536, "zero": true, "data": false}]
> 
> Yes, that might work as well.  But we didn't previously document
> depth to be optional.  Removing something from output risks breaking
> more downstream tools that expect it to be non-optional, compared to
> providing a new value.

A negative value isn't any less unexpected than a missing key. I don't
think any existing tool would be able to handle it. Encoding different
meanings in a single value isn't very QAPI-like either. Usually strings
that are parsed are the problem, but negative integers really isn't that
much different. I don't really like this solution.

Leaving out the depth feels like a better suggestion to me.

But anyway, this seems to only happen at the end of the backing chain.
So if the backing chain consistents of n images, why not report 'depth':
n + 1? So, in the above example, you would get 1. I think this has the
best chances of tools actually working correctly with the new output,
even though it's still not unlikely to break something.

Kevin




Re: [PATCH] qemu-img: Use "depth":-1 to make backing probes obvious

2021-06-11 Thread Vladimir Sementsov-Ogievskiy

11.06.2021 00:39, Eric Blake wrote:

The recently-added NBD context qemu:allocation-depth makes an obvious
case for why it is important to distinguish between locally-present
data (even with that data is sparse) [shown as depth 1 over NBD], and
data that could not be found anywhere in the backing chain [shown as
depth 0].  But qemu-img map --output=json predates that addition, and
has the unfortunate behavior that all portions of the backing chain
that resolve without finding a hit in any backing layer report the
same depth as the final backing layer.  This makes it harder to
reconstruct a qcow2 backing chain using just 'qemu-img map' output
when using "backing":null to artificially limit a backing chain,
because it is impossible to distinguish between a
QCOW2_CLUSTER_UNALLOCATED (which defers to a [missing] backing file)
and a QCOW2_CLUSTER_ZERO_PLAIN cluster (which would override any
backing file), since both types of clusters otherwise show as
"data":false,"zero":true" (but note that we can distinguish a
QCOW2_CLUSTER_ZERO_ALLOCATED, which would also have an "offset":
listing).

The task of reconstructing a qcow2 chain was made harder in commit
0da9856851 (nbd: server: Report holes for raw images), because prior
to that point, it was possible to abuse NBD's block status command to
see which portions of a qcow2 file resulted in BDRV_BLOCK_ALLOCATED
(showing up as NBD_STATE_ZERO in isolation) vs. missing from the chain
(showing up as NBD_STATE_ZERO|NBD_STATE_HOLE); but now qemu reports
more accurate sparseness information over NBD.

An obvious solution is to make 'qemu-img map --output=json' visually
distinguish between clusters that have a local allocation from those
that are found nowhere in the chain, by adding "depth":-1 as the new
witness of data that could not be tied to a specific backing image.
Several iotests are impacted, but glancing through the changes shows
that it is an improvement in that it shows more accurate details.

Note that the documentation is specifically worded to allow qemu-img
to report "depth":-1 both in the scenario where the last file in the
backing chain still defers the cluster (corresponding to
BDRV_BLOCK_ALLOCATED not being set anywhere in the chain), and in the
scenario where qemu is unable to determine which backing chain element
(if any) provides the data.  The latter case does not exist now, but
I'm considering an upcoming patch to add a BDRV_BLOCK_BACKING that
would let a specific driver (such as NBD) inform the block layer that
it is known that a cluster comes from a backing layer, but where there
is insufficient data to determine which layer.

As a quick demonstration:

 # Create a qcow2 image with a raw backing file:
 $ qemu-img create base.raw $((4*64*1024))
 $ qemu-img create -f qcow2 -b base.raw -F raw top.qcow2

 # Write to first 3 clusters of base:
 $ qemu-io -f raw -c "w -P 65 0 64k" -c "w -P 66 64k 64k" \
   -c "w -P 67 128k 64k" base.raw

 # Write to second and third clusters of top, hiding base:
 $ qemu-io -f qcow2 -c "w -P 69 64k 64k" -c "w -z 128k 64k" top.qcow2

 # Examine the full backing chain
 $ qemu-img map --output=json -f qcow2 top.qcow2
 [{ "start": 0, "length": 65536, "depth": 1, "zero": false, "data": true, 
"offset": 0},
 { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, 
"offset": 327680},
 { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": 
false},
 { "start": 196608, "length": 65536, "depth": 1, "zero": true, "data": false, 
"offset": 196608}]

 # Repeat, but with the backing chain clamped. Pre-patch:
 $ qemu-img map --output=json -f qcow2 json:'{"driver":"qcow2", \
   "backing":null, "file":{"driver":"file", "filename":"top.qcow2"}}'
 [{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false},
 { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, 
"offset": 327680},
 { "start": 131072, "length": 131072, "depth": 0, "zero": true, "data": 
false}]

 # Repeat, but post-patch:
 $ qemu-img map --output=json -f qcow2 json:'{"driver":"qcow2", \
   "backing":null, "file":{"driver":"file", "filename":"top.qcow2"}}'
 [{ "start": 0, "length": 65536, "depth": -1, "zero": true, "data": false},
 { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, 
"offset": 327680},
 { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": 
false},
 { "start": 196608, "length": 65536, "depth": -1, "zero": true, "data": 
false}]

Note that pre-patch, it was impossible to determine which portions of
the qcow2 file override the backing file because the "depth":0 regions
were combined, so even though qemu internally can tell the difference
between sclusters 2 and 3, the command line user could not.  But
post-patch, the "depth":-1 markings match the "depth":1 markings when
the backing chain is intact, and it becomes obvious which clusters are
importa