[PATCH v2 2/3] hw/nvme: namespace parameter for EUI-64
The EUI-64 field is the only identifier for NVMe namespaces in UEFI device paths. Add a new namespace property "eui64", that provides the user the option to specify the EUI-64. Signed-off-by: Heinrich Schuchardt Acked-by: Klaus Jensen --- v2: fix typo %s/EUI64/EUI-64/ --- docs/system/nvme.rst | 4 +++ hw/nvme/ctrl.c | 58 ++-- hw/nvme/ns.c | 2 ++ hw/nvme/nvme.h | 1 + 4 files changed, 42 insertions(+), 23 deletions(-) diff --git a/docs/system/nvme.rst b/docs/system/nvme.rst index f7f63d6bf6..b5f8288d7c 100644 --- a/docs/system/nvme.rst +++ b/docs/system/nvme.rst @@ -81,6 +81,10 @@ There are a number of parameters available: Set the UUID of the namespace. This will be reported as a "Namespace UUID" descriptor in the Namespace Identification Descriptor List. +``eui64`` + Set the EUI-64 of the namespace. This will be reported as a "IEEE Extended + Unique Identifier" descriptor in the Namespace Identification Descriptor List. + ``bus`` If there are more ``nvme`` devices defined, this parameter may be used to attach the namespace to a specific ``nvme`` device (identified by an ``id`` diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 8dd9cb2ccb..f37c4fd635 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -4436,19 +4436,19 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) NvmeIdentify *c = (NvmeIdentify *)&req->cmd; uint32_t nsid = le32_to_cpu(c->nsid); uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; - -struct data { -struct { -NvmeIdNsDescr hdr; -uint8_t v[NVME_NIDL_UUID]; -} uuid; -struct { -NvmeIdNsDescr hdr; -uint8_t v; -} csi; -}; - -struct data *ns_descrs = (struct data *)list; +uint8_t *pos = list; +struct { +NvmeIdNsDescr hdr; +uint8_t v[NVME_NIDL_UUID]; +} QEMU_PACKED uuid; +struct { +NvmeIdNsDescr hdr; +uint64_t v; +} QEMU_PACKED eui64; +struct { +NvmeIdNsDescr hdr; +uint8_t v; +} QEMU_PACKED csi; trace_pci_nvme_identify_ns_descr_list(nsid); @@ -4462,17 +4462,29 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) } /* - * Because the NGUID and EUI64 fields are 0 in the Identify Namespace data - * structure, a Namespace UUID (nidt = 3h) must be reported in the - * Namespace Identification Descriptor. Add the namespace UUID here. + * If the EUI-64 field is 0 and the NGUID field is 0, the namespace must + * provide a valid Namespace UUID in the Namespace Identification Descriptor + * data structure. QEMU does not yet support setting NGUID. */ -ns_descrs->uuid.hdr.nidt = NVME_NIDT_UUID; -ns_descrs->uuid.hdr.nidl = NVME_NIDL_UUID; -memcpy(&ns_descrs->uuid.v, ns->params.uuid.data, NVME_NIDL_UUID); - -ns_descrs->csi.hdr.nidt = NVME_NIDT_CSI; -ns_descrs->csi.hdr.nidl = NVME_NIDL_CSI; -ns_descrs->csi.v = ns->csi; +uuid.hdr.nidt = NVME_NIDT_UUID; +uuid.hdr.nidl = NVME_NIDL_UUID; +memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID); +memcpy(pos, &uuid, sizeof(uuid)); +pos += sizeof(uuid); + +if (ns->params.eui64) { +eui64.hdr.nidt = NVME_NIDT_EUI64; +eui64.hdr.nidl = NVME_NIDL_EUI64; +eui64.v = cpu_to_be64(ns->params.eui64); +memcpy(pos, &eui64, sizeof(eui64)); +pos += sizeof(eui64); +} + +csi.hdr.nidt = NVME_NIDT_CSI; +csi.hdr.nidl = NVME_NIDL_CSI; +csi.v = ns->csi; +memcpy(pos, &csi, sizeof(csi)); +pos += sizeof(csi); return nvme_c2h(n, list, sizeof(list), req); } diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c index 3fec9c6273..45e457de6a 100644 --- a/hw/nvme/ns.c +++ b/hw/nvme/ns.c @@ -77,6 +77,7 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp) id_ns->mssrl = cpu_to_le16(ns->params.mssrl); id_ns->mcl = cpu_to_le32(ns->params.mcl); id_ns->msrc = ns->params.msrc; +id_ns->eui64 = cpu_to_be64(ns->params.eui64); ds = 31 - clz32(ns->blkconf.logical_block_size); ms = ns->params.ms; @@ -511,6 +512,7 @@ static Property nvme_ns_props[] = { DEFINE_PROP_BOOL("shared", NvmeNamespace, params.shared, false), DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0), DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid), +DEFINE_PROP_UINT64("eui64", NvmeNamespace, params.eui64, 0), DEFINE_PROP_UINT16("ms", NvmeNamespace, params.ms, 0), DEFINE_PROP_UINT8("mset", NvmeNamespace, params.mset, 0), DEFINE_PROP_UINT8("pi", NvmeNamespace, params.pi, 0), diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h index 93a7e0e538..ac90e13d7b 100644 --- a/hw/nvme/nvme.h +++ b/hw/nvme/nvme.h @@ -83,6 +83,7 @@ typedef struct NvmeNamespaceParams { bool shared; uint32_t nsid; QemuUUID uuid; +uint64_t eui64; uint16_t ms; uint8_t mset; -- 2.30.2
[PATCH v2 0/3] hw/nvme: namespace parameter for EUI-64
The EUI-64 field is the only identifier for NVMe namespaces in UEFI device paths. Add a new namespace property "eui64", that provides the user the option to specify the EUI-64. v2: include patch for hw_compat_6_0 add a patch to implement default values for the EUI-64 Heinrich Schuchardt (3): hw: virt: consider hw_compat_6_0 hw/nvme: namespace parameter for EUI-64 hw/nvme: default for namespace EUI-64 docs/system/nvme.rst | 6 + hw/arm/virt.c| 2 ++ hw/core/machine.c| 1 + hw/nvme/ctrl.c | 58 ++-- hw/nvme/ns.c | 11 + hw/nvme/nvme.h | 3 +++ 6 files changed, 58 insertions(+), 23 deletions(-) -- 2.30.2
[PATCH v2 1/3] hw: virt: consider hw_compat_6_0
virt-6.0 must consider hw_compat_6_0. Fixes: da7e13c00b59 ("hw: add compat machines for 6.1") Signed-off-by: Heinrich Schuchardt Reviewed-by: Cornelia Huck --- v2: add missing Fixes: tag --- hw/arm/virt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 840758666d..8bc3b408fe 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -2764,6 +2764,8 @@ DEFINE_VIRT_MACHINE_AS_LATEST(6, 1) static void virt_machine_6_0_options(MachineClass *mc) { +virt_machine_6_1_options(mc); +compat_props_add(mc->compat_props, hw_compat_6_0, hw_compat_6_0_len); } DEFINE_VIRT_MACHINE(6, 0) -- 2.30.2
[PATCH v2 3/3] hw/nvme: default for namespace EUI-64
On machines with version > 6.0 replace a missing EUI-64 by a generated value. Signed-off-by: Heinrich Schuchardt --- v2: new patch --- docs/system/nvme.rst | 2 ++ hw/core/machine.c| 1 + hw/nvme/ns.c | 9 + hw/nvme/nvme.h | 2 ++ 4 files changed, 14 insertions(+) diff --git a/docs/system/nvme.rst b/docs/system/nvme.rst index b5f8288d7c..33a15c7dbc 100644 --- a/docs/system/nvme.rst +++ b/docs/system/nvme.rst @@ -84,6 +84,8 @@ There are a number of parameters available: ``eui64`` Set the EUI-64 of the namespace. This will be reported as a "IEEE Extended Unique Identifier" descriptor in the Namespace Identification Descriptor List. + Since machine type 6.1 a non-zero default value is used if the parameter + is not provided. For earlier machine types the field defaults to 0. ``bus`` If there are more ``nvme`` devices defined, this parameter may be used to diff --git a/hw/core/machine.c b/hw/core/machine.c index 55b9bc7817..d0e934 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -39,6 +39,7 @@ GlobalProperty hw_compat_6_0[] = { { "gpex-pcihost", "allow-unmapped-accesses", "false" }, { "i8042", "extended-state", "false"}, +{ "nvme-ns", "eui64-default", "off"}, }; const size_t hw_compat_6_0_len = G_N_ELEMENTS(hw_compat_6_0); diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c index 45e457de6a..4275c3db63 100644 --- a/hw/nvme/ns.c +++ b/hw/nvme/ns.c @@ -56,6 +56,7 @@ void nvme_ns_init_format(NvmeNamespace *ns) static int nvme_ns_init(NvmeNamespace *ns, Error **errp) { +static uint64_t ns_count; NvmeIdNs *id_ns = &ns->id_ns; uint8_t ds; uint16_t ms; @@ -73,6 +74,12 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp) id_ns->nmic |= NVME_NMIC_NS_SHARED; } +/* Substitute a missing EUI-64 by an autogenerated one */ +++ns_count; +if (!ns->params.eui64 && ns->params.eui64_default) { +ns->params.eui64 = ns_count + NVME_EUI64_DEFAULT; +} + /* simple copy */ id_ns->mssrl = cpu_to_le16(ns->params.mssrl); id_ns->mcl = cpu_to_le32(ns->params.mcl); @@ -533,6 +540,8 @@ static Property nvme_ns_props[] = { params.max_open_zones, 0), DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace, params.zd_extension_size, 0), +DEFINE_PROP_BOOL("eui64-default", NvmeNamespace, params.eui64_default, + true), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h index ac90e13d7b..3fb869731d 100644 --- a/hw/nvme/nvme.h +++ b/hw/nvme/nvme.h @@ -26,6 +26,7 @@ #define NVME_MAX_CONTROLLERS 32 #define NVME_MAX_NAMESPACES 256 +#define NVME_EUI64_DEFAULT 0x27fed9272381cbd0UL typedef struct NvmeCtrl NvmeCtrl; typedef struct NvmeNamespace NvmeNamespace; @@ -84,6 +85,7 @@ typedef struct NvmeNamespaceParams { uint32_t nsid; QemuUUID uuid; uint64_t eui64; +bool eui64_default; uint16_t ms; uint8_t mset; -- 2.30.2
Re: [PATCH 2/2] nbd: Add new qemu:joint-allocation metadata context
On Wed, Jun 9, 2021 at 9:01 PM Eric Blake wrote: > > When trying to reconstruct a qcow2 chain using information provided > over NBD, ovirt had been relying on an unsafe assumption that any > portion of the qcow2 file advertised as sparse would defer to the > backing image; this worked with what qemu 5.2 reports for a qcow2 BSD > loaded with "backing":null. However, in 6.0, commit 0da9856851 (nbd: > server: Report holes for raw images) also had a side-effect of > reporting unallocated zero clusters in qcow2 files as sparse. This > change is correct from the NBD spec perspective (advertising bits has > always been optional based on how much information the server has > available, and should only be used to optimize behavior when a bit is > set, while not assuming semantics merely because a bit is clear), but > means that a qcow2 file that uses an unallocated zero cluster to > override a backing file now shows up as sparse over NBD, and causes > ovirt to fail to reproduce that cluster (ie. ovirt was assuming it > only had to write clusters where the bit was clear, and the 6.0 > behavior change shows the flaw in that assumption). > > The correct fix is for ovirt to additionally use the > qemu:allocation-depth metadata context added in 5.2: after all, the > actual determination for what is needed to recreate a qcow2 file is > not whether a cluster is sparse, but whether the allocation-depth > shows the cluster to be local. But reproducing an image is more > efficient when handling known-zero clusters, which means that ovirt > has to track both base:allocation and qemu:allocation-depth metadata > contexts simultaneously. While NBD_CMD_BLOCK_STATUS is just fine > sending back information for two contexts in parallel, it comes with > some bookkeeping overhead at the client side: the two contexts need > not report the same length of replies, and it involves more network > traffic. Since this change is not simple, and the chance that we also get the dirty bitmap included in the result seems to be very low, I decided to check the direction of merging multiple extents. I started with merging "base:allocation" and "qemu:dirty-bitmap:xxx" since we already have both. It was not hard to do, although it is not completely tested yet. Here is the merging code: https://gerrit.ovirt.org/c/ovirt-imageio/+/115216/1/daemon/ovirt_imageio/_internal/nbdutil.py To make merging easy and safe, we map the NBD_STATE_DIRTY bit to a private bit so it cannot clash with the NBD_STATE_HOLE bit: https://gerrit.ovirt.org/c/ovirt-imageio/+/115215/1/daemon/ovirt_imageio/_internal/nbd.py Here is a functional test using qemu-nbd showing that it works: https://gerrit.ovirt.org/c/ovirt-imageio/+/115216/1/daemon/test/client_test.py I'll try to use "qemu:allocation-depth" in a similar way next week, probably mapping depth > 0 to EXTENT_EXISTS, to use when reporting holes in single qcow2 images. If this is successful, we can start using this in the next ovirt release, and we don't need "qemu:joint-allocation". Nir
Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole
On Sat, Jun 12, 2021 at 12:23:06AM +0300, Nir Soffer wrote: > > Otherwise, you do have a point: "depth":1 in isolation is ambiguous > > between "not allocated anywhere in this 1-element chain" and > > "allocated at the first backing file in this chain of length 2 or > > more". At which point you can indeed use "qemu-img info" to determine > > the backing chain depth. How painful is that extra step? Does it > > justify the addition of a new optional "backing":true to any portion > > of the file that was beyond the end of the chain (and omit that line > > for all other regions, rather than printing "backing":false)? > > Dealing with depth: N + 1 is not that painful, but also not great. > > I think it is worth a little more effort, and it will save time in the long > term > for users and for developers. Better APIs need simpler and shorter > documentation and require less support. > > I'm not sure about backing: false, maybe absent: true to match libnbd? In the patch [1], I did "backing":true if the cluster was not found in the chain, and omitted the bool altogether when the cluster is present. If we like the name "absent":true better than "backing":true, that's an easy change. The libnbd change for nbdinfo to report 'absent' instead of 'unallocated' has not yet been released, so we have some leeway on naming choices. [1] https://lists.gnu.org/archive/html/qemu-devel/2021-06/msg03067.html -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3266 Virtualization: qemu.org | libvirt.org
Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole
On Fri, Jun 11, 2021 at 9:34 PM Eric Blake wrote: > > On Fri, Jun 11, 2021 at 08:35:01PM +0300, Nir Soffer wrote: > > On Fri, Jun 11, 2021 at 4:28 PM Eric Blake wrote: > > > > > > On Fri, Jun 11, 2021 at 10:09:09AM +0200, Kevin Wolf wrote: > > > > > Yes, that might work as well. But we didn't previously document > > > > > depth to be optional. Removing something from output risks breaking > > > > > more downstream tools that expect it to be non-optional, compared to > > > > > providing a new value. > > > > > > > > A negative value isn't any less unexpected than a missing key. I don't > > > > think any existing tool would be able to handle it. Encoding different > > > > meanings in a single value isn't very QAPI-like either. Usually strings > > > > that are parsed are the problem, but negative integers really isn't that > > > > much different. I don't really like this solution. > > > > > > > > Leaving out the depth feels like a better suggestion to me. > > > > > > > > But anyway, this seems to only happen at the end of the backing chain. > > > > So if the backing chain consistents of n images, why not report 'depth': > > > > n + 1? So, in the above example, you would get 1. I think this has the > > > > best chances of tools actually working correctly with the new output, > > > > even though it's still not unlikely to break something. > > > > > > Ooh, I like that. It is closer to reality - the file data really > > > comes from the next depth, even if we have no filename at that depth. > > > v2 of my patch coming up. > > > > How do you know the number of the layer? this info is not presented in > > qemu-img map output. ... > Otherwise, you do have a point: "depth":1 in isolation is ambiguous > between "not allocated anywhere in this 1-element chain" and > "allocated at the first backing file in this chain of length 2 or > more". At which point you can indeed use "qemu-img info" to determine > the backing chain depth. How painful is that extra step? Does it > justify the addition of a new optional "backing":true to any portion > of the file that was beyond the end of the chain (and omit that line > for all other regions, rather than printing "backing":false)? Dealing with depth: N + 1 is not that painful, but also not great. I think it is worth a little more effort, and it will save time in the long term for users and for developers. Better APIs need simpler and shorter documentation and require less support. I'm not sure about backing: false, maybe absent: true to match libnbd? Nir
Re: [PATCH 7/7] vhost-user-blk: Implement reconnection during realize
On Wed, Jun 09, 2021 at 05:46:58PM +0200, Kevin Wolf wrote: > Commit dabefdd6 removed code that was supposed to try reconnecting > during .realize(), but actually just crashed and had several design > problems. > > This adds the feature back without the crash in simple cases while also > fixing some design problems: Reconnection is now only tried if there was > a problem with the connection and not an error related to the content > (which would fail again the same way in the next attempt). Reconnection > is limited to three attempts (four with the initial attempt) so that we > won't end up in an infinite loop if a problem is permanent. If the > backend restarts three times in the very short time window of device > initialisation, we have bigger problems and erroring out is the right > course of action. > > In the case that a connection error occurs and we reconnect, the error > message is printed using error_report_err(), but otherwise ignored. > > Signed-off-by: Kevin Wolf Reviewed-by: Raphael Norwitz > --- > hw/block/vhost-user-blk.c | 14 +- > 1 file changed, 13 insertions(+), 1 deletion(-) > > diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c > index e49d2e4c83..f75a42bc62 100644 > --- a/hw/block/vhost-user-blk.c > +++ b/hw/block/vhost-user-blk.c > @@ -455,8 +455,10 @@ static int vhost_user_blk_realize_connect(VHostUserBlk > *s, Error **errp) > > static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp) > { > +ERRP_GUARD(); > VirtIODevice *vdev = VIRTIO_DEVICE(dev); > VHostUserBlk *s = VHOST_USER_BLK(vdev); > +int retries; > int i, ret; > > if (!s->chardev.chr) { > @@ -498,7 +500,17 @@ static void vhost_user_blk_device_realize(DeviceState > *dev, Error **errp) > s->inflight = g_new0(struct vhost_inflight, 1); > s->vhost_vqs = g_new0(struct vhost_virtqueue, s->num_queues); > > -ret = vhost_user_blk_realize_connect(s, errp); > +retries = 3; > +assert(!*errp); > +do { > +if (*errp) { > +error_prepend(errp, "Reconnecting after error: "); > +error_report_err(*errp); > +*errp = NULL; > +} > +ret = vhost_user_blk_realize_connect(s, errp); > +} while (ret == -EPROTO && retries--); > + > if (ret < 0) { > goto virtio_err; > } > -- > 2.30.2 >
Re: [PATCH 6/7] vhost-user-blk: Factor out vhost_user_blk_realize_connect()
On Wed, Jun 09, 2021 at 05:46:57PM +0200, Kevin Wolf wrote: > This function is the part that we will want to retry if the connection > is lost during initialisation, so factor it out to keep the following > patch simpler. > > The error path for vhost_dev_get_config() forgot disconnecting the > chardev, add this while touching the code. > > Signed-off-by: Kevin Wolf Reviewed-by: Raphael Norwitz > --- > hw/block/vhost-user-blk.c | 48 ++- > 1 file changed, 32 insertions(+), 16 deletions(-) > > diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c > index 3770f715da..e49d2e4c83 100644 > --- a/hw/block/vhost-user-blk.c > +++ b/hw/block/vhost-user-blk.c > @@ -423,6 +423,36 @@ static void vhost_user_blk_event(void *opaque, > QEMUChrEvent event) > } > } > > +static int vhost_user_blk_realize_connect(VHostUserBlk *s, Error **errp) > +{ > +DeviceState *dev = &s->parent_obj.parent_obj; > +int ret; > + > +s->connected = false; > + > +ret = qemu_chr_fe_wait_connected(&s->chardev, errp); > +if (ret < 0) { > +return ret; > +} > + > +ret = vhost_user_blk_connect(dev, errp); > +if (ret < 0) { > +qemu_chr_fe_disconnect(&s->chardev); > +return ret; > +} > +assert(s->connected); > + > +ret = vhost_dev_get_config(&s->dev, (uint8_t *)&s->blkcfg, > + sizeof(struct virtio_blk_config), errp); > +if (ret < 0) { > +qemu_chr_fe_disconnect(&s->chardev); > +vhost_dev_cleanup(&s->dev); > +return ret; > +} > + > +return 0; > +} > + > static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp) > { > VirtIODevice *vdev = VIRTIO_DEVICE(dev); > @@ -467,22 +497,10 @@ static void vhost_user_blk_device_realize(DeviceState > *dev, Error **errp) > > s->inflight = g_new0(struct vhost_inflight, 1); > s->vhost_vqs = g_new0(struct vhost_virtqueue, s->num_queues); > -s->connected = false; > - > -if (qemu_chr_fe_wait_connected(&s->chardev, errp) < 0) { > -goto virtio_err; > -} > > -if (vhost_user_blk_connect(dev, errp) < 0) { > -qemu_chr_fe_disconnect(&s->chardev); > -goto virtio_err; > -} > -assert(s->connected); > - > -ret = vhost_dev_get_config(&s->dev, (uint8_t *)&s->blkcfg, > - sizeof(struct virtio_blk_config), errp); > +ret = vhost_user_blk_realize_connect(s, errp); > if (ret < 0) { > -goto vhost_err; > +goto virtio_err; > } > > /* we're fully initialized, now we can operate, so add the handler */ > @@ -491,8 +509,6 @@ static void vhost_user_blk_device_realize(DeviceState > *dev, Error **errp) > NULL, true); > return; > > -vhost_err: > -vhost_dev_cleanup(&s->dev); > virtio_err: > g_free(s->vhost_vqs); > s->vhost_vqs = NULL; > -- > 2.30.2 >
Re: [PATCH 5/7] vhost: Distinguish errors in vhost_dev_get_config()
On Wed, Jun 09, 2021 at 05:46:56PM +0200, Kevin Wolf wrote: > Instead of just returning 0/-1 and letting the caller make up a > meaningless error message, add an Error parameter to allow reporting the > real error and switch to 0/-errno so that different kind of errors can > be distinguished in the caller. > > Signed-off-by: Kevin Wolf Just one commmit message suggestion. Reviewed-by: Raphael Norwitz > --- > include/hw/virtio/vhost-backend.h | 2 +- > include/hw/virtio/vhost.h | 4 ++-- > hw/block/vhost-user-blk.c | 9 + > hw/display/vhost-user-gpu.c | 6 -- > hw/input/vhost-user-input.c | 6 -- > hw/net/vhost_net.c| 2 +- > hw/virtio/vhost-user-vsock.c | 9 + > hw/virtio/vhost-user.c| 24 > hw/virtio/vhost-vdpa.c| 2 +- > hw/virtio/vhost.c | 14 +++--- > 10 files changed, 46 insertions(+), 32 deletions(-) > > diff --git a/include/hw/virtio/vhost-backend.h > b/include/hw/virtio/vhost-backend.h > index 728ebb0ed9..8475c5a29d 100644 > --- a/include/hw/virtio/vhost-backend.h > +++ b/include/hw/virtio/vhost-backend.h > @@ -98,7 +98,7 @@ typedef int (*vhost_set_config_op)(struct vhost_dev *dev, > const uint8_t *data, > uint32_t offset, uint32_t size, > uint32_t flags); > typedef int (*vhost_get_config_op)(struct vhost_dev *dev, uint8_t *config, > - uint32_t config_len); > + uint32_t config_len, Error **errp); > > typedef int (*vhost_crypto_create_session_op)(struct vhost_dev *dev, >void *session_info, > diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h > index 2d7aaad67b..045d0fd9f2 100644 > --- a/include/hw/virtio/vhost.h > +++ b/include/hw/virtio/vhost.h > @@ -130,8 +130,8 @@ int vhost_net_set_backend(struct vhost_dev *hdev, >struct vhost_vring_file *file); > > int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write); > -int vhost_dev_get_config(struct vhost_dev *dev, uint8_t *config, > - uint32_t config_len); > +int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config, > + uint32_t config_len, Error **errp); > int vhost_dev_set_config(struct vhost_dev *dev, const uint8_t *data, > uint32_t offset, uint32_t size, uint32_t flags); > /* notifier callback in case vhost device config space changed > diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c > index e9382e152a..3770f715da 100644 > --- a/hw/block/vhost-user-blk.c > +++ b/hw/block/vhost-user-blk.c > @@ -91,11 +91,13 @@ static int vhost_user_blk_handle_config_change(struct > vhost_dev *dev) > int ret; > struct virtio_blk_config blkcfg; > VHostUserBlk *s = VHOST_USER_BLK(dev->vdev); > +Error *local_err = NULL; > > ret = vhost_dev_get_config(dev, (uint8_t *)&blkcfg, > - sizeof(struct virtio_blk_config)); > + sizeof(struct virtio_blk_config), > + &local_err); > if (ret < 0) { > -error_report("get config space failed"); > +error_report_err(local_err); > return -1; > } > > @@ -478,9 +480,8 @@ static void vhost_user_blk_device_realize(DeviceState > *dev, Error **errp) > assert(s->connected); > > ret = vhost_dev_get_config(&s->dev, (uint8_t *)&s->blkcfg, > - sizeof(struct virtio_blk_config)); > + sizeof(struct virtio_blk_config), errp); > if (ret < 0) { > -error_setg(errp, "vhost-user-blk: get block config failed"); > goto vhost_err; > } > > diff --git a/hw/display/vhost-user-gpu.c b/hw/display/vhost-user-gpu.c > index 6cdaa1c73b..389199e6ca 100644 > --- a/hw/display/vhost-user-gpu.c > +++ b/hw/display/vhost-user-gpu.c > @@ -415,14 +415,16 @@ vhost_user_gpu_get_config(VirtIODevice *vdev, uint8_t > *config_data) > VirtIOGPUBase *b = VIRTIO_GPU_BASE(vdev); > struct virtio_gpu_config *vgconfig = > (struct virtio_gpu_config *)config_data; > +Error *local_err = NULL; > int ret; > > memset(config_data, 0, sizeof(struct virtio_gpu_config)); > > ret = vhost_dev_get_config(&g->vhost->dev, > - config_data, sizeof(struct > virtio_gpu_config)); > + config_data, sizeof(struct virtio_gpu_config), > + &local_err); > if (ret) { > -error_report("vhost-user-gpu: get device config space failed"); > +error_report_err(local_err); > return; > } > > diff --git a/hw/input/vhost-user-input.c b/hw/input/vhost-user-input.c > index 63984a8ba7..273e96a7b1 100644
Re: [PATCH 4/7] vhost-user-blk: Add Error parameter to vhost_user_blk_start()
On Wed, Jun 09, 2021 at 05:46:55PM +0200, Kevin Wolf wrote: > Instead of letting the caller make up a meaningless error message, add > an Error parameter to allow reporting the real error. > > Signed-off-by: Kevin Wolf Reviewed-by: Raphael Norwitz > --- > hw/block/vhost-user-blk.c | 31 +++ > 1 file changed, 15 insertions(+), 16 deletions(-) > > diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c > index 0cb56baefb..e9382e152a 100644 > --- a/hw/block/vhost-user-blk.c > +++ b/hw/block/vhost-user-blk.c > @@ -113,7 +113,7 @@ const VhostDevConfigOps blk_ops = { > .vhost_dev_config_notifier = vhost_user_blk_handle_config_change, > }; > > -static int vhost_user_blk_start(VirtIODevice *vdev) > +static int vhost_user_blk_start(VirtIODevice *vdev, Error **errp) > { > VHostUserBlk *s = VHOST_USER_BLK(vdev); > BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); > @@ -121,19 +121,19 @@ static int vhost_user_blk_start(VirtIODevice *vdev) > int i, ret; > > if (!k->set_guest_notifiers) { > -error_report("binding does not support guest notifiers"); > +error_setg(errp, "binding does not support guest notifiers"); > return -ENOSYS; > } > > ret = vhost_dev_enable_notifiers(&s->dev, vdev); > if (ret < 0) { > -error_report("Error enabling host notifiers: %d", -ret); > +error_setg_errno(errp, -ret, "Error enabling host notifiers"); > return ret; > } > > ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, true); > if (ret < 0) { > -error_report("Error binding guest notifier: %d", -ret); > +error_setg_errno(errp, -ret, "Error binding guest notifier"); > goto err_host_notifiers; > } > > @@ -141,27 +141,27 @@ static int vhost_user_blk_start(VirtIODevice *vdev) > > ret = vhost_dev_prepare_inflight(&s->dev, vdev); > if (ret < 0) { > -error_report("Error set inflight format: %d", -ret); > +error_setg_errno(errp, -ret, "Error setting inflight format"); > goto err_guest_notifiers; > } > > if (!s->inflight->addr) { > ret = vhost_dev_get_inflight(&s->dev, s->queue_size, s->inflight); > if (ret < 0) { > -error_report("Error get inflight: %d", -ret); > +error_setg_errno(errp, -ret, "Error getting inflight"); > goto err_guest_notifiers; > } > } > > ret = vhost_dev_set_inflight(&s->dev, s->inflight); > if (ret < 0) { > -error_report("Error set inflight: %d", -ret); > +error_setg_errno(errp, -ret, "Error setting inflight"); > goto err_guest_notifiers; > } > > ret = vhost_dev_start(&s->dev, vdev); > if (ret < 0) { > -error_report("Error starting vhost: %d", -ret); > +error_setg_errno(errp, -ret, "Error starting vhost"); > goto err_guest_notifiers; > } > s->started_vu = true; > @@ -214,6 +214,7 @@ static void vhost_user_blk_set_status(VirtIODevice *vdev, > uint8_t status) > { > VHostUserBlk *s = VHOST_USER_BLK(vdev); > bool should_start = virtio_device_started(vdev, status); > +Error *local_err = NULL; > int ret; > > if (!vdev->vm_running) { > @@ -229,10 +230,9 @@ static void vhost_user_blk_set_status(VirtIODevice > *vdev, uint8_t status) > } > > if (should_start) { > -ret = vhost_user_blk_start(vdev); > +ret = vhost_user_blk_start(vdev, &local_err); > if (ret < 0) { > -error_report("vhost-user-blk: vhost start failed: %s", > - strerror(-ret)); > +error_reportf_err(local_err, "vhost-user-blk: vhost start > failed: "); > qemu_chr_fe_disconnect(&s->chardev); > } > } else { > @@ -270,6 +270,7 @@ static uint64_t vhost_user_blk_get_features(VirtIODevice > *vdev, > static void vhost_user_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq) > { > VHostUserBlk *s = VHOST_USER_BLK(vdev); > +Error *local_err = NULL; > int i, ret; > > if (!vdev->start_on_kick) { > @@ -287,10 +288,9 @@ static void vhost_user_blk_handle_output(VirtIODevice > *vdev, VirtQueue *vq) > /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start > * vhost here instead of waiting for .set_status(). > */ > -ret = vhost_user_blk_start(vdev); > +ret = vhost_user_blk_start(vdev, &local_err); > if (ret < 0) { > -error_report("vhost-user-blk: vhost start failed: %s", > - strerror(-ret)); > +error_reportf_err(local_err, "vhost-user-blk: vhost start failed: "); > qemu_chr_fe_disconnect(&s->chardev); > return; > } > @@ -340,9 +340,8 @@ static int vhost_user_blk_connect(DeviceState *dev, Error > **errp) > > /* restore vhost state */ > if (virtio_device_started(vdev, vdev->status)) { > -ret = vhost_user_b
Re: [PATCH 3/7] vhost: Return 0/-errno in vhost_dev_init()
On Wed, Jun 09, 2021 at 05:46:54PM +0200, Kevin Wolf wrote: > Instead of just returning 0/-1 and letting the caller make up a > meaningless error message, switch to 0/-errno so that different kinds of > errors can be distinguished in the caller. > > This involves changing a few more callbacks in VhostOps to return > 0/-errno: .vhost_set_owner(), .vhost_get_features() and > .vhost_virtqueue_set_busyloop_timeout(). The implementations of these > functions are trivial as they generally just send a message to the > backend. > > Signed-off-by: Kevin Wolf Reviewed-by: Raphael Norwitz > --- > hw/virtio/vhost-backend.c | 4 +++- > hw/virtio/vhost-user.c| 10 +++--- > hw/virtio/vhost-vdpa.c| 4 +++- > hw/virtio/vhost.c | 8 > 4 files changed, 17 insertions(+), 9 deletions(-) > > diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c > index f4f71cf58a..594d770b75 100644 > --- a/hw/virtio/vhost-backend.c > +++ b/hw/virtio/vhost-backend.c > @@ -24,10 +24,12 @@ static int vhost_kernel_call(struct vhost_dev *dev, > unsigned long int request, > void *arg) > { > int fd = (uintptr_t) dev->opaque; > +int ret; > > assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_KERNEL); > > -return ioctl(fd, request, arg); > +ret = ioctl(fd, request, arg); > +return ret < 0 ? -errno : ret; > } > > static int vhost_kernel_init(struct vhost_dev *dev, void *opaque, Error > **errp) > diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c > index 024cb201bb..889559d86a 100644 > --- a/hw/virtio/vhost-user.c > +++ b/hw/virtio/vhost-user.c > @@ -1353,7 +1353,11 @@ static int vhost_user_get_u64(struct vhost_dev *dev, > int request, uint64_t *u64) > > static int vhost_user_get_features(struct vhost_dev *dev, uint64_t *features) > { > -return vhost_user_get_u64(dev, VHOST_USER_GET_FEATURES, features); > +if (vhost_user_get_u64(dev, VHOST_USER_GET_FEATURES, features) < 0) { > +return -EPROTO; > +} > + > +return 0; > } > > static int vhost_user_set_owner(struct vhost_dev *dev) > @@ -1364,7 +1368,7 @@ static int vhost_user_set_owner(struct vhost_dev *dev) > }; > > if (vhost_user_write(dev, &msg, NULL, 0) < 0) { > -return -1; > +return -EPROTO; > } > > return 0; > @@ -1872,7 +1876,7 @@ static int vhost_user_backend_init(struct vhost_dev > *dev, void *opaque, > > err = vhost_user_get_features(dev, &features); > if (err < 0) { > -return -EPROTO; > +return err; > } > > if (virtio_has_feature(features, VHOST_USER_F_PROTOCOL_FEATURES)) { > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c > index c2aadb57cb..71897c1a01 100644 > --- a/hw/virtio/vhost-vdpa.c > +++ b/hw/virtio/vhost-vdpa.c > @@ -253,10 +253,12 @@ static int vhost_vdpa_call(struct vhost_dev *dev, > unsigned long int request, > { > struct vhost_vdpa *v = dev->opaque; > int fd = v->device_fd; > +int ret; > > assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); > > -return ioctl(fd, request, arg); > +ret = ioctl(fd, request, arg); > +return ret < 0 ? -errno : ret; > } > > static void vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status) > diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c > index fd13135706..c7f9d8bb06 100644 > --- a/hw/virtio/vhost.c > +++ b/hw/virtio/vhost.c > @@ -1309,13 +1309,13 @@ int vhost_dev_init(struct vhost_dev *hdev, void > *opaque, > > r = hdev->vhost_ops->vhost_set_owner(hdev); > if (r < 0) { > -error_setg(errp, "vhost_set_owner failed"); > +error_setg_errno(errp, -r, "vhost_set_owner failed"); > goto fail; > } > > r = hdev->vhost_ops->vhost_get_features(hdev, &features); > if (r < 0) { > -error_setg(errp, "vhost_get_features failed"); > +error_setg_errno(errp, -r, "vhost_get_features failed"); > goto fail; > } > > @@ -1332,7 +1332,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, > r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + > i, > busyloop_timeout); > if (r < 0) { > -error_setg(errp, "Failed to set busyloop timeout"); > +error_setg_errno(errp, -r, "Failed to set busyloop timeout"); > goto fail_busyloop; > } > } > @@ -1391,7 +1391,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, > if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) > { > error_setg(errp, "vhost backend memory slots limit is less" > " than current number of present memory slots"); > -r = -1; > +r = -EINVAL; > goto fail_busyloop; > } > > -- > 2.30.2 >
Re: [PATCH 2/7] vhost: Distinguish errors in vhost_backend_init()
On Wed, Jun 09, 2021 at 05:46:53PM +0200, Kevin Wolf wrote: > Instead of just returning 0/-1 and letting the caller make up a > meaningless error message, add an Error parameter to allow reporting the > real error and switch to 0/-errno so that different kind of errors can > be distinguished in the caller. > > Specifically, in vhost-user, EPROTO is used for all errors that relate > to the connection itself, whereas other error codes are used for errors > relating to the content of the connection. This will allow us later to > automatically reconnect when the connection goes away, without ending up > in an endless loop if it's a permanent error in the configuration. > > Signed-off-by: Kevin Wolf Reviewed-by: Raphael Norwitz > --- > include/hw/virtio/vhost-backend.h | 3 ++- > hw/virtio/vhost-backend.c | 2 +- > hw/virtio/vhost-user.c| 41 --- > hw/virtio/vhost-vdpa.c| 2 +- > hw/virtio/vhost.c | 13 +- > 5 files changed, 32 insertions(+), 29 deletions(-) > > diff --git a/include/hw/virtio/vhost-backend.h > b/include/hw/virtio/vhost-backend.h > index 8a6f8e2a7a..728ebb0ed9 100644 > --- a/include/hw/virtio/vhost-backend.h > +++ b/include/hw/virtio/vhost-backend.h > @@ -37,7 +37,8 @@ struct vhost_scsi_target; > struct vhost_iotlb_msg; > struct vhost_virtqueue; > > -typedef int (*vhost_backend_init)(struct vhost_dev *dev, void *opaque); > +typedef int (*vhost_backend_init)(struct vhost_dev *dev, void *opaque, > + Error **errp); > typedef int (*vhost_backend_cleanup)(struct vhost_dev *dev); > typedef int (*vhost_backend_memslots_limit)(struct vhost_dev *dev); > > diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c > index 31b33bde37..f4f71cf58a 100644 > --- a/hw/virtio/vhost-backend.c > +++ b/hw/virtio/vhost-backend.c > @@ -30,7 +30,7 @@ static int vhost_kernel_call(struct vhost_dev *dev, > unsigned long int request, > return ioctl(fd, request, arg); > } > > -static int vhost_kernel_init(struct vhost_dev *dev, void *opaque) > +static int vhost_kernel_init(struct vhost_dev *dev, void *opaque, Error > **errp) > { > assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_KERNEL); > > diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c > index ee57abe045..024cb201bb 100644 > --- a/hw/virtio/vhost-user.c > +++ b/hw/virtio/vhost-user.c > @@ -1856,7 +1856,8 @@ static int > vhost_user_postcopy_notifier(NotifierWithReturn *notifier, > return 0; > } > > -static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque) > +static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque, > + Error **errp) > { > uint64_t features, protocol_features, ram_slots; > struct vhost_user *u; > @@ -1871,7 +1872,7 @@ static int vhost_user_backend_init(struct vhost_dev > *dev, void *opaque) > > err = vhost_user_get_features(dev, &features); > if (err < 0) { > -return err; > +return -EPROTO; > } > > if (virtio_has_feature(features, VHOST_USER_F_PROTOCOL_FEATURES)) { > @@ -1880,7 +1881,7 @@ static int vhost_user_backend_init(struct vhost_dev > *dev, void *opaque) > err = vhost_user_get_u64(dev, VHOST_USER_GET_PROTOCOL_FEATURES, > &protocol_features); > if (err < 0) { > -return err; > +return -EPROTO; > } > > dev->protocol_features = > @@ -1891,14 +1892,14 @@ static int vhost_user_backend_init(struct vhost_dev > *dev, void *opaque) > dev->protocol_features &= ~(1ULL << > VHOST_USER_PROTOCOL_F_CONFIG); > } else if (!(protocol_features & > (1ULL << VHOST_USER_PROTOCOL_F_CONFIG))) { > -error_report("Device expects VHOST_USER_PROTOCOL_F_CONFIG " > -"but backend does not support it."); > -return -1; > +error_setg(errp, "Device expects VHOST_USER_PROTOCOL_F_CONFIG " > + "but backend does not support it."); > +return -EINVAL; > } > > err = vhost_user_set_protocol_features(dev, dev->protocol_features); > if (err < 0) { > -return err; > +return -EPROTO; > } > > /* query the max queues we support if backend supports Multiple > Queue */ > @@ -1906,12 +1907,12 @@ static int vhost_user_backend_init(struct vhost_dev > *dev, void *opaque) > err = vhost_user_get_u64(dev, VHOST_USER_GET_QUEUE_NUM, > &dev->max_queues); > if (err < 0) { > -return err; > +return -EPROTO; > } > } > if (dev->num_queues && dev->max_queues < dev->num_queues) { > -error_report("The maximum number of queues supported by the " > - "backe
Re: [PATCH 1/7] vhost: Add Error parameter to vhost_dev_init()
On Wed, Jun 09, 2021 at 05:46:52PM +0200, Kevin Wolf wrote: > This allows callers to return better error messages instead of making > one up while the real error ends up on stderr. Most callers can > immediately make use of this because they already have an Error > parameter themselves. The others just keep printing the error with > error_report_err(). > > Signed-off-by: Kevin Wolf Reviewed-by: Raphael Norwitz > --- > include/hw/virtio/vhost.h| 2 +- > backends/cryptodev-vhost.c | 5 - > backends/vhost-user.c| 4 ++-- > hw/block/vhost-user-blk.c| 4 ++-- > hw/net/vhost_net.c | 6 +- > hw/scsi/vhost-scsi.c | 4 +--- > hw/scsi/vhost-user-scsi.c| 4 +--- > hw/virtio/vhost-user-fs.c| 3 +-- > hw/virtio/vhost-user-vsock.c | 3 +-- > hw/virtio/vhost-vsock.c | 3 +-- > hw/virtio/vhost.c| 16 ++-- > 11 files changed, 29 insertions(+), 25 deletions(-) > > diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h > index 21a9a52088..2d7aaad67b 100644 > --- a/include/hw/virtio/vhost.h > +++ b/include/hw/virtio/vhost.h > @@ -104,7 +104,7 @@ struct vhost_net { > > int vhost_dev_init(struct vhost_dev *hdev, void *opaque, > VhostBackendType backend_type, > - uint32_t busyloop_timeout); > + uint32_t busyloop_timeout, Error **errp); > void vhost_dev_cleanup(struct vhost_dev *hdev); > int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev); > void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev); > diff --git a/backends/cryptodev-vhost.c b/backends/cryptodev-vhost.c > index 8231e7f1bc..bc13e466b4 100644 > --- a/backends/cryptodev-vhost.c > +++ b/backends/cryptodev-vhost.c > @@ -52,6 +52,7 @@ cryptodev_vhost_init( > { > int r; > CryptoDevBackendVhost *crypto; > +Error *local_err = NULL; > > crypto = g_new(CryptoDevBackendVhost, 1); > crypto->dev.max_queues = 1; > @@ -66,8 +67,10 @@ cryptodev_vhost_init( > /* vhost-user needs vq_index to initiate a specific queue pair */ > crypto->dev.vq_index = crypto->cc->queue_index * crypto->dev.nvqs; > > -r = vhost_dev_init(&crypto->dev, options->opaque, options->backend_type, > 0); > +r = vhost_dev_init(&crypto->dev, options->opaque, options->backend_type, > 0, > + &local_err); > if (r < 0) { > +error_report_err(local_err); > goto fail; > } > > diff --git a/backends/vhost-user.c b/backends/vhost-user.c > index b366610e16..10b39992d2 100644 > --- a/backends/vhost-user.c > +++ b/backends/vhost-user.c > @@ -48,9 +48,9 @@ vhost_user_backend_dev_init(VhostUserBackend *b, > VirtIODevice *vdev, > b->dev.nvqs = nvqs; > b->dev.vqs = g_new0(struct vhost_virtqueue, nvqs); > > -ret = vhost_dev_init(&b->dev, &b->vhost_user, VHOST_BACKEND_TYPE_USER, > 0); > +ret = vhost_dev_init(&b->dev, &b->vhost_user, VHOST_BACKEND_TYPE_USER, 0, > + errp); > if (ret < 0) { > -error_setg_errno(errp, -ret, "vhost initialization failed"); > return -1; > } > > diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c > index c6210fad0c..0cb56baefb 100644 > --- a/hw/block/vhost-user-blk.c > +++ b/hw/block/vhost-user-blk.c > @@ -332,9 +332,9 @@ static int vhost_user_blk_connect(DeviceState *dev, Error > **errp) > > vhost_dev_set_config_notifier(&s->dev, &blk_ops); > > -ret = vhost_dev_init(&s->dev, &s->vhost_user, VHOST_BACKEND_TYPE_USER, > 0); > +ret = vhost_dev_init(&s->dev, &s->vhost_user, VHOST_BACKEND_TYPE_USER, 0, > + errp); > if (ret < 0) { > -error_setg_errno(errp, -ret, "vhost initialization failed"); > return ret; > } > > diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c > index 44c1ed92dc..447b119f85 100644 > --- a/hw/net/vhost_net.c > +++ b/hw/net/vhost_net.c > @@ -22,6 +22,7 @@ > #include "standard-headers/linux/vhost_types.h" > #include "hw/virtio/virtio-net.h" > #include "net/vhost_net.h" > +#include "qapi/error.h" > #include "qemu/error-report.h" > #include "qemu/main-loop.h" > > @@ -157,6 +158,7 @@ struct vhost_net *vhost_net_init(VhostNetOptions *options) > bool backend_kernel = options->backend_type == VHOST_BACKEND_TYPE_KERNEL; > struct vhost_net *net = g_new0(struct vhost_net, 1); > uint64_t features = 0; > +Error *local_err = NULL; > > if (!options->net_backend) { > fprintf(stderr, "vhost-net requires net backend to be setup\n"); > @@ -187,8 +189,10 @@ struct vhost_net *vhost_net_init(VhostNetOptions > *options) > } > > r = vhost_dev_init(&net->dev, options->opaque, > - options->backend_type, options->busyloop_timeout); > + options->backend_type, options->busyloop_timeout, > + &local_err); > if (r < 0) { > +error_report_err(local_
[PATCH v2 2/1] qemu-img: Add "backing":true to unallocated map segments
To save the user from having to check 'qemu-img info --backing-chain' or other followup command to determine which "depth":n goes beyond the chain, add a boolean field "backing" that is set only for unallocated portions of the disk. Signed-off-by: Eric Blake --- Touches the same iotest output as 1/1. If we decide that switching to "depth":n+1 is too risky, and that the mere addition of "backing":true while keeping "depth":n is good enough, then we'd have just one patch, instead of this double churn. Preferences? docs/tools/qemu-img.rst| 3 ++ qapi/block-core.json | 7 ++- qemu-img.c | 15 +- tests/qemu-iotests/122.out | 34 +++--- tests/qemu-iotests/154.out | 96 +++--- tests/qemu-iotests/179.out | 66 +- tests/qemu-iotests/223.out | 24 +- tests/qemu-iotests/244.out | 6 +-- tests/qemu-iotests/252.out | 4 +- tests/qemu-iotests/274.out | 16 +++ tests/qemu-iotests/291.out | 8 ++-- tests/qemu-iotests/309.out | 4 +- 12 files changed, 150 insertions(+), 133 deletions(-) diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst index c155b1bf3cc8..fbc623b645c3 100644 --- a/docs/tools/qemu-img.rst +++ b/docs/tools/qemu-img.rst @@ -601,6 +601,9 @@ Command description: a ``depth``; for example, a depth of 2 refers to the backing file of the backing file of *FILENAME*. Depth will be one larger than the chain length if no file in the chain provides the data. + - an optional ``backing`` field is present with value true if no +file in the backing chain provides the data (making it easier to +identify when ``depth`` exceeds the chain length). In JSON format, the ``offset`` field is optional; it is absent in cases where ``human`` format would omit the entry or exit with an error. diff --git a/qapi/block-core.json b/qapi/block-core.json index 2ea294129e08..cebe12ba16a0 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -264,6 +264,9 @@ # @offset: if present, the image file stores the data for this range # in raw format at the given (host) offset # +# @backing: if present, the range is not allocated within the backing +# chain (since 6.1) +# # @filename: filename that is referred to by @offset # # Since: 2.6 @@ -271,8 +274,8 @@ ## { 'struct': 'MapEntry', 'data': {'start': 'int', 'length': 'int', 'data': 'bool', - 'zero': 'bool', 'depth': 'int', '*offset': 'int', - '*filename': 'str' } } + 'zero': 'bool', 'depth': 'int', '*backing': 'bool', + '*offset': 'int', '*filename': 'str' } } ## # @BlockdevCacheInfo: diff --git a/qemu-img.c b/qemu-img.c index 33a5cd012b8b..4d357f534803 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -2977,8 +2977,13 @@ static int dump_map_entry(OutputFormat output_format, MapEntry *e, break; case OFORMAT_JSON: printf("{ \"start\": %"PRId64", \"length\": %"PRId64"," - " \"depth\": %"PRId64", \"zero\": %s, \"data\": %s", - e->start, e->length, e->depth, + " \"depth\": %"PRId64, e->start, e->length, e->depth); +if (e->has_backing) { +/* Backing should only be set at the end of the chain */ +assert(e->backing && e->depth > 0); +printf(", \"backing\": true"); +} +printf(", \"zero\": %s, \"data\": %s", e->zero ? "true" : "false", e->data ? "true" : "false"); if (e->has_offset) { @@ -2999,6 +3004,7 @@ static int get_block_status(BlockDriverState *bs, int64_t offset, { int ret; int depth; +bool backing = false; BlockDriverState *file; bool has_offset; int64_t map; @@ -3037,6 +3043,7 @@ static int get_block_status(BlockDriverState *bs, int64_t offset, } if (!(ret & BDRV_BLOCK_ALLOCATED)) { depth++; +backing = true; } *e = (MapEntry) { @@ -3047,6 +3054,8 @@ static int get_block_status(BlockDriverState *bs, int64_t offset, .offset = map, .has_offset = has_offset, .depth = depth, +.has_backing = backing, +.backing = backing, .has_filename = filename, .filename = filename, }; @@ -3072,6 +3081,8 @@ static inline bool entry_mergeable(const MapEntry *curr, const MapEntry *next) if (curr->has_offset && curr->offset + curr->length != next->offset) { return false; } +/* backing should only ever be set for identical depth */ +assert(curr->backing == next->backing); return true; } diff --git a/tests/qemu-iotests/122.out b/tests/qemu-iotests/122.out index 779dab4847f0..c5aa2c9866f1 100644 --- a/tests/qemu-iotests/122.out +++ b/tests/qemu-iotests/122.out @@ -68,11 +68,11 @@ read 65536/65536 bytes at offset 4194304 read 65536/65536 bytes at offset 8388608 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) [{ "start": 0, "length": 65536, "depth"
Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole
On Fri, Jun 11, 2021 at 08:35:01PM +0300, Nir Soffer wrote: > On Fri, Jun 11, 2021 at 4:28 PM Eric Blake wrote: > > > > On Fri, Jun 11, 2021 at 10:09:09AM +0200, Kevin Wolf wrote: > > > > Yes, that might work as well. But we didn't previously document > > > > depth to be optional. Removing something from output risks breaking > > > > more downstream tools that expect it to be non-optional, compared to > > > > providing a new value. > > > > > > A negative value isn't any less unexpected than a missing key. I don't > > > think any existing tool would be able to handle it. Encoding different > > > meanings in a single value isn't very QAPI-like either. Usually strings > > > that are parsed are the problem, but negative integers really isn't that > > > much different. I don't really like this solution. > > > > > > Leaving out the depth feels like a better suggestion to me. > > > > > > But anyway, this seems to only happen at the end of the backing chain. > > > So if the backing chain consistents of n images, why not report 'depth': > > > n + 1? So, in the above example, you would get 1. I think this has the > > > best chances of tools actually working correctly with the new output, > > > even though it's still not unlikely to break something. > > > > Ooh, I like that. It is closer to reality - the file data really > > comes from the next depth, even if we have no filename at that depth. > > v2 of my patch coming up. > > How do you know the number of the layer? this info is not presented in > qemu-img map output. qemu-img map has two output formats. In --output=human, areas of the disk reading as zero are elided (and this happens to include ALL areas that were not allocated anywhere in the chain); all other areas list the filename of the element in the chain where the data was found. This mode also fails if compression or encryption prevents easy access to actual data. In other words, it's fragile, so no one uses it for anything programmatic, even though it's the default. In --output=json, no file names are output. Instead, "depth":N tells you how deep in the backing chain you must traverse to find the data. "depth":0 is obvious: the file you mapped (other than the bug that this patch is fixing where we mistakenly used "depth":0 also for unallocated regions). If you use "backing":null to force a 1-layer depth, then "depth":1 is unambiguous meaning the (non-present) backing file. Otherwise, you do have a point: "depth":1 in isolation is ambiguous between "not allocated anywhere in this 1-element chain" and "allocated at the first backing file in this chain of length 2 or more". At which point you can indeed use "qemu-img info" to determine the backing chain depth. How painful is that extra step? Does it justify the addition of a new optional "backing":true to any portion of the file that was beyond the end of the chain (and omit that line for all other regions, rather than printing "backing":false)? > > Users will have to run "qemu-img info --backing-chain" to understand the > output of qemu-img map. At any rate, it should be easy enough to output an additional field, followup patch coming soon... -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3266 Virtualization: qemu.org | libvirt.org
Re: [PATCH v2] qemu-img: Make unallocated part of backing chain obvious in map
On Fri, Jun 11, 2021 at 5:59 PM Eric Blake wrote: > > On Fri, Jun 11, 2021 at 05:35:12PM +0300, Vladimir Sementsov-Ogievskiy wrote: > > > An obvious solution is to make 'qemu-img map --output=json' > > > distinguish between clusters that have a local allocation from those > > > that are found nowhere in the chain. We already have a one-off > > > mismatch between qemu-img map and NBD qemu:allocation-depth (the > > > former chose 0, and the latter 1 for the local layer), so exposing the > > > latter's choice of 0 for unallocated in the entire chain would mean > > > using using "depth":-1 in the former, but a negative depth may confuse > > > existing tools. But there is an easy out: for any chain of length N, > > > we can simply represent an unallocated cluster as "depth":N+1. This > > > does have a slight risk of confusing any tool that might try to > > > dereference NULL when finding the backing image for the last file in > > > the backing chain, but that risk sseems worth the more precise output. > > > The iotests have several examples where this distinction demonstrates > > > the additional accuracy. > > > > > > Signed-off-by: Eric Blake > > > --- > > > > > > Replaces v1: 20210610213906.1313440-1-ebl...@redhat.com > > > (qemu-img: Use "depth":-1 to make backing probes obvious) > > > > > > Use N+1 instead of -1 for unallocated [Kevin] > > > > > > > Bit in contrast with -1, or with separate boolean flag, you lose the > > possibility to distinguish case when we have 3 layers and the cluster is > > absent in all of them, and the case when we have 4 layers and the cluster > > is absent in top 3 but in 4 it is qcow2 UNALLOCATED_ZERO cluster. > > Using just 'qemu-img map --output-json', you only see depth numbers. > You also have to use 'qemu-img info --backing-chain' to see what file > those depth numbers correspond to, at which point it becomes obvious > whether "depth":4 meant unallocated (because the chain was length 3) > or allocated at depth 4 (because the chain was length 4 or longer). > But that's no worse than pre-patch, where you had to use qemu-img info > --backing-chain to learn which file a particular "depth" maps to. > > > > > So, if someone use this API to reconstruct the chain, then for original 3 > > empty layers he will create 3 empty layers and 4rd additional ZERO layer. > > And such reconstructed chain would not be equal to original chain (as if we > > take these two chains and add additional backing file as a new bottom > > layer, effect would be different).. I'm not sure is it a problem in the > > task you are solving :\ > > It should be fairly easy to optimize the case of a backing chain where > EVERY listed cluster at the final depth was "data":false,"zero":true > to omit that file after all. > > And in oVirt's case, Nir pointed out that we have one more tool at our > disposal in recreating a backing chain: if you use > json:{"driver":"qcow2", "backing":null, ...} as your image file, you > don't have to worry about arbitrary files in the backing chain, only > about recreating the top-most layer of a chain. And in that case, it > becomes very obvious that "depth":0 is something you must recreate, > and "depth":1 would be a non-existent backing file because you just > passed "backing":null. Note that oVirt does not use qemu-img map, we use qemu-nbd to get image extents, since it is used only in context we already connect to qemu-nbd server or run qemu-nbd. Management tools already know the image format (they should avoid doing format probing anyway), and using a json uri allows single command to get the needed info when you inspect a single layer. But this change introduces a risk that some program using qemu-img map will interrupt the result in the wrong way, assuming that there is N+1 layer. I think adding a new flag for absent extents is better. It cannot break any user and it is easier to understand and use. Nir
Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole
On Fri, Jun 11, 2021 at 4:28 PM Eric Blake wrote: > > On Fri, Jun 11, 2021 at 10:09:09AM +0200, Kevin Wolf wrote: > > > Yes, that might work as well. But we didn't previously document > > > depth to be optional. Removing something from output risks breaking > > > more downstream tools that expect it to be non-optional, compared to > > > providing a new value. > > > > A negative value isn't any less unexpected than a missing key. I don't > > think any existing tool would be able to handle it. Encoding different > > meanings in a single value isn't very QAPI-like either. Usually strings > > that are parsed are the problem, but negative integers really isn't that > > much different. I don't really like this solution. > > > > Leaving out the depth feels like a better suggestion to me. > > > > But anyway, this seems to only happen at the end of the backing chain. > > So if the backing chain consistents of n images, why not report 'depth': > > n + 1? So, in the above example, you would get 1. I think this has the > > best chances of tools actually working correctly with the new output, > > even though it's still not unlikely to break something. > > Ooh, I like that. It is closer to reality - the file data really > comes from the next depth, even if we have no filename at that depth. > v2 of my patch coming up. How do you know the number of the layer? this info is not presented in qemu-img map output. Users will have to run "qemu-img info --backing-chain" to understand the output of qemu-img map.
Re: [PATCH v4 00/32] block/nbd: rework client connection
11.06.2021 18:55, Eric Blake wrote: On Thu, Jun 10, 2021 at 01:07:30PM +0300, Vladimir Sementsov-Ogievskiy wrote: v4: Now based on new Paolo's patch: Based-on: <20210609122234.544153-1-pbonz...@redhat.com> Also, I've dropped patch 33 for now, it's too much for this series. I'll resend it later on top of this. The series is also available at tag up-nbd-client-connection-v4 in git https://src.openvz.org/scm/~vsementsov/qemu.git I think everything has R-b now, so I'll queue this through my NBD tree including folding in the grammar tweaks where I spotted them. Thanks a lot! -- Best regards, Vladimir
Re: [PATCH 3/7] net/rocker: use GDateTime for formatting timestamp in debug messages
ping: anyone willing to give a review of this one On Wed, May 05, 2021 at 11:36:58AM +0100, Daniel P. Berrangé wrote: > The GDateTime APIs provided by GLib avoid portability pitfalls, such > as some platforms where 'struct timeval.tv_sec' field is still 'long' > instead of 'time_t'. When combined with automatic cleanup, GDateTime > often results in simpler code too. > > Signed-off-by: Daniel P. Berrangé > --- > hw/net/rocker/rocker.h | 11 +++ > 1 file changed, 3 insertions(+), 8 deletions(-) > > diff --git a/hw/net/rocker/rocker.h b/hw/net/rocker/rocker.h > index 941c932265..412fa44d01 100644 > --- a/hw/net/rocker/rocker.h > +++ b/hw/net/rocker/rocker.h > @@ -25,14 +25,9 @@ > #if defined(DEBUG_ROCKER) > # define DPRINTF(fmt, ...) \ > do { \ > -struct timeval tv; \ > -char timestr[64]; \ > -time_t now;\ > -gettimeofday(&tv, NULL); \ > -now = tv.tv_sec; \ > -strftime(timestr, sizeof(timestr), "%T", localtime(&now)); \ > -fprintf(stderr, "%s.%06ld ", timestr, tv.tv_usec); \ > -fprintf(stderr, "ROCKER: " fmt, ## __VA_ARGS__); \ > +g_autoptr(GDateTime) now = g_date_time_new_now_local();\ > +g_autofree char *nowstr = g_date_time_format(now, "%T.%f");\ > +fprintf(stderr, "%s ROCKER: " fmt, nowstr, ## __VA_ARGS__);\ > } while (0) > #else > static inline GCC_FMT_ATTR(1, 2) int DPRINTF(const char *fmt, ...) > -- > 2.31.1 > Regards, Daniel -- |: https://berrange.com -o-https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o-https://fstop138.berrange.com :| |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|
Re: [PATCH 5/7] io: use GDateTime for formatting timestamp for websock headers
ping: anyone willing to review this On Wed, May 05, 2021 at 11:37:00AM +0100, Daniel P. Berrangé wrote: > The GDateTime APIs provided by GLib avoid portability pitfalls, such > as some platforms where 'struct timeval.tv_sec' field is still 'long' > instead of 'time_t'. When combined with automatic cleanup, GDateTime > often results in simpler code too. > > Signed-off-by: Daniel P. Berrangé > --- > io/channel-websock.c | 10 ++ > 1 file changed, 2 insertions(+), 8 deletions(-) > > diff --git a/io/channel-websock.c b/io/channel-websock.c > index 03c1f7cb62..70889bb54d 100644 > --- a/io/channel-websock.c > +++ b/io/channel-websock.c > @@ -177,15 +177,9 @@ qio_channel_websock_handshake_send_res(QIOChannelWebsock > *ioc, > > static gchar *qio_channel_websock_date_str(void) > { > -struct tm tm; > -time_t now = time(NULL); > -char datebuf[128]; > +g_autoptr(GDateTime) now = g_date_time_new_now_utc(); > > -gmtime_r(&now, &tm); > - > -strftime(datebuf, sizeof(datebuf), "%a, %d %b %Y %H:%M:%S GMT", &tm); > - > -return g_strdup(datebuf); > +return g_date_time_format(now, "%a, %d %b %Y %H:%M:%S GMT"); > } > > static void qio_channel_websock_handshake_send_res_err(QIOChannelWebsock > *ioc, > -- > 2.31.1 > Regards, Daniel -- |: https://berrange.com -o-https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o-https://fstop138.berrange.com :| |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|
Re: [PATCH v2] async: the main AioContext is only "current" if under the BQL
On Fri, Jun 11, 2021 at 01:14:27PM +0200, Paolo Bonzini wrote: > > > With this change, qemu_mutex_iothread_locked() must be changed from > > > > Did you miss "qemu_mutex_iothread_locked() stub function"? > > Yes, that comment refers to the stub. Let me resubmit with a testcase and > I'll fix that too. Vladimir's v4 series reworking nbd connection depends on this one. I'll wait for your v3 to land, but once it does, I'm happy to queue it through my NBD tree along with Vladimir's work, if that makes it easier. -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3266 Virtualization: qemu.org | libvirt.org
Re: [PATCH v4 00/32] block/nbd: rework client connection
On Thu, Jun 10, 2021 at 01:07:30PM +0300, Vladimir Sementsov-Ogievskiy wrote: > v4: > > Now based on new Paolo's patch: > Based-on: <20210609122234.544153-1-pbonz...@redhat.com> > > Also, I've dropped patch 33 for now, it's too much for this series. > I'll resend it later on top of this. > > The series is also available at tag up-nbd-client-connection-v4 in > git https://src.openvz.org/scm/~vsementsov/qemu.git I think everything has R-b now, so I'll queue this through my NBD tree including folding in the grammar tweaks where I spotted them. -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3266 Virtualization: qemu.org | libvirt.org
Re: [PATCH v4 28/32] block/nbd: split nbd_co_do_establish_connection out of nbd_reconnect_attempt
On Thu, Jun 10, 2021 at 01:07:58PM +0300, Vladimir Sementsov-Ogievskiy wrote: > Split out the part that we want to reuse for nbd_open(). > > Signed-off-by: Vladimir Sementsov-Ogievskiy > --- > block/nbd.c | 80 - > 1 file changed, 42 insertions(+), 38 deletions(-) > Reviewed-by: Eric Blake -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3266 Virtualization: qemu.org | libvirt.org
Re: [PATCH v4 21/32] nbd/client-connection: shutdown connection on release
On Thu, Jun 10, 2021 at 01:07:51PM +0300, Vladimir Sementsov-Ogievskiy wrote: > Now, when a thread can do negotiation and retry, it may run relatively > long. We need a mechanism to stop it, when the user is not interested > in a result any more. So, on nbd_client_connection_release() let's > shutdown the socket, and do not retry connection if thread is detached. > > Signed-off-by: Vladimir Sementsov-Ogievskiy > --- > nbd/client-connection.c | 20 +--- > 1 file changed, 17 insertions(+), 3 deletions(-) > Reviewed-by: Eric Blake -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3266 Virtualization: qemu.org | libvirt.org
Re: [PATCH v4 20/32] nbd/client-connection: implement connection retry
On Thu, Jun 10, 2021 at 01:07:50PM +0300, Vladimir Sementsov-Ogievskiy wrote: > Add an option for a thread to retry connection until succeeds. We'll for a thread to retry connecting until it succeeds. > use nbd/client-connection both for reconnect and for initial connection > in nbd_open(), so we need a possibility to use same NBDClientConnection > instance to connect once in nbd_open() and then use retry semantics for > reconnect. > > Signed-off-by: Vladimir Sementsov-Ogievskiy > --- > include/block/nbd.h | 2 ++ > nbd/client-connection.c | 56 +++-- > 2 files changed, 45 insertions(+), 13 deletions(-) > Reviewed-by: Eric Blake -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3266 Virtualization: qemu.org | libvirt.org
Re: [PATCH v4 19/32] nbd/client-connection: add possibility of negotiation
On Thu, Jun 10, 2021 at 01:07:49PM +0300, Vladimir Sementsov-Ogievskiy wrote: > Add arguments and logic to support nbd negotiation in the same thread > after successful connection. > > Signed-off-by: Vladimir Sementsov-Ogievskiy > --- > include/block/nbd.h | 9 +++- > block/nbd.c | 4 +- > nbd/client-connection.c | 105 ++-- > 3 files changed, 109 insertions(+), 9 deletions(-) > Reviewed-by: Eric Blake -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3266 Virtualization: qemu.org | libvirt.org
Re: [PATCH v2] qemu-img: Make unallocated part of backing chain obvious in map
On Fri, Jun 11, 2021 at 05:35:12PM +0300, Vladimir Sementsov-Ogievskiy wrote: > > An obvious solution is to make 'qemu-img map --output=json' > > distinguish between clusters that have a local allocation from those > > that are found nowhere in the chain. We already have a one-off > > mismatch between qemu-img map and NBD qemu:allocation-depth (the > > former chose 0, and the latter 1 for the local layer), so exposing the > > latter's choice of 0 for unallocated in the entire chain would mean > > using using "depth":-1 in the former, but a negative depth may confuse > > existing tools. But there is an easy out: for any chain of length N, > > we can simply represent an unallocated cluster as "depth":N+1. This > > does have a slight risk of confusing any tool that might try to > > dereference NULL when finding the backing image for the last file in > > the backing chain, but that risk sseems worth the more precise output. > > The iotests have several examples where this distinction demonstrates > > the additional accuracy. > > > > Signed-off-by: Eric Blake > > --- > > > > Replaces v1: 20210610213906.1313440-1-ebl...@redhat.com > > (qemu-img: Use "depth":-1 to make backing probes obvious) > > > > Use N+1 instead of -1 for unallocated [Kevin] > > > > Bit in contrast with -1, or with separate boolean flag, you lose the > possibility to distinguish case when we have 3 layers and the cluster is > absent in all of them, and the case when we have 4 layers and the cluster is > absent in top 3 but in 4 it is qcow2 UNALLOCATED_ZERO cluster. Using just 'qemu-img map --output-json', you only see depth numbers. You also have to use 'qemu-img info --backing-chain' to see what file those depth numbers correspond to, at which point it becomes obvious whether "depth":4 meant unallocated (because the chain was length 3) or allocated at depth 4 (because the chain was length 4 or longer). But that's no worse than pre-patch, where you had to use qemu-img info --backing-chain to learn which file a particular "depth" maps to. > > So, if someone use this API to reconstruct the chain, then for original 3 > empty layers he will create 3 empty layers and 4rd additional ZERO layer. And > such reconstructed chain would not be equal to original chain (as if we take > these two chains and add additional backing file as a new bottom layer, > effect would be different).. I'm not sure is it a problem in the task you are > solving :\ It should be fairly easy to optimize the case of a backing chain where EVERY listed cluster at the final depth was "data":false,"zero":true to omit that file after all. And in oVirt's case, Nir pointed out that we have one more tool at our disposal in recreating a backing chain: if you use json:{"driver":"qcow2", "backing":null, ...} as your image file, you don't have to worry about arbitrary files in the backing chain, only about recreating the top-most layer of a chain. And in that case, it becomes very obvious that "depth":0 is something you must recreate, and "depth":1 would be a non-existent backing file because you just passed "backing":null. -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3266 Virtualization: qemu.org | libvirt.org
Re: [PATCH v2] qemu-img: Make unallocated part of backing chain obvious in map
11.06.2021 17:01, Eric Blake wrote: The recently-added NBD context qemu:allocation-depth is able to distinguish between locally-present data (even with that data is sparse) [shown as depth 1 over NBD], and data that could not be found anywhere in the backing chain [shown as depth 0]. But qemu-img map --output=json predates that addition, and prior to this patch has the unfortunate behavior that all portions of the backing chain that resolve without finding a hit in any backing layer report the same depth as the final backing layer. This makes it harder to reconstruct a qcow2 backing chain using just 'qemu-img map' output, especially when using "backing":null to artificially limit a backing chain, because it is impossible to distinguish between a QCOW2_CLUSTER_UNALLOCATED (which defers to a [missing] backing file) and a QCOW2_CLUSTER_ZERO_PLAIN cluster (which would override any backing file), since both types of clusters otherwise show as "data":false,"zero":true" (but note that we can distinguish a QCOW2_CLUSTER_ZERO_ALLOCATED, which would also have an "offset": listing). The task of reconstructing a qcow2 chain was made harder in commit 0da9856851 (nbd: server: Report holes for raw images), because prior to that point, it was possible to abuse NBD's block status command to see which portions of a qcow2 file resulted in BDRV_BLOCK_ALLOCATED (showing up as NBD_STATE_ZERO in isolation) vs. missing from the chain (showing up as NBD_STATE_ZERO|NBD_STATE_HOLE); but now qemu reports more accurate sparseness information over NBD. An obvious solution is to make 'qemu-img map --output=json' distinguish between clusters that have a local allocation from those that are found nowhere in the chain. We already have a one-off mismatch between qemu-img map and NBD qemu:allocation-depth (the former chose 0, and the latter 1 for the local layer), so exposing the latter's choice of 0 for unallocated in the entire chain would mean using using "depth":-1 in the former, but a negative depth may confuse existing tools. But there is an easy out: for any chain of length N, we can simply represent an unallocated cluster as "depth":N+1. This does have a slight risk of confusing any tool that might try to dereference NULL when finding the backing image for the last file in the backing chain, but that risk sseems worth the more precise output. The iotests have several examples where this distinction demonstrates the additional accuracy. Signed-off-by: Eric Blake --- Replaces v1: 20210610213906.1313440-1-ebl...@redhat.com (qemu-img: Use "depth":-1 to make backing probes obvious) Use N+1 instead of -1 for unallocated [Kevin] Bit in contrast with -1, or with separate boolean flag, you lose the possibility to distinguish case when we have 3 layers and the cluster is absent in all of them, and the case when we have 4 layers and the cluster is absent in top 3 but in 4 it is qcow2 UNALLOCATED_ZERO cluster. So, if someone use this API to reconstruct the chain, then for original 3 empty layers he will create 3 empty layers and 4rd additional ZERO layer. And such reconstructed chain would not be equal to original chain (as if we take these two chains and add additional backing file as a new bottom layer, effect would be different).. I'm not sure is it a problem in the task you are solving :\ -- Best regards, Vladimir
Re: [PATCH v4 18/32] nbd/client-connection: use QEMU_LOCK_GUARD
On Thu, Jun 10, 2021 at 01:07:48PM +0300, Vladimir Sementsov-Ogievskiy wrote: > We don't update connect_thread_func() to use QEMU_LOCK_GUARD, as it > will get more complex critical sections logic in further commit, where > QEMU_LOCK_GUARD doesn't help. > > Signed-off-by: Vladimir Sementsov-Ogievskiy > --- > nbd/client-connection.c | 99 +++-- > 1 file changed, 45 insertions(+), 54 deletions(-) > Reviewed-by: Eric Blake -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3266 Virtualization: qemu.org | libvirt.org
Re: [PATCH v4 16/32] block/nbd: introduce nbd_client_connection_release()
On Thu, Jun 10, 2021 at 01:07:46PM +0300, Vladimir Sementsov-Ogievskiy wrote: > This is a last step of creating bs-independent nbd connection > interface. With next commit we can finally move it to separate file. > > Signed-off-by: Vladimir Sementsov-Ogievskiy > --- > block/nbd.c | 45 +++-- > 1 file changed, 27 insertions(+), 18 deletions(-) > Reviewed-by: Eric Blake -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3266 Virtualization: qemu.org | libvirt.org
Re: [PATCH v4 11/32] block/nbd: drop thr->state
On Thu, Jun 10, 2021 at 01:07:41PM +0300, Vladimir Sementsov-Ogievskiy wrote: > We don't need all these states. The code refactored to use two boolean > variables looks simpler. > > While moving the comment in nbd_co_establish_connection() rework it to > give better information. Also, we are going to move the connection code > to separate file and mentioning drained section would be confusing. > > Improve also the comment in NBDConnectThread, while dropping removed > state names from it. > > Signed-off-by: Vladimir Sementsov-Ogievskiy > --- > block/nbd.c | 139 +--- > 1 file changed, 44 insertions(+), 95 deletions(-) > > typedef struct NBDConnectThread { > /* Initialization constants */ > SocketAddress *saddr; /* address to connect to */ > > +QemuMutex mutex; > + > /* > - * Result of last attempt. Valid in FAIL and SUCCESS states. > - * If you want to steal error, don't forget to set pointer to NULL. > + * @sioc and @err present a result of connection attempt. > + * While running is true, they are used only by thread, mutex locking is > not > + * needed. When thread is finished nbd_co_establish_connection steal > these > + * pointers under mutex. @sioc and @err represent a connection attempt. While running is true, they are only used by the connection thread, and mutex locking is not needed. Once the thread finishes, nbd_co_establish_connection then steals these pointers under mutex. > */ > QIOChannelSocket *sioc; > Error *err; > > -QemuMutex mutex; > -/* All further fields are protected by mutex */ > -NBDConnectThreadState state; /* current state of the thread */ > +/* All further fields are accessed only under mutex */ > +bool running; /* thread is running now */ > +bool detached; /* thread is detached and should cleanup the state */ > Okay, I'm understanding this better than I did in v3. Reviewed-by: Eric Blake -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3266 Virtualization: qemu.org | libvirt.org
Re: [PATCH v4 05/32] qemu-sockets: introduce socket_address_parse_named_fd()
11.06.2021 16:22, Eric Blake wrote: On Thu, Jun 10, 2021 at 01:07:35PM +0300, Vladimir Sementsov-Ogievskiy wrote: Add function that transforms named fd inside SocketAddress structure into number representation. This way it may be then used in a context where current monitor is not available. Signed-off-by: Vladimir Sementsov-Ogievskiy --- include/qemu/sockets.h | 14 ++ util/qemu-sockets.c| 19 +++ 2 files changed, 33 insertions(+) diff --git a/include/qemu/sockets.h b/include/qemu/sockets.h index 7d1f813576..1f4f18a44a 100644 --- a/include/qemu/sockets.h +++ b/include/qemu/sockets.h @@ -111,4 +111,18 @@ SocketAddress *socket_remote_address(int fd, Error **errp); */ SocketAddress *socket_address_flatten(SocketAddressLegacy *addr); +/** + * socket_address_parse_named_fd: + * + * Modify @addr, replacing named fd by corresponding number. + * + * Parsing named fd (by sockget_get_fd) is not possible in context where + * current monitor is not available. So, SocketAddress user may first call + * socket_parse_named_fd() to parse named fd in advance, and then pass @addr to + * the context where monitor is not available. 2 different wrong function names, and reads awkwardly. How about this shorter variant: Modify @addr, replacing a named fd by its corresponding number. Needed for callers that plan to pass @addr to a context where the current monitor is not available. A lot better, thanks! + * + * Return 0 on success. + */ +int socket_address_parse_named_fd(SocketAddress *addr, Error **errp); + #endif /* QEMU_SOCKETS_H */ diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c But the code looks good. Somehow, C is simpler for me than English ) Reviewed-by: Eric Blake -- Best regards, Vladimir
Re: [PATCH v4 10/32] block/nbd: simplify waking of nbd_co_establish_connection()
On Thu, Jun 10, 2021 at 01:07:40PM +0300, Vladimir Sementsov-Ogievskiy wrote: > Instead of managing connect_bh, bh_ctx, and wait_connect fields, we > can use a single link to the waiting coroutine with proper mutex > protection. > ... > Also, this commit reduces the dependence of > nbd_co_establish_connection() on the internals of bs (we now use a > generic pointer to the coroutine, instead of direct use of > s->connection_co). This is a step towards splitting the connection > API out of nbd.c. > > Signed-off-by: Vladimir Sementsov-Ogievskiy > --- > block/nbd.c | 55 +++-- > 1 file changed, 15 insertions(+), 40 deletions(-) > Reviewied-by: Eric Blake -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3266 Virtualization: qemu.org | libvirt.org
[PATCH v2] qemu-img: Make unallocated part of backing chain obvious in map
The recently-added NBD context qemu:allocation-depth is able to distinguish between locally-present data (even with that data is sparse) [shown as depth 1 over NBD], and data that could not be found anywhere in the backing chain [shown as depth 0]. But qemu-img map --output=json predates that addition, and prior to this patch has the unfortunate behavior that all portions of the backing chain that resolve without finding a hit in any backing layer report the same depth as the final backing layer. This makes it harder to reconstruct a qcow2 backing chain using just 'qemu-img map' output, especially when using "backing":null to artificially limit a backing chain, because it is impossible to distinguish between a QCOW2_CLUSTER_UNALLOCATED (which defers to a [missing] backing file) and a QCOW2_CLUSTER_ZERO_PLAIN cluster (which would override any backing file), since both types of clusters otherwise show as "data":false,"zero":true" (but note that we can distinguish a QCOW2_CLUSTER_ZERO_ALLOCATED, which would also have an "offset": listing). The task of reconstructing a qcow2 chain was made harder in commit 0da9856851 (nbd: server: Report holes for raw images), because prior to that point, it was possible to abuse NBD's block status command to see which portions of a qcow2 file resulted in BDRV_BLOCK_ALLOCATED (showing up as NBD_STATE_ZERO in isolation) vs. missing from the chain (showing up as NBD_STATE_ZERO|NBD_STATE_HOLE); but now qemu reports more accurate sparseness information over NBD. An obvious solution is to make 'qemu-img map --output=json' distinguish between clusters that have a local allocation from those that are found nowhere in the chain. We already have a one-off mismatch between qemu-img map and NBD qemu:allocation-depth (the former chose 0, and the latter 1 for the local layer), so exposing the latter's choice of 0 for unallocated in the entire chain would mean using using "depth":-1 in the former, but a negative depth may confuse existing tools. But there is an easy out: for any chain of length N, we can simply represent an unallocated cluster as "depth":N+1. This does have a slight risk of confusing any tool that might try to dereference NULL when finding the backing image for the last file in the backing chain, but that risk sseems worth the more precise output. The iotests have several examples where this distinction demonstrates the additional accuracy. Signed-off-by: Eric Blake --- Replaces v1: 20210610213906.1313440-1-ebl...@redhat.com (qemu-img: Use "depth":-1 to make backing probes obvious) Use N+1 instead of -1 for unallocated [Kevin] docs/tools/qemu-img.rst| 3 +- qemu-img.c | 3 ++ tests/qemu-iotests/122.out | 34 ++--- tests/qemu-iotests/154.out | 100 +++-- tests/qemu-iotests/179.out | 67 + tests/qemu-iotests/223.out | 24 - tests/qemu-iotests/244.out | 7 +-- tests/qemu-iotests/252.out | 4 +- tests/qemu-iotests/274.out | 16 +++--- tests/qemu-iotests/291.out | 8 +-- tests/qemu-iotests/309.out | 4 +- 11 files changed, 162 insertions(+), 108 deletions(-) diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst index cfe11478791f..c155b1bf3cc8 100644 --- a/docs/tools/qemu-img.rst +++ b/docs/tools/qemu-img.rst @@ -599,7 +599,8 @@ Command description: - whether the data is known to read as zero (boolean field ``zero``); - in order to make the output shorter, the target file is expressed as a ``depth``; for example, a depth of 2 refers to the backing file -of the backing file of *FILENAME*. +of the backing file of *FILENAME*. Depth will be one larger than +the chain length if no file in the chain provides the data. In JSON format, the ``offset`` field is optional; it is absent in cases where ``human`` format would omit the entry or exit with an error. diff --git a/qemu-img.c b/qemu-img.c index a5993682aad4..33a5cd012b8b 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -3035,6 +3035,9 @@ static int get_block_status(BlockDriverState *bs, int64_t offset, bdrv_refresh_filename(file); filename = file->filename; } +if (!(ret & BDRV_BLOCK_ALLOCATED)) { +depth++; +} *e = (MapEntry) { .start = offset, diff --git a/tests/qemu-iotests/122.out b/tests/qemu-iotests/122.out index 3a3e121d579d..779dab4847f0 100644 --- a/tests/qemu-iotests/122.out +++ b/tests/qemu-iotests/122.out @@ -68,11 +68,11 @@ read 65536/65536 bytes at offset 4194304 read 65536/65536 bytes at offset 8388608 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) [{ "start": 0, "length": 65536, "depth": 0, "zero": false, "data": true}, -{ "start": 65536, "length": 4128768, "depth": 0, "zero": true, "data": false}, +{ "start": 65536, "length": 4128768, "depth": 1, "zero": true, "data": false}, { "start": 4194304, "length": 65536, "depth": 0, "zero": false, "data": true}, -{ "start": 4259840, "length": 4128768, "dep
Re: [PATCH v4 06/32] block/nbd: call socket_address_parse_named_fd() in advance
On Thu, Jun 10, 2021 at 01:07:36PM +0300, Vladimir Sementsov-Ogievskiy wrote: > Detecting monitor by current coroutine works bad when we are not in > coroutine context. And that's exactly so in nbd reconnect code, where > qio_channel_socket_connect_sync() is called from thread. > > Monitor is needed only to parse named file descriptor. So, let's just > parse it during nbd_open(), so that all further users of s->saddr don't > need to access monitor. > > Signed-off-by: Vladimir Sementsov-Ogievskiy > --- > block/nbd.c | 6 ++ > 1 file changed, 6 insertions(+) Reviewed-by: Eric Blake -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3266 Virtualization: qemu.org | libvirt.org
Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole
On Fri, Jun 11, 2021 at 01:21:45PM +0200, Kevin Wolf wrote: > > Did you consider just add a new field? > > > > So, "depth" keeps its meaning "which level provides data". > > > > And we add additional optional field like > > > > absolutely-completely-absent: bool > > > > Which is true if data is nowhere in the backing chain. > > Or how about exposing BDRV_BLOCK_ALLOCATED as 'allocated': 'bool'? Which > I think is what the conclusion was already for NBD, so doing the same in > 'qemu-img map' would be consistent. > > This is, of course, almost the same as 'absolutely-completely-absent', > just without the negating the flag. If we want to bikeshed on a new name, I think "allocated" is going to cause more confusion than it solves. And "hole" is wrong. Better would be "backing":true for portions of the file that would derive from a backing file, if a backing file had been present. But that still feels like more work than just exposing n+1 in depth. -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3266 Virtualization: qemu.org | libvirt.org
Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole
On Fri, Jun 11, 2021 at 10:09:09AM +0200, Kevin Wolf wrote: > > Yes, that might work as well. But we didn't previously document > > depth to be optional. Removing something from output risks breaking > > more downstream tools that expect it to be non-optional, compared to > > providing a new value. > > A negative value isn't any less unexpected than a missing key. I don't > think any existing tool would be able to handle it. Encoding different > meanings in a single value isn't very QAPI-like either. Usually strings > that are parsed are the problem, but negative integers really isn't that > much different. I don't really like this solution. > > Leaving out the depth feels like a better suggestion to me. > > But anyway, this seems to only happen at the end of the backing chain. > So if the backing chain consistents of n images, why not report 'depth': > n + 1? So, in the above example, you would get 1. I think this has the > best chances of tools actually working correctly with the new output, > even though it's still not unlikely to break something. Ooh, I like that. It is closer to reality - the file data really comes from the next depth, even if we have no filename at that depth. v2 of my patch coming up. -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3266 Virtualization: qemu.org | libvirt.org
Re: [PATCH v4 05/32] qemu-sockets: introduce socket_address_parse_named_fd()
On Thu, Jun 10, 2021 at 01:07:35PM +0300, Vladimir Sementsov-Ogievskiy wrote: > Add function that transforms named fd inside SocketAddress structure > into number representation. This way it may be then used in a context > where current monitor is not available. > > Signed-off-by: Vladimir Sementsov-Ogievskiy > --- > include/qemu/sockets.h | 14 ++ > util/qemu-sockets.c| 19 +++ > 2 files changed, 33 insertions(+) > > diff --git a/include/qemu/sockets.h b/include/qemu/sockets.h > index 7d1f813576..1f4f18a44a 100644 > --- a/include/qemu/sockets.h > +++ b/include/qemu/sockets.h > @@ -111,4 +111,18 @@ SocketAddress *socket_remote_address(int fd, Error > **errp); > */ > SocketAddress *socket_address_flatten(SocketAddressLegacy *addr); > > +/** > + * socket_address_parse_named_fd: > + * > + * Modify @addr, replacing named fd by corresponding number. > + * > + * Parsing named fd (by sockget_get_fd) is not possible in context where > + * current monitor is not available. So, SocketAddress user may first call > + * socket_parse_named_fd() to parse named fd in advance, and then pass @addr > to > + * the context where monitor is not available. 2 different wrong function names, and reads awkwardly. How about this shorter variant: Modify @addr, replacing a named fd by its corresponding number. Needed for callers that plan to pass @addr to a context where the current monitor is not available. > + * > + * Return 0 on success. > + */ > +int socket_address_parse_named_fd(SocketAddress *addr, Error **errp); > + > #endif /* QEMU_SOCKETS_H */ > diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c But the code looks good. Reviewed-by: Eric Blake -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3266 Virtualization: qemu.org | libvirt.org
Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole
11.06.2021 14:21, Kevin Wolf wrote: Am 11.06.2021 um 10:14 hat Vladimir Sementsov-Ogievskiy geschrieben: 11.06.2021 11:09, Kevin Wolf wrote: Am 10.06.2021 um 22:46 hat Eric Blake geschrieben: On Thu, Jun 10, 2021 at 11:09:05PM +0300, Nir Soffer wrote: But: $ qemu-img map --output=json -f qcow2 json:'{"driver":"qcow2","backing":null, \ "file":{"driver":"file","filename":"top.qcow2"}}' [{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false}, { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": 327680}, { "start": 131072, "length": 131072, "depth": 0, "zero": true, "data": false}] also reports the entire file at "depth":0, which is misleading, since we have just been arguing from the qemu:allocation-depth perspective (and also from bdrv_block_status) that the qcow2 image is NOT 100% allocated (in the sense where allocation == data comes locally). Perhaps it might be better if we tweaked the above qemu-img map to produce: [{ "start": 0, "length": 65536, "depth": -1, "zero": true, "data": false}, { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": 327680}, { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": false}, { "start": 196608, "length": 65536, "depth": -1, "zero": true, "data": false}] It will be more consistent with "offset" to drop "depth" from output if we don't have it: [{ "start": 0, "length": 65536, "zero": true, "data": false}, { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": 327680}, { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": false}, { "start": 196608, "length": 65536, "zero": true, "data": false}] Yes, that might work as well. But we didn't previously document depth to be optional. Removing something from output risks breaking more downstream tools that expect it to be non-optional, compared to providing a new value. A negative value isn't any less unexpected than a missing key. I don't think any existing tool would be able to handle it. Encoding different meanings in a single value isn't very QAPI-like either. Usually strings that are parsed are the problem, but negative integers really isn't that much different. I don't really like this solution. Leaving out the depth feels like a better suggestion to me. But anyway, this seems to only happen at the end of the backing chain. So if the backing chain consistents of n images, why not report 'depth': n + 1? So, in the above example, you would get 1. I think this has the best chances of tools actually working correctly with the new output, even though it's still not unlikely to break something. Did you consider just add a new field? So, "depth" keeps its meaning "which level provides data". And we add additional optional field like absolutely-completely-absent: bool Which is true if data is nowhere in the backing chain. Or how about exposing BDRV_BLOCK_ALLOCATED as 'allocated': 'bool'? Which I think is what the conclusion was already for NBD, so doing the same in 'qemu-img map' would be consistent. "allocated" is historically ambiguous: we never know exactly does it mean "occupy space on disk" or "data (or zeroes) taken from this qcow2 image, not from backing". Eric recently sent related patch to libnbd: [libnbd PATCH] info: Avoid ambiguous 'allocated' terminology in mapping This is, of course, almost the same as 'absolutely-completely-absent', just without the negating the flag. Kevin -- Best regards, Vladimir
[PATCH] block/nvme: Fix VFIO_MAP_DMA failed: No space left on device
When the NVMe block driver was introduced (see commit bdd6a90a9e5, January 2018), Linux VFIO_IOMMU_MAP_DMA ioctl was only returning -ENOMEM in case of error. The driver was correctly handling the error path to recycle its volatile IOVA mappings. To fix CVE-2019-3882, Linux commit 492855939bdb ("vfio/type1: Limit DMA mappings per container", April 2019) added the -ENOSPC error to signal the user exhausted the DMA mappings available for a container. The block driver started to mis-behave: qemu-system-x86_64: VFIO_MAP_DMA failed: No space left on device (qemu) (qemu) info status VM status: paused (io-error) (qemu) c VFIO_MAP_DMA failed: No space left on device qemu-system-x86_64: block/block-backend.c:1968: blk_get_aio_context: Assertion `ctx == blk->ctx' failed. Fix by handling the -ENOSPC error when DMA mappings are exhausted; other errors (such -ENOMEM) are still handled later in the same function. An easy way to reproduce this bug is to restrict the DMA mapping limit (65535 by default) when loading the VFIO IOMMU module: # modprobe vfio_iommu_type1 dma_entry_limit=666 Cc: qemu-sta...@nongnu.org Reported-by: Michal Prívozník Fixes: bdd6a90a9e5 ("block: Add VFIO based NVMe driver") Buglink: https://bugs.launchpad.net/qemu/+bug/186 Resolves: https://gitlab.com/qemu-project/qemu/-/issues/65 Signed-off-by: Philippe Mathieu-Daudé --- Michal, is it still possible for you to test this (old bug)? A functional test using viommu & nested VM is planned (suggested by Stefan and Maxim). --- block/nvme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/nvme.c b/block/nvme.c index 2b5421e7aa6..12f9dd5cce3 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -1030,7 +1030,7 @@ try_map: r = qemu_vfio_dma_map(s->vfio, qiov->iov[i].iov_base, len, true, &iova); -if (r == -ENOMEM && retry) { +if (r == -ENOSPC && retry) { retry = false; trace_nvme_dma_flush_queue_wait(s); if (s->dma_map_count) { -- 2.31.1
Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole
Am 11.06.2021 um 10:14 hat Vladimir Sementsov-Ogievskiy geschrieben: > 11.06.2021 11:09, Kevin Wolf wrote: > > Am 10.06.2021 um 22:46 hat Eric Blake geschrieben: > > > On Thu, Jun 10, 2021 at 11:09:05PM +0300, Nir Soffer wrote: > > > > > But: > > > > > > > > > > $ qemu-img map --output=json -f qcow2 > > > > > json:'{"driver":"qcow2","backing":null, \ > > > > >"file":{"driver":"file","filename":"top.qcow2"}}' > > > > > [{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": > > > > > false}, > > > > > { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": > > > > > true, "offset": 327680}, > > > > > { "start": 131072, "length": 131072, "depth": 0, "zero": true, > > > > > "data": false}] > > > > > > > > > > also reports the entire file at "depth":0, which is misleading, since > > > > > we have just been arguing from the qemu:allocation-depth perspective > > > > > (and also from bdrv_block_status) that the qcow2 image is NOT 100% > > > > > allocated (in the sense where allocation == data comes locally). > > > > > Perhaps it might be better if we tweaked the above qemu-img map to > > > > > produce: > > > > > > > > > > [{ "start": 0, "length": 65536, "depth": -1, "zero": true, "data": > > > > > false}, > > > > > { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": > > > > > true, "offset": 327680}, > > > > > { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": > > > > > false}, > > > > > { "start": 196608, "length": 65536, "depth": -1, "zero": true, > > > > > "data": false}] > > > > > > > > It will be more consistent with "offset" to drop "depth" from output > > > > if we don't have it: > > > > > > > > [{ "start": 0, "length": 65536, "zero": true, "data": false}, > > > > { "start": 65536, "length": 65536, "depth": 0, "zero": false, > > > > "data": true, "offset": 327680}, > > > > { "start": 131072, "length": 65536, "depth": 0, "zero": true, > > > > "data": false}, > > > > { "start": 196608, "length": 65536, "zero": true, "data": false}] > > > > > > Yes, that might work as well. But we didn't previously document > > > depth to be optional. Removing something from output risks breaking > > > more downstream tools that expect it to be non-optional, compared to > > > providing a new value. > > > > A negative value isn't any less unexpected than a missing key. I don't > > think any existing tool would be able to handle it. Encoding different > > meanings in a single value isn't very QAPI-like either. Usually strings > > that are parsed are the problem, but negative integers really isn't that > > much different. I don't really like this solution. > > > > Leaving out the depth feels like a better suggestion to me. > > > > But anyway, this seems to only happen at the end of the backing chain. > > So if the backing chain consistents of n images, why not report 'depth': > > n + 1? So, in the above example, you would get 1. I think this has the > > best chances of tools actually working correctly with the new output, > > even though it's still not unlikely to break something. > > > > Did you consider just add a new field? > > So, "depth" keeps its meaning "which level provides data". > > And we add additional optional field like > > absolutely-completely-absent: bool > > Which is true if data is nowhere in the backing chain. Or how about exposing BDRV_BLOCK_ALLOCATED as 'allocated': 'bool'? Which I think is what the conclusion was already for NBD, so doing the same in 'qemu-img map' would be consistent. This is, of course, almost the same as 'absolutely-completely-absent', just without the negating the flag. Kevin
Re: [PATCH v2] async: the main AioContext is only "current" if under the BQL
On 09/06/21 17:01, Vladimir Sementsov-Ogievskiy wrote: 09.06.2021 15:22, Paolo Bonzini wrote: If we want to wake up a coroutine from a worker thread, aio_co_wake() currently does not work. In that scenario, aio_co_wake() calls aio_co_enter(), but there is no current AioContext and therefore qemu_get_current_aio_context() returns the main thread. aio_co_wake() then attempts to call aio_context_acquire() instead of going through aio_co_schedule(). The default case of qemu_get_current_aio_context() was added to cover synchronous I/O started from the vCPU thread, but the main and vCPU threads are quite different. The main thread is an I/O thread itself, only running a more complicated event loop; the vCPU thread instead is essentially a worker thread that occasionally calls qemu_mutex_lock_iothread(). It is only in those critical sections that it acts as if it were the home thread of the main AioContext. Therefore, this patch detaches qemu_get_current_aio_context() from iothreads, which is a useless complication. The AioContext pointer is stored directly in the thread-local variable, including for the main loop. Worker threads (including vCPU threads) optionally behave as temporary home threads if they have taken the big QEMU lock, but if that is not the case they will always schedule coroutines on remote threads via aio_co_schedule(). With this change, qemu_mutex_iothread_locked() must be changed from Did you miss "qemu_mutex_iothread_locked() stub function"? Yes, that comment refers to the stub. Let me resubmit with a testcase and I'll fix that too. Paolo
Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole
11.06.2021 12:05, Nir Soffer wrote: ב-11 ביוני 2021, בשעה 11:14, Vladimir Sementsov-Ogievskiy כתב/ה: 11.06.2021 11:09, Kevin Wolf wrote: Am 10.06.2021 um 22:46 hat Eric Blake geschrieben: On Thu, Jun 10, 2021 at 11:09:05PM +0300, Nir Soffer wrote: But: $ qemu-img map --output=json -f qcow2 json:'{"driver":"qcow2","backing":null, \ "file":{"driver":"file","filename":"top.qcow2"}}' [{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false}, { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": 327680}, { "start": 131072, "length": 131072, "depth": 0, "zero": true, "data": false}] also reports the entire file at "depth":0, which is misleading, since we have just been arguing from the qemu:allocation-depth perspective (and also from bdrv_block_status) that the qcow2 image is NOT 100% allocated (in the sense where allocation == data comes locally). Perhaps it might be better if we tweaked the above qemu-img map to produce: [{ "start": 0, "length": 65536, "depth": -1, "zero": true, "data": false}, { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": 327680}, { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": false}, { "start": 196608, "length": 65536, "depth": -1, "zero": true, "data": false}] It will be more consistent with "offset" to drop "depth" from output if we don't have it: [{ "start": 0, "length": 65536, "zero": true, "data": false}, { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": 327680}, { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": false}, { "start": 196608, "length": 65536, "zero": true, "data": false}] Yes, that might work as well. But we didn't previously document depth to be optional. Removing something from output risks breaking more downstream tools that expect it to be non-optional, compared to providing a new value. A negative value isn't any less unexpected than a missing key. I don't think any existing tool would be able to handle it. Encoding different meanings in a single value isn't very QAPI-like either. Usually strings that are parsed are the problem, but negative integers really isn't that much different. I don't really like this solution. Leaving out the depth feels like a better suggestion to me. But anyway, this seems to only happen at the end of the backing chain. So if the backing chain consistents of n images, why not report 'depth': n + 1? So, in the above example, you would get 1. I think this has the best chances of tools actually working correctly with the new output, even though it's still not unlikely to break something. Did you consider just add a new field? So, "depth" keeps its meaning "which level provides data". And we add additional optional field like absolutely-completely-absent: bool hole: bool? That messes-up with file-posix holes which are UNALLOCATED_ZERO.. I think, we should somehow start to honestly report backing chains and use "backing" concept in interfaces.. maybe nobacking: bool May be true only together with data=false and zero=true, and means that all layers refer to backing for this region, but last layer just don't have backing image currently and therefore returns zeroes. -- Best regards, Vladimir
Re: [PATCH v2] async: the main AioContext is only "current" if under the BQL
On 09/06/21 15:02, Kevin Wolf wrote: Am 09.06.2021 um 14:22 hat Paolo Bonzini geschrieben: If we want to wake up a coroutine from a worker thread, aio_co_wake() currently does not work. In that scenario, aio_co_wake() calls aio_co_enter(), but there is no current AioContext and therefore qemu_get_current_aio_context() returns the main thread. aio_co_wake() then attempts to call aio_context_acquire() instead of going through aio_co_schedule(). The default case of qemu_get_current_aio_context() was added to cover synchronous I/O started from the vCPU thread, but the main and vCPU threads are quite different. The main thread is an I/O thread itself, only running a more complicated event loop; the vCPU thread instead is essentially a worker thread that occasionally calls qemu_mutex_lock_iothread(). It is only in those critical sections that it acts as if it were the home thread of the main AioContext. Therefore, this patch detaches qemu_get_current_aio_context() from iothreads, which is a useless complication. The AioContext pointer is stored directly in the thread-local variable, including for the main loop. Worker threads (including vCPU threads) optionally behave as temporary home threads if they have taken the big QEMU lock, but if that is not the case they will always schedule coroutines on remote threads via aio_co_schedule(). With this change, qemu_mutex_iothread_locked() must be changed from true to false. The previous value of true was needed because the main thread did not have an AioContext in the thread-local variable, but now it does have one. Reported-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Paolo Bonzini The commit message doesn't specify, but in the buggy case, are we talking about calling aio_co_wake() for a coroutine in the main context specifically, right? Could we have a unit test for this scenario? Yes, that's the scenario. I will try to write a unit test indeed. Paolo
Re: iotest 233 failing
On Thu, Jun 10, 2021 at 10:33:40PM +0100, Daniel P. Berrangé wrote: > On Thu, Jun 10, 2021 at 10:31:14PM +0100, Daniel P. Berrangé wrote: > > On Thu, Jun 10, 2021 at 03:34:46PM -0500, Eric Blake wrote: > > > I'm now getting failures on iotest 233: > > > > > > 233 fail [15:26:01] [15:26:03] 2.1s (last: 1.3s) output > > > mismatch (see 233.out.bad) > > > --- /home/eblake/qemu/tests/qemu-iotests/233.out > > > +++ 233.out.bad > > > @@ -65,6 +65,6 @@ > > > == final server log == > > > qemu-nbd: option negotiation failed: Verify failed: No certificate was > > > found. > > > qemu-nbd: option negotiation failed: Verify failed: No certificate was > > > found. > > > -qemu-nbd: option negotiation failed: TLS x509 authz check for > > > CN=localhost,O=Cthulhu Dark Lord Enterprises client1,L=R'lyeh,C=South > > > Pacific is denied > > > -qemu-nbd: option negotiation failed: TLS x509 authz check for > > > CN=localhost,O=Cthulhu Dark Lord Enterprises client3,L=R'lyeh,C=South > > > Pacific is denied > > > +qemu-nbd: option negotiation failed: TLS x509 authz check for C=South > > > Pacific,L=R'lyeh,O=Cthulhu Dark Lord Enterprises client1,CN=localhost is > > > denied > > > +qemu-nbd: option negotiation failed: TLS x509 authz check for C=South > > > Pacific,L=R'lyeh,O=Cthulhu Dark Lord Enterprises client3,CN=localhost is > > > denied > > > *** done > > > Failures: 233 > > > Failed 1 of 1 iotests > > > > > > Looks like I recently updated to gnutls-3.7.2-1.fc34 on June 1, could > > > that be the culprit for the error message being reordered? > > > > It is possible I guess. They have indeed made such a change in the past > > and reverted it when I pointed out that this is effectively an ABI for > > apps, because access control lists are based on matching the distinguish > > name string, as an opaque string. The cause certainly needs investigating > > as a matter of urgency because this is ABI for QEMU's authz access control > > lists. > > There is an ominous sounding NEWS item in 3.7.2 > > ** certtool: When producing certificates and certificate requests, subject DN >components that are provided individually will now be ordered by >assumed scale (e.g. Country before State, Organization before >OrganizationalUnit). This change also affects the order in which >certtool prompts interactively. Please rely on the template >mechanism for automated use of certtool! (#1243) > > This ordering change in certtool seems to correspond with the new order > you see above in the distinguished name, so I wonder if the certtool > change had accidental side effects. Right so iotest 233 of course creates a new certificate, and so it picks up the new certtool behaviour, which means the certificate it generates has the distinguished name in the new order. This is good because it means the gnutls API for querying the distinguished name is still using the same ordering as before. This is bad because the iotest is obviously susceptible to changes it the way the certificate is created. I think we might just need to apply a filter to strip the distinguished name from the output. Regards, Daniel -- |: https://berrange.com -o-https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o-https://fstop138.berrange.com :| |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|
Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole
> ב-11 ביוני 2021, בשעה 11:14, Vladimir Sementsov-Ogievskiy > כתב/ה: > > 11.06.2021 11:09, Kevin Wolf wrote: >> Am 10.06.2021 um 22:46 hat Eric Blake geschrieben: On Thu, Jun 10, 2021 at 11:09:05PM +0300, Nir Soffer wrote: >> But: >> >> $ qemu-img map --output=json -f qcow2 >> json:'{"driver":"qcow2","backing":null, \ >> "file":{"driver":"file","filename":"top.qcow2"}}' >> [{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false}, >> { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": >> true, "offset": 327680}, >> { "start": 131072, "length": 131072, "depth": 0, "zero": true, "data": >> false}] >> >> also reports the entire file at "depth":0, which is misleading, since >> we have just been arguing from the qemu:allocation-depth perspective >> (and also from bdrv_block_status) that the qcow2 image is NOT 100% >> allocated (in the sense where allocation == data comes locally). >> Perhaps it might be better if we tweaked the above qemu-img map to >> produce: >> >> [{ "start": 0, "length": 65536, "depth": -1, "zero": true, "data": >> false}, >> { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": >> true, "offset": 327680}, >> { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": >> false}, >> { "start": 196608, "length": 65536, "depth": -1, "zero": true, "data": >> false}] > > It will be more consistent with "offset" to drop "depth" from output > if we don't have it: > > [{ "start": 0, "length": 65536, "zero": true, "data": false}, > { "start": 65536, "length": 65536, "depth": 0, "zero": false, > "data": true, "offset": 327680}, > { "start": 131072, "length": 65536, "depth": 0, "zero": true, > "data": false}, > { "start": 196608, "length": 65536, "zero": true, "data": false}] >>> >>> Yes, that might work as well. But we didn't previously document >>> depth to be optional. Removing something from output risks breaking >>> more downstream tools that expect it to be non-optional, compared to >>> providing a new value. >> A negative value isn't any less unexpected than a missing key. I don't >> think any existing tool would be able to handle it. Encoding different >> meanings in a single value isn't very QAPI-like either. Usually strings >> that are parsed are the problem, but negative integers really isn't that >> much different. I don't really like this solution. >> Leaving out the depth feels like a better suggestion to me. >> But anyway, this seems to only happen at the end of the backing chain. >> So if the backing chain consistents of n images, why not report 'depth': >> n + 1? So, in the above example, you would get 1. I think this has the >> best chances of tools actually working correctly with the new output, >> even though it's still not unlikely to break something. > > Did you consider just add a new field? > > So, "depth" keeps its meaning "which level provides data". > > And we add additional optional field like > > absolutely-completely-absent: bool hole: bool? > > Which is true if data is nowhere in the backing chain. > > > -- > Best regards, > Vladimir
Re: [PATCH] qemu-img: Use "depth":-1 to make backing probes obvious
11.06.2021 11:08, Vladimir Sementsov-Ogievskiy wrote: 11.06.2021 00:39, Eric Blake wrote: The recently-added NBD context qemu:allocation-depth makes an obvious case for why it is important to distinguish between locally-present data (even with that data is sparse) [shown as depth 1 over NBD], and data that could not be found anywhere in the backing chain [shown as depth 0]. But qemu-img map --output=json predates that addition, and has the unfortunate behavior that all portions of the backing chain that resolve without finding a hit in any backing layer report the same depth as the final backing layer. This makes it harder to reconstruct a qcow2 backing chain using just 'qemu-img map' output when using "backing":null to artificially limit a backing chain, because it is impossible to distinguish between a QCOW2_CLUSTER_UNALLOCATED (which defers to a [missing] backing file) and a QCOW2_CLUSTER_ZERO_PLAIN cluster (which would override any backing file), since both types of clusters otherwise show as "data":false,"zero":true" (but note that we can distinguish a QCOW2_CLUSTER_ZERO_ALLOCATED, which would also have an "offset": listing). The task of reconstructing a qcow2 chain was made harder in commit 0da9856851 (nbd: server: Report holes for raw images), because prior to that point, it was possible to abuse NBD's block status command to see which portions of a qcow2 file resulted in BDRV_BLOCK_ALLOCATED (showing up as NBD_STATE_ZERO in isolation) vs. missing from the chain (showing up as NBD_STATE_ZERO|NBD_STATE_HOLE); but now qemu reports more accurate sparseness information over NBD. An obvious solution is to make 'qemu-img map --output=json' visually distinguish between clusters that have a local allocation from those that are found nowhere in the chain, by adding "depth":-1 as the new witness of data that could not be tied to a specific backing image. Several iotests are impacted, but glancing through the changes shows that it is an improvement in that it shows more accurate details. Note that the documentation is specifically worded to allow qemu-img to report "depth":-1 both in the scenario where the last file in the backing chain still defers the cluster (corresponding to BDRV_BLOCK_ALLOCATED not being set anywhere in the chain), and in the scenario where qemu is unable to determine which backing chain element (if any) provides the data. The latter case does not exist now, but I'm considering an upcoming patch to add a BDRV_BLOCK_BACKING that would let a specific driver (such as NBD) inform the block layer that it is known that a cluster comes from a backing layer, but where there is insufficient data to determine which layer. As a quick demonstration: # Create a qcow2 image with a raw backing file: $ qemu-img create base.raw $((4*64*1024)) $ qemu-img create -f qcow2 -b base.raw -F raw top.qcow2 # Write to first 3 clusters of base: $ qemu-io -f raw -c "w -P 65 0 64k" -c "w -P 66 64k 64k" \ -c "w -P 67 128k 64k" base.raw # Write to second and third clusters of top, hiding base: $ qemu-io -f qcow2 -c "w -P 69 64k 64k" -c "w -z 128k 64k" top.qcow2 # Examine the full backing chain $ qemu-img map --output=json -f qcow2 top.qcow2 [{ "start": 0, "length": 65536, "depth": 1, "zero": false, "data": true, "offset": 0}, { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": 327680}, { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": false}, { "start": 196608, "length": 65536, "depth": 1, "zero": true, "data": false, "offset": 196608}] # Repeat, but with the backing chain clamped. Pre-patch: $ qemu-img map --output=json -f qcow2 json:'{"driver":"qcow2", \ "backing":null, "file":{"driver":"file", "filename":"top.qcow2"}}' [{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false}, { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": 327680}, { "start": 131072, "length": 131072, "depth": 0, "zero": true, "data": false}] # Repeat, but post-patch: $ qemu-img map --output=json -f qcow2 json:'{"driver":"qcow2", \ "backing":null, "file":{"driver":"file", "filename":"top.qcow2"}}' [{ "start": 0, "length": 65536, "depth": -1, "zero": true, "data": false}, { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": 327680}, { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": false}, { "start": 196608, "length": 65536, "depth": -1, "zero": true, "data": false}] Note that pre-patch, it was impossible to determine which portions of the qcow2 file override the backing file because the "depth":0 regions were combined, so even though qemu internally can tell the difference between sclusters 2 and 3, the command line user could not. But post-patch, the "depth":-1 markings match the "depth":1 markings when the backing chain is in
Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole
11.06.2021 11:09, Kevin Wolf wrote: Am 10.06.2021 um 22:46 hat Eric Blake geschrieben: On Thu, Jun 10, 2021 at 11:09:05PM +0300, Nir Soffer wrote: But: $ qemu-img map --output=json -f qcow2 json:'{"driver":"qcow2","backing":null, \ "file":{"driver":"file","filename":"top.qcow2"}}' [{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false}, { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": 327680}, { "start": 131072, "length": 131072, "depth": 0, "zero": true, "data": false}] also reports the entire file at "depth":0, which is misleading, since we have just been arguing from the qemu:allocation-depth perspective (and also from bdrv_block_status) that the qcow2 image is NOT 100% allocated (in the sense where allocation == data comes locally). Perhaps it might be better if we tweaked the above qemu-img map to produce: [{ "start": 0, "length": 65536, "depth": -1, "zero": true, "data": false}, { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": 327680}, { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": false}, { "start": 196608, "length": 65536, "depth": -1, "zero": true, "data": false}] It will be more consistent with "offset" to drop "depth" from output if we don't have it: [{ "start": 0, "length": 65536, "zero": true, "data": false}, { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": 327680}, { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": false}, { "start": 196608, "length": 65536, "zero": true, "data": false}] Yes, that might work as well. But we didn't previously document depth to be optional. Removing something from output risks breaking more downstream tools that expect it to be non-optional, compared to providing a new value. A negative value isn't any less unexpected than a missing key. I don't think any existing tool would be able to handle it. Encoding different meanings in a single value isn't very QAPI-like either. Usually strings that are parsed are the problem, but negative integers really isn't that much different. I don't really like this solution. Leaving out the depth feels like a better suggestion to me. But anyway, this seems to only happen at the end of the backing chain. So if the backing chain consistents of n images, why not report 'depth': n + 1? So, in the above example, you would get 1. I think this has the best chances of tools actually working correctly with the new output, even though it's still not unlikely to break something. Did you consider just add a new field? So, "depth" keeps its meaning "which level provides data". And we add additional optional field like absolutely-completely-absent: bool Which is true if data is nowhere in the backing chain. -- Best regards, Vladimir
Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole
Am 10.06.2021 um 22:46 hat Eric Blake geschrieben: > On Thu, Jun 10, 2021 at 11:09:05PM +0300, Nir Soffer wrote: > > > But: > > > > > > $ qemu-img map --output=json -f qcow2 > > > json:'{"driver":"qcow2","backing":null, \ > > > "file":{"driver":"file","filename":"top.qcow2"}}' > > > [{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false}, > > > { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": > > > true, "offset": 327680}, > > > { "start": 131072, "length": 131072, "depth": 0, "zero": true, "data": > > > false}] > > > > > > also reports the entire file at "depth":0, which is misleading, since > > > we have just been arguing from the qemu:allocation-depth perspective > > > (and also from bdrv_block_status) that the qcow2 image is NOT 100% > > > allocated (in the sense where allocation == data comes locally). > > > Perhaps it might be better if we tweaked the above qemu-img map to > > > produce: > > > > > > [{ "start": 0, "length": 65536, "depth": -1, "zero": true, "data": false}, > > > { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": > > > true, "offset": 327680}, > > > { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": > > > false}, > > > { "start": 196608, "length": 65536, "depth": -1, "zero": true, "data": > > > false}] > > > > It will be more consistent with "offset" to drop "depth" from output > > if we don't have it: > > > > [{ "start": 0, "length": 65536, "zero": true, "data": false}, > > { "start": 65536, "length": 65536, "depth": 0, "zero": false, > > "data": true, "offset": 327680}, > > { "start": 131072, "length": 65536, "depth": 0, "zero": true, > > "data": false}, > > { "start": 196608, "length": 65536, "zero": true, "data": false}] > > Yes, that might work as well. But we didn't previously document > depth to be optional. Removing something from output risks breaking > more downstream tools that expect it to be non-optional, compared to > providing a new value. A negative value isn't any less unexpected than a missing key. I don't think any existing tool would be able to handle it. Encoding different meanings in a single value isn't very QAPI-like either. Usually strings that are parsed are the problem, but negative integers really isn't that much different. I don't really like this solution. Leaving out the depth feels like a better suggestion to me. But anyway, this seems to only happen at the end of the backing chain. So if the backing chain consistents of n images, why not report 'depth': n + 1? So, in the above example, you would get 1. I think this has the best chances of tools actually working correctly with the new output, even though it's still not unlikely to break something. Kevin
Re: [PATCH] qemu-img: Use "depth":-1 to make backing probes obvious
11.06.2021 00:39, Eric Blake wrote: The recently-added NBD context qemu:allocation-depth makes an obvious case for why it is important to distinguish between locally-present data (even with that data is sparse) [shown as depth 1 over NBD], and data that could not be found anywhere in the backing chain [shown as depth 0]. But qemu-img map --output=json predates that addition, and has the unfortunate behavior that all portions of the backing chain that resolve without finding a hit in any backing layer report the same depth as the final backing layer. This makes it harder to reconstruct a qcow2 backing chain using just 'qemu-img map' output when using "backing":null to artificially limit a backing chain, because it is impossible to distinguish between a QCOW2_CLUSTER_UNALLOCATED (which defers to a [missing] backing file) and a QCOW2_CLUSTER_ZERO_PLAIN cluster (which would override any backing file), since both types of clusters otherwise show as "data":false,"zero":true" (but note that we can distinguish a QCOW2_CLUSTER_ZERO_ALLOCATED, which would also have an "offset": listing). The task of reconstructing a qcow2 chain was made harder in commit 0da9856851 (nbd: server: Report holes for raw images), because prior to that point, it was possible to abuse NBD's block status command to see which portions of a qcow2 file resulted in BDRV_BLOCK_ALLOCATED (showing up as NBD_STATE_ZERO in isolation) vs. missing from the chain (showing up as NBD_STATE_ZERO|NBD_STATE_HOLE); but now qemu reports more accurate sparseness information over NBD. An obvious solution is to make 'qemu-img map --output=json' visually distinguish between clusters that have a local allocation from those that are found nowhere in the chain, by adding "depth":-1 as the new witness of data that could not be tied to a specific backing image. Several iotests are impacted, but glancing through the changes shows that it is an improvement in that it shows more accurate details. Note that the documentation is specifically worded to allow qemu-img to report "depth":-1 both in the scenario where the last file in the backing chain still defers the cluster (corresponding to BDRV_BLOCK_ALLOCATED not being set anywhere in the chain), and in the scenario where qemu is unable to determine which backing chain element (if any) provides the data. The latter case does not exist now, but I'm considering an upcoming patch to add a BDRV_BLOCK_BACKING that would let a specific driver (such as NBD) inform the block layer that it is known that a cluster comes from a backing layer, but where there is insufficient data to determine which layer. As a quick demonstration: # Create a qcow2 image with a raw backing file: $ qemu-img create base.raw $((4*64*1024)) $ qemu-img create -f qcow2 -b base.raw -F raw top.qcow2 # Write to first 3 clusters of base: $ qemu-io -f raw -c "w -P 65 0 64k" -c "w -P 66 64k 64k" \ -c "w -P 67 128k 64k" base.raw # Write to second and third clusters of top, hiding base: $ qemu-io -f qcow2 -c "w -P 69 64k 64k" -c "w -z 128k 64k" top.qcow2 # Examine the full backing chain $ qemu-img map --output=json -f qcow2 top.qcow2 [{ "start": 0, "length": 65536, "depth": 1, "zero": false, "data": true, "offset": 0}, { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": 327680}, { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": false}, { "start": 196608, "length": 65536, "depth": 1, "zero": true, "data": false, "offset": 196608}] # Repeat, but with the backing chain clamped. Pre-patch: $ qemu-img map --output=json -f qcow2 json:'{"driver":"qcow2", \ "backing":null, "file":{"driver":"file", "filename":"top.qcow2"}}' [{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false}, { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": 327680}, { "start": 131072, "length": 131072, "depth": 0, "zero": true, "data": false}] # Repeat, but post-patch: $ qemu-img map --output=json -f qcow2 json:'{"driver":"qcow2", \ "backing":null, "file":{"driver":"file", "filename":"top.qcow2"}}' [{ "start": 0, "length": 65536, "depth": -1, "zero": true, "data": false}, { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": 327680}, { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": false}, { "start": 196608, "length": 65536, "depth": -1, "zero": true, "data": false}] Note that pre-patch, it was impossible to determine which portions of the qcow2 file override the backing file because the "depth":0 regions were combined, so even though qemu internally can tell the difference between sclusters 2 and 3, the command line user could not. But post-patch, the "depth":-1 markings match the "depth":1 markings when the backing chain is intact, and it becomes obvious which clusters are importa