Re: [PATCH 2/3] util: support detailed error reporting for qemu_open

2020-07-01 Thread Markus Armbruster
Daniel P. Berrangé  writes:

> Create a "qemu_open_err" method which does the same as "qemu_open",
> but with a "Error **errp" for error reporting. There should be no
> behavioural difference for existing callers at this stage.
>
> Signed-off-by: Daniel P. Berrangé 
> ---
>  include/qemu/osdep.h |  1 +
>  util/osdep.c | 71 +++-
>  2 files changed, 58 insertions(+), 14 deletions(-)
>
> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> index 0d26a1b9bd..e41701a308 100644
> --- a/include/qemu/osdep.h
> +++ b/include/qemu/osdep.h
> @@ -483,6 +483,7 @@ int qemu_madvise(void *addr, size_t len, int advice);
>  int qemu_mprotect_rwx(void *addr, size_t size);
>  int qemu_mprotect_none(void *addr, size_t size);
>  
> +int qemu_open_err(const char *name, int flags, Error **errp, ...);
>  int qemu_open(const char *name, int flags, ...);
>  int qemu_close(int fd);
>  int qemu_unlink(const char *name);
> diff --git a/util/osdep.c b/util/osdep.c
> index 4bdbe81cec..450b3a5da3 100644
> --- a/util/osdep.c
> +++ b/util/osdep.c
> @@ -22,6 +22,7 @@
>   * THE SOFTWARE.
>   */
>  #include "qemu/osdep.h"
> +#include "qapi/error.h"
>  
>  /* Needed early for CONFIG_BSD etc. */
>  
> @@ -282,7 +283,7 @@ int qemu_lock_fd_test(int fd, int64_t start, int64_t len, 
> bool exclusive)
>  /*
>   * Opens a file with FD_CLOEXEC set
>   */
> -int qemu_open(const char *name, int flags, ...)
> +static int qemu_openv(const char *name, int flags, Error **errp, va_list ap)
>  {
>  int ret;
>  int mode = 0;
> @@ -297,24 +298,31 @@ int qemu_open(const char *name, int flags, ...)
>  
>  fdset_id = qemu_parse_fdset(fdset_id_str);
>  if (fdset_id == -1) {
> +error_setg(errp, "Unable to parse fdset %s", name);
>  errno = EINVAL;
>  return -1;
>  }
>  
>  fd = monitor_fdset_get_fd(fdset_id, flags);
>  if (fd < 0) {
> +error_setg_errno(errp, -fd, "Unable acquire FD for %s flags %x",
> + name, flags);
>  errno = -fd;
>  return -1;
>  }
>  
>  dupfd = qemu_dup_flags(fd, flags);
>  if (dupfd == -1) {
> +error_setg_errno(errp, errno, "Unable dup FD for %s flags %x",
> + name, flags);
>  return -1;
>  }
>  
>  ret = monitor_fdset_dup_fd_add(fdset_id, dupfd);
>  if (ret == -1) {
>  close(dupfd);
> +error_setg(errp, "Unable save FD for %s flags %x",
> +   name, flags);
>  errno = EINVAL;
>  return -1;
>  }
> @@ -324,11 +332,7 @@ int qemu_open(const char *name, int flags, ...)
>  #endif
>  
>  if (flags & O_CREAT) {
> -va_list ap;
> -
> -va_start(ap, flags);
>  mode = va_arg(ap, int);
> -va_end(ap);
>  }
>  
>  #ifdef O_CLOEXEC
> @@ -340,25 +344,64 @@ int qemu_open(const char *name, int flags, ...)
>  }
>  #endif
>  
> +if (ret == -1) {
> +const char *action = "open";
> +if (flags & O_CREAT) {
> +action = "create";
> +}
>  #ifdef O_DIRECT
> -if (ret == -1 && errno == EINVAL && (flags & O_DIRECT)) {
> -int newflags = flags & ~O_DIRECT;
> +if (errno == EINVAL && (flags & O_DIRECT)) {
> +int newflags = flags & ~O_DIRECT;
>  # ifdef O_CLOEXEC
> -ret = open(name, newflags | O_CLOEXEC, mode);
> +ret = open(name, newflags | O_CLOEXEC, mode);
>  # else
> -ret = open(name, newflags, mode);
> +ret = open(name, newflags, mode);
>  # endif
> -if (ret != -1) {
> -close(ret);
> -error_report("file system does not support O_DIRECT");
> -errno = EINVAL;
> +if (ret != -1) {
> +close(ret);
> +error_setg(errp, "Unable to %s '%s' flags 0x%x: "
> +   "filesystem does not support O_DIRECT",
> +   action, name, flags);
> +if (!errp) {

If the caller ignores errors, ...

> +error_report("file system does not support O_DIRECT");

... we report this error to stderr (but not any of the other ones).
This is weird.  I figure you do it here to reproduce the weirdness of
qemu_open() before the patch.  Goes back to

commit a5813077aac7865f69b7ee46a594f3705429f7cd
Author: Stefan Hajnoczi 
Date:   Thu Aug 22 11:29:03 2013 +0200

osdep: warn if open(O_DIRECT) on fails with EINVAL

Print a warning when opening a file O_DIRECT fails with EINVAL.  This
saves users a lot of time trying to figure out the EINVAL error, which
is typical when attempting to open a file O_DIRECT on Linux tmpfs.

Reported-by: Deepak C Shetty 
Signed-off-by: Stefan Hajnoczi 
Reviewed-by: Eric Blake 

The message isn't phrased as a warning, though.  Should it use

Re: [PATCH v3 3/4] hw/block/nvme: Fix pmrmsc register size

2020-07-01 Thread Andrzej Jakowski
On 6/30/20 9:45 AM, Klaus Jensen wrote:
> On Jun 30 17:16, Philippe Mathieu-Daudé wrote:
>> On 6/30/20 5:10 PM, Andrzej Jakowski wrote:
>>> On 6/30/20 4:04 AM, Philippe Mathieu-Daudé wrote:
 The Persistent Memory Region Controller Memory Space Control
 register is 64-bit wide. See 'Figure 68: Register Definition'
 of the 'NVM Express Base Specification Revision 1.4'.

 Fixes: 6cf9413229 ("introduce PMR support from NVMe 1.4 spec")
 Reported-by: Klaus Jensen 
 Reviewed-by: Klaus Jensen 
 Signed-off-by: Philippe Mathieu-Daudé 
 ---
 Cc: Andrzej Jakowski 
 ---
  include/block/nvme.h | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

 diff --git a/include/block/nvme.h b/include/block/nvme.h
 index 71c5681912..82c384614a 100644
 --- a/include/block/nvme.h
 +++ b/include/block/nvme.h
 @@ -21,7 +21,7 @@ typedef struct QEMU_PACKED NvmeBar {
  uint32_tpmrsts;
  uint32_tpmrebs;
  uint32_tpmrswtp;
 -uint32_tpmrmsc;
 +uint64_tpmrmsc;
  } NvmeBar;
  
  enum NvmeCapShift {
 -- 2.21.3
>>>
>>> This is good catch, though I wanted to highlight that this will still 
>>> need to change as this register is not aligned properly and thus not in
>>> compliance with spec.
>>
>> I was wondering the odd alignment too. So you are saying at some time
>> in the future the spec will be updated to correct the alignment?
Yep I think so.
So PMRMSC currently is 64-bit register that is defined at E14h offset.
It is in conflict with spec because spec requires 64-bit registers to 
be 64-bit aligned.
I anticipate that changes will be needed.

>>
>> Should we use this instead?
>>
>>   uint32_tpmrmsc;
>>  +uint32_tpmrmsc_upper32; /* the spec define this, but *
>>  + * only low 32-bit are used  */
>>
>> Or eventually an unnamed struct:
>>
>>  -uint32_tpmrmsc;
>>  +struct {
>>  +uint32_t pmrmsc;
>>  +uint32_t pmrmsc_upper32; /* the spec define this, but *
>>  +  * only low 32-bit are used  */
>>  +};
>>
>>>
>>> Reviewed-by Andrzej Jakowski 
>>>
>>
> 
> I'm also not sure what you mean Andrzej. The odd alignment is exactly
> what the spec (v1.4) says?
> 




Re: [PATCH] block: Raise an error when backing file parameter is an empty string

2020-07-01 Thread Connor Kuehl

Hi Kevin & Max,

Just pinging this patch for your consideration.

Thank you,

Connor

On 6/17/20 11:27 AM, Connor Kuehl wrote:

Providing an empty string for the backing file parameter like so:

qemu-img create -f qcow2 -b '' /tmp/foo

allows the flow of control to reach and subsequently fail an assert
statement because passing an empty string to

bdrv_get_full_backing_filename_from_filename()

simply results in NULL being returned without an error being raised.

To fix this, let's check for an empty string when getting the value from
the opts list.

Reported-by: Attila Fazekas 
Fixes: https://bugzilla.redhat.com/1809553
Signed-off-by: Connor Kuehl 
---
  block.c|  4 
  tests/qemu-iotests/298 | 47 ++
  tests/qemu-iotests/298.out |  5 
  tests/qemu-iotests/group   |  1 +
  4 files changed, 57 insertions(+)
  create mode 100755 tests/qemu-iotests/298
  create mode 100644 tests/qemu-iotests/298.out

diff --git a/block.c b/block.c
index 6dbcb7e083..b335d6bcb2 100644
--- a/block.c
+++ b/block.c
@@ -6116,6 +6116,10 @@ void bdrv_img_create(const char *filename, const char 
*fmt,
   "same filename as the backing file");
  goto out;
  }
+if (backing_file[0] == '\0') {
+error_setg(errp, "Expected backing file name, got empty string");
+goto out;
+}
  }
  
  backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);

diff --git a/tests/qemu-iotests/298 b/tests/qemu-iotests/298
new file mode 100755
index 00..1e30caebec
--- /dev/null
+++ b/tests/qemu-iotests/298
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+#
+# Copyright (C) 2020 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+
+
+
+# Regression test for avoiding an assertion when the 'backing file'
+# parameter (-b) is set to an empty string for qemu-img create
+#
+#   qemu-img create -f qcow2 -b '' /tmp/foo
+#
+# This ensures the invalid parameter is handled with a user-
+# friendly message instead of a failed assertion.
+
+import iotests
+
+class TestEmptyBackingFilename(iotests.QMPTestCase):
+
+
+def test_empty_backing_file_name(self):
+actual = iotests.qemu_img_pipe(
+'create',
+'-f', 'qcow2',
+'-b', '',
+'/tmp/foo'
+)
+expected = 'qemu-img: /tmp/foo: Expected backing file name,' \
+   ' got empty string'
+
+self.assertEqual(actual.strip(), expected.strip())
+
+
+if __name__ == '__main__':
+iotests.main(supported_fmts=['raw', 'qcow2'])
diff --git a/tests/qemu-iotests/298.out b/tests/qemu-iotests/298.out
new file mode 100644
index 00..ae1213e6f8
--- /dev/null
+++ b/tests/qemu-iotests/298.out
@@ -0,0 +1,5 @@
+.
+--
+Ran 1 tests
+
+OK
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index d886fa0cb3..4bca2d9e05 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -302,3 +302,4 @@
  291 rw quick
  292 rw auto quick
  297 meta
+298 img auto quick






[PATCH v4 1/2] nvme: indicate CMB support through controller capabilities register

2020-07-01 Thread Andrzej Jakowski
This patch sets CMBS bit in controller capabilities register when user
configures NVMe driver with CMB support, so capabilites are correctly
reported to guest OS.

Signed-off-by: Andrzej Jakowski 
Reviewed-by: Klaus Jensen 
---
 hw/block/nvme.c  | 2 +-
 include/block/nvme.h | 6 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 1aee042d4c..9f11f3e9da 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1582,6 +1582,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
 NVME_CAP_SET_TO(n->bar.cap, 0xf);
 NVME_CAP_SET_CSS(n->bar.cap, 1);
 NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
+NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0);
 
 n->bar.vs = 0x00010200;
 n->bar.intmc = n->bar.intms = 0;
@@ -1591,7 +1592,6 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
 {
 NvmeCtrl *n = NVME(pci_dev);
 Error *local_err = NULL;
-
 int i;
 
 nvme_check_constraints(n, &local_err);
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 1720ee1d51..14cf398dfa 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -35,6 +35,7 @@ enum NvmeCapShift {
 CAP_MPSMIN_SHIFT   = 48,
 CAP_MPSMAX_SHIFT   = 52,
 CAP_PMR_SHIFT  = 56,
+CAP_CMB_SHIFT  = 57,
 };
 
 enum NvmeCapMask {
@@ -48,6 +49,7 @@ enum NvmeCapMask {
 CAP_MPSMIN_MASK= 0xf,
 CAP_MPSMAX_MASK= 0xf,
 CAP_PMR_MASK   = 0x1,
+CAP_CMB_MASK   = 0x1,
 };
 
 #define NVME_CAP_MQES(cap)  (((cap) >> CAP_MQES_SHIFT)   & CAP_MQES_MASK)
@@ -78,8 +80,10 @@ enum NvmeCapMask {
<< CAP_MPSMIN_SHIFT)
 #define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & 
CAP_MPSMAX_MASK)\
 << 
CAP_MPSMAX_SHIFT)
-#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK)\
+#define NVME_CAP_SET_PMRS(cap, val)   (cap |= (uint64_t)(val & CAP_PMR_MASK)   
\
 << CAP_PMR_SHIFT)
+#define NVME_CAP_SET_CMBS(cap, val)   (cap |= (uint64_t)(val & CAP_CMB_MASK)   
\
+   << CAP_CMB_SHIFT)
 
 enum NvmeCcShift {
 CC_EN_SHIFT = 0,
-- 
2.21.1




[PATCH v4 2/2] nvme: allow cmb and pmr to be enabled on same device

2020-07-01 Thread Andrzej Jakowski
So far it was not possible to have CMB and PMR emulated on the same
device, because BAR2 was used exclusively either of PMR or CMB. This
patch places CMB at BAR4 offset so it not conflicts with MSI-X vectors.

Signed-off-by: Andrzej Jakowski 
---
 hw/block/nvme.c  | 101 +--
 hw/block/nvme.h  |   1 +
 include/block/nvme.h |   4 +-
 3 files changed, 72 insertions(+), 34 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 9f11f3e9da..f5156bcfe5 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -22,12 +22,12 @@
  *  [pmrdev=,] \
  *  max_ioqpairs=
  *
- * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
- * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
+ * Note cmb_size_mb denotes size of CMB in MB. CMB when configured is assumed
+ * to be resident in BAR4 at certain offset - this is because BAR4 is also
+ * used for storing MSI-X table that is available at offset 0 in BAR4.
  *
- * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
- * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
- * both provided.
+ * pmrdev is assumed to be resident in BAR2/BAR3. When configured it consumes
+ * whole BAR2/BAR3 exclusively.
  * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
  * For example:
  * -object memory-backend-file,id=,share=on,mem-path=, \
@@ -57,8 +57,8 @@
 #define NVME_MAX_IOQPAIRS 0x
 #define NVME_REG_SIZE 0x1000
 #define NVME_DB_SIZE  4
-#define NVME_CMB_BIR 2
 #define NVME_PMR_BIR 2
+#define NVME_MSIX_BIR 4
 
 #define NVME_GUEST_ERR(trace, fmt, ...) \
 do { \
@@ -1395,7 +1395,7 @@ static void nvme_check_constraints(NvmeCtrl *n, Error 
**errp)
 return;
 }
 
-if (!n->params.cmb_size_mb && n->pmrdev) {
+if (n->pmrdev) {
 if (host_memory_backend_is_mapped(n->pmrdev)) {
 char *path = 
object_get_canonical_path_component(OBJECT(n->pmrdev));
 error_setg(errp, "can't use already busy memdev: %s", path);
@@ -1453,33 +1453,70 @@ static void nvme_init_namespace(NvmeCtrl *n, 
NvmeNamespace *ns, Error **errp)
 id_ns->nuse = id_ns->ncap;
 }
 
-static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
+static void nvme_bar4_init(PCIDevice *pci_dev, Error **errp)
 {
-NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_CMB_BIR);
-NVME_CMBLOC_SET_OFST(n->bar.cmbloc, 0);
-
-NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
-NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
-NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 0);
-NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
-NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
-NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */
-NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb);
-
-n->cmbuf = g_malloc0(NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
-memory_region_init_io(&n->ctrl_mem, OBJECT(n), &nvme_cmb_ops, n,
-  "nvme-cmb", NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
-pci_register_bar(pci_dev, NVME_CMBLOC_BIR(n->bar.cmbloc),
+NvmeCtrl *n = NVME(pci_dev);
+int status;
+uint64_t bar_size, cmb_offset = 0;
+uint32_t msix_vectors;
+uint32_t nvme_pba_offset;
+uint32_t cmb_size_units;
+
+msix_vectors = n->params.max_ioqpairs + 1;
+nvme_pba_offset = PCI_MSIX_ENTRY_SIZE * msix_vectors;
+bar_size = nvme_pba_offset + QEMU_ALIGN_UP(msix_vectors, 64) / 8;
+
+if (n->params.cmb_size_mb) {
+NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
+NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
+NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 0);
+NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
+NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
+NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */
+NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb);
+
+cmb_size_units = NVME_CMBSZ_GETSIZEUNITS(n->bar.cmbsz);
+cmb_offset = QEMU_ALIGN_UP(bar_size, cmb_size_units);
+
+NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_MSIX_BIR);
+NVME_CMBLOC_SET_OFST(n->bar.cmbloc, cmb_offset / cmb_size_units);
+
+n->cmbuf = g_malloc0(NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
+
+bar_size += cmb_offset;
+bar_size += NVME_CMBSZ_GETSIZE(n->bar.cmbsz);
+}
+
+bar_size = pow2ceil(bar_size);
+
+/* Create memory region for BAR4, then overlap cmb, msix and pba
+ * tables on top of it.*/
+memory_region_init(&n->bar4, OBJECT(n), "nvme-bar4", bar_size);
+
+if (n->params.cmb_size_mb) {
+memory_region_init_io(&n->ctrl_mem, OBJECT(n), &nvme_cmb_ops, n,
+  "nvme-cmb", NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
+
+memory_region_add_subregion(&n->bar4, cmb_offset, &n->ctrl_mem);
+}
+
+status = msix_init(pci_dev, n->params.msix_qsize,
+   &n->bar4, NVME_MSIX_BIR, 0,
+   &n->bar4, NVME_MSIX_BIR, nvme_pba_offset,
+   0, errp);
+
+if (status) {
+return;
+}
+
+pci_register_bar(pci_dev, NVME_

[PATCH v4] nvme: allow cmb and pmr emulation on same device

2020-07-01 Thread Andrzej Jakowski
Hi All, 

Resending series recently posted on mailing list related to nvme device
extension with couple of fixes after review.

This patch series does following:
 - Fixes problem where CMBS bit was not set in controller capabilities
   register, so support for CMB was not correctly advertised to guest.
   This is resend of patch that has been submitted and reviewed by
   Klaus [1]
 - Introduces BAR4 sharing between MSI-X vectors and CMB. This allows
   to have CMB and PMR emulated on the same device. This extension
   was indicated by Keith [2]

v4:
 - modified BAR4 initialization, so now it consists of CMB, MSIX and
   PBA memory regions overlapping on top of it. This reduces patch
   complexity significantly (Klaus [5])

v3:
 - code style fixes including: removal of spurious line break, moving
   define into define section and code alignment (Klaus [4])
 - removed unintended code reintroduction (Klaus [4])

v2:
 - rebase on Kevin's latest block branch (Klaus [3])
 - improved comments section (Klaus [3])
 - simplified calculation of BAR4 size (Klaus [3])

v1:
 - initial push of the patch

[1]: 
https://lore.kernel.org/qemu-devel/20200408055607.g2ii7gwqbnv6cd3w@apples.localdomain/
[2]: 
https://lore.kernel.org/qemu-devel/20200330165518.ga8...@redsun51.ssa.fujisawa.hgst.com/
[3]: 
https://lore.kernel.org/qemu-devel/20200605181043.28782-1-andrzej.jakow...@linux.intel.com/
[4]: 
https://lore.kernel.org/qemu-devel/20200618092524.posxi5mysb3jjtpn@apples.localdomain/
[5]: 
https://lore.kernel.org/qemu-devel/20200626055033.6vxqvi4s5pll7som@apples.localdomain/





Re: [PATCH v2 17/18] hw/block/nvme: Use zone metadata file for persistence

2020-07-01 Thread Klaus Jensen
On Jun 18 06:34, Dmitry Fomichev wrote:
> A ZNS drive that is emulated by this driver is currently initialized
> with all zones Empty upon startup. However, actual ZNS SSDs save the
> state and condition of all zones in their internal NVRAM in the event
> of power loss. When such a drive is powered up again, it closes or
> finishes all zones that were open at the moment of shutdown. Besides
> that, the write pointer position as well as the state and condition
> of all zones is preserved across power-downs.
> 
> This commit adds the capability to have a persistent zone metadata
> to the driver. The new optional driver property, "zone_file",
> is introduced. If added to the command line, this property specifies
> the name of the file that stores the zone metadata. If "zone_file" is
> omitted, the driver will initialize with all zones empty, the same as
> before.
> 
> If zone metadata is configured to be persistent, then zone descriptor
> extensions also persist across controller shutdowns.
> 
> Signed-off-by: Dmitry Fomichev 

Stefan, before I review this in depth, can you comment on if mmap'ing a
file from a device model and issuing regular msync's is an acceptable
approach to storing state persistently across QEMU invocations?

I could not find any examples of this in hw/, so I am unsure. I
implemented something like this using an additional blockdev on the
device and doing blk_aio's, but just mmaping a file seems much simpler,
but at the cost of portability? On the other hand, I can't find any
examples of using an additional blockdev either.

Can you shed any light on the preferred approach?

> ---
>  hw/block/nvme.c | 371 +---
>  hw/block/nvme.h |  38 +
>  2 files changed, 388 insertions(+), 21 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 14d5f1d155..63e7a6352e 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -69,6 +69,8 @@
>  } while (0)
>  
>  static void nvme_process_sq(void *opaque);
> +static void nvme_sync_zone_file(NvmeCtrl *n, NvmeNamespace *ns,
> +NvmeZone *zone, int len);
>  
>  /*
>   * Add a zone to the tail of a zone list.
> @@ -90,6 +92,7 @@ static void nvme_add_zone_tail(NvmeCtrl *n, NvmeNamespace 
> *ns, NvmeZoneList *zl,
>  zl->tail = idx;
>  }
>  zl->size++;
> +nvme_set_zone_meta_dirty(n, ns, true);
>  }
>  
>  /*
> @@ -106,12 +109,15 @@ static void nvme_remove_zone(NvmeCtrl *n, NvmeNamespace 
> *ns, NvmeZoneList *zl,
>  if (zl->size == 0) {
>  zl->head = NVME_ZONE_LIST_NIL;
>  zl->tail = NVME_ZONE_LIST_NIL;
> +nvme_set_zone_meta_dirty(n, ns, true);
>  } else if (idx == zl->head) {
>  zl->head = zone->next;
>  ns->zone_array[zl->head].prev = NVME_ZONE_LIST_NIL;
> +nvme_set_zone_meta_dirty(n, ns, true);
>  } else if (idx == zl->tail) {
>  zl->tail = zone->prev;
>  ns->zone_array[zl->tail].next = NVME_ZONE_LIST_NIL;
> +nvme_set_zone_meta_dirty(n, ns, true);
>  } else {
>  ns->zone_array[zone->next].prev = zone->prev;
>  ns->zone_array[zone->prev].next = zone->next;
> @@ -138,6 +144,7 @@ static NvmeZone *nvme_remove_zone_head(NvmeCtrl *n, 
> NvmeNamespace *ns,
>  ns->zone_array[zl->head].prev = NVME_ZONE_LIST_NIL;
>  }
>  zone->prev = zone->next = 0;
> +nvme_set_zone_meta_dirty(n, ns, true);
>  }
>  
>  return zone;
> @@ -476,6 +483,7 @@ static void nvme_assign_zone_state(NvmeCtrl *n, 
> NvmeNamespace *ns,
>  case NVME_ZONE_STATE_READ_ONLY:
>  zone->tstamp = 0;
>  }
> +nvme_sync_zone_file(n, ns, zone, sizeof(NvmeZone));
>  }
>  
>  static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
> @@ -2976,9 +2984,114 @@ static const MemoryRegionOps nvme_cmb_ops = {
>  },
>  };
>  
> -static int nvme_init_zone_meta(NvmeCtrl *n, NvmeNamespace *ns,
> +static int nvme_validate_zone_file(NvmeCtrl *n, NvmeNamespace *ns,
>  uint64_t capacity)
>  {
> +NvmeZoneMeta *meta = ns->zone_meta;
> +NvmeZone *zone = ns->zone_array;
> +uint64_t start = 0, zone_size = n->params.zone_size;
> +int i, n_imp_open = 0, n_exp_open = 0, n_closed = 0, n_full = 0;
> +
> +if (meta->magic != NVME_ZONE_META_MAGIC) {
> +return 1;
> +}
> +if (meta->version != NVME_ZONE_META_VER) {
> +return 2;
> +}
> +if (meta->zone_size != zone_size) {
> +return 3;
> +}
> +if (meta->zone_capacity != n->params.zone_capacity) {
> +return 4;
> +}
> +if (meta->nr_offline_zones != n->params.nr_offline_zones) {
> +return 5;
> +}
> +if (meta->nr_rdonly_zones != n->params.nr_rdonly_zones) {
> +return 6;
> +}
> +if (meta->lba_size != n->conf.logical_block_size) {
> +return 7;
> +}
> +if (meta->zd_extension_size != n->params.zd_extension_size) {
> +return 8;
> +}
> +
> +for (i = 0; i < n->num_zones; i++, zone++) {
> +i

Re: [PATCH v4 00/16] python: add mypy support to python/qemu

2020-07-01 Thread John Snow



On 6/29/20 10:30 AM, John Snow wrote:
> 
> 
> On 6/29/20 4:26 AM, Philippe Mathieu-Daudé wrote:
>> +Cleber/Wainer.
>>
>> On 6/26/20 10:41 PM, John Snow wrote:
>>> Based-on: 20200626202350.11060-1-js...@redhat.com
>>>
>>> This series modifies the python/qemu library to comply with mypy --strict.
>>> This requires my "refactor shutdown" patch as a pre-requisite.
>>>
>>> v4:
>>> 001/16:[] [--] 'python/qmp.py: Define common types'
>>> 002/16:[] [--] 'iotests.py: use qemu.qmp type aliases'
>>> 003/16:[] [--] 'python/qmp.py: re-absorb MonitorResponseError'
>>> 004/16:[] [--] 'python/qmp.py: Do not return None from cmd_obj'
>>> 005/16:[] [--] 'python/qmp.py: add casts to JSON deserialization'
>>> 006/16:[] [--] 'python/qmp.py: add QMPProtocolError'
>>> 007/16:[] [--] 'python/machine.py: Fix monitor address typing'
>>> 008/16:[] [--] 'python/machine.py: reorder __init__'
>>> 009/16:[] [--] 'python/machine.py: Don't modify state in _base_args()'
>>> 010/16:[] [--] 'python/machine.py: Handle None events in events_wait'
>>> 011/16:[] [--] 'python/machine.py: use qmp.command'
>>> 012/16:[0010] [FC] 'python/machine.py: Add _qmp access shim'
>>> 013/16:[] [-C] 'python/machine.py: fix _popen access'
>>> 014/16:[] [--] 'python/qemu: make 'args' style arguments immutable'
>>> 015/16:[] [--] 'iotests.py: Adjust HMP kwargs typing'
>>> 016/16:[] [-C] 'python/qemu: Add mypy type annotations'
>>>
>>>  - Rebased on "refactor shutdown" v4
>>>  - Fixed _qmp access for scripts that disable QMP
>>
>> See:
>>
>> https://travis-ci.org/github/philmd/qemu/jobs/702507625#L5439
>>
>> / # uname -a
>> Linux buildroot 4.5.0-2-4kc-malta #1 Debian 4.5.5-1 (2016-05-29) mips
>> GNU/Linux
>> / # reboot
>> / # reboot: Restarting system
> {'execute': 'quit'}
>> qemu received signal 9; command: "mips-softmmu/qemu-system-mips -display
>> none -vga none -chardev
>> socket,id=mon,path=/tmp/tmpcojsc5u3/qemu-14540-monitor.sock -mon
>> chardev=mon,mode=control -machine malta -chardev
>> socket,id=console,path=/tmp/tmpcojsc5u3/qemu-14540-console.sock,server,nowait
>> -serial chardev:console -kernel
>> /tmp/avocado_xgmck2k5/avocado_job_83wkzs2f/12-tests_acceptance_boot_linux_console.py_BootLinuxConsole.test_mips_malta_cpio/boot/vmlinux-4.5.0-2-4kc-malta
>> -initrd
>> /tmp/avocado_xgmck2k5/avocado_job_83wkzs2f/12-tests_acceptance_boot_linux_console.py_BootLinuxConsole.test_mips_malta_cpiorootfs.cpio
>> -append printk.time=0 console=ttyS0 console=tty rdinit=/sbin/init
>> noreboot -no-reboot"
>>
>> Reproduced traceback from:
>> /home/travis/build/philmd/qemu/build/tests/venv/lib/python3.6/site-packages/avocado/core/test.py:886
>> Traceback (most recent call last):
>>   File
>> "/home/travis/build/philmd/qemu/build/tests/acceptance/avocado_qemu/__init__.py",
>> line 195, in tearDown
>> vm.shutdown()
>>   File "/home/travis/build/philmd/qemu/python/qemu/machine.py", line
>> 457, in shutdown
>> self._do_shutdown(has_quit)
>>   File "/home/travis/build/philmd/qemu/python/qemu/machine.py", line
>> 434, in _do_shutdown
>> self._soft_shutdown(has_quit, timeout)
>>   File "/home/travis/build/philmd/qemu/python/qemu/machine.py", line
>> 414, in _soft_shutdown
>> self._qmp.cmd('quit')
>>   File "/home/travis/build/philmd/qemu/python/qemu/qmp.py", line 271, in cmd
>> return self.cmd_obj(qmp_cmd)
>>   File "/home/travis/build/philmd/qemu/python/qemu/qmp.py", line 249, in
>> cmd_obj
>> self.__sock.sendall(json.dumps(qmp_cmd).encode('utf-8'))
>> BrokenPipeError: [Errno 32] Broken pipe
>>
> 
> Might be racing between QMP going away and Python not being aware that
> the process has closed yet. It's the only explanation I have left.
> 
> I wish I could reproduce this, though. When I submit jobs to Travis I am
> not seeing these failures.
> 
> I'll see what I can do, but at this point I'll not re-send the patch
> series until I can conclusively fix your build testing.
> 
> --js
> 

OK, this is a very big mea culpa. There are two problems.

1. I should catch ConnectionReset and not ConnectionResetError; one is a
catch-all for socket problems and the other is a very specific errno
that does not include BrokenPipeError.

2. Even if I do, it can still race, actually. QEMU might be in the
middle of shutting down, but has already lost the control channel.

Solving the second problem as the interface is currently designed is
hard. You can wait, but then if the wait failed, you need to re-raise
the control channel error instead of the wait failure. IOW, you need to
wait in order to learn if the control channel error is "important" or not.

So, the architecture of this is starting to look wrong; _soft_shutdown
should keep clarity of purpose. Either it was able to to a nice, soft
shutdown -- or it wasn't.

I'm starting to think that the real problem is that machine.py shouldn't
try to hide connection errors on shutdown at all if we expected to be
able to issue a quit command.

Changing my mind rapi

Re: [PATCH 0/3] block: improve error reporting for unsupported O_DIRECT

2020-07-01 Thread no-reply
Patchew URL: 
https://patchew.org/QEMU/20200701160509.1523847-1-berra...@redhat.com/



Hi,

This series failed the docker-quick@centos7 build test. Please find the testing 
commands and
their output below. If you have Docker installed, you can probably reproduce it
locally.

=== TEST SCRIPT BEGIN ===
#!/bin/bash
make docker-image-centos7 V=1 NETWORK=1
time make docker-test-quick@centos7 SHOW_ENV=1 J=14 NETWORK=1
=== TEST SCRIPT END ===

+++ /tmp/qemu-test/build/tests/qemu-iotests/226.out.bad 2020-07-01 
16:58:11.321029717 +
@@ -6,7 +6,7 @@
 qemu-io: can't open: A regular file was expected by the 'file' driver, but 
something else was given
 qemu-io: warning: Opening a character device as a file using the 'file' driver 
is deprecated
 == Testing RW ==
-qemu-io: can't open: Could not open 'TEST_DIR/t.IMGFMT': Is a directory
+qemu-io: can't open: Unable to open 'TEST_DIR/t.IMGFMT' flags 0x2: Is a 
directory
 qemu-io: warning: Opening a character device as a file using the 'file' driver 
is deprecated
 
 === Testing with driver:host_device ===
@@ -14,13 +14,13 @@
---
Not run: 259
Failures: 061 069 111 226 244
Failed 5 of 119 iotests
make: *** [check-tests/check-block.sh] Error 1
Traceback (most recent call last):
  File "./tests/docker/docker.py", line 669, in 
sys.exit(main())
---
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['sudo', '-n', 'docker', 'run', 
'--label', 'com.qemu.instance.uuid=67228a70e7844d70b07a864fef5c4569', '-u', 
'1003', '--security-opt', 'seccomp=unconfined', '--rm', '-e', 'TARGET_LIST=', 
'-e', 'EXTRA_CONFIGURE_OPTS=', '-e', 'V=', '-e', 'J=14', '-e', 'DEBUG=', '-e', 
'SHOW_ENV=1', '-e', 'CCACHE_DIR=/var/tmp/ccache', '-v', 
'/home/patchew2/.cache/qemu-docker-ccache:/var/tmp/ccache:z', '-v', 
'/var/tmp/patchew-tester-tmp-y2quzqfb/src/docker-src.2020-07-01-12.41.12.23538:/var/tmp/qemu:z,ro',
 'qemu:centos7', '/var/tmp/qemu/run', 'test-quick']' returned non-zero exit 
status 2.
filter=--filter=label=com.qemu.instance.uuid=67228a70e7844d70b07a864fef5c4569
make[1]: *** [docker-run] Error 1
make[1]: Leaving directory `/var/tmp/patchew-tester-tmp-y2quzqfb/src'
make: *** [docker-run-test-quick@centos7] Error 2

real17m41.892s
user0m5.790s


The full log is available at
http://patchew.org/logs/20200701160509.1523847-1-berra...@redhat.com/testing.docker-quick@centos7/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-de...@redhat.com

Re: [PATCH v2 15/18] hw/block/nvme: Support Zone Descriptor Extensions

2020-07-01 Thread Klaus Jensen
On Jun 18 06:34, Dmitry Fomichev wrote:
> Zone Descriptor Extension is a label that can be assigned to a zone.
> It can be set to an Empty zone and it stays assigned until the zone
> is reset.
> 
> This commit adds a new optional property, "zone_descr_ext_size", to
> the driver. Its value must be a multiple of 64 bytes. If this value
> is non-zero, it becomes possible to assign extensions of that size
> to any Empty zones. The default value for this property is 0,
> therefore setting extensions is disabled by default.
> 
> Signed-off-by: Hans Holmberg 
> Signed-off-by: Dmitry Fomichev 

Reviewed-by: Klaus Jensen 

> ---
>  hw/block/nvme.c | 76 ++---
>  hw/block/nvme.h |  8 ++
>  2 files changed, 80 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index b9135a6b1f..eb41081627 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -1360,6 +1360,26 @@ static bool nvme_cond_offline_all(uint8_t state)
>  return state == NVME_ZONE_STATE_READ_ONLY;
>  }
>  
> +static uint16_t nvme_set_zd_ext(NvmeCtrl *n, NvmeNamespace *ns,
> +NvmeZone *zone, uint8_t state)
> +{
> +uint16_t status;
> +
> +if (state == NVME_ZONE_STATE_EMPTY) {
> +nvme_auto_transition_zone(n, ns, false, true);
> +status = nvme_aor_check(n, ns, 1, 0);
> +if (status != NVME_SUCCESS) {
> +return status;
> +}
> +nvme_aor_inc_active(n, ns);
> +zone->d.za |= NVME_ZA_ZD_EXT_VALID;
> +nvme_assign_zone_state(n, ns, zone, NVME_ZONE_STATE_CLOSED);
> +return NVME_SUCCESS;
> +}
> +
> +return NVME_ZONE_INVAL_TRANSITION;
> +}
> +
>  static uint16_t name_do_zone_op(NvmeCtrl *n, NvmeNamespace *ns,
>  NvmeZone *zone, uint8_t state, bool all,
>  uint16_t (*op_hndlr)(NvmeCtrl *, NvmeNamespace *, NvmeZone *,
> @@ -1388,13 +1408,16 @@ static uint16_t name_do_zone_op(NvmeCtrl *n, 
> NvmeNamespace *ns,
>  static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeNamespace *ns,
>  NvmeCmd *cmd, NvmeRequest *req)
>  {
> +NvmeRwCmd *rw;
>  uint32_t dw13 = le32_to_cpu(cmd->cdw13);
> +uint64_t prp1, prp2;
>  uint64_t slba = 0;
>  uint64_t zone_idx = 0;
>  uint16_t status;
>  uint8_t action, state;
>  bool all;
>  NvmeZone *zone;
> +uint8_t *zd_ext;
>  
>  action = dw13 & 0xff;
>  all = dw13 & 0x100;
> @@ -1449,7 +1472,25 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, 
> NvmeNamespace *ns,
>  
>  case NVME_ZONE_ACTION_SET_ZD_EXT:
>  trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
> -return NVME_INVALID_FIELD | NVME_DNR;
> +if (all || !n->params.zd_extension_size) {
> +return NVME_INVALID_FIELD | NVME_DNR;
> +}
> +zd_ext = nvme_get_zd_extension(n, ns, zone_idx);
> +rw = (NvmeRwCmd *)cmd;
> +prp1 = le64_to_cpu(rw->prp1);
> +prp2 = le64_to_cpu(rw->prp2);
> +status = nvme_dma_write_prp(n, zd_ext, n->params.zd_extension_size,
> +prp1, prp2);
> +if (status) {
> +trace_pci_nvme_err_zd_extension_map_error(zone_idx);
> +return status;
> +}
> +
> +status = nvme_set_zd_ext(n, ns, zone, state);
> +if (status == NVME_SUCCESS) {
> +trace_pci_nvme_zd_extension_set(zone_idx);
> +return status;
> +}
>  break;
>  
>  default:
> @@ -1528,7 +1569,7 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, 
> NvmeNamespace *ns,
>  return NVME_INVALID_FIELD | NVME_DNR;
>  }
>  
> -if (zra == NVME_ZONE_REPORT_EXTENDED) {
> +if (zra == NVME_ZONE_REPORT_EXTENDED && !n->params.zd_extension_size) {
>  return NVME_INVALID_FIELD | NVME_DNR;
>  }
>  
> @@ -1540,6 +1581,9 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, 
> NvmeNamespace *ns,
>  partial = (dw13 >> 16) & 0x01;
>  
>  zone_entry_sz = sizeof(NvmeZoneDescr);
> +if (zra == NVME_ZONE_REPORT_EXTENDED) {
> +zone_entry_sz += n->params.zd_extension_size;
> +}
>  
>  max_zones = (len - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
>  buf = g_malloc0(len);
> @@ -1571,6 +1615,14 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, 
> NvmeNamespace *ns,
>  z->wp = cpu_to_le64(~0ULL);
>  }
>  
> +if (zra == NVME_ZONE_REPORT_EXTENDED) {
> +if (zs->d.za & NVME_ZA_ZD_EXT_VALID) {
> +memcpy(buf_p, nvme_get_zd_extension(n, ns, zone_index),
> +   n->params.zd_extension_size);
> +}
> +buf_p += n->params.zd_extension_size;
> +}
> +
>  zone_index++;
>  }
>  
> @@ -2337,7 +2389,7 @@ static uint16_t nvme_handle_changed_zone_log(NvmeCtrl 
> *n, NvmeCmd *cmd,
>  continue;
>  }
>  num_aen_zones++;
> -if (zone->d.za) {
> +if (zone->d.za & ~NVME_ZA_ZD_EXT_VALID) {
>  tr

Re: [PATCH v9 14/34] qcow2: Add QCow2SubclusterType and qcow2_get_subcluster_type()

2020-07-01 Thread Alberto Garcia
On Wed 01 Jul 2020 02:52:14 PM CEST, Max Reitz wrote:
>>  if (l2_entry & QCOW_OFLAG_COMPRESSED) {
>>  return QCOW2_CLUSTER_COMPRESSED;
>> -} else if (l2_entry & QCOW_OFLAG_ZERO) {
>> +} else if ((l2_entry & QCOW_OFLAG_ZERO) && !has_subclusters(s)) {
>
> OK, so now qcow2_get_cluster_type() reports zero clusters to be normal
> or unallocated clusters when there are subclusters.  Seems weird to
> me, because zero clusters are invalid clusters then.

I'm actually hesitant about this.

In extended L2 entries QCOW_OFLAG_ZERO does not have any meaning so
technically it doesn't need to be checked any more than the other
reserved bits (1 to 8).

The reason why we would want to check it is, of course, because that bit
does have a meaning in regular L2 entries.

But that bit is ignored in images with subclusters so the only reason
why we would check it is to report corruption, not because we need to
know its value.

It's true that we do check it in v2 images, although in that case the
entries are otherwise identical and there is a way to convert between
both types.

> I preferred just reporting them as zero clusters and letting the
> caller deal with it, because it does mean an error in the image and so
> it should be reported.

Another alternative would be to add QCOW2_CLUSTER_INVALID and we could
even include there other cases like unaligned offsets and things like
that. But that would also affect the code that repairs corrupted images.

Berto



Re: [PATCH v2 13/18] hw/block/nvme: Set Finish/Reset Zone Recommended attributes

2020-07-01 Thread Klaus Jensen
On Jun 18 06:34, Dmitry Fomichev wrote:
> Added logic to set and reset FZR and RZR zone attributes. Four new
> driver properties are added to control the timing of setting and
> resetting these attributes. FZR/RZR delay lasts from the zone
> operation and until when the corresponding zone attribute is set.
> FZR/RZR limits set the time period between setting FZR or RZR
> attribute and resetting it simulating the internal controller action
> on that zone.
> 
> Signed-off-by: Dmitry Fomichev 

Please correct me if I am wrong here, but I want to raise a question
about the use of QEMU_CLOCK_REALTIME here. I agree that it makes sense
that the limits are "absolute", but does this hold for emulation? In my
view, when emulation is stopped, the world is stopped. Should we emulate
the need for background operations in this case? I don't think so.

> ---
>  hw/block/nvme.c | 99 +
>  hw/block/nvme.h | 13 ++-
>  2 files changed, 111 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index a29cbfcc96..c3898448c7 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -201,6 +201,84 @@ static inline void nvme_aor_dec_active(NvmeCtrl *n, 
> NvmeNamespace *ns)
>  assert(ns->nr_active_zones >= 0);
>  }
>  
> +static void nvme_set_rzr(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone)
> +{
> +assert(zone->flags & NVME_ZFLAGS_SET_RZR);
> +zone->tstamp = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
> +zone->flags &= ~NVME_ZFLAGS_TS_DELAY;
> +zone->d.za |= NVME_ZA_RESET_RECOMMENDED;
> +zone->flags &= ~NVME_ZFLAGS_SET_RZR;
> +trace_pci_nvme_zone_reset_recommended(zone->d.zslba);
> +}
> +
> +static void nvme_clear_rzr(NvmeCtrl *n, NvmeNamespace *ns,
> +NvmeZone *zone, bool notify)
> +{
> +if (n->params.rrl_usec) {
> +zone->flags &= ~(NVME_ZFLAGS_SET_RZR | NVME_ZFLAGS_TS_DELAY);
> +notify = notify && (zone->d.za & NVME_ZA_RESET_RECOMMENDED);
> +zone->d.za &= ~NVME_ZA_RESET_RECOMMENDED;
> +zone->tstamp = 0;
> +}
> +}
> +
> +static void nvme_set_fzr(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone)
> +{
> +assert(zone->flags & NVME_ZFLAGS_SET_FZR);
> +zone->tstamp = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
> +zone->flags &= ~NVME_ZFLAGS_TS_DELAY;
> +zone->d.za |= NVME_ZA_FINISH_RECOMMENDED;
> +zone->flags &= ~NVME_ZFLAGS_SET_FZR;
> +trace_pci_nvme_zone_finish_recommended(zone->d.zslba);
> +}
> +
> +static void nvme_clear_fzr(NvmeCtrl *n, NvmeNamespace *ns,
> +NvmeZone *zone, bool notify)
> +{
> +if (n->params.frl_usec) {
> +zone->flags &= ~(NVME_ZFLAGS_SET_FZR | NVME_ZFLAGS_TS_DELAY);
> +notify = notify && (zone->d.za & NVME_ZA_FINISH_RECOMMENDED);
> +zone->d.za &= ~NVME_ZA_FINISH_RECOMMENDED;
> +zone->tstamp = 0;
> +}
> +}
> +
> +static void nvme_schedule_rzr(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone)
> +{
> +if (n->params.frl_usec) {
> +zone->flags &= ~(NVME_ZFLAGS_SET_FZR | NVME_ZFLAGS_TS_DELAY);
> +zone->d.za &= ~NVME_ZA_FINISH_RECOMMENDED;
> +zone->tstamp = 0;
> +}
> +if (n->params.rrl_usec) {
> +zone->flags |= NVME_ZFLAGS_SET_RZR;
> +if (n->params.rzr_delay_usec) {
> +zone->tstamp = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
> +zone->flags |= NVME_ZFLAGS_TS_DELAY;
> +} else {
> +nvme_set_rzr(n, ns, zone);
> +}
> +}
> +}
> +
> +static void nvme_schedule_fzr(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone)
> +{
> +if (n->params.rrl_usec) {
> +zone->flags &= ~(NVME_ZFLAGS_SET_RZR | NVME_ZFLAGS_TS_DELAY);
> +zone->d.za &= ~NVME_ZA_RESET_RECOMMENDED;
> +zone->tstamp = 0;
> +}
> +if (n->params.frl_usec) {
> +zone->flags |= NVME_ZFLAGS_SET_FZR;
> +if (n->params.fzr_delay_usec) {
> +zone->tstamp = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
> +zone->flags |= NVME_ZFLAGS_TS_DELAY;
> +} else {
> +nvme_set_fzr(n, ns, zone);
> +}
> +}
> +}
> +
>  static void nvme_assign_zone_state(NvmeCtrl *n, NvmeNamespace *ns,
>  NvmeZone *zone, uint8_t state)
>  {
> @@ -208,15 +286,19 @@ static void nvme_assign_zone_state(NvmeCtrl *n, 
> NvmeNamespace *ns,
>  switch (nvme_get_zone_state(zone)) {
>  case NVME_ZONE_STATE_EXPLICITLY_OPEN:
>  nvme_remove_zone(n, ns, ns->exp_open_zones, zone);
> +nvme_clear_fzr(n, ns, zone, false);
>  break;
>  case NVME_ZONE_STATE_IMPLICITLY_OPEN:
>  nvme_remove_zone(n, ns, ns->imp_open_zones, zone);
> +nvme_clear_fzr(n, ns, zone, false);
>  break;
>  case NVME_ZONE_STATE_CLOSED:
>  nvme_remove_zone(n, ns, ns->closed_zones, zone);
> +nvme_clear_fzr(n, ns, zone, false);
>  break;
>  case NVME_ZONE_STATE_FULL:
>  nvme_remove_zone(n, ns, ns->f

Re: [PATCH] MAINTAINERS: Cover "block/nvme.h" file

2020-07-01 Thread Klaus Jensen
On Jul  1 16:06, Philippe Mathieu-Daudé wrote:
> The "block/nvme.h" header is shared by both the NVMe block
> driver and the NVMe emulated device. Add the 'F:' entry on
> both sections, so all maintainers/reviewers are notified
> when it is changed.
> 
> Signed-off-by: Philippe Mathieu-Daudé 

Reviewed-by: Klaus Jensen 

> ---
> Cc: Fam Zheng 
> Cc: Keith Busch 
> Cc: Kevin Wolf 
> Cc: Max Reitz 
> Cc: Niklas Cassel 
> Cc: Klaus Jensen 
> Cc: Klaus Jensen 
> Cc: Maxim Levitsky 
> Cc: Dmitry Fomichev 
> Cc: Andrzej Jakowski 
> ---
>  MAINTAINERS | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index dec252f38b..9995cdc805 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -1795,6 +1795,7 @@ M: Keith Busch 
>  L: qemu-block@nongnu.org
>  S: Supported
>  F: hw/block/nvme*
> +F: include/block/nvme.h
>  F: tests/qtest/nvme-test.c
>  
>  megasas
> @@ -2804,6 +2805,7 @@ M: Fam Zheng 
>  L: qemu-block@nongnu.org
>  S: Supported
>  F: block/nvme*
> +F: include/block/nvme.h
>  
>  Bootdevice
>  M: Gonglei 
> -- 
> 2.21.3
> 
> 



[PATCH 2/3] util: support detailed error reporting for qemu_open

2020-07-01 Thread Daniel P . Berrangé
Create a "qemu_open_err" method which does the same as "qemu_open",
but with a "Error **errp" for error reporting. There should be no
behavioural difference for existing callers at this stage.

Signed-off-by: Daniel P. Berrangé 
---
 include/qemu/osdep.h |  1 +
 util/osdep.c | 71 +++-
 2 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index 0d26a1b9bd..e41701a308 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -483,6 +483,7 @@ int qemu_madvise(void *addr, size_t len, int advice);
 int qemu_mprotect_rwx(void *addr, size_t size);
 int qemu_mprotect_none(void *addr, size_t size);
 
+int qemu_open_err(const char *name, int flags, Error **errp, ...);
 int qemu_open(const char *name, int flags, ...);
 int qemu_close(int fd);
 int qemu_unlink(const char *name);
diff --git a/util/osdep.c b/util/osdep.c
index 4bdbe81cec..450b3a5da3 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -22,6 +22,7 @@
  * THE SOFTWARE.
  */
 #include "qemu/osdep.h"
+#include "qapi/error.h"
 
 /* Needed early for CONFIG_BSD etc. */
 
@@ -282,7 +283,7 @@ int qemu_lock_fd_test(int fd, int64_t start, int64_t len, 
bool exclusive)
 /*
  * Opens a file with FD_CLOEXEC set
  */
-int qemu_open(const char *name, int flags, ...)
+static int qemu_openv(const char *name, int flags, Error **errp, va_list ap)
 {
 int ret;
 int mode = 0;
@@ -297,24 +298,31 @@ int qemu_open(const char *name, int flags, ...)
 
 fdset_id = qemu_parse_fdset(fdset_id_str);
 if (fdset_id == -1) {
+error_setg(errp, "Unable to parse fdset %s", name);
 errno = EINVAL;
 return -1;
 }
 
 fd = monitor_fdset_get_fd(fdset_id, flags);
 if (fd < 0) {
+error_setg_errno(errp, -fd, "Unable acquire FD for %s flags %x",
+ name, flags);
 errno = -fd;
 return -1;
 }
 
 dupfd = qemu_dup_flags(fd, flags);
 if (dupfd == -1) {
+error_setg_errno(errp, errno, "Unable dup FD for %s flags %x",
+ name, flags);
 return -1;
 }
 
 ret = monitor_fdset_dup_fd_add(fdset_id, dupfd);
 if (ret == -1) {
 close(dupfd);
+error_setg(errp, "Unable save FD for %s flags %x",
+   name, flags);
 errno = EINVAL;
 return -1;
 }
@@ -324,11 +332,7 @@ int qemu_open(const char *name, int flags, ...)
 #endif
 
 if (flags & O_CREAT) {
-va_list ap;
-
-va_start(ap, flags);
 mode = va_arg(ap, int);
-va_end(ap);
 }
 
 #ifdef O_CLOEXEC
@@ -340,25 +344,64 @@ int qemu_open(const char *name, int flags, ...)
 }
 #endif
 
+if (ret == -1) {
+const char *action = "open";
+if (flags & O_CREAT) {
+action = "create";
+}
 #ifdef O_DIRECT
-if (ret == -1 && errno == EINVAL && (flags & O_DIRECT)) {
-int newflags = flags & ~O_DIRECT;
+if (errno == EINVAL && (flags & O_DIRECT)) {
+int newflags = flags & ~O_DIRECT;
 # ifdef O_CLOEXEC
-ret = open(name, newflags | O_CLOEXEC, mode);
+ret = open(name, newflags | O_CLOEXEC, mode);
 # else
-ret = open(name, newflags, mode);
+ret = open(name, newflags, mode);
 # endif
-if (ret != -1) {
-close(ret);
-error_report("file system does not support O_DIRECT");
-errno = EINVAL;
+if (ret != -1) {
+close(ret);
+error_setg(errp, "Unable to %s '%s' flags 0x%x: "
+   "filesystem does not support O_DIRECT",
+   action, name, flags);
+if (!errp) {
+error_report("file system does not support O_DIRECT");
+}
+errno = EINVAL;
+return -1;
+}
 }
-}
 #endif /* O_DIRECT */
+error_setg_errno(errp, errno, "Unable to %s '%s' flags 0x%x",
+ action, name, flags);
+}
+
 
 return ret;
 }
 
+int qemu_open_err(const char *name, int flags, Error **errp, ...)
+{
+va_list ap;
+int rv;
+
+va_start(ap, errp);
+rv = qemu_openv(name, flags, errp, ap);
+va_end(ap);
+
+return rv;
+}
+
+int qemu_open(const char *name, int flags, ...)
+{
+va_list ap;
+int rv;
+
+va_start(ap, flags);
+rv = qemu_openv(name, flags, NULL, ap);
+va_end(ap);
+
+return rv;
+}
+
 int qemu_close(int fd)
 {
 int64_t fdset_id;
-- 
2.26.2




[PATCH 1/3] util: validate whether O_DIRECT is supported after failure

2020-07-01 Thread Daniel P . Berrangé
Currently we suggest that a filesystem may not support O_DIRECT after
seeing an EINVAL. Other things can cause EINVAL though, so it is better
to do an explicit check, and then report a definitive error message.

Signed-off-by: Daniel P. Berrangé 
---
 util/osdep.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/util/osdep.c b/util/osdep.c
index 4829c07ff6..4bdbe81cec 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -342,8 +342,17 @@ int qemu_open(const char *name, int flags, ...)
 
 #ifdef O_DIRECT
 if (ret == -1 && errno == EINVAL && (flags & O_DIRECT)) {
-error_report("file system may not support O_DIRECT");
-errno = EINVAL; /* in case it was clobbered */
+int newflags = flags & ~O_DIRECT;
+# ifdef O_CLOEXEC
+ret = open(name, newflags | O_CLOEXEC, mode);
+# else
+ret = open(name, newflags, mode);
+# endif
+if (ret != -1) {
+close(ret);
+error_report("file system does not support O_DIRECT");
+errno = EINVAL;
+}
 }
 #endif /* O_DIRECT */
 
-- 
2.26.2




[PATCH 0/3] block: improve error reporting for unsupported O_DIRECT

2020-07-01 Thread Daniel P . Berrangé
To repeat the commit message from patch 3...

Currently at startup if using cache=none on a filesystem lacking
O_DIRECT such as tmpfs, at startup QEMU prints

qemu-system-x86_64: -drive file=/tmp/foo.img,cache=none: file system may not 
support O_DIRECT
qemu-system-x86_64: -drive file=/tmp/foo.img,cache=none: Could not open 
'/tmp/foo.img': Invalid argument

while at QMP level the hint is missing, so QEMU reports just

  "error": {
  "class": "GenericError",
  "desc": "Could not open '/tmp/foo.img': Invalid argument"
  }

which is close to useless for the end user trying to figure out what
they did wrong

With this change at startup QEMU prints

qemu-system-x86_64: -drive file=/tmp/foo.img,cache=none: Unable to open 
'/tmp/foo.img' flags 0x4000: filesystem does not support O_DIRECT

while at the QMP level QEMU reports a massively more informative

  "error": {
 "class": "GenericError",
 "desc": "Unable to open '/tmp/foo.img' flags 0x4002: filesystem does not 
support O_DIRECT"
  }


qemu_open is used in many more places besides block layer, but
converting those to qemu_open_err is left as an exercise for
other maintainers.

Daniel P. Berrangé (3):
  util: validate whether O_DIRECT is supported after failure
  util: support detailed error reporting for qemu_open
  block: switch to use qemu_open_err for improved errors

 block/file-posix.c   | 10 +++
 include/qemu/osdep.h |  1 +
 util/osdep.c | 70 ++--
 3 files changed, 66 insertions(+), 15 deletions(-)

-- 
2.26.2





Re: [PATCH v2 12/12] block/nvme: Use per-queue AIO context

2020-07-01 Thread Stefan Hajnoczi
On Tue, Jun 30, 2020 at 09:13:18PM +0200, Philippe Mathieu-Daudé wrote:
> To be able to use multiple queues on the same hardware,
> we need to have each queue able to receive IRQ notifications
> in the correct AIO context.
> The AIO context and the notification handler have to be proper
> to each queue, not to the block driver. Move aio_context and
> irq_notifier from BDRVNVMeState to NVMeQueuePair.

If I understand correctly, after this patch:

1. Each queue pair has an EventNotifier irq_notifier but only the admin
   queuepair's irq_notifier is hooked up to VFIO. This means all
   interrupts come via the admin queuepair.

   (This also means all queuepairs still need to be polled for
   completions when the interrupt fires because we don't know which
   queuepair had a completion event.)

2. AioContexts must be identical across all queuepairs and
   BlockDriverStates. Although they all have their own AioContext
   pointer there is no true support for different AioContexts yet.

   (For example, nvme_cmd_sync() is called with a bs argument but
   AIO_WAIT_WHILE(q->aio_context, ...) uses the queuepair aio_context so
   the assumption is that they match.)

Please confirm and add something equivalent into the commit description
so the assumptions/limitations are clear.


signature.asc
Description: PGP signature


[PATCH 3/3] block: switch to use qemu_open_err for improved errors

2020-07-01 Thread Daniel P . Berrangé
Currently at startup if using cache=none on a filesystem lacking
O_DIRECT such as tmpfs, at startup QEMU prints

qemu-system-x86_64: -drive file=/tmp/foo.img,cache=none: file system may not 
support O_DIRECT
qemu-system-x86_64: -drive file=/tmp/foo.img,cache=none: Could not open 
'/tmp/foo.img': Invalid argument

while at QMP level the hint is missing, so QEMU reports just

  "error": {
  "class": "GenericError",
  "desc": "Could not open '/tmp/foo.img': Invalid argument"
  }

which is close to useless for the end user trying to figure out what
they did wrong

With this change at startup QEMU prints

qemu-system-x86_64: -drive file=/tmp/foo.img,cache=none: Unable to open 
'/tmp/foo.img' flags 0x4000: filesystem does not support O_DIRECT

while at the QMP level QEMU reports a massively more informative

  "error": {
 "class": "GenericError",
 "desc": "Unable to open '/tmp/foo.img' flags 0x4002: filesystem does not 
support O_DIRECT"
  }

Signed-off-by: Daniel P. Berrangé 
---
 block/file-posix.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 3ab8f5a0fa..2865b789fb 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -574,11 +574,10 @@ static int raw_open_common(BlockDriverState *bs, QDict 
*options,
 raw_parse_flags(bdrv_flags, &s->open_flags, false);
 
 s->fd = -1;
-fd = qemu_open(filename, s->open_flags, 0644);
+fd = qemu_open_err(filename, s->open_flags, errp, 0644);
 ret = fd < 0 ? -errno : 0;
 
 if (ret < 0) {
-error_setg_file_open(errp, -ret, filename);
 if (ret == -EROFS) {
 ret = -EACCES;
 }
@@ -970,9 +969,8 @@ static int raw_reconfigure_getfd(BlockDriverState *bs, int 
flags,
 ret = raw_normalize_devicepath(&normalized_filename, errp);
 if (ret >= 0) {
 assert(!(*open_flags & O_CREAT));
-fd = qemu_open(normalized_filename, *open_flags);
+fd = qemu_open_err(normalized_filename, *open_flags, errp);
 if (fd == -1) {
-error_setg_errno(errp, errno, "Could not reopen file");
 return -1;
 }
 }
@@ -2324,10 +2322,10 @@ raw_co_create(BlockdevCreateOptions *options, Error 
**errp)
 }
 
 /* Create file */
-fd = qemu_open(file_opts->filename, O_RDWR | O_CREAT | O_BINARY, 0644);
+fd = qemu_open_err(file_opts->filename, O_RDWR | O_CREAT | O_BINARY,
+   errp, 0644);
 if (fd < 0) {
 result = -errno;
-error_setg_errno(errp, -result, "Could not create file");
 goto out;
 }
 
-- 
2.26.2




Re: [PATCH v2 06/12] block/nvme: Use union of NvmeIdCtrl / NvmeIdNs structures

2020-07-01 Thread Stefan Hajnoczi
On Tue, Jun 30, 2020 at 09:13:12PM +0200, Philippe Mathieu-Daudé wrote:
> We allocate an unique chunk of memory then use it for two
> different structures. By using an union, we make it clear
> the data is overlapping (and we can remove the casts).
> 
> Suggested-by: Stefan Hajnoczi 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  block/nvme.c | 31 +++
>  1 file changed, 15 insertions(+), 16 deletions(-)

Reviewed-by: Stefan Hajnoczi 


signature.asc
Description: PGP signature


Re: [PATCH v2 08/12] block/nvme: Replace qemu_try_blockalign(bs) by qemu_try_memalign(pg_sz)

2020-07-01 Thread Stefan Hajnoczi
On Tue, Jun 30, 2020 at 09:13:14PM +0200, Philippe Mathieu-Daudé wrote:
> qemu_try_blockalign() is a generic API that call back to the
> block driver to return its page alignment. As we call from
> within the very same driver, we already know to page alignment
> stored in our state. Remove indirections and use the value from
> BDRVNVMeState.
> This change is required to later remove the BlockDriverState
> argument, to make nvme_init_queue() per hardware, and not per
> block driver.
> 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  block/nvme.c | 10 +-
>  1 file changed, 5 insertions(+), 5 deletions(-)

Reviewed-by: Stefan Hajnoczi 


signature.asc
Description: PGP signature


Re: [PATCH v2 05/12] block/nvme: Rename local variable

2020-07-01 Thread Stefan Hajnoczi
On Tue, Jun 30, 2020 at 09:13:11PM +0200, Philippe Mathieu-Daudé wrote:
> We are going to modify the code in the next commit. Renaming
> the 'resp' variable to 'id' first makes the next commit easier
> to review. No logical changes.
> 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  block/nvme.c | 19 +--
>  1 file changed, 9 insertions(+), 10 deletions(-)

Reviewed-by: Stefan Hajnoczi 


signature.asc
Description: PGP signature


Re: [PATCH v2 02/12] block/nvme: Avoid further processing if trace event not enabled

2020-07-01 Thread Stefan Hajnoczi
On Tue, Jun 30, 2020 at 09:13:08PM +0200, Philippe Mathieu-Daudé wrote:
> Avoid further processing if TRACE_NVME_SUBMIT_COMMAND_RAW is
> not enabled. This is an untested intend of performance optimization.
> 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  block/nvme.c | 3 +++
>  1 file changed, 3 insertions(+)

Reviewed-by: Stefan Hajnoczi 


signature.asc
Description: PGP signature


Re: [PATCH v2 01/12] block/nvme: Replace magic value by SCALE_MS definition

2020-07-01 Thread Stefan Hajnoczi
On Tue, Jun 30, 2020 at 09:13:07PM +0200, Philippe Mathieu-Daudé wrote:
> Use self-explicit SCALE_MS definition instead of magic value.
> 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  block/nvme.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

Reviewed-by: Stefan Hajnoczi 


signature.asc
Description: PGP signature


Re: [PATCH 1/4] migration: Prevent memleak by ...params_test_apply

2020-07-01 Thread Eric Blake

On 6/30/20 3:45 AM, Max Reitz wrote:

The created structure is not really a proper QAPI object, so we cannot
and will not free its members.  Strings therein should therefore not be
duplicated, or we will leak them.


This seems fragile to me; having to code QAPI usage differently 
depending on whether the containing struct was malloc'd or not (and 
therefore whether someone will call qapi_free_MigrateSetParameters or 
not) looks awkward to maintain.  We have 
visit_type_MigrateSetParameters_members, could that be used as a cleaner 
way to free all members of the struct without freeing the struct itself? 
 Should the QAPI generator start generating qapi_free_FOO_members to 
make such cleanup easier?




Signed-off-by: Max Reitz 
---
  migration/migration.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 481a590f72..47c7da4e55 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1336,12 +1336,12 @@ static void 
migrate_params_test_apply(MigrateSetParameters *params,
  
  if (params->has_tls_creds) {

  assert(params->tls_creds->type == QTYPE_QSTRING);
-dest->tls_creds = g_strdup(params->tls_creds->u.s);
+dest->tls_creds = params->tls_creds->u.s;
  }
  
  if (params->has_tls_hostname) {

  assert(params->tls_hostname->type == QTYPE_QSTRING);
-dest->tls_hostname = g_strdup(params->tls_hostname->u.s);
+dest->tls_hostname = params->tls_hostname->u.s;
  }
  
  if (params->has_max_bandwidth) {




--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org




Re: [PATCH 2/4] migration: Add block-bitmap-mapping parameter

2020-07-01 Thread Vladimir Sementsov-Ogievskiy

30.06.2020 11:45, Max Reitz wrote:

This migration parameter allows mapping block node names and bitmap
names to aliases for the purpose of block dirty bitmap migration.

This way, management tools can use different node and bitmap names on
the source and destination and pass the mapping of how bitmaps are to be
transferred to qemu (on the source, the destination, or even both with
arbitrary aliases in the migration stream).

Suggested-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Max Reitz 
---
  qapi/migration.json|  83 +++-
  migration/migration.h  |   3 +
  migration/block-dirty-bitmap.c | 372 -
  migration/migration.c  |  29 +++
  4 files changed, 432 insertions(+), 55 deletions(-)

diff --git a/qapi/migration.json b/qapi/migration.json
index d5000558c6..5aeae9bea8 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -507,6 +507,44 @@
'data': [ 'none', 'zlib',
  { 'name': 'zstd', 'if': 'defined(CONFIG_ZSTD)' } ] }
  
+##

+# @BitmapMigrationBitmapAlias:
+#
+# @name: The name of the bitmap.
+#
+# @alias: An alias name for migration (for example the bitmap name on
+# the opposite site).
+#
+# Since: 5.1
+##
+{ 'struct': 'BitmapMigrationBitmapAlias',
+  'data': {
+  'name': 'str',
+  'alias': 'str'
+  } }
+
+##
+# @BitmapMigrationNodeAlias:
+#
+# Maps a block node name and the bitmaps it has to aliases for dirty
+# bitmap migration.
+#
+# @node-name: A block node name.
+#
+# @alias: An alias block node name for migration (for example the
+# node name on the opposite site).
+#
+# @bitmaps: Mappings for the bitmaps on this node.
+#
+# Since: 5.1
+##
+{ 'struct': 'BitmapMigrationNodeAlias',
+  'data': {
+  'node-name': 'str',
+  'alias': 'str',
+  'bitmaps': [ 'BitmapMigrationBitmapAlias' ]


So, we still can't migrate bitmaps from one node to different nodes, but we
also don't know a usecase for it, so it seems OK. But with such scheme we
can select and rename bitmaps in-flight, and Peter said about corresponding
use-case.

I'm OK with this, still, just an idea in my mind:

we could instead just have a list of

BitmapMigrationAlias: {
 node-name
 bitmap-name
 node-alias
 bitmap-alias
}

so, mapping is set for each bitmap in separate.


+  } }
+
  ##
  # @MigrationParameter:
  #
@@ -641,6 +679,18 @@
  #  will consume more CPU.
  #  Defaults to 1. (Since 5.0)
  #
+# @block-bitmap-mapping: Maps block nodes and bitmaps on them to
+#  aliases for the purpose of dirty bitmap migration.  Such
+#  aliases may for example be the corresponding names on the
+#  opposite site.
+#  The mapping must be one-to-one and complete: On the source,


I've recently faced the case where incomplete mapping make sence: shared
migration of read-only bitmaps in backing files: it seems better to not
migrate them through migration channel, they are migrating through
shared storage automatically (yes, we'll pay the time to load them on
destination, but I'm going to improve it by implementing lazy load of
read-only and disabled bitmaps, so this will not be a problem).

So, now I think that it would be good just ignore all the bitmap not
described by mapping


+#  migrating a bitmap from a node when either is not mapped
+#  will result in an error.  On the destination, similarly,
+#  receiving a bitmap (by alias) from a node (by alias) when
+#  either alias is not mapped will result in an error.
+#  By default, all node names and bitmap names are mapped to
+#  themselves. (Since 5.1)


This sentense is unclear, want means "by default" - if the mapping is
not specified at all or if some nodes/bitmaps are not covered. Still,
tha latter will conflict with previous sentencies, so "by default" is
about when mapping is not set at all. I suggest to word it directly:
"If mapping is not set (the command never called, or mapping was
removed".

And, actual behavior without mapping is not as simple: it actually tries
to use blk names if possible and node-names if not, and this veries
from version to version. So, I think not worth to document it in detail,
just note, that without mapping the behavior is not well defined and
tries to use block-device name if possible and node-name otherwise. And
of course direct mapping of bitmap names.


+#
  # Since: 2.4
  ##
  { 'enum': 'MigrationParameter',
@@ -655,7 +705,8 @@
 'multifd-channels',
 'xbzrle-cache-size', 'max-postcopy-bandwidth',
 'max-cpu-throttle', 'multifd-compression',
-   'multifd-zlib-level' ,'multifd-zstd-level' ] }
+   'multifd-zlib-level' ,'multifd-zstd-level',
+   'block-bitmap-mapping' ] }
  
  ##

  # @MigrateSetParameters:
@@ -781,6 +832,18 @@
  #  will consume more CPU.
  #  Defaults to 1. (Since 5.0)
  #
+# @block-bitmap-mapping: Maps block nodes and bitmaps on them to
+#  aliases 

Re: nvme emulation merge process (was: Re: [PATCH 00/10] hw/block/nvme: namespace types and zoned namespaces)

2020-07-01 Thread Keith Busch
On Wed, Jul 01, 2020 at 03:57:27PM +0200, Philippe Mathieu-Daudé wrote:
> On 7/1/20 3:18 PM, Klaus Jensen wrote:
> > We've also seen good patches from Andrzej linger on the list for quite a
> > while, prompting a number of RESENDs. I only recently allocated more
> > time and upped my review game, but I hope that contributors feel that
> > stuff gets reviewed in a timely fashion by now.
> > 
> > Please understand that this is in NO WAY a criticism of Keith who
> > already made it very clear to me that he did not have a lot time to
> > review, but only ack the odd patch.
> > 
> >> Depending on how much time Keith can spend on review in the
> >> near future and how much control he wants to keep over the development,
> >> I could imagine adding Klaus to MAINTAINERS, either as a co-maintainer
> >> or as a reviewer. Then I could rely on reviews/acks from either of you
> >> for merging series.
> >>
> > 
> > I would be happy to step up (officially) to help maintain the device
> > with Keith and review on a daily basis, and my position can support
> > this.
> 
> Sounds good to me, but it is up to Keith Busch to accept.

I definitely want to continue at least having the opprotunity to review,
though you may have noticed I am a bit low on time for more thorough
maintenance this project deserves. The recent development pace for nvme
would benefit from having its own tree, so I'm open to either
co-maintenance, or handing off this to others. Please allow me to send a
few queries off-list today to check-in with potentially interested parties.
 
> It would be nice to have at least one developer from WDC listed as
> designated reviewer too.
> 
> Maxim is candidate for designated reviewer but I think he doesn't
> have the time.
> 
> It would also nice to have Andrzej Jakowski listed, if he is interested.



[PATCH] MAINTAINERS: Cover "block/nvme.h" file

2020-07-01 Thread Philippe Mathieu-Daudé
The "block/nvme.h" header is shared by both the NVMe block
driver and the NVMe emulated device. Add the 'F:' entry on
both sections, so all maintainers/reviewers are notified
when it is changed.

Signed-off-by: Philippe Mathieu-Daudé 
---
Cc: Fam Zheng 
Cc: Keith Busch 
Cc: Kevin Wolf 
Cc: Max Reitz 
Cc: Niklas Cassel 
Cc: Klaus Jensen 
Cc: Klaus Jensen 
Cc: Maxim Levitsky 
Cc: Dmitry Fomichev 
Cc: Andrzej Jakowski 
---
 MAINTAINERS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index dec252f38b..9995cdc805 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1795,6 +1795,7 @@ M: Keith Busch 
 L: qemu-block@nongnu.org
 S: Supported
 F: hw/block/nvme*
+F: include/block/nvme.h
 F: tests/qtest/nvme-test.c
 
 megasas
@@ -2804,6 +2805,7 @@ M: Fam Zheng 
 L: qemu-block@nongnu.org
 S: Supported
 F: block/nvme*
+F: include/block/nvme.h
 
 Bootdevice
 M: Gonglei 
-- 
2.21.3




Re: nvme emulation merge process (was: Re: [PATCH 00/10] hw/block/nvme: namespace types and zoned namespaces)

2020-07-01 Thread Philippe Mathieu-Daudé
On 7/1/20 3:18 PM, Klaus Jensen wrote:
> On Jul  1 12:34, Kevin Wolf wrote:
>> Am 30.06.2020 um 22:36 hat Klaus Jensen geschrieben:
>>> On Jun 30 08:42, Keith Busch wrote:
 On Tue, Jun 30, 2020 at 04:09:46PM +0200, Philippe Mathieu-Daudé wrote:
> What I see doable for the following days is:
> - hw/block/nvme: Fix I/O BAR structure [3]
> - hw/block/nvme: handle transient dma errors
> - hw/block/nvme: bump to v1.3


 These look like sensible patches to rebase future work on, IMO. The 1.3
 updates had been prepared a while ago, at least.
>>>
>>> I think Philippe's "hw/block/nvme: Fix I/O BAR structure" series is a
>>> no-brainer. It just needs to get in asap.
>>
>> I think we need to talk about how nvme patches are supposed to get
>> merged. I'm not familiar with the hardware nor the code, so the model
>> was that I just blindly merge patches that Keith has reviewed/acked,
>> just to spare him the work to prepare a pull request. But obviously, we
>> started doing things this way when there was a lot less activity around
>> the nvme emulation.
>>
>> If we find that this doesn't scale any more, maybe we need to change
>> something.
> 
> Honestly, I do not think the current model has worked very well for some
> time; especially for larger series where I, for one, has felt that my
> work was largely ignored due to a lack of designated reviewers. Things
> only picked up when Beata, Maxim and Philippe started reviewing my
> series - maybe out of pity or because I was bombing the list, I don't
> know ;)

I have no interest in the NVMe device emulation, but one of the first
thing I notice when I look at the wiki the time I wanted to send my
first patch, is the "Return the favor" paragraph:
https://wiki.qemu.org/Contribute/SubmitAPatch#Return_the_favor

 "Peer review only works if everyone chips in a bit of review time.
  If everyone submitted more patches than they reviewed, we would
  have a patch backlog. A good goal is to try to review at least as
  many patches from others as what you submit. Don't worry if you
  don't know the code base as well as a maintainer; it's perfectly
  fine to admit when your review is weak because you are unfamiliar
  with the code."

So as some reviewed my patches, I try to return the favor to the
community, in particular when I see someone is stuck waiting for
review, and the patch topic is some area I can understand.

I don't see that as an "out of pity" reaction.

Note, it is true bomb series scares reviewers. You learned it the
bad way. But you can see, after resending the first part of your
"bomb", even if it took 10 versions, the result is a great
improvement!

> We've also seen good patches from Andrzej linger on the list for quite a
> while, prompting a number of RESENDs. I only recently allocated more
> time and upped my review game, but I hope that contributors feel that
> stuff gets reviewed in a timely fashion by now.
> 
> Please understand that this is in NO WAY a criticism of Keith who
> already made it very clear to me that he did not have a lot time to
> review, but only ack the odd patch.
> 
>> Depending on how much time Keith can spend on review in the
>> near future and how much control he wants to keep over the development,
>> I could imagine adding Klaus to MAINTAINERS, either as a co-maintainer
>> or as a reviewer. Then I could rely on reviews/acks from either of you
>> for merging series.
>>
> 
> I would be happy to step up (officially) to help maintain the device
> with Keith and review on a daily basis, and my position can support
> this.

Sounds good to me, but it is up to Keith Busch to accept.

It would be nice to have at least one developer from WDC listed as
designated reviewer too.

Maxim is candidate for designated reviewer but I think he doesn't
have the time.

It would also nice to have Andrzej Jakowski listed, if he is interested.

> 
>> Of course, the patches don't necessarily have to go through my tree
>> either if this only serves to complicate things these days. If sending
>> separate pull requests directly to Peter would make things easier, I
>> certainly wouldn't object.
>>
> 
> I don't think there is any reason to by-pass your tree. I think the
> volume would need to increase even further for that to make sense.
> 




[PATCH v4 02/40] iotests: Fix 051 output after qdev_init_nofail() removal

2020-07-01 Thread Alex Bennée
From: Philippe Mathieu-Daudé 

Commit 96927c744 replaced qdev_init_nofail() call by
isa_realize_and_unref() which has a different error
message. Update the test output accordingly.

Gitlab CI error after merging b77b5b3dc7:
https://gitlab.com/qemu-project/qemu/-/jobs/597414772#L4375

Reported-by: Thomas Huth 
Signed-off-by: Philippe Mathieu-Daudé 
Signed-off-by: Alex Bennée 
Reviewed-by: John Snow 
Reviewed-by: Thomas Huth 
Message-Id: <20200616154949.6586-1-phi...@redhat.com>
---
 tests/qemu-iotests/051.pc.out | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/qemu-iotests/051.pc.out b/tests/qemu-iotests/051.pc.out
index 0ea80d35f0e..da8ad871876 100644
--- a/tests/qemu-iotests/051.pc.out
+++ b/tests/qemu-iotests/051.pc.out
@@ -142,7 +142,7 @@ QEMU X.Y.Z monitor - type 'help' for more information
 
 Testing: -drive if=ide
 QEMU X.Y.Z monitor - type 'help' for more information
-(qemu) QEMU_PROG: Initialization of device ide-hd failed: Device needs media, 
but drive is empty
+(qemu) QEMU_PROG: Device needs media, but drive is empty
 
 Testing: -drive if=virtio
 QEMU X.Y.Z monitor - type 'help' for more information
@@ -214,7 +214,7 @@ QEMU X.Y.Z monitor - type 'help' for more information
 
 Testing: -drive file=TEST_DIR/t.qcow2,if=ide,readonly=on
 QEMU X.Y.Z monitor - type 'help' for more information
-(qemu) QEMU_PROG: Initialization of device ide-hd failed: Block node is 
read-only
+(qemu) QEMU_PROG: Block node is read-only
 
 Testing: -drive file=TEST_DIR/t.qcow2,if=virtio,readonly=on
 QEMU X.Y.Z monitor - type 'help' for more information
-- 
2.20.1




Re: [PATCH v9 16/34] qcow2: Add qcow2_cluster_is_allocated()

2020-07-01 Thread Max Reitz
On 28.06.20 13:02, Alberto Garcia wrote:
> This helper function tells us if a cluster is allocated (that is,
> there is an associated host offset for it).
> 
> Signed-off-by: Alberto Garcia 
> Reviewed-by: Eric Blake 
> ---
>  block/qcow2.h | 6 ++
>  1 file changed, 6 insertions(+)

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature


[PATCH 1/2] iscsi: handle check condition status in retry loop

2020-07-01 Thread Xie Yongji
The handling of check condition was incorrect because
we would only do it after retries exceed maximum.

Fixes: 8c460269aa ("iscsi: base all handling of check condition on 
scsi_sense_to_errno")
Signed-off-by: Xie Yongji 
---
 block/iscsi.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index a8b76979d8..2964c9f8d2 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -266,16 +266,16 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int 
status,
 timer_mod(&iTask->retry_timer,
   qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + retry_time);
 iTask->do_retry = 1;
-}
-} else if (status == SCSI_STATUS_CHECK_CONDITION) {
-int error = iscsi_translate_sense(&task->sense);
-if (error == EAGAIN) {
-error_report("iSCSI CheckCondition: %s",
- iscsi_get_error(iscsi));
-iTask->do_retry = 1;
-} else {
-iTask->err_code = -error;
-iTask->err_str = g_strdup(iscsi_get_error(iscsi));
+} else if (status == SCSI_STATUS_CHECK_CONDITION) {
+int error = iscsi_translate_sense(&task->sense);
+if (error == EAGAIN) {
+error_report("iSCSI CheckCondition: %s",
+ iscsi_get_error(iscsi));
+iTask->do_retry = 1;
+} else {
+iTask->err_code = -error;
+iTask->err_str = g_strdup(iscsi_get_error(iscsi));
+}
 }
 }
 }
-- 
2.11.0




[PATCH 2/2] iscsi: return -EIO when sense fields are meaningless

2020-07-01 Thread Xie Yongji
When an I/O request failed, now we only return correct
value on scsi check condition. We should also have a
default errno such as -EIO in other case.

Signed-off-by: Xie Yongji 
---
 block/iscsi.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/iscsi.c b/block/iscsi.c
index 2964c9f8d2..387ed872ef 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -241,9 +241,11 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int 
status,
 
 iTask->status = status;
 iTask->do_retry = 0;
+iTask->err_code = 0;
 iTask->task = task;
 
 if (status != SCSI_STATUS_GOOD) {
+iTask->err_code = -EIO;
 if (iTask->retries++ < ISCSI_CMD_RETRIES) {
 if (status == SCSI_STATUS_BUSY ||
 status == SCSI_STATUS_TIMEOUT ||
-- 
2.11.0




Re: [PATCH v9 15/34] qcow2: Add qcow2_get_subcluster_range_type()

2020-07-01 Thread Max Reitz
On 28.06.20 13:02, Alberto Garcia wrote:
> There are situations in which we want to know how many contiguous
> subclusters of the same type there are in a given cluster. This can be
> done by simply iterating over the subclusters and repeatedly calling
> qcow2_get_subcluster_type() for each one of them.
> 
> However once we determined the type of a subcluster we can check the
> rest efficiently by counting the number of adjacent ones (or zeroes)
> in the bitmap. This is what this function does.
> 
> Signed-off-by: Alberto Garcia 
> Reviewed-by: Eric Blake 
> ---
>  block/qcow2-cluster.c | 51 +++
>  1 file changed, 51 insertions(+)

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature


Re: nvme emulation merge process (was: Re: [PATCH 00/10] hw/block/nvme: namespace types and zoned namespaces)

2020-07-01 Thread Maxim Levitsky
On Wed, 2020-07-01 at 15:18 +0200, Klaus Jensen wrote:
> On Jul  1 12:34, Kevin Wolf wrote:
> > Am 30.06.2020 um 22:36 hat Klaus Jensen geschrieben:
> > > On Jun 30 08:42, Keith Busch wrote:
> > > > On Tue, Jun 30, 2020 at 04:09:46PM +0200, Philippe Mathieu-Daudé wrote:
> > > > > What I see doable for the following days is:
> > > > > - hw/block/nvme: Fix I/O BAR structure [3]
> > > > > - hw/block/nvme: handle transient dma errors
> > > > > - hw/block/nvme: bump to v1.3
> > > > 
> > > > These look like sensible patches to rebase future work on, IMO. The 1.3
> > > > updates had been prepared a while ago, at least.
> > > 
> > > I think Philippe's "hw/block/nvme: Fix I/O BAR structure" series is a
> > > no-brainer. It just needs to get in asap.
> > 
> > I think we need to talk about how nvme patches are supposed to get
> > merged. I'm not familiar with the hardware nor the code, so the model
> > was that I just blindly merge patches that Keith has reviewed/acked,
> > just to spare him the work to prepare a pull request. But obviously, we
> > started doing things this way when there was a lot less activity around
> > the nvme emulation.
> > 
> > If we find that this doesn't scale any more, maybe we need to change
> > something.
> 
> Honestly, I do not think the current model has worked very well for some
> time; especially for larger series where I, for one, has felt that my
> work was largely ignored due to a lack of designated reviewers. Things
> only picked up when Beata, Maxim and Philippe started reviewing my
> series - maybe out of pity or because I was bombing the list, I don't
> know ;)
> 
> We've also seen good patches from Andrzej linger on the list for quite a
> while, prompting a number of RESENDs. I only recently allocated more
> time and upped my review game, but I hope that contributors feel that
> stuff gets reviewed in a timely fashion by now.
> 
> Please understand that this is in NO WAY a criticism of Keith who
> already made it very clear to me that he did not have a lot time to
> review, but only ack the odd patch.
> 
> > Depending on how much time Keith can spend on review in the
> > near future and how much control he wants to keep over the development,
> > I could imagine adding Klaus to MAINTAINERS, either as a co-maintainer
> > or as a reviewer. Then I could rely on reviews/acks from either of you
> > for merging series.
> > 
> 
> I would be happy to step up (officially) to help maintain the device
> with Keith and review on a daily basis, and my position can support
> this.
> 
> > Of course, the patches don't necessarily have to go through my tree
> > either if this only serves to complicate things these days. If sending
> > separate pull requests directly to Peter would make things easier, I
> > certainly wouldn't object.
> > 
> 
> I don't think there is any reason to by-pass your tree. I think the
> volume would need to increase even further for that to make sense.
> 
It my fault as well - I need to get back to reviewing these.
(I'll review few of them today I hope)

Best regards,
Maxim Levitsky




Re: nvme emulation merge process (was: Re: [PATCH 00/10] hw/block/nvme: namespace types and zoned namespaces)

2020-07-01 Thread Klaus Jensen
On Jul  1 12:34, Kevin Wolf wrote:
> Am 30.06.2020 um 22:36 hat Klaus Jensen geschrieben:
> > On Jun 30 08:42, Keith Busch wrote:
> > > On Tue, Jun 30, 2020 at 04:09:46PM +0200, Philippe Mathieu-Daudé wrote:
> > > > What I see doable for the following days is:
> > > > - hw/block/nvme: Fix I/O BAR structure [3]
> > > > - hw/block/nvme: handle transient dma errors
> > > > - hw/block/nvme: bump to v1.3
> > > 
> > > 
> > > These look like sensible patches to rebase future work on, IMO. The 1.3
> > > updates had been prepared a while ago, at least.
> > 
> > I think Philippe's "hw/block/nvme: Fix I/O BAR structure" series is a
> > no-brainer. It just needs to get in asap.
> 
> I think we need to talk about how nvme patches are supposed to get
> merged. I'm not familiar with the hardware nor the code, so the model
> was that I just blindly merge patches that Keith has reviewed/acked,
> just to spare him the work to prepare a pull request. But obviously, we
> started doing things this way when there was a lot less activity around
> the nvme emulation.
> 
> If we find that this doesn't scale any more, maybe we need to change
> something.

Honestly, I do not think the current model has worked very well for some
time; especially for larger series where I, for one, has felt that my
work was largely ignored due to a lack of designated reviewers. Things
only picked up when Beata, Maxim and Philippe started reviewing my
series - maybe out of pity or because I was bombing the list, I don't
know ;)

We've also seen good patches from Andrzej linger on the list for quite a
while, prompting a number of RESENDs. I only recently allocated more
time and upped my review game, but I hope that contributors feel that
stuff gets reviewed in a timely fashion by now.

Please understand that this is in NO WAY a criticism of Keith who
already made it very clear to me that he did not have a lot time to
review, but only ack the odd patch.

> Depending on how much time Keith can spend on review in the
> near future and how much control he wants to keep over the development,
> I could imagine adding Klaus to MAINTAINERS, either as a co-maintainer
> or as a reviewer. Then I could rely on reviews/acks from either of you
> for merging series.
> 

I would be happy to step up (officially) to help maintain the device
with Keith and review on a daily basis, and my position can support
this.

> Of course, the patches don't necessarily have to go through my tree
> either if this only serves to complicate things these days. If sending
> separate pull requests directly to Peter would make things easier, I
> certainly wouldn't object.
> 

I don't think there is any reason to by-pass your tree. I think the
volume would need to increase even further for that to make sense.



Re: [PATCH 0/2] hw/block/nvme: handle transient dma errors

2020-07-01 Thread Philippe Mathieu-Daudé
On 6/29/20 11:34 PM, Klaus Jensen wrote:
> On Jun 29 14:07, no-re...@patchew.org wrote:
>> Patchew URL: 
>> https://patchew.org/QEMU/20200629202053.1223342-1-...@irrelevant.dk/

>> --- /tmp/qemu-test/src/tests/qemu-iotests/040.out   2020-06-29 
>> 20:12:10.0 +
>> +++ /tmp/qemu-test/build/tests/qemu-iotests/040.out.bad 2020-06-29 
>> 20:58:48.288790818 +
>> @@ -1,3 +1,5 @@
>> +WARNING:qemu.machine:qemu received signal 9: 
>> /tmp/qemu-test/build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
>>  -display none -vga none -chardev 
>> socket,id=mon,path=/tmp/tmp.Jdol0fPScQ/qemu-21749-monitor.sock -mon 
>> chardev=mon,mode=control -qtest 
>> unix:path=/tmp/tmp.Jdol0fPScQ/qemu-21749-qtest.sock -accel qtest -nodefaults 
>> -display none -accel qtest
>> +WARNING:qemu.machine:qemu received signal 9: 
>> /tmp/qemu-test/build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
>>  -display none -vga none -chardev 
>> socket,id=mon,path=/tmp/tmp.Jdol0fPScQ/qemu-21749-monitor.sock -mon 
>> chardev=mon,mode=control -qtest 
>> unix:path=/tmp/tmp.Jdol0fPScQ/qemu-21749-qtest.sock -accel qtest -nodefaults 
>> -display none -accel qtest

Kevin, Max, can iotests/040 be affected by this change?

> 
> 
> Hmm, I can't seem to reproduce this locally and the test succeeded on
> the next series[1] that is based on this.
> 
> Is this a flaky test? Or a bad test runner? I'm of course worried when
> a qcow2 test fails and I touch something else than the nvme device ;)
> 
> 
>   [1]: https://patchew.org/QEMU/20200629203155.1236860-1-...@irrelevant.dk/
> 




Re: [PATCH v9 14/34] qcow2: Add QCow2SubclusterType and qcow2_get_subcluster_type()

2020-07-01 Thread Max Reitz
On 28.06.20 13:02, Alberto Garcia wrote:
> This patch adds QCow2SubclusterType, which is the subcluster-level
> version of QCow2ClusterType. All QCOW2_SUBCLUSTER_* values have the
> the same meaning as their QCOW2_CLUSTER_* equivalents (when they
> exist). See below for details and caveats.
> 
> In images without extended L2 entries clusters are treated as having
> exactly one subcluster so it is possible to replace one data type with
> the other while keeping the exact same semantics.
> 
> With extended L2 entries there are new possible values, and every
> subcluster in the same cluster can obviously have a different
> QCow2SubclusterType so functions need to be adapted to work on the
> subcluster level.
> 
> There are several things that have to be taken into account:
> 
>   a) QCOW2_SUBCLUSTER_COMPRESSED means that the whole cluster is
>  compressed. We do not support compression at the subcluster
>  level.
> 
>   b) There are two different values for unallocated subclusters:
>  QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN which means that the whole
>  cluster is unallocated, and QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC
>  which means that the cluster is allocated but the subcluster is
>  not. The latter can only happen in images with extended L2
>  entries.
> 
>   c) QCOW2_SUBCLUSTER_INVALID is used to detect the cases where an L2
>  entry has a value that violates the specification. The caller is
>  responsible for handling these situations.
> 
>  To prevent compatibility problems with images that have invalid
>  values but are currently being read by QEMU without causing side
>  effects, QCOW2_SUBCLUSTER_INVALID is only returned for images
>  with extended L2 entries.
> 
> qcow2_cluster_to_subcluster_type() is added as a separate function
> from qcow2_get_subcluster_type(), but this is only temporary and both
> will be merged in a subsequent patch.
> 
> Signed-off-by: Alberto Garcia 
> Reviewed-by: Eric Blake 
> ---
>  block/qcow2.h | 126 +-
>  1 file changed, 125 insertions(+), 1 deletion(-)
> 
> diff --git a/block/qcow2.h b/block/qcow2.h
> index 82b86f6cec..3aec6f452a 100644
> --- a/block/qcow2.h
> +++ b/block/qcow2.h

[...]

> @@ -634,9 +686,11 @@ static inline int64_t 
> qcow2_vm_state_offset(BDRVQcow2State *s)
>  static inline QCow2ClusterType qcow2_get_cluster_type(BlockDriverState *bs,
>uint64_t l2_entry)
>  {
> +BDRVQcow2State *s = bs->opaque;
> +
>  if (l2_entry & QCOW_OFLAG_COMPRESSED) {
>  return QCOW2_CLUSTER_COMPRESSED;
> -} else if (l2_entry & QCOW_OFLAG_ZERO) {
> +} else if ((l2_entry & QCOW_OFLAG_ZERO) && !has_subclusters(s)) {

OK, so now qcow2_get_cluster_type() reports zero clusters to be normal
or unallocated clusters when there are subclusters.  Seems weird to me,
because zero clusters are invalid clusters then.  I preferred just
reporting them as zero clusters and letting the caller deal with it,
because it does mean an error in the image and so it should be reported.

So...

>  if (l2_entry & L2E_OFFSET_MASK) {
>  return QCOW2_CLUSTER_ZERO_ALLOC;
>  }

[...]

> +/*
> + * In an image without subsclusters @l2_bitmap is ignored and
> + * @sc_index must be 0.
> + * Return QCOW2_SUBCLUSTER_INVALID if an invalid l2 entry is detected
> + * (this checks the whole entry and bitmap, not only the bits related
> + * to subcluster @sc_index).
> + */
> +static inline
> +QCow2SubclusterType qcow2_get_subcluster_type(BlockDriverState *bs,
> +  uint64_t l2_entry,
> +  uint64_t l2_bitmap,
> +  unsigned sc_index)
> +{
> +BDRVQcow2State *s = bs->opaque;
> +QCow2ClusterType type = qcow2_get_cluster_type(bs, l2_entry);
> +assert(sc_index < s->subclusters_per_cluster);
> +
> +if (has_subclusters(s)) {
> +switch (type) {
> +case QCOW2_CLUSTER_COMPRESSED:
> +return QCOW2_SUBCLUSTER_COMPRESSED;
> +case QCOW2_CLUSTER_NORMAL:
> +if ((l2_bitmap >> 32) & l2_bitmap) {
> +return QCOW2_SUBCLUSTER_INVALID;
> +} else if (l2_bitmap & QCOW_OFLAG_SUB_ZERO(sc_index)) {
> +return QCOW2_SUBCLUSTER_ZERO_ALLOC;
> +} else if (l2_bitmap & QCOW_OFLAG_SUB_ALLOC(sc_index)) {
> +return QCOW2_SUBCLUSTER_NORMAL;
> +} else {
> +return QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC;
> +}
> +case QCOW2_CLUSTER_UNALLOCATED:
> +if (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC) {
> +return QCOW2_SUBCLUSTER_INVALID;
> +} else if (l2_bitmap & QCOW_OFLAG_SUB_ZERO(sc_index)) {
> +return QCOW2_SUBCLUSTER_ZERO_PLAIN;
> +} else {
> +return QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN;
> + 

Re: [PATCH v9 13/34] qcow2: Update get/set_l2_entry() and add get/set_l2_bitmap()

2020-07-01 Thread Max Reitz
On 28.06.20 13:02, Alberto Garcia wrote:
> Extended L2 entries are 128-bit wide: 64 bits for the entry itself and
> 64 bits for the subcluster allocation bitmap.
> 
> In order to support them correctly get/set_l2_entry() need to be
> updated so they take the entry width into account in order to
> calculate the correct offset.
> 
> This patch also adds the get/set_l2_bitmap() functions that are
> used to access the bitmaps. For convenience we allow calling
> get_l2_bitmap() on images without subclusters. In this case the
> returned value is always 0 and has no meaning.
> 
> Signed-off-by: Alberto Garcia 
> Reviewed-by: Eric Blake 
> ---
>  block/qcow2.h | 21 +
>  1 file changed, 21 insertions(+)

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature


Re: [PATCH v9 11/34] qcow2: Add offset_into_subcluster() and size_to_subclusters()

2020-07-01 Thread Max Reitz
On 28.06.20 13:02, Alberto Garcia wrote:
> Like offset_into_cluster() and size_to_clusters(), but for
> subclusters.
> 
> Signed-off-by: Alberto Garcia 
> Reviewed-by: Eric Blake 
> ---
>  block/qcow2.h | 10 ++
>  1 file changed, 10 insertions(+)

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature


Re: [PATCH 1/2] pci: pass along the return value of dma_memory_rw

2020-07-01 Thread Michael S. Tsirkin
On Mon, Jun 29, 2020 at 10:20:52PM +0200, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Some devices might want to know the return value of dma_memory_rw, so
> pass it along instead of ignoring it.
> 
> There are no existing users of the return value, so this patch should be
> safe.
> 
> Signed-off-by: Klaus Jensen 
> Reviewed-by: Philippe Mathieu-Daudé 
> Reviewed-by: Michael S. Tsirkin 
> Acked-by: Keith Busch 

Feel free to merge with patch 2/2.

> ---
>  include/hw/pci/pci.h | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index a4e9c3341615..2347dc36bfb5 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -786,8 +786,7 @@ static inline AddressSpace 
> *pci_get_address_space(PCIDevice *dev)
>  static inline int pci_dma_rw(PCIDevice *dev, dma_addr_t addr,
>   void *buf, dma_addr_t len, DMADirection dir)
>  {
> -dma_memory_rw(pci_get_address_space(dev), addr, buf, len, dir);
> -return 0;
> +return dma_memory_rw(pci_get_address_space(dev), addr, buf, len, dir);
>  }
>  
>  static inline int pci_dma_read(PCIDevice *dev, dma_addr_t addr,
> -- 
> 2.27.0




Re: [PATCH v2 14/18] hw/block/nvme: Generate zone AENs

2020-07-01 Thread Klaus Jensen
On Jun 18 06:34, Dmitry Fomichev wrote:
> Added an optional Boolean "zone_async_events" property to the driver.
> Once it's turned on, the namespace will be sending "Zone Descriptor
> Changed" asynchronous events to the host in particular situations
> defined by the protocol. In order to clear these AENs, the host needs
> to read the newly added Changed Zones Log.
> 
> Signed-off-by: Dmitry Fomichev 

This was a tough review ;)


  * I don't like the monkey patching of the completion queue path to
handle AERs and it took me way too much time to figure out what was
going on with the extra timer_mod's on the cq->timer.

Please consider taking a look at

  
https://github.com/birkelund/qemu/commit/928a6ead98ba3b0a293d90496c3fa54d51a052a5

which is already reviewed and gets AERs right I think. But if my
v1.3 series are merged, that will be in-tree anyway.

  * Handling the RRL and FRL delays and limits can be handled using a
single timer like I'm doing here in my version of the ZNS
emulation:

  
https://github.com/birkelund/qemu/blob/for-master/nvme/hw/block/nvme-ns.c#L52

This is infinitely more efficient since it removes the need for
continuously kicking the event loop every 10ms. And this patch
*really* needs to get get rid of that polling ;)


More comments inline.


> ---
>  hw/block/nvme.c  | 300 ++-
>  hw/block/nvme.h  |  13 +-
>  include/block/nvme.h |  23 +++-
>  3 files changed, 328 insertions(+), 8 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index c3898448c7..b9135a6b1f 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -201,12 +201,66 @@ static inline void nvme_aor_dec_active(NvmeCtrl *n, 
> NvmeNamespace *ns)
>  assert(ns->nr_active_zones >= 0);
>  }
>  
> +static bool nvme_complete_async_req(NvmeCtrl *n, NvmeNamespace *ns,
> +enum NvmeAsyncEventType type, uint8_t info)
> +{
> +NvmeAsyncEvent *ae;
> +uint32_t nsid = 0;
> +uint8_t log_page = 0;
> +
> +switch (type) {
> +case NVME_AER_TYPE_ERROR:
> +case NVME_AER_TYPE_SMART:
> +break;
> +case NVME_AER_TYPE_NOTICE:
> +switch (info) {
> +case NVME_AER_NOTICE_ZONE_DESCR_CHANGED:
> +log_page = NVME_LOG_ZONE_CHANGED_LIST;
> +nsid = ns->nsid;
> +if (!(n->ae_cfg & NVME_AEN_CFG_ZONE_DESCR_CHNGD_NOTICES)) {
> +trace_pci_nvme_zone_ae_not_enabled(info, log_page, nsid);
> +return false;
> +}
> +if (ns->aen_pending) {
> +trace_pci_nvme_zone_ae_not_cleared(info, log_page, nsid);
> +return false;
> +}
> +ns->aen_pending = true;
> +}
> +break;
> +case NVME_AER_TYPE_CMDSET_SPECIFIC:
> +case NVME_AER_TYPE_VENDOR_SPECIFIC:
> +break;
> +}
> +
> +ae = g_malloc0(sizeof(*ae));
> +ae->res = type;
> +ae->res |= (info << 8) & 0xff00;
> +ae->res |= (log_page << 16) & 0xff;
> +ae->nsid = nsid;
> +
> +QTAILQ_INSERT_TAIL(&n->async_reqs, ae, entry);
> +timer_mod(n->admin_cq.timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 
> 500);
> +return true;
> +}
> +
> +static inline void nvme_notify_zone_changed(NvmeCtrl *n, NvmeNamespace *ns,
> +NvmeZone *zone)
> +{
> +if (n->ae_cfg) {
> +zone->flags |= NVME_ZFLAGS_AEN_PEND;
> +nvme_complete_async_req(n, ns, NVME_AER_TYPE_NOTICE,
> +NVME_AER_NOTICE_ZONE_DESCR_CHANGED);
> +}
> +}
> +
>  static void nvme_set_rzr(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone)
>  {
>  assert(zone->flags & NVME_ZFLAGS_SET_RZR);
>  zone->tstamp = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
>  zone->flags &= ~NVME_ZFLAGS_TS_DELAY;
>  zone->d.za |= NVME_ZA_RESET_RECOMMENDED;
> +nvme_notify_zone_changed(n, ns, zone);
>  zone->flags &= ~NVME_ZFLAGS_SET_RZR;
>  trace_pci_nvme_zone_reset_recommended(zone->d.zslba);
>  }
> @@ -215,10 +269,14 @@ static void nvme_clear_rzr(NvmeCtrl *n, NvmeNamespace 
> *ns,
>  NvmeZone *zone, bool notify)
>  {
>  if (n->params.rrl_usec) {
> -zone->flags &= ~(NVME_ZFLAGS_SET_RZR | NVME_ZFLAGS_TS_DELAY);
> +zone->flags &= ~(NVME_ZFLAGS_SET_RZR | NVME_ZFLAGS_TS_DELAY |
> + NVME_ZFLAGS_AEN_PEND);
>  notify = notify && (zone->d.za & NVME_ZA_RESET_RECOMMENDED);
>  zone->d.za &= ~NVME_ZA_RESET_RECOMMENDED;
>  zone->tstamp = 0;
> +if (notify) {
> +nvme_notify_zone_changed(n, ns, zone);
> +}
>  }
>  }
>  
> @@ -228,6 +286,7 @@ static void nvme_set_fzr(NvmeCtrl *n, NvmeNamespace *ns, 
> NvmeZone *zone)
>  zone->tstamp = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
>  zone->flags &= ~NVME_ZFLAGS_TS_DELAY;
>  zone->d.za |= NVME_ZA_FINISH_RECOMMENDED;
> +nvme_notify_zone_changed(n, ns, zone);
>  zone->flags &= ~NVME_ZFLAGS_SET_FZR;
>  trace_pci_nv

Re: [PATCH 2/2] iscsi: return -EIO when sense fields are meaningless

2020-07-01 Thread Paolo Bonzini
On 01/07/20 12:54, Xie Yongji wrote:
> When an I/O request failed, now we only return correct
> value on scsi check condition. We should also have a
> default errno such as -EIO in other case.
> 
> Signed-off-by: Xie Yongji 
> ---
>  block/iscsi.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/block/iscsi.c b/block/iscsi.c
> index 2964c9f8d2..387ed872ef 100644
> --- a/block/iscsi.c
> +++ b/block/iscsi.c
> @@ -241,9 +241,11 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int 
> status,
>  
>  iTask->status = status;
>  iTask->do_retry = 0;
> +iTask->err_code = 0;
>  iTask->task = task;
>  
>  if (status != SCSI_STATUS_GOOD) {
> +iTask->err_code = -EIO;
>  if (iTask->retries++ < ISCSI_CMD_RETRIES) {
>  if (status == SCSI_STATUS_BUSY ||
>  status == SCSI_STATUS_TIMEOUT ||
> 

Queued both, thanks.

Paolo




Re: [PATCH 1/4] migration: Prevent memleak by ...params_test_apply

2020-07-01 Thread Vladimir Sementsov-Ogievskiy

30.06.2020 11:45, Max Reitz wrote:

The created structure is not really a proper QAPI object, so we cannot
and will not free its members.  Strings therein should therefore not be
duplicated, or we will leak them.

Signed-off-by: Max Reitz


Reviewed-by: Vladimir Sementsov-Ogievskiy 

--
Best regards,
Vladimir



[PATCH v2 3/5] nbd: make nbd_export_close_all() synchronous

2020-07-01 Thread Vladimir Sementsov-Ogievskiy
Consider nbd_export_close_all(). The call-stack looks like this:
 nbd_export_close_all() -> nbd_export_close -> call client_close() for
each client.

client_close() doesn't guarantee that client is closed: nbd_trip()
keeps reference to it. So, nbd_export_close_all() just reduce
reference counter on export and removes it from the list, but doesn't
guarantee that nbd_trip() finished neither export actually removed.

Let's wait for all exports actually removed.

Without this fix, the following crash is possible:

- export bitmap through internal Qemu NBD server
- connect a client
- shutdown Qemu

On shutdown nbd_export_close_all is called, but it actually don't wait
for nbd_trip() to finish and to release its references. So, export is
not release, and exported bitmap remains busy, and on try to remove the
bitmap (which is part of bdrv_close()) the assertion fails:

bdrv_release_dirty_bitmap_locked: Assertion `!bdrv_dirty_bitmap_busy(bitmap)' 
failed

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---

v2: rewritten, try to wait exports directly.

Note: I'm not sure in my understanding of AIO_WAIT_WHILE and related things
and really hope for review.


 nbd/server.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/nbd/server.c b/nbd/server.c
index 20754e9ebc..9d64b00f4b 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -102,6 +102,8 @@ struct NBDExport {
 };
 
 static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
+static QTAILQ_HEAD(, NBDExport) closed_exports =
+QTAILQ_HEAD_INITIALIZER(closed_exports);
 
 /* NBDExportMetaContexts represents a list of contexts to be exported,
  * as selected by NBD_OPT_SET_META_CONTEXT. Also used for
@@ -1655,6 +1657,7 @@ void nbd_export_close(NBDExport *exp)
 g_free(exp->name);
 exp->name = NULL;
 QTAILQ_REMOVE(&exports, exp, next);
+QTAILQ_INSERT_TAIL(&closed_exports, exp, next);
 }
 g_free(exp->description);
 exp->description = NULL;
@@ -1717,7 +1720,9 @@ void nbd_export_put(NBDExport *exp)
 g_free(exp->export_bitmap_context);
 }
 
+QTAILQ_REMOVE(&closed_exports, exp, next);
 g_free(exp);
+aio_wait_kick();
 }
 }
 
@@ -1737,6 +1742,9 @@ void nbd_export_close_all(void)
 nbd_export_close(exp);
 aio_context_release(aio_context);
 }
+
+AIO_WAIT_WHILE(NULL, !(QTAILQ_EMPTY(&exports) &&
+   QTAILQ_EMPTY(&closed_exports)));
 }
 
 static int coroutine_fn nbd_co_send_iov(NBDClient *client, struct iovec *iov,
-- 
2.18.0




[PATCH v2 1/5] iotests: QemuIoInteractive: use qemu_io_args_no_fmt

2020-07-01 Thread Vladimir Sementsov-Ogievskiy
The only user (iotest 205) of QemuIoInteractive provides -f argument,
so it's a bit inefficient to use qemu_io_args, which contains -f too.
And we are going to add one more test, which wants to specify -f by
hand. Let's use qemu_io_args_no_fmt.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Eric Blake 
---
 tests/qemu-iotests/iotests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index 5ea4c4df8b..efe9958f5e 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -211,7 +211,7 @@ def get_virtio_scsi_device():
 
 class QemuIoInteractive:
 def __init__(self, *args):
-self.args = qemu_io_args + list(args)
+self.args = qemu_io_args_no_fmt + list(args)
 self._p = subprocess.Popen(self.args, stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
-- 
2.18.0




[PATCH v2 5/5] iotests: test shutdown when bitmap is exported through NBD

2020-07-01 Thread Vladimir Sementsov-Ogievskiy
Test shutdown when bitmap is exported through NBD and active client
exists. The previous patch fixes a crash, provoked by this scenario.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 tests/qemu-iotests/299 | 65 ++
 tests/qemu-iotests/299.out | 10 ++
 tests/qemu-iotests/group   |  1 +
 3 files changed, 76 insertions(+)
 create mode 100644 tests/qemu-iotests/299
 create mode 100644 tests/qemu-iotests/299.out

diff --git a/tests/qemu-iotests/299 b/tests/qemu-iotests/299
new file mode 100644
index 00..e129c7f7cb
--- /dev/null
+++ b/tests/qemu-iotests/299
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+#
+# Test shutdown when bitmap is exported through NBD server
+#
+# Copyright (c) 2020 Virtuozzo International GmbH.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+#
+
+import iotests
+
+# The test is unrelated to formats, restrict it to qcow2 to avoid extra runs
+iotests.script_initialize(
+supported_fmts=['qcow2'],
+)
+
+nbd_sock = iotests.file_path('nbd.sock', base_dir=iotests.sock_dir)
+nbd_uri = 'nbd+unix:///disk?socket=' + nbd_sock
+size = 1024 * 1024
+
+vm = iotests.VM()
+vm.launch()
+
+vm.qmp_log('blockdev-add', **{
+'node-name': 'disk',
+'driver': 'null-co',
+'size': 1024 * 1024,
+})
+
+vm.qmp_log('block-dirty-bitmap-add', **{
+'node': 'disk',
+'name': 'bitmap0'
+})
+
+vm.qmp_log('nbd-server-start', **{
+'addr': {
+'type': 'unix',
+'data': {'path': nbd_sock}
+}
+}, filters=[iotests.filter_qmp_testfiles])
+
+vm.qmp_log('nbd-server-add', **{
+'device': 'disk',
+'writable': True,
+'bitmap': 'bitmap0'
+})
+
+p = iotests.QemuIoInteractive('-f', 'raw', nbd_uri)
+# wait for connection and check it:
+iotests.log(p.cmd('read 0 512').rstrip(), filters=[iotests.filter_qemu_io])
+
+vm.shutdown()
+
+p.close()
diff --git a/tests/qemu-iotests/299.out b/tests/qemu-iotests/299.out
new file mode 100644
index 00..bba4252923
--- /dev/null
+++ b/tests/qemu-iotests/299.out
@@ -0,0 +1,10 @@
+{"execute": "blockdev-add", "arguments": {"driver": "null-co", "node-name": 
"disk", "size": 1048576}}
+{"return": {}}
+{"execute": "block-dirty-bitmap-add", "arguments": {"name": "bitmap0", "node": 
"disk"}}
+{"return": {}}
+{"execute": "nbd-server-start", "arguments": {"addr": {"data": {"path": 
"SOCK_DIR/PID-nbd.sock"}, "type": "unix"}}}
+{"return": {}}
+{"execute": "nbd-server-add", "arguments": {"bitmap": "bitmap0", "device": 
"disk", "writable": true}}
+{"return": {}}
+read 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index d886fa0cb3..250192352c 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -302,3 +302,4 @@
 291 rw quick
 292 rw auto quick
 297 meta
+299 auto quick
-- 
2.18.0




[PATCH v2 4/5] iotests.py: filter_testfiles(): filter SOCK_DIR too

2020-07-01 Thread Vladimir Sementsov-Ogievskiy
Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 tests/qemu-iotests/iotests.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index ac9d199a1e..31d4b105ca 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -345,8 +345,9 @@ def filter_qmp(qmsg, filter_fn):
 return qmsg
 
 def filter_testfiles(msg):
-prefix = os.path.join(test_dir, "%s-" % (os.getpid()))
-return msg.replace(prefix, 'TEST_DIR/PID-')
+pref1 = os.path.join(test_dir, "%s-" % (os.getpid()))
+pref2 = os.path.join(sock_dir, "%s-" % (os.getpid()))
+return msg.replace(pref1, 'TEST_DIR/PID-').replace(pref2, 'SOCK_DIR/PID-')
 
 def filter_qmp_testfiles(qmsg):
 def _filter(_key, value):
-- 
2.18.0




[PATCH v2 2/5] iotests.py: QemuIoInteractive: print output on failure

2020-07-01 Thread Vladimir Sementsov-Ogievskiy
Make it simpler to debug when qemu-io fails due to wrong arguments or
environment.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Eric Blake 
---
 tests/qemu-iotests/iotests.py | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index efe9958f5e..ac9d199a1e 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -216,7 +216,13 @@ class QemuIoInteractive:
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True)
-assert self._p.stdout.read(9) == 'qemu-io> '
+out = self._p.stdout.read(9)
+if out != 'qemu-io> ':
+# Most probably qemu-io just failed to start.
+# Let's collect the whole output and exit.
+out += self._p.stdout.read()
+self._p.wait(timeout=1)
+raise ValueError(out)
 
 def close(self):
 self._p.communicate('q\n')
-- 
2.18.0




[PATCH v2 0/5] Fix crash due to NBD export leak

2020-07-01 Thread Vladimir Sementsov-Ogievskiy
Hi all!

We've faced crash bug, which is reproducing on master branch as well.
The case is described in 03, where fix is suggested.
New iotest in 05 crashes without that fix.

v2:
01: reword commit msg, add Eric's r-b
02: fix type in commit msg, add Eric's r-b
03: rewrite
v1:04 split into 04 (iotests.py improvement) and 05(new test)


Vladimir Sementsov-Ogievskiy (5):
  iotests: QemuIoInteractive: use qemu_io_args_no_fmt
  iotests.py: QemuIoInteractive: print output on failure
  nbd: make nbd_export_close_all() synchronous
  iotests.py: filter_testfiles(): filter SOCK_DIR too
  iotests: test shutdown when bitmap is exported through NBD

 nbd/server.c  |  8 +
 tests/qemu-iotests/299| 65 +++
 tests/qemu-iotests/299.out| 10 ++
 tests/qemu-iotests/group  |  1 +
 tests/qemu-iotests/iotests.py | 15 +---
 5 files changed, 95 insertions(+), 4 deletions(-)
 create mode 100644 tests/qemu-iotests/299
 create mode 100644 tests/qemu-iotests/299.out

-- 
2.18.0




nvme emulation merge process (was: Re: [PATCH 00/10] hw/block/nvme: namespace types and zoned namespaces)

2020-07-01 Thread Kevin Wolf
Am 30.06.2020 um 22:36 hat Klaus Jensen geschrieben:
> On Jun 30 08:42, Keith Busch wrote:
> > On Tue, Jun 30, 2020 at 04:09:46PM +0200, Philippe Mathieu-Daudé wrote:
> > > What I see doable for the following days is:
> > > - hw/block/nvme: Fix I/O BAR structure [3]
> > > - hw/block/nvme: handle transient dma errors
> > > - hw/block/nvme: bump to v1.3
> > 
> > 
> > These look like sensible patches to rebase future work on, IMO. The 1.3
> > updates had been prepared a while ago, at least.
> 
> I think Philippe's "hw/block/nvme: Fix I/O BAR structure" series is a
> no-brainer. It just needs to get in asap.

I think we need to talk about how nvme patches are supposed to get
merged. I'm not familiar with the hardware nor the code, so the model
was that I just blindly merge patches that Keith has reviewed/acked,
just to spare him the work to prepare a pull request. But obviously, we
started doing things this way when there was a lot less activity around
the nvme emulation.

If we find that this doesn't scale any more, maybe we need to change
something. Depending on how much time Keith can spend on review in the
near future and how much control he wants to keep over the development,
I could imagine adding Klaus to MAINTAINERS, either as a co-maintainer
or as a reviewer. Then I could rely on reviews/acks from either of you
for merging series.

Of course, the patches don't necessarily have to go through my tree
either if this only serves to complicate things these days. If sending
separate pull requests directly to Peter would make things easier, I
certainly wouldn't object.

Kevin




Re: [PATCH 2/4] migration: Add block-bitmap-mapping parameter

2020-07-01 Thread Max Reitz
On 30.06.20 12:51, Dr. David Alan Gilbert wrote:
> * Max Reitz (mre...@redhat.com) wrote:
>> This migration parameter allows mapping block node names and bitmap
>> names to aliases for the purpose of block dirty bitmap migration.
>>
>> This way, management tools can use different node and bitmap names on
>> the source and destination and pass the mapping of how bitmaps are to be
>> transferred to qemu (on the source, the destination, or even both with
>> arbitrary aliases in the migration stream).
>>
>> Suggested-by: Vladimir Sementsov-Ogievskiy 
>> Signed-off-by: Max Reitz 
>> ---
>>  qapi/migration.json|  83 +++-
>>  migration/migration.h  |   3 +
>>  migration/block-dirty-bitmap.c | 372 -
>>  migration/migration.c  |  29 +++
>>  4 files changed, 432 insertions(+), 55 deletions(-)
>>
>> diff --git a/qapi/migration.json b/qapi/migration.json
>> index d5000558c6..5aeae9bea8 100644
>> --- a/qapi/migration.json
>> +++ b/qapi/migration.json
>> @@ -507,6 +507,44 @@
>>'data': [ 'none', 'zlib',
>>  { 'name': 'zstd', 'if': 'defined(CONFIG_ZSTD)' } ] }
>>  
>> +##
>> +# @BitmapMigrationBitmapAlias:
>> +#
>> +# @name: The name of the bitmap.
>> +#
>> +# @alias: An alias name for migration (for example the bitmap name on
>> +# the opposite site).
>> +#
>> +# Since: 5.1
>> +##
>> +{ 'struct': 'BitmapMigrationBitmapAlias',
>> +  'data': {
>> +  'name': 'str',
>> +  'alias': 'str'
>> +  } }
>> +
>> +##
>> +# @BitmapMigrationNodeAlias:
>> +#
>> +# Maps a block node name and the bitmaps it has to aliases for dirty
>> +# bitmap migration.
>> +#
>> +# @node-name: A block node name.
>> +#
>> +# @alias: An alias block node name for migration (for example the
>> +# node name on the opposite site).
>> +#
>> +# @bitmaps: Mappings for the bitmaps on this node.
>> +#
>> +# Since: 5.1
>> +##
>> +{ 'struct': 'BitmapMigrationNodeAlias',
>> +  'data': {
>> +  'node-name': 'str',
>> +  'alias': 'str',
>> +  'bitmaps': [ 'BitmapMigrationBitmapAlias' ]
>> +  } }
>> +
>>  ##
>>  # @MigrationParameter:
>>  #
>> @@ -641,6 +679,18 @@
>>  #  will consume more CPU.
>>  #  Defaults to 1. (Since 5.0)
>>  #
>> +# @block-bitmap-mapping: Maps block nodes and bitmaps on them to
>> +#  aliases for the purpose of dirty bitmap migration.  Such
>> +#  aliases may for example be the corresponding names on the
>> +#  opposite site.
>> +#  The mapping must be one-to-one and complete: On the source,
>> +#  migrating a bitmap from a node when either is not mapped
>> +#  will result in an error.  On the destination, similarly,
>> +#  receiving a bitmap (by alias) from a node (by alias) when
>> +#  either alias is not mapped will result in an error.
>> +#  By default, all node names and bitmap names are mapped to
>> +#  themselves. (Since 5.1)
>> +#
>>  # Since: 2.4
>>  ##
>>  { 'enum': 'MigrationParameter',
>> @@ -655,7 +705,8 @@
>> 'multifd-channels',
>> 'xbzrle-cache-size', 'max-postcopy-bandwidth',
>> 'max-cpu-throttle', 'multifd-compression',
>> -   'multifd-zlib-level' ,'multifd-zstd-level' ] }
>> +   'multifd-zlib-level' ,'multifd-zstd-level',
>> +   'block-bitmap-mapping' ] }
>>  
>>  ##
>>  # @MigrateSetParameters:
>> @@ -781,6 +832,18 @@
>>  #  will consume more CPU.
>>  #  Defaults to 1. (Since 5.0)
>>  #
>> +# @block-bitmap-mapping: Maps block nodes and bitmaps on them to
>> +#  aliases for the purpose of dirty bitmap migration.  Such
>> +#  aliases may for example be the corresponding names on the
>> +#  opposite site.
>> +#  The mapping must be one-to-one and complete: On the source,
>> +#  migrating a bitmap from a node when either is not mapped
>> +#  will result in an error.  On the destination, similarly,
>> +#  receiving a bitmap (by alias) from a node (by alias) when
>> +#  either alias is not mapped will result in an error.
>> +#  By default, all node names and bitmap names are mapped to
>> +#  themselves. (Since 5.1)
>> +#
>>  # Since: 2.4
>>  ##
>>  # TODO either fuse back into MigrationParameters, or make
>> @@ -811,7 +874,8 @@
>>  '*max-cpu-throttle': 'int',
>>  '*multifd-compression': 'MultiFDCompression',
>>  '*multifd-zlib-level': 'int',
>> -'*multifd-zstd-level': 'int' } }
>> +'*multifd-zstd-level': 'int',
>> +'*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ] } }
> 
> That's a hairy type for a migration parameter!
> I'm curious what 'info migrate_parameters' does in hmp or what happens
> if you try and set it?

Oh.  I didn’t know these were accessible via HMP.

As for setting it, that fails an assertion because I thus forgot to
handle it in hmp_migrate_set_parameter().  I think the best thing 

Re: [PATCH 3/4] nbd: make client_close synchronous

2020-07-01 Thread Vladimir Sementsov-Ogievskiy

29.06.2020 16:56, Stefan Hajnoczi wrote:

On Mon, Jun 29, 2020 at 10:55:06AM +0300, Vladimir Sementsov-Ogievskiy wrote:

Also, why in block/io.c we kick the main context, but not bs->aio_context?


 From AIO_WAIT_WHILE():

  * The caller's thread must be the IOThread that owns @ctx or the main loop
  * thread (with @ctx acquired exactly once).  This function cannot be used to
  * wait on conditions between two IOThreads since that could lead to deadlock,
  * go via the main loop instead.

Case 1: IOThread

   while ((cond)) {   \
   aio_poll(ctx_, true);  \
   waited_ = true;\
   }  \

In this case aio_poll() returns after every event loop iteration and we
will re-evaluate cond. Therefore we don't need to be kicked.

Case 2: Main loop thread

In this case we need the kick since we're waiting on the main loop
AioContext, not the IOThread AioContext that is doing the actual work.
aio_wait_kick() schedules a dummy BH to wake up the main loop thread.

There is no harm in scheduling the dummy BH in the main loop thread when
AIO_WAIT_WHILE() is called from an IOThread.

Hope this helps,


Thanks!

Looking at this all again, I think that client->recv_coroutine == NULL is a bad 
marker of finish, as it's not directly bound to _put. I'll try another approach, 
to make nbd_export_close_all() be synchronous instead by waiting for all export to 
be actually freed.


--
Best regards,
Vladimir



Re: [PATCH 34/46] qom: Don't handle impossible object_property_get_link() failure

2020-07-01 Thread Markus Armbruster
Philippe Mathieu-Daudé  writes:

> On 6/25/20 5:09 PM, Markus Armbruster wrote:
>> Philippe Mathieu-Daudé  writes:
>> 
>>> On 6/24/20 6:43 PM, Markus Armbruster wrote:
 Don't handle object_property_get_link() failure that can't happen
 unless the programmer screwed up, pass &error_abort.

 Signed-off-by: Markus Armbruster 
 ---
  hw/arm/bcm2835_peripherals.c |  7 +--
  hw/arm/bcm2836.c |  7 +--
  hw/display/bcm2835_fb.c  |  8 +---
  hw/dma/bcm2835_dma.c |  9 +
  hw/gpio/bcm2835_gpio.c   | 15 ++-
  hw/intc/nios2_iic.c  |  8 +---
  hw/misc/bcm2835_mbox.c   |  9 +
  hw/misc/bcm2835_property.c   | 17 ++---
  hw/usb/hcd-dwc2.c|  9 +
  9 files changed, 11 insertions(+), 78 deletions(-)

 diff --git a/hw/arm/bcm2835_peripherals.c b/hw/arm/bcm2835_peripherals.c
 index 8313410ffe..3c40bda91e 100644
 --- a/hw/arm/bcm2835_peripherals.c
 +++ b/hw/arm/bcm2835_peripherals.c
 @@ -134,12 +134,7 @@ static void bcm2835_peripherals_realize(DeviceState 
 *dev, Error **errp)
  uint64_t ram_size, vcram_size;
  int n;
  
 -obj = object_property_get_link(OBJECT(dev), "ram", &err);
 -if (obj == NULL) {
 -error_setg(errp, "%s: required ram link not found: %s",
 -   __func__, error_get_pretty(err));
 -return;
 -}
 +obj = object_property_get_link(OBJECT(dev), "ram", &error_abort);
>>> [...]
>>>
>>> Should we now add an assert(errp) in object_property_get_link()?
>>> Basically this would force forks to adapt their code when
>>> rebasing.
>> 
>> Functions should not place additional restrictions @errp arguments
>> without a compelling reason.
>
> My compelling reason is you spent quite some time cleaning, then we'll
> merge old patches based on examples previous your cleanup, and either
> you'll have to clean again, or the codebase will get inconsistent again.

We might also merge patches that ignore errors for perfectly sane
reasons.  We'll then debug the crash, and take out the assertion again.

>> What if you want genuinely don't need the
>> error details when object_property_get_link() fails?  Passing null is
>> better than passing &err only to error_free(err).
>
> So what about:
>
> -- >8 --
> --- a/qom/object.c
> +++ b/qom/object.c
> @@ -1372,9 +1372,11 @@ void object_property_set_link(Object *obj, Object
> *value,
>  Object *object_property_get_link(Object *obj, const char *name,
>   Error **errp)
>  {
> -char *str = object_property_get_str(obj, name, errp);
> +char *str;
>  Object *target = NULL;
>
> +assert(errp == NULL || errp == &error_abort || errp == &error_fatal);
> +str = object_property_get_str(obj, name, errp);
>  if (str && *str) {
>  target = object_resolve_path(str, NULL);
>  if (!target) {
> ---

Consider an @obj that comes in two flavours, one with and one without
the link.  Code handles both, but we still want the Error object for
tracing purposes:

linked_obj = object_property_get_link(obj, "some-link-prop", &err);
if (!linked_obj) {
char *obj_name = object_get_canonical_path_component(obj);
trace_frob_get_som_link_prop_failed(obj_name, error_get_pretty(err));
g_free(obj_name);
error_free(err);
}

Such use of Error objects just for tracing exists in the tree already.

>
>> 
>>> Reviewed-by: Philippe Mathieu-Daudé 
>> 
>> Thanks!
>> 




Re: [PATCH 15/46] qemu-option: Tidy up opt_set() not to free arguments on failure

2020-07-01 Thread Markus Armbruster
Vladimir Sementsov-Ogievskiy  writes:

> 24.06.2020 19:43, Markus Armbruster wrote:
>> opt_set() frees its argument @value on failure.  Slightly unclean;
>> functions ideally do nothing on failure.
>>
>> To tidy this up, move opt_create() from opt_set() into its callers,
>> along with the cleanup.
>
> Hmm, let me think a bit..
>
> So, prior to this patch:
>
> opt_set gets name/value pair and sets the option in opts object, it
> seems absolutely obvious and standard behavior for Map-like object.
>
> The fact that for setting an option we create a QemuOpt object, and
> somehow register it inside opts object is an implementation detail.

You explain behavior on success.  The issue is behavior on failure.

> after the patch:
>
> opt_set gets opt object, which is already registered in opts. So,
> it seems like option is "partly" set already, and opt_set only
> finalize the processing.

Yes, opt_set() becomes a bit of a misnomer: it doesn't add a QemuOpt to
@opts, it validates a QemuOpt against its key's description, if @opts
has descriptions.

Hmm, opt_set() is now almost identical to qemu_opts_validate()'s loop
body.  Perhaps I can de-duplicate.

> And, as opt_set() only finalize the "set" operation, on opt_set
> failure we need additional roll-back of "set" operation first step.
>
> Additional fact, indirectly showing that something is unclear here
> is that we pass "opts" to opt_set twice: as "opts" parameter and
> inside opt: (opt->opts must be the same, assertion won't hurt if
> you decide to keep the patch).

Valid point.

> =
>
> Semantics before the patch seems clearer to me.
>
> To improve the situation around "value", we can just g_strdup it
> in opt_create as well as "name" argument (and use const char*
> type for "value" argument as well)

We don't strdup() there because opts_do_parse() extracts the value with
get_opt_name(), which strdup()s it.  strdup()ing it some more isn't
wrong, I just dislike it.

Let me try the de-duplication, and then we'll see whether you still
dislike the patch.




Re: [PATCH 14/46] qemu-option: Factor out helper opt_create()

2020-07-01 Thread Markus Armbruster
Vladimir Sementsov-Ogievskiy  writes:

> 24.06.2020 19:43, Markus Armbruster wrote:
>> There is just one use so far.  The next commit will add more.
>>
>> Signed-off-by: Markus Armbruster 
>> ---
>>   util/qemu-option.c | 27 ++-
>>   1 file changed, 18 insertions(+), 9 deletions(-)
>>
>> diff --git a/util/qemu-option.c b/util/qemu-option.c
>> index d9293814b4..3cdf0c0800 100644
>> --- a/util/qemu-option.c
>> +++ b/util/qemu-option.c
>> @@ -502,6 +502,23 @@ int qemu_opt_unset(QemuOpts *opts, const char *name)
>>   }
>>   }
>>   +static QemuOpt *opt_create(QemuOpts *opts, const char *name, char
>> *value,
>> +   bool prepend)
>> +{
>> +QemuOpt *opt = g_malloc0(sizeof(*opt));
>
> I'd prefer g_new0(QemuOpt, 1)

I generally prefer g_new0() over g_malloc0(), too.  But the pattern

lhs = g_malloc0(sizeof(*lhs))

is fine with me, provided sizeof(*lhs) is at least as readable as the
type of *lhs.  Looks like a wash here, so I'm refraining from messing
with the moved code.

> anyway:
> Reviewed-by: Vladimir Sementsov-Ogievskiy 

Thanks!




Re: [PATCH 11/46] qemu-option: Make uses of find_desc_by_name() more similar

2020-07-01 Thread Markus Armbruster
Vladimir Sementsov-Ogievskiy  writes:

> 29.06.2020 12:36, Vladimir Sementsov-Ogievskiy wrote:
>> 24.06.2020 19:43, Markus Armbruster wrote:
>>> This is to make the next commit easier to review.
>>>
>>> Signed-off-by: Markus Armbruster 
>>> ---
>>>   util/qemu-option.c | 32 ++--
>>>   1 file changed, 18 insertions(+), 14 deletions(-)
>>>
>>> diff --git a/util/qemu-option.c b/util/qemu-option.c
>>> index 6119f971a4..9941005c91 100644
>>> --- a/util/qemu-option.c
>>> +++ b/util/qemu-option.c
>>> @@ -270,6 +270,7 @@ static void qemu_opt_del_all(QemuOpts *opts, const char 
>>> *name)
>>>   const char *qemu_opt_get(QemuOpts *opts, const char *name)
>>>   {
>>>   QemuOpt *opt;
>>> +    const QemuOptDesc *desc;
>> Honestly, I don't see how this hunk helps with the following patch, which is 
>> simple anyway.
>> Keeping desc variable scope smaller seems better for me, as well as further 
>> scope of
>> def_val. (Still, keep my r-b if you don't want to change it).
>>
>
> Aha, I see, we have more similar patterns and you want them to look 
> similarly. Still, it's
> better to keep scope of variable smaller. May be a follow-up.

The variable goes away in the next patch.

I don't expect you to read PATCH n+1 before reviewing PATCH n :)




Re: [PATCH 10/46] qemu-option: Check return value instead of @err where convenient

2020-07-01 Thread Markus Armbruster
Vladimir Sementsov-Ogievskiy  writes:

> 24.06.2020 19:43, Markus Armbruster wrote:
>> Convert uses like
>>
>>  opts = qemu_opts_create(..., &err);
>>  if (err) {
>>  ...
>>  }
>>
>> to
>>
>>  opts = qemu_opts_create(..., &err);
>>  if (!opts) {
>>  ...
>>  }
>>
>> Eliminate error_propagate() that are now unnecessary.  Delete @err
>> that are now unused.
>>
>> Signed-off-by: Markus Armbruster 
>> ---
>>   block/parallels.c  |  4 ++--
>>   blockdev.c |  5 ++---
>>   qdev-monitor.c |  6 ++
>>   util/qemu-config.c | 10 --
>>   util/qemu-option.c | 12 
>>   5 files changed, 14 insertions(+), 23 deletions(-)
>>
>> diff --git a/block/parallels.c b/block/parallels.c
>> index 860dbb80a2..b15c9ac28d 100644
>> --- a/block/parallels.c
>> +++ b/block/parallels.c
>> @@ -823,8 +823,8 @@ static int parallels_open(BlockDriverState *bs, QDict 
>> *options, int flags,
>>   }
>>   }
>>   -opts = qemu_opts_create(¶llels_runtime_opts, NULL, 0,
>> &local_err);
>> -if (local_err != NULL) {
>> +opts = qemu_opts_create(¶llels_runtime_opts, NULL, 0, errp);
>> +if (!opts) {
>>   goto fail_options;
>>   }
>
> Honestly, I don't like this hunk. as already complicated code (crossing 
> gotos) becomes more
> complicated (add one more pattern to fail_options path: no-op 
> error_propagate).
>
> At least, we'll need a follow-up patch, refactoring parallels_open() to drop 
> "fail_options"
> label completely.

For what it's worth, this is how it looks at the end of the series:

static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
  Error **errp)
{
BDRVParallelsState *s = bs->opaque;
ParallelsHeader ph;
int ret, size, i;
QemuOpts *opts = NULL;
Error *local_err = NULL;
char *buf;

bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
   BDRV_CHILD_IMAGE, false, errp);
if (!bs->file) {
return -EINVAL;
}

ret = bdrv_pread(bs->file, 0, &ph, sizeof(ph));
if (ret < 0) {
goto fail;
}

bs->total_sectors = le64_to_cpu(ph.nb_sectors);

if (le32_to_cpu(ph.version) != HEADER_VERSION) {
goto fail_format;
}
if (!memcmp(ph.magic, HEADER_MAGIC, 16)) {
s->off_multiplier = 1;
bs->total_sectors = 0x & bs->total_sectors;
} else if (!memcmp(ph.magic, HEADER_MAGIC2, 16)) {
s->off_multiplier = le32_to_cpu(ph.tracks);
} else {
goto fail_format;
}

s->tracks = le32_to_cpu(ph.tracks);
if (s->tracks == 0) {
error_setg(errp, "Invalid image: Zero sectors per track");
ret = -EINVAL;
goto fail;
}
if (s->tracks > INT32_MAX/513) {
error_setg(errp, "Invalid image: Too big cluster");
ret = -EFBIG;
goto fail;
}

s->bat_size = le32_to_cpu(ph.bat_entries);
if (s->bat_size > INT_MAX / sizeof(uint32_t)) {
error_setg(errp, "Catalog too large");
ret = -EFBIG;
goto fail;
}

size = bat_entry_off(s->bat_size);
s->header_size = ROUND_UP(size, bdrv_opt_mem_align(bs->file->bs));
s->header = qemu_try_blockalign(bs->file->bs, s->header_size);
if (s->header == NULL) {
ret = -ENOMEM;
goto fail;
}
s->data_end = le32_to_cpu(ph.data_off);
if (s->data_end == 0) {
s->data_end = ROUND_UP(bat_entry_off(s->bat_size), 
BDRV_SECTOR_SIZE);
}
if (s->data_end < s->header_size) {
/* there is not enough unused space to fit to block align between 
BAT
   and actual data. We can't avoid read-modify-write... */
s->header_size = size;
}

ret = bdrv_pread(bs->file, 0, s->header, s->header_size);
if (ret < 0) {
goto fail;
}
s->bat_bitmap = (uint32_t *)(s->header + 1);

for (i = 0; i < s->bat_size; i++) {
int64_t off = bat2sect(s, i);
if (off >= s->data_end) {
s->data_end = off + s->tracks;
}
}

if (le32_to_cpu(ph.inuse) == HEADER_INUSE_MAGIC) {
/* Image was not closed correctly. The check is mandatory */
s->header_unclean = true;
if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) {
error_setg(errp, "parallels: Image was not closed correctly; "
   "cannot be opened read/write");
ret = -EACCES;
goto fail;
}
}

opts = qemu_opts_create(¶llels_runtime_opts, NULL, 0, errp);
if (!opts) {
goto fail_options;
}

if (!qemu_opts_absorb_qdict(opts, options, errp)